├── .dockerignore
├── .gitattributes
├── .gitignore
├── .gitmodules
├── .jvmopts
├── .scalafmt.conf
├── API.adoc
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── build.sbt
├── config
    ├── docker.json
    ├── local.json
    └── prod.json
├── data
    ├── ait-collections.json
    ├── ait-users.json
    ├── arch-users.json
    └── special-collections.json
├── entrypoint.sh
├── fairscheduler.xml
├── job_scripts
    ├── trocr-run.py
    └── whisper-run.py
├── lib
    └── javax.servlet-api-3.1.0.jar
├── logging
    └── .keep
├── migration
    └── LegacyDatasetMigrator.scala
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
├── src
    ├── main
    │   ├── bash
    │   │   └── sendmail
    │   └── scala
    │   │   └── org
    │   │       └── archive
    │   │           └── webservices
    │   │               └── ars
    │   │                   ├── AdminController.scala
    │   │                   ├── ApiController.scala
    │   │                   ├── Arch.scala
    │   │                   ├── ArchSwaggerSupport.scala
    │   │                   ├── BaseController.scala
    │   │                   ├── FilesController.scala
    │   │                   ├── JobUuidApiController.scala
    │   │                   ├── Keystone.scala
    │   │                   ├── ScalatraBootstrap.scala
    │   │                   ├── SwaggerController.scala
    │   │                   ├── WasapiController.scala
    │   │                   ├── addons
    │   │                       ├── AddonLoader.scala
    │   │                       └── ArchAddon.scala
    │   │                   ├── ait
    │   │                       ├── Ait.scala
    │   │                       └── AitUser.scala
    │   │                   ├── aut
    │   │                       ├── AutLoader.scala
    │   │                       ├── AutUtil.scala
    │   │                       └── TikaUtil.scala
    │   │                   ├── io
    │   │                       ├── CollectionCache.scala
    │   │                       ├── FileAccessContext.scala
    │   │                       ├── FileAccessKeyRing.scala
    │   │                       ├── FilePointer.scala
    │   │                       ├── IOHelper.scala
    │   │                       ├── MemoryCompressor.scala
    │   │                       ├── RandomFileAccess.scala
    │   │                       ├── Vault.scala
    │   │                       └── WebArchiveLoader.scala
    │   │                   ├── model
    │   │                       ├── ArchCollection.scala
    │   │                       ├── ArchCollectionInfo.scala
    │   │                       ├── ArchCollectionStats.scala
    │   │                       ├── ArchConf.scala
    │   │                       ├── ArchJobCategories.scala
    │   │                       ├── ArchJobCategory.scala
    │   │                       ├── ArchJobInstanceInfo.scala
    │   │                       ├── DerivativeOutput.scala
    │   │                       ├── DerivativeOutputCache.scala
    │   │                       ├── LocalArchConf.scala
    │   │                       ├── PublishedDatasets.scala
    │   │                       ├── api
    │   │                       │   ├── AvailableJob.scala
    │   │                       │   ├── AvailableJobsCategory.scala
    │   │                       │   ├── Collection.scala
    │   │                       │   ├── Dataset.scala
    │   │                       │   ├── DatasetFile.scala
    │   │                       │   ├── InputSpec.scala
    │   │                       │   ├── JobState.scala
    │   │                       │   ├── WasapiResponse.scala
    │   │                       │   └── package.scala
    │   │                       ├── app
    │   │                       │   └── RequestContext.scala
    │   │                       ├── collections
    │   │                       │   ├── AitCollectionSpecifics.scala
    │   │                       │   ├── CollectionSpecifics.scala
    │   │                       │   ├── CustomCollectionSpecifics.scala
    │   │                       │   ├── FileCollectionSpecifics.scala
    │   │                       │   ├── GenericRandomAccess.scala
    │   │                       │   ├── SpecialCollectionSpecifics.scala
    │   │                       │   ├── UnionCollectionSpecifics.scala
    │   │                       │   └── inputspecs
    │   │                       │   │   ├── ArchCollectionSpecLoader.scala
    │   │                       │   │   ├── CdxQuerySpecLoader.scala
    │   │                       │   │   ├── DatasetSpecLoader.scala
    │   │                       │   │   ├── FileRecord.scala
    │   │                       │   │   ├── FileRecordFactory.scala
    │   │                       │   │   ├── FileSpecLoader.scala
    │   │                       │   │   ├── HdfsFileRecordFactory.scala
    │   │                       │   │   ├── HttpFileRecordFactory.scala
    │   │                       │   │   ├── InMemoryCdxFileRecord.scala
    │   │                       │   │   ├── InputSpec.scala
    │   │                       │   │   ├── InputSpecLoader.scala
    │   │                       │   │   ├── LongestPrefixProbing.scala
    │   │                       │   │   ├── MetaFilesSpecLoader.scala
    │   │                       │   │   ├── MetaRemoteSpecLoader.scala
    │   │                       │   │   ├── MultiSpecLoader.scala
    │   │                       │   │   ├── OneTimeAccess.scala
    │   │                       │   │   ├── S3FileRecordFactory.scala
    │   │                       │   │   ├── S3HttpFileRecordFactory.scala
    │   │                       │   │   ├── VaultFileRecordFactory.scala
    │   │                       │   │   └── meta
    │   │                       │   │       ├── FileMetaData.scala
    │   │                       │   │       ├── FileMetaField.scala
    │   │                       │   │       ├── FileMetaFieldSummary.scala
    │   │                       │   │       ├── FileMetaFieldType.scala
    │   │                       │   │       ├── FileMetaFieldTypeSummary.scala
    │   │                       │   │       └── FileMetaSummary.scala
    │   │                       └── users
    │   │                       │   ├── ArchUser.scala
    │   │                       │   ├── DefaultArchUser.scala
    │   │                       │   └── KeystoneUser.scala
    │   │                   ├── processing
    │   │                       ├── ChainedJob.scala
    │   │                       ├── DerivationJob.scala
    │   │                       ├── DerivationJobConf.scala
    │   │                       ├── DerivationJobInstance.scala
    │   │                       ├── DerivationJobParameters.scala
    │   │                       ├── GenericJob.scala
    │   │                       ├── GenericJobManager.scala
    │   │                       ├── JobManager.scala
    │   │                       ├── JobManagerBase.scala
    │   │                       ├── JobQueue.scala
    │   │                       ├── JobStateManager.scala
    │   │                       ├── PartialDerivationJob.scala
    │   │                       ├── ProcessingState.scala
    │   │                       ├── SampleVizData.scala
    │   │                       ├── SparkJob.scala
    │   │                       ├── SparkJobListener.scala
    │   │                       ├── SparkJobManager.scala
    │   │                       ├── SparkRunner.scala
    │   │                       └── jobs
    │   │                       │   ├── ArsLgaGeneration.scala
    │   │                       │   ├── ArsWaneGeneration.scala
    │   │                       │   ├── ArsWatGeneration.scala
    │   │                       │   ├── AudioInformationExtraction.scala
    │   │                       │   ├── DomainFrequencyExtraction.scala
    │   │                       │   ├── DomainGraphExtraction.scala
    │   │                       │   ├── ImageGraphExtraction.scala
    │   │                       │   ├── ImageInformationExtraction.scala
    │   │                       │   ├── PdfInformationExtraction.scala
    │   │                       │   ├── PresentationProgramInformationExtraction.scala
    │   │                       │   ├── SpreadsheetInformationExtraction.scala
    │   │                       │   ├── TextFilesInformationExtraction.scala
    │   │                       │   ├── VideoInformationExtraction.scala
    │   │                       │   ├── WebGraphExtraction.scala
    │   │                       │   ├── WebPagesExtraction.scala
    │   │                       │   ├── WordProcessorInformationExtraction.scala
    │   │                       │   ├── archivespark
    │   │                       │       ├── AiJob.scala
    │   │                       │       ├── ArchiveSparkFlexJob.scala
    │   │                       │       ├── ArchiveSparkNoop.scala
    │   │                       │       ├── base
    │   │                       │       │   ├── ArchEnrichRoot.scala
    │   │                       │       │   ├── ArchFileRecord.scala
    │   │                       │       │   ├── ArchFileSpec.scala
    │   │                       │       │   ├── ArchWarcRecord.scala
    │   │                       │       │   ├── ArchWarcSpec.scala
    │   │                       │       │   ├── ArchiveSparkBaseJob.scala
    │   │                       │       │   ├── ArchiveSparkEnrichJob.scala
    │   │                       │       │   ├── FileLoad.scala
    │   │                       │       │   ├── LocalFileCache.scala
    │   │                       │       │   └── PlainTextLoad.scala
    │   │                       │       ├── functions
    │   │                       │       │   ├── ArchFileBytes.scala
    │   │                       │       │   ├── ArchFileCache.scala
    │   │                       │       │   ├── ArchFileProcEnrichFuncBase.scala
    │   │                       │       │   ├── ArchWarcPayload.scala
    │   │                       │       │   ├── CondaBasedFunction.scala
    │   │                       │       │   ├── CoreNlpEntities.scala
    │   │                       │       │   ├── TrOCR.scala
    │   │                       │       │   ├── Whisper.scala
    │   │                       │       │   ├── WhisperText.scala
    │   │                       │       │   └── adapters
    │   │                       │       │   │   ├── ArchArchiveSparkFunctionAdapter.scala
    │   │                       │       │   │   ├── CondaBasedArchiveSparkFunctionAdapter.scala
    │   │                       │       │   │   └── EntitiesAdapter.scala
    │   │                       │       └── preset
    │   │                       │       │   ├── EntityExtraction.scala
    │   │                       │       │   ├── TrOcrEntityExtraction.scala
    │   │                       │       │   ├── TrOcrProcessing.scala
    │   │                       │       │   ├── WhisperEntityExtraction.scala
    │   │                       │       │   ├── WhisperText.scala
    │   │                       │       │   └── WhisperTranscription.scala
    │   │                       │   ├── shared
    │   │                       │       ├── ArsJob.scala
    │   │                       │       ├── AutJob.scala
    │   │                       │       ├── BinaryInformationAutJob.scala
    │   │                       │       └── NetworkAutJob.scala
    │   │                       │   └── system
    │   │                       │       ├── DatasetPublication.scala
    │   │                       │       ├── MetadataSummary.scala
    │   │                       │       └── UserDefinedQuery.scala
    │   │                   └── util
    │   │                       ├── CacheUtil.scala
    │   │                       ├── Common.scala
    │   │                       ├── DatafileUtil.scala
    │   │                       ├── DatasetUtil.scala
    │   │                       ├── FormatUtil.scala
    │   │                       ├── HttpUtil.scala
    │   │                       ├── LazyCache.scala
    │   │                       ├── PublicSuffixUtil.scala
    │   │                       └── UUID.scala
    └── test
    │   └── scala
    │       └── org
    │           └── archive
    │               └── webservices
    │                   └── ars
    │                       ├── ApiController.scala
    │                       ├── Fixtures.scala
    │                       ├── JobUuidApiController.scala
    │                       ├── UnitSpec.scala
    │                       └── model
    │                           └── LocalArchConfSpec.scala
├── templates
    ├── notebooks
    │   ├── audio-information.ipynb
    │   ├── css-file-information.ipynb
    │   ├── domain-frequency.ipynb
    │   ├── domain-graph.ipynb
    │   ├── html-file-information.ipynb
    │   ├── image-graph.ipynb
    │   ├── image-information.ipynb
    │   ├── js-file-information.ipynb
    │   ├── json-file-information.ipynb
    │   ├── pdf-information.ipynb
    │   ├── plain-text-file-information.ipynb
    │   ├── powerpoint-information.ipynb
    │   ├── spreadsheet-information.ipynb
    │   ├── video-information.ipynb
    │   ├── web-graph.ipynb
    │   ├── web-pages.ipynb
    │   ├── word-document-information.ipynb
    │   └── xml-file-information.ipynb
    ├── sendmail_failed.txt
    ├── sendmail_finished.txt
    └── sendmail_udq-finished.txt
└── webapp
    └── .gitkeep


/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | webapp/js/dist/** linguist-generated
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/.gitmodules


--------------------------------------------------------------------------------
/.jvmopts:
--------------------------------------------------------------------------------
1 | -Xms512M
2 | -Xmx4096M
3 | -Xss2M
4 | -XX:MaxMetaspaceSize=1024M


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
 1 | # Apache Spark scalafmt configuration.
 2 | 
 3 | align = none
 4 | align.openParenDefnSite = false
 5 | align.openParenCallSite = false
 6 | align.tokens = []
 7 | optIn = {
 8 |   configStyleArguments = false
 9 | }
10 | danglingParentheses.preset = false
11 | docstrings.style = Asterisk
12 | maxColumn = 98
13 | runner.dialect = scala212
14 | version = 3.7.7
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Read the current user's ID so that we can assign the same ID to "arch" Docker user.
 3 | UID = $$(id -u)
 4 | 
 5 | PWD=$(shell pwd)
 6 | IMAGE=ait-arch
 7 | BASE_IMAGE=`grep -P 'BASE_IMAGE=.+$$' Dockerfile | cut -d'=' -f2`
 8 | RUN_BASE_CMD=docker run --rm -it -v $(PWD)/shared:/opt/arch/shared -p 12341:12341
 9 | GPU_TEST_CMD=docker run --gpus all $(BASE_IMAGE)
10 | CPU_MSG_CMD=printf '*\n* GPU not available: Whisper and TrOCR jobs will use the CPU\n*\n'
11 | 
12 | config/config.json:
13 | 	cp config/docker.json config/config.json
14 | 
15 | .PHONY: build-docker-image
16 | build-docker-image: config/config.json
17 | 	docker build --build-arg UID=$(UID) . -t $(IMAGE)
18 | 
19 | shared:
20 | 	mkdir -p shared/in/collections; \
21 | 	mkdir shared/log; \
22 | 	mkdir -p shared/out/custom-collections; \
23 | 	mkdir shared/out/datasets;
24 | 
25 | # Define a function that first tests running the BASE_IMAGE with the --gpus option,
26 | # and if that doesn't exit with an error, runs ARCH with GPU support, and otherwise
27 | # displays a message indicating that AI jobs will use the CPU and ARCH without GPU support.
28 | # usage: $(call run_docker_image_fn,<extraRunOptions>,<containerCommand>)
29 | # The "$(or $(1),--it)" adds quotes with are necessary when a first argument value is
30 | # specified and defaults to the redundant "-it" when one is not specified to prevent docker
31 | # run from complaining about "invalid reference format".
32 | define run_docker_image_fn
33 | 	$(eval GPU_RUN_CMD=$(RUN_BASE_CMD) "$(or $(1),-it)" --gpus all $(IMAGE) $(2))
34 | 	$(eval CPU_RUN_CMD=$(RUN_BASE_CMD) "$(or $(1),-it)" $(IMAGE) $(2))
35 | 	@$(GPU_TEST_CMD) 2>/dev/null && ($(GPU_RUN_CMD) || true) || ($(CPU_MSG_CMD) && $(CPU_RUN_CMD))
36 | endef
37 | 
38 | .PHONY: run-docker-image
39 | run-docker-image: shared
40 | 	$(call run_docker_image_fn)
41 | 
42 | lib/.symlinks-copied:
43 | 	docker cp $$(docker create --name arch-tmp $(IMAGE)):/opt/arch/lib . \
44 | 	&& docker rm arch-tmp \
45 | 	&& touch lib/.symlinks-copied
46 | 
47 | .PHONY: run-docker-image-dev
48 | run-docker-image-dev: shared lib/.symlinks-copied
49 | 	$(call run_docker_image_fn,"-v $(PWD):/opt/arch")
50 | 
51 | .PHONY: docker-shell
52 | docker-shell: shared lib/.symlinks-copied
53 | 	$(call run_docker_image_fn,"-v $(PWD):/opt/arch","bash")
54 | 


--------------------------------------------------------------------------------
/config/docker.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "basePath": "/",
 3 |   "customCollectionPath": "/opt/arch/shared/out/custom-collections",
 4 |   "externalPort": 12341,
 5 |   "jobLoggingPath": "/opt/arch/shared/log",
 6 |   "keystoneBaseUrl": "http://keystone:8000",
 7 |   "keystonePrivateApiKey": "supersecret",
 8 |   "uuidJobOutPath": "/opt/arch/shared/out/datasets"
 9 | }
10 | 


--------------------------------------------------------------------------------
/config/local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aitCollectionPath": "data/in",
 3 |   "aitCollectionWarcDir": "arcs",
 4 |   "collectionCachePath": "/tmp/arch-cache",
 5 |   "jobOutPath": "/data/user-out",
 6 |   "globalJobOutPath": "/data/out",
 7 |   "customCollectionPath": "data/collections",
 8 |   "jobLoggingPath": "logging",
 9 |   "localTempPath": "data/tmp",
10 |   "sparkMaster": "local[*]",
11 |   "baseUrl": "http://127.0.0.1:12341",
12 |   "loginUrl": "http://127.0.0.1:12341/ait/login?next=",
13 |   "port": 12341,
14 |   "githubBearer": "example_bearer_token"
15 | }
16 | 


--------------------------------------------------------------------------------
/config/prod.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aitCollectionHdfsHost": "ia802400.us.archive.org",
 3 |   "aitCollectionPath": "/search/ait",
 4 |   "aitCollectionWarcDir": "arcs",
 5 |   "collectionCachePath": "/tmp/arch-cache",
 6 |   "globalJobOutPath": "/user/arch/arch",
 7 |   "jobOutPath": "/user/arch/arch-users",
 8 |   "customCollectionPath": "/user/arch/arch-custom-collections",
 9 |   "jobLoggingPath": "logging",
10 |   "localTempPath": "/tmp",
11 |   "sparkMaster": "yarn",
12 |   "baseUrl": "https://webdata.archive-it.org",
13 |   "loginUrl": "https://webdata.archive-it.org/ait/login?next=",
14 |   "hadoopQueue": "default",
15 |   "port": 12353,
16 |   "production": true
17 | }


--------------------------------------------------------------------------------
/data/ait-collections.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ait:1451" : [
 3 |         14462,
 4 |         14472,
 5 |         14489
 6 |     ],
 7 |     "ait:1796" : [
 8 |         10923
 9 |     ],
10 |     "test" : [
11 |         1
12 |     ],
13 |     "ks:test" : [
14 |         1
15 |     ]
16 | }
17 | 


--------------------------------------------------------------------------------
/data/ait-users.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ids" : [
3 |         1451,
4 |         1796
5 |     ]
6 | }
7 | 


--------------------------------------------------------------------------------
/data/arch-users.json:
--------------------------------------------------------------------------------
1 | {
2 |   "ks:system" : {
3 |     "name" : "Keystone System",
4 |     "admin" : true,
5 |     "apiKey" : "$pbkdf2-sha512$120000$edRi7uf7Dg18ebkFm5lphcfOAiVVCvRB$vyl48k.uOahDCmTKOqViXpw8FG7fKzkVParjfOZ/60U"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/data/special-collections.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collections" : {
 3 |         "test-collection" : {
 4 |             "name" : "ARCH Test Collection",
 5 |             "path" : "/user/helge/arch-test-collection"
 6 |         }
 7 |     },
 8 |     "users" : {
 9 |         "test" : [
10 |             "test-collection"
11 |         ],
12 |         "ks:test" : [
13 |             "test-collection"
14 |         ]
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | shopt -s nullglob
 5 | 
 6 | # For any new directory in /opt/arch/in/collections that is not yet
 7 | # represented as being authorized for the built-in test user, authorize it.
 8 | for dir in /opt/arch/shared/in/collections/*; do
 9 |     collection_key=`basename $dir`;
10 |     collection_name=`echo $collection_key | sed -r 's/(^|-)(\w)/ \U\2/g' | sed 's/^ //'`;
11 |     cat <<< $(jq ".collections |= if has(\"$collection_key\") then . else  .\"$collection_key\" = {name: \"$collection_name\", path: \"$dir\"} end | .users[\"ks:system\"] |= (.+ [\"$collection_key\"] | unique)" /opt/arch/data/special-collections.json) > /opt/arch/data/special-collections.json
12 | done
13 | 
14 | exec "$@"
15 | 


--------------------------------------------------------------------------------
/fairscheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <allocations>
 3 |     <pool name="weight-1"><schedulingMode>FAIR</schedulingMode><weight>1</weight><minShare>0</minShare></pool>
 4 |     <pool name="weight-2"><schedulingMode>FAIR</schedulingMode><weight>2</weight><minShare>0</minShare></pool>
 5 |     <pool name="weight-4"><schedulingMode>FAIR</schedulingMode><weight>4</weight><minShare>0</minShare></pool>
 6 |     <pool name="weight-8"><schedulingMode>FAIR</schedulingMode><weight>8</weight><minShare>0</minShare></pool>
 7 |     <pool name="weight-16"><schedulingMode>FAIR</schedulingMode><weight>16</weight><minShare>0</minShare></pool>
 8 |     <pool name="weight-32"><schedulingMode>FAIR</schedulingMode><weight>32</weight><minShare>0</minShare></pool>
 9 |     <pool name="weight-64"><schedulingMode>FAIR</schedulingMode><weight>64</weight><minShare>0</minShare></pool>
10 |     <pool name="weight-128"><schedulingMode>FAIR</schedulingMode><weight>128</weight><minShare>0</minShare></pool>
11 | </allocations>


--------------------------------------------------------------------------------
/job_scripts/whisper-run.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import whisper
 3 | import json
 4 | import torch
 5 | 
 6 | is_cuda = torch.cuda.is_available()
 7 | device = "cuda" if is_cuda else "cpu"
 8 | 
 9 | model_filename = sys.argv[1]
10 | out_pipe_path = sys.argv[-1]
11 | 
12 | model = whisper.load_model(model_filename, device=device)
13 | 
14 | def process(audio_file):
15 |     result = model.transcribe(audio_file, fp16=is_cuda)
16 |     return result
17 | 
18 | with open(out_pipe_path, 'w') as pipe:
19 |     while True:
20 |         print("##", file=pipe, flush=True)
21 |         input_file = sys.stdin.readline().strip()
22 |         try:
23 |             result = process(input_file)
24 |             print(json.dumps(result["segments"]), file=pipe, flush=True)
25 |         except: pass
26 | 


--------------------------------------------------------------------------------
/lib/javax.servlet-api-3.1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/lib/javax.servlet-api-3.1.0.jar


--------------------------------------------------------------------------------
/logging/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/logging/.keep


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.3.8


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scalatra.sbt" % "sbt-scalatra" % "1.0.3")
2 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.0")
3 | 


--------------------------------------------------------------------------------
/src/main/bash/sendmail:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cat $2 >> /data/sendmail.log
4 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/AdminController.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import org.apache.commons.io.input.BoundedInputStream
 4 | import org.archive.webservices.ars.processing.JobStateManager
 5 | import org.archive.webservices.ars.processing.JobStateManager.Charset
 6 | import org.archive.webservices.sparkling._
 7 | import org.archive.webservices.sparkling.io.IOUtil
 8 | import org.scalatra._
 9 | 
10 | import java.io.{File, FileInputStream}
11 | import scala.io.Source
12 | import scala.util.Try
13 | 
14 | class AdminController extends BaseController {
15 |   val MaxLogLength: Int = 1.mb.toInt
16 |   get("/logs/:log_type") {
17 |     ensureAuth { user =>
18 |       if (user.isAdmin) {
19 |         params("log_type") match {
20 |           case "jobs" =>
21 |             val tail = params.get("tail").flatMap(str => Try(str.toInt).toOption).getOrElse(-1)
22 |             val logFile = new File(s"${JobStateManager.LoggingDir}/${JobStateManager.JobLogFile}")
23 |             val log = if (logFile.exists) {
24 |               val skip = if (tail < 0) 0L else (logFile.length - tail.min(MaxLogLength)).max(0L)
25 |               val in = new FileInputStream(logFile)
26 |               try {
27 |                 IOUtil.skip(in, skip)
28 |                 val source = Source.fromInputStream(
29 |                   new BoundedInputStream(in, MaxLogLength),
30 |                   JobStateManager.Charset)
31 |                 try {
32 |                   source.mkString
33 |                 } finally {
34 |                   source.close()
35 |                 }
36 |               } finally {
37 |                 in.close()
38 |               }
39 |             } else ""
40 |             Ok(log, Map("Content-Type" -> "text/plain"))
41 |           case "running" =>
42 |             val runningJobsFile =
43 |               new File(s"${JobStateManager.LoggingDir}/${JobStateManager.RunningJobsFile}")
44 |             val log = if (runningJobsFile.exists) {
45 |               val source = Source.fromFile(runningJobsFile, Charset)
46 |               try {
47 |                 source.mkString
48 |               } finally {
49 |                 source.close()
50 |               }
51 |             } else ""
52 |             Ok(log, Map("Content-Type" -> "application/json"))
53 |           case "failed" =>
54 |             val failedJobsFile =
55 |               new File(s"${JobStateManager.LoggingDir}/${JobStateManager.FailedJobsFile}")
56 |             val log = if (failedJobsFile.exists) {
57 |               val source = Source.fromFile(failedJobsFile, Charset)
58 |               try {
59 |                 source.mkString
60 |               } finally {
61 |                 source.close()
62 |               }
63 |             } else ""
64 |             Ok(log, Map("Content-Type" -> "text/plain"))
65 |           case _ =>
66 |             NotFound()
67 |         }
68 |       } else Forbidden()
69 |     }
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/Arch.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import _root_.io.sentry.protocol.Message
 4 | import _root_.io.sentry.{Sentry, SentryEvent, SentryLevel}
 5 | import org.archive.webservices.ars.addons.AddonLoader
 6 | import org.archive.webservices.ars.model.ArchConf
 7 | import org.archive.webservices.ars.processing.JobStateManager
 8 | import org.archive.webservices.sparkling._
 9 | import org.archive.webservices.sparkling.io.IOUtil
10 | import org.archive.webservices.sparkling.util.RddUtil
11 | import org.eclipse.jetty.server.Server
12 | import org.eclipse.jetty.webapp.WebAppContext
13 | import org.scalatra.servlet.ScalatraListener
14 | 
15 | import java.io.File
16 | import scala.collection.JavaConverters._ // For SentryEvent.setExtras
17 | 
18 | object Arch {
19 |   def start(contextPath: String, port: Int): Unit = {
20 |     val server = new Server(port)
21 | 
22 |     val context = new WebAppContext()
23 |     context.setContextPath(contextPath)
24 |     context.setResourceBase("webapp")
25 |     context.setInitParameter("org.eclipse.jetty.servlet.Default.dirAllowed", "false")
26 |     context.setInitParameter("org.eclipse.jetty.servlet.Default.useFileMappedBuffer", "false")
27 |     context.setInitParameter(
28 |       ScalatraListener.LifeCycleKey,
29 |       classOf[ScalatraBootstrap].getCanonicalName)
30 |     context.setInitParameter(
31 |       org.scalatra.EnvironmentKey,
32 |       ArchConf.deploymentEnvironment match {
33 |         case "DEV" => "development"
34 |         case "QA" => "qa"
35 |         case "PROD" => "production"
36 |       })
37 |     context.setEventListeners(Array(new ScalatraListener))
38 | 
39 |     server.setHandler(context)
40 |     server.start()
41 |     server.join()
42 |   }
43 | 
44 |   def initSentry(): Unit = {
45 |     Sentry.init(options => {
46 |       options.setDsn(ArchConf.sentryDsn);
47 |       options.setEnvironment(ArchConf.deploymentEnvironment);
48 |       // Set traces_sample_rate to 0.10 to capture 10% of transactions for performance monitoring.
49 |       options.setTracesSampleRate(0.10);
50 |     })
51 |   }
52 | 
53 |   def reportEvent(
54 |       title: String,
55 |       message: String,
56 |       extraContext: Map[String, Object] = Map.empty,
57 |       level: SentryLevel = SentryLevel.INFO): Unit = {
58 |     // Send an event to Sentry.
59 |     val event = new SentryEvent()
60 |     val _message = new Message()
61 |     // Use the title as the message and add the message text as the extra "details" property
62 |     // to make display in the client more reasonable, otherwise Sentry will display a prefix
63 |     // of the message text as the event title.
64 |     _message.setMessage(title)
65 |     event.setMessage(_message)
66 |     event.setLevel(level)
67 |     event.setExtras((Map("details" -> message) ++ extraContext).asJava)
68 |     Sentry.captureEvent(event)
69 |   }
70 | 
71 |   val reportInfo = reportEvent(_, _, _, SentryLevel.INFO)
72 |   val reportWarning = reportEvent(_, _, _, SentryLevel.WARNING)
73 |   val reportError = reportEvent(_, _, _, SentryLevel.ERROR)
74 | 
75 |   def reportException(e: Exception): Unit = Sentry.captureException(e)
76 | 
77 |   def main(args: Array[String]): Unit = {
78 |     IOUtil.memoryBuffer = 1.mb.toInt
79 |     RddUtil.saveRecordTimeoutMillis = -1
80 |     AddonLoader.initializePackage("org.archive.webservices.arch.addons")
81 |     JobStateManager.init()
82 |     initSentry()
83 |     start(ArchConf.basePath, ArchConf.internalPort)
84 |   }
85 | 
86 |   def debugging: Boolean = new File("_debugging").exists
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/ArchSwaggerSupport.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import org.scalatra.swagger._
 4 | import org.scalatra.util.NotNothing
 5 | 
 6 | trait ArchSwaggerSupport extends SwaggerSupport {
 7 |   // apiOperation wrapper to add X-API-* header params to all endpoints.
 8 |   def apiOp[T: Manifest: NotNothing](name: String): SwaggerSupportSyntax.OperationBuilder =
 9 |     (apiOperation[T](name)
10 |       parameter headerParam[String]("X-API-USER").description(
11 |         "The user for which this request is being made")
12 |       parameter headerParam[String]("X-API-KEY").description(
13 |         "An API key that's authorized to act on behalf of X-API-USER"))
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/BaseController.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import org.archive.webservices.ars.model.app.RequestContext
 4 | import org.archive.webservices.ars.model.users.ArchUser
 5 | import org.scalatra._
 6 | import org.scalatra.scalate.ScalateSupport
 7 | 
 8 | class BaseController extends ScalatraServlet with ScalateSupport {
 9 |   // Report and rethrow any Exceptions.
10 |   error {
11 |     case e: Exception => {
12 |       Arch.reportException(e)
13 |       throw e
14 |     }
15 |   }
16 | 
17 |   def ensureAuth(action: RequestContext => ActionResult): ActionResult = {
18 |     for {
19 |       apiUser <- request.headers.get("X-API-USER")
20 |       apiKey <- request.headers.get("X-API-KEY")
21 |     } yield {
22 |       ArchUser.get(apiUser, Some(apiKey)) match {
23 |         case Some(user) => action(RequestContext(user))
24 |         case None => Forbidden()
25 |       }
26 |     }
27 |   }.getOrElse(Forbidden())
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/ScalatraBootstrap.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import org.scalatra.LifeCycle
 4 | import org.scalatra.swagger.ApiKey
 5 | 
 6 | import javax.servlet.ServletContext
 7 | 
 8 | class ScalatraBootstrap extends LifeCycle {
 9 | 
10 |   implicit val swagger = new ArchApiSwagger
11 |   swagger.addAuthorization(ApiKey("X-API-KEY"))
12 | 
13 |   override def init(context: ServletContext): Unit = {
14 |     context.mount(new AdminController, "/admin/*")
15 |     context.mount(new ApiController, "/api/*")
16 |     context.mount(new JobUuidApiController, "/api/job/*")
17 |     context.mount(new WasapiController, "/wasapi/*")
18 |     context.mount(new FilesController, "/files/*")
19 |     context.mount(new SwaggerController, "/api-docs")
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/SwaggerController.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars
 2 | 
 3 | import org.scalatra.ScalatraServlet
 4 | import org.scalatra.swagger.{ApiInfo, NativeSwaggerBase, Swagger}
 5 | 
 6 | class SwaggerController(implicit val swagger: Swagger)
 7 |     extends ScalatraServlet
 8 |     with NativeSwaggerBase
 9 | 
10 | object ArchApiInfo
11 |     extends ApiInfo(
12 |       "The ARCH API",
13 |       "Docs for the ARCH API",
14 |       "https://arch.archive-it.org",
15 |       "arch@archive.org",
16 |       "AGPL-3.0",
17 |       "https://www.gnu.org/licenses/agpl-3.0.en.html")
18 | 
19 | class ArchApiSwagger extends Swagger(Swagger.SpecVersion, "2.0.0", ArchApiInfo)
20 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/addons/AddonLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.addons
 2 | 
 3 | import java.util.jar.JarFile
 4 | import scala.collection.JavaConverters._
 5 | 
 6 | object AddonLoader {
 7 |   private var init: Map[String, ArchAddon] = Map.empty
 8 | 
 9 |   def init(addon: ArchAddon): Unit = {
10 |     init += addon.getClass.getName -> addon
11 |   }
12 | 
13 |   def initializePackage(packageName: String): Unit = {
14 |     println(s"Initializing add-ons of package $packageName...")
15 |     val packagePath = packageName.replace('.', '/') + "/"
16 |     val systemClassLoader = ClassLoader.getSystemClassLoader
17 |     val systemResources = systemClassLoader.getResources(packagePath).asScala
18 |     val contextClassLoader = Thread.currentThread.getContextClassLoader
19 |     val threadResources = contextClassLoader.getResources(packagePath).asScala
20 |     val jarEntries = (systemResources ++ threadResources)
21 |       .filter(_.getProtocol == "jar")
22 |       .flatMap { jar =>
23 |         jar.getPath.stripPrefix("file:").split('!').headOption
24 |       }
25 |       .flatMap(path => new JarFile(path).entries().asScala)
26 |       .map(_.toString)
27 |     val objects = jarEntries
28 |       .filter(_.startsWith(packagePath))
29 |       .filter(_.endsWith("$.class"))
30 |       .map(_.stripSuffix(".class").replace('/', '.'))
31 |     for (objectClass <- objects) {
32 |       try {
33 |         Class.forName(objectClass, true, systemClassLoader)
34 |       } catch {
35 |         case _: ClassNotFoundException =>
36 |           Class.forName(objectClass, true, contextClassLoader)
37 |       }
38 |     }
39 |     for (addon <- init.values) {
40 |       println(s"Loading add-on ${addon.getClass.getName.stripSuffix("$")}...")
41 |       addon.initAddon()
42 |     }
43 |     println(s"Initialized add-ons of package $packageName.")
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/addons/ArchAddon.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.addons
2 | 
3 | trait ArchAddon {
4 |   def initAddon(): Unit
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/ait/AitUser.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.ait
 2 | 
 3 | case class AitUser(id: Int, userName: String, fullName: String, email: Option[String] = None) {
 4 |   def isSystemUser: Boolean = id == 0
 5 |   def isLoggedIn: Boolean = id >= 0
 6 | }
 7 | 
 8 | object AitUser {
 9 |   lazy val Empty = AitUser(-1, "", "")
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/aut/AutUtil.scala:
--------------------------------------------------------------------------------
  1 | package org.archive.webservices.ars.aut
  2 | 
  3 | import io.archivesunleashed.matchbox.ExtractDomain
  4 | import org.archive.webservices.ars.util.PublicSuffixUtil
  5 | import org.archive.webservices.sparkling.http.HttpMessage
  6 | import org.archive.webservices.sparkling.warc.WarcRecord
  7 | 
  8 | import java.io.InputStream
  9 | import javax.imageio.ImageIO
 10 | 
 11 | object AutUtil {
 12 |   val months = Seq(
 13 |     "jan",
 14 |     "feb",
 15 |     "mar",
 16 |     "apr",
 17 |     "may",
 18 |     "jun",
 19 |     "jul",
 20 |     "aug",
 21 |     "sep",
 22 |     "oct",
 23 |     "nov",
 24 |     "dec").zipWithIndex.map { case (s, d) => (s, ("0" + (d + 1)).takeRight(2)) }
 25 | 
 26 |   def url(r: WarcRecord): String = r.url.getOrElse("")
 27 | 
 28 |   def timestamp(r: WarcRecord): String =
 29 |     r.timestamp.filter(_.length >= 14).map(_.take(14)).getOrElse("")
 30 | 
 31 |   def mime(http: HttpMessage): String = http.mime.getOrElse("unknown")
 32 | 
 33 |   def checkPageMime(url: String, mime: String): Boolean = {
 34 |     val u = url.toLowerCase
 35 |     (mime == "text/html" || mime == "application/xhtml+xml" || u.endsWith("htm") || u.endsWith(
 36 |       "html")) && !u.endsWith("robots.txt")
 37 |   }
 38 | 
 39 |   def validPage(r: WarcRecord, http: HttpMessage): Boolean = {
 40 |     timestamp(r).nonEmpty && checkPageMime(url(r), http.mime.getOrElse("")) && http.status == 200
 41 |   }
 42 | 
 43 |   def extractDomainRemovePrefixWWW(url: String, publicSuffixes: Set[String]): String = {
 44 |     Option(if (url.trim.isEmpty) "" else ExtractDomain(url).replaceAll("^\\s*www\\.", ""))
 45 |       .map(_.trim)
 46 |       .map(PublicSuffixUtil.resolve(_, publicSuffixes))
 47 |       .getOrElse("")
 48 |   }
 49 | 
 50 |   def extractDomainRemovePrefixWWW(url: String): String = {
 51 |     Option(if (url.trim.isEmpty) "" else ExtractDomain(url).replaceAll("^\\s*www\\.", ""))
 52 |       .map(_.trim)
 53 |       .getOrElse("")
 54 |   }
 55 | 
 56 |   def rfc1123toTime14(lastModifiedDate: String): String = {
 57 |     if (lastModifiedDate.isEmpty) {
 58 |       ""
 59 |     } else {
 60 |       val lc = lastModifiedDate.toLowerCase
 61 |       val date = months.find(m => lc.contains(m._1)).map(_._2).flatMap { m =>
 62 |         val d = lc
 63 |           .replace(":", "")
 64 |           .split(' ')
 65 |           .drop(1)
 66 |           .map(d => (d.length, d))
 67 |           .toMap
 68 |         for (y <- d.get(4); n <- d.get(2); t <- d.get(6))
 69 |           yield y + m + n + t
 70 |       }
 71 |       date match {
 72 |         case Some(date) =>
 73 |           date
 74 |         case None =>
 75 |           ""
 76 |       }
 77 |     }
 78 |   }
 79 | 
 80 |   // see io.archivesunleashed.matchbox.ComputeImageSize
 81 |   def computeImageSize(in: InputStream): (Int, Int) = {
 82 |     val nullImage = (0, 0)
 83 |     try {
 84 |       val stream = ImageIO.createImageInputStream(in)
 85 |       try {
 86 |         val readers = ImageIO.getImageReaders(stream)
 87 |         if (readers.hasNext) {
 88 |           val reader = readers.next
 89 |           reader.setInput(stream)
 90 |           (reader.getWidth(0), reader.getHeight(0))
 91 |         } else nullImage
 92 |       } finally {
 93 |         stream.close()
 94 |       }
 95 |     } catch {
 96 |       case e: Throwable => nullImage
 97 |     }
 98 |   }
 99 | 
100 |   def extractLinks(
101 |       func: (String, String) => Seq[(String, String, String)],
102 |       url: String,
103 |       body: String): Seq[(String, String, String)] = {
104 |     func(url, body).flatMap { case (s, t, a) =>
105 |       for {
106 |         source <- Option(s).map(_.trim).filter(_.nonEmpty)
107 |         target <- Option(t).map(_.trim).filter(_.nonEmpty)
108 |       } yield (source, target, Option(a).map(_.trim).getOrElse(""))
109 |     }
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/aut/TikaUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.aut
 2 | 
 3 | import org.apache.tika.Tika
 4 | import org.apache.tika.detect.DefaultDetector
 5 | import org.apache.tika.parser.AutoDetectParser
 6 | import org.archive.webservices.sparkling.io.IOUtil
 7 | 
 8 | import java.io.InputStream
 9 | import scala.util.Try
10 | 
11 | object TikaUtil {
12 |   val detector = new DefaultDetector()
13 |   val parser = new AutoDetectParser(detector)
14 |   val tika = new Tika(detector, parser)
15 | 
16 |   def mime(in: InputStream): String = {
17 |     (if (in.markSupported() && IOUtil.eof(in)) None else Try(tika.detect(in)).toOption)
18 |       .getOrElse("N/A")
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/io/CollectionCache.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.io
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.archive.webservices.ars.model.ArchConf
 5 | import org.archive.webservices.sparkling._
 6 | import org.archive.webservices.sparkling.io._
 7 | 
 8 | import java.time.Instant
 9 | import scala.util.Try
10 | 
11 | object CollectionCache {
12 |   val CacheClearThresholdBytes: Long = 1.tb
13 | 
14 |   private var inUse = Set.empty[String]
15 |   private var lastUse = Map.empty[String, Long]
16 | 
17 |   def cache[R](sourceId: String)(action: String => R): R = {
18 |     val dir = cacheDir(sourceId)
19 |     synchronized {
20 |       inUse += dir
21 |       clearCache()
22 |     }
23 |     val path = cacheDirPath(dir)
24 |     HdfsIO.fs.mkdirs(new Path(path))
25 |     val r = action(path)
26 |     synchronized {
27 |       inUse -= dir
28 |       lastUse = lastUse.updated(dir, Instant.now.toEpochMilli)
29 |     }
30 |     r
31 |   }
32 | 
33 |   def cacheDir(sourceId: String): String = IOHelper.escapePath(sourceId)
34 | 
35 |   def cacheDirPath(cacheDir: String): String = ArchConf.collectionCachePath + "/" + cacheDir
36 | 
37 |   def cachePath(sourceId: String): String = cacheDirPath(cacheDir(sourceId))
38 | 
39 |   def cachePath(sourceId: String, filename: String): String = cachePath(sourceId) + "/" + filename
40 | 
41 |   def clearCache(): Unit = synchronized {
42 |     val fs = HdfsIO.fs
43 |     var length = Try(fs.getContentSummary(new Path(ArchConf.collectionCachePath)).getLength)
44 |       .getOrElse(0L)
45 |     if (length > CacheClearThresholdBytes) {
46 |       for (dir <- fs.listStatus(new Path(ArchConf.collectionCachePath))
47 |         if dir.isDirectory) {
48 |         val path = dir.getPath
49 |         val c = path.getName
50 |         if (!inUse.contains(c) && !lastUse.contains(c)) {
51 |           val pathLength = fs.getContentSummary(path).getLength
52 |           if (fs.delete(path, true)) length -= pathLength
53 |         }
54 |       }
55 |       val toDelete =
56 |         lastUse.toSeq
57 |           .filter { case (c, _) => !inUse.contains(c) }
58 |           .sortBy(_._2)
59 |           .map(_._1)
60 |           .toIterator
61 |       while (length > CacheClearThresholdBytes && toDelete.hasNext) {
62 |         val path = new Path(ArchConf.collectionCachePath, toDelete.next)
63 |         if (fs.exists(path)) {
64 |           val pathLength = fs.getContentSummary(path).getLength
65 |           if (fs.delete(path, true)) length -= pathLength
66 |         }
67 |       }
68 |     }
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/io/FileAccessContext.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.io
 2 | 
 3 | import org.archive.webservices.ars.Arch
 4 | import org.archive.webservices.ars.model.{ArchConf, LocalArchConf}
 5 | import org.archive.webservices.sparkling.io.HdfsIO
 6 | 
 7 | class FileAccessContext(
 8 |     val conf: ArchConf with Serializable,
 9 |     val useAitHdfsIO: Boolean = false,
10 |     val keyRing: FileAccessKeyRing,
11 |     val keyValueCache: Map[String, java.io.Serializable])
12 |     extends Serializable {
13 |   @transient lazy val hdfsIO: HdfsIO = if (useAitHdfsIO) aitHdfsIO else HdfsIO
14 |   @transient lazy val aitHdfsIOopt: Option[HdfsIO] =
15 |     conf.aitCollectionHdfsHostPort
16 |       .map { case (host, port) => HdfsIO(host, port) }
17 | 
18 |   def aitHdfsIO: HdfsIO = aitHdfsIOopt.getOrElse(hdfsIO)
19 | 
20 |   @transient private var initialized: Boolean = false
21 |   def init(): Unit = if (!initialized) {
22 |     initialized = true
23 |     ArchConf.set(conf)
24 |     Arch.initSentry()
25 |   }
26 | }
27 | 
28 | object FileAccessContext {
29 |   var KeyValueCache = Map.empty[String, java.io.Serializable]
30 | 
31 |   def fromLocalArchConf: FileAccessContext =
32 |     new FileAccessContext(
33 |       conf = LocalArchConf.instance,
34 |       keyRing = FileAccessKeyRing.system,
35 |       keyValueCache = KeyValueCache)
36 | 
37 |   def fromLocalArchConf(alwaysAitHdfsIO: Boolean) =
38 |     new FileAccessContext(
39 |       conf = LocalArchConf.instance,
40 |       useAitHdfsIO = alwaysAitHdfsIO,
41 |       keyRing = FileAccessKeyRing.system,
42 |       keyValueCache = KeyValueCache)
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/io/FileAccessKeyRing.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.io
 2 | 
 3 | import org.archive.webservices.sparkling.io.IOUtil
 4 | 
 5 | import java.io.File
 6 | import scala.collection.JavaConverters._
 7 | 
 8 | class FileAccessKeyRing private (secrets: Map[String, String]) extends Serializable {
 9 |   import FileAccessKeyRing._
10 | 
11 |   def forUrl(url: String): Option[(String, Array[String])] = {
12 |     val (protocol, path) = {
13 |       val splitAt = url.lastIndexOf(":")
14 |       if (splitAt < 0) ("", url) else (url.take(splitAt), url.drop(splitAt + 1))
15 |     }
16 |     val secretSplit = path
17 |       .split('/')
18 |       .find(_.nonEmpty)
19 |       .flatMap { host =>
20 |         secrets.get(
21 |           secretKey(
22 |             protocol,
23 |             (if (host.contains("@")) host.split('@').last else host).split(':').head))
24 |       }
25 |       .toArray
26 |       .flatMap(_.split(SecretSeparator))
27 |     secretSplit.headOption.filter(SupportedAccessMethods.contains).map((_, secretSplit.drop(1)))
28 |   }
29 | }
30 | 
31 | object FileAccessKeyRing {
32 |   val SecretSeparator = "::"
33 |   val AccessMethodS3 = "s3"
34 |   val AccessMethodBasic = "basic"
35 |   val AccessMethodVault = "vault"
36 |   val SupportedAccessMethods = Set(AccessMethodS3, AccessMethodBasic, AccessMethodVault)
37 |   val SecretEnvPrefix = "ARCH_SECRET_"
38 |   val SecretsFile = ".secrets"
39 | 
40 |   def secretKey(protocol: String, host: String): String = {
41 |     s"${protocol.toUpperCase}_${host.replace('.', '-').toUpperCase}"
42 |   }
43 | 
44 |   def loadEnv: Map[String, String] = {
45 |     System.getenv().asScala.toMap.filterKeys(_.startsWith(SecretEnvPrefix)).map { case (k, v) =>
46 |       k.stripPrefix(SecretEnvPrefix) -> v
47 |     }
48 |   }
49 | 
50 |   def loadSecrets: Map[String, String] = {
51 |     if (new File(SecretsFile).exists) {
52 |       IOUtil
53 |         .lines(SecretsFile)
54 |         .flatMap { line =>
55 |           val equalIdx = line.indexOf("=")
56 |           if (equalIdx == -1) None
57 |           else
58 |             Some {
59 |               line.take(equalIdx).trim -> line.drop(equalIdx + 1).trim
60 |             }
61 |         }
62 |         .filter { case (k, v) =>
63 |           k.nonEmpty && v.nonEmpty
64 |         }
65 |         .toMap
66 |     } else Map.empty
67 |   }
68 | 
69 |   lazy val system: FileAccessKeyRing = new FileAccessKeyRing(loadEnv ++ loadSecrets)
70 | 
71 |   def forUrl(url: String): Option[(String, Array[String])] = system.forUrl(url)
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/io/FilePointer.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.io
 2 | 
 3 | import org.archive.webservices.ars.io.FilePointer.{DefaultSource, SourceSeparator}
 4 | import org.archive.webservices.sparkling.util.StringUtil
 5 | 
 6 | case class FilePointer(url: String, filename: String) {
 7 |   private lazy val sourcePathSplit = {
 8 |     val splitAt = StringUtil.prefixBySeparator(url, "/").lastIndexOf(SourceSeparator)
 9 |     if (splitAt < 0) (DefaultSource, url) else (url.take(splitAt), url.drop(splitAt + 1))
10 |   }
11 | 
12 |   def source: String = sourcePathSplit._1
13 |   def path: String = sourcePathSplit._2
14 | 
15 |   def relative(parent: FilePointer): FilePointer = {
16 |     if (source.isEmpty) {
17 |       val splitAt = parent.url.lastIndexOf('/')
18 |       if (splitAt < 0) this
19 |       else {
20 |         FilePointer(IOHelper.concatPaths(parent.url.take(splitAt), url), filename)
21 |       }
22 |     } else this
23 |   }
24 | }
25 | 
26 | object FilePointer {
27 |   val SourceSeparator = ":"
28 |   val DefaultSource = "hdfs"
29 | 
30 |   def fromUrl(url: String): FilePointer = {
31 |     val lastSlashIdx = url.lastIndexOf('/')
32 |     if (lastSlashIdx < 0) {
33 |       val sourceSeparatorIdx = url.lastIndexOf(SourceSeparator)
34 |       if (sourceSeparatorIdx < 0) FilePointer(url, url)
35 |       else FilePointer(url, url.drop(sourceSeparatorIdx + 1))
36 |     } else FilePointer(url, url.drop(lastSlashIdx + 1))
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/io/MemoryCompressor.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.io
 2 | 
 3 | import org.apache.commons.compress.compressors.bzip2.{BZip2CompressorInputStream, BZip2CompressorOutputStream}
 4 | import org.archive.webservices.sparkling.io.ByteArray
 5 | 
 6 | import java.io.{ByteArrayOutputStream, EOFException, InputStream}
 7 | 
 8 | object MemoryCompressor {
 9 |   val BufferSize = 1024
10 | 
11 |   def compress(in: InputStream): ByteArray = {
12 |     val array = new ByteArray
13 |     val buffer = new Array[Byte](BufferSize)
14 |     val out = new ByteArrayOutputStream()
15 |     val compressor = new BZip2CompressorOutputStream(out)
16 |     try {
17 |       var read = in.read(buffer)
18 |       while (read != -1) {
19 |         if (read > 0) compressor.write(buffer, 0, read)
20 |         if (out.size > BufferSize) {
21 |           array.append(out.toByteArray)
22 |           out.reset()
23 |         }
24 |         read = in.read(buffer)
25 |       }
26 |     } catch {
27 |       case _: EOFException => // ignore EOF / break loop
28 |     }
29 |     compressor.close()
30 |     array.append(out.toByteArray)
31 |     array
32 |   }
33 | 
34 |   def decompress(array: ByteArray): InputStream =
35 |     new BZip2CompressorInputStream(array.toInputStream)
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/ArchCollectionInfo.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model
 2 | 
 3 | import _root_.io.circe.parser._
 4 | import _root_.io.circe.syntax._
 5 | import io.circe.Json
 6 | import org.archive.webservices.ars.processing.{DerivationJobConf, JobManager}
 7 | import org.archive.webservices.sparkling.io.HdfsIO
 8 | import org.scalatra.guavaCache.GuavaCache
 9 | 
10 | import java.time.Instant
11 | import scala.collection.immutable.ListMap
12 | 
13 | case class ArchCollectionInfo private (
14 |     collectionId: String,
15 |     file: String,
16 |     lastJob: Option[(String, Boolean, Long)] = None) {
17 |   def lastJobId: Option[String] = lastJob.map(_._1)
18 |   def lastJobSample: Option[Boolean] = lastJob.map(_._2)
19 |   def lastJobTime: Option[Instant] = lastJob.map(_._3).map(Instant.ofEpochSecond)
20 |   def lastJobName: Option[String] = lastJobId.flatMap(JobManager.jobs.get).map { job =>
21 |     job.name + (if (lastJobSample.getOrElse(false)) ArchCollectionInfo.SampleNameSuffix else "")
22 |   }
23 | 
24 |   def setLastJob(id: String, sample: Boolean, time: Instant): ArchCollectionInfo = {
25 |     copy(collectionId, file, Some(id, sample, time.getEpochSecond))
26 |   }
27 | 
28 |   def save(): Unit = {
29 |     GuavaCache.put(ArchCollectionInfo.CachePrefix + collectionId, this, None)
30 |     HdfsIO.writeLines(
31 |       file,
32 |       Seq((ListMap.empty[String, Json] ++ {
33 |         lastJob.toSeq.flatMap { case (id, sample, time) =>
34 |           Seq(
35 |             "lastJobId" -> id.asJson,
36 |             "lastJobSample" -> sample.asJson,
37 |             "lastJobEpoch" -> time.asJson)
38 |         }
39 |       }).asJson.spaces4),
40 |       overwrite = true)
41 |   }
42 | }
43 | 
44 | object ArchCollectionInfo {
45 |   val Charset = "utf-8"
46 |   val CachePrefix = "collection-info#"
47 |   val SampleNameSuffix = " (Sample)"
48 | 
49 |   def get(collectionId: String): Option[ArchCollectionInfo] = {
50 |     GuavaCache.get(CachePrefix + collectionId).orElse {
51 |       ArchCollection.get(collectionId).map { c =>
52 |         val file = DerivationJobConf.collectionOutPath(c) + "/info.json"
53 |         val globalFile = DerivationJobConf.collectionOutPath(c, global = true) + "/info.json"
54 |         val lastJob = Seq(file, globalFile)
55 |           .filter(HdfsIO.exists)
56 |           .flatMap { inFile =>
57 |             parse(HdfsIO.lines(inFile).mkString).right.toOption.map(_.hcursor).flatMap { cursor =>
58 |               cursor.get[Long]("lastJobEpoch").toOption.flatMap { epoch =>
59 |                 cursor
60 |                   .get[String]("lastJobId")
61 |                   .toOption
62 |                   .map { id =>
63 |                     (id, cursor.get[Boolean]("lastJobSample").getOrElse(false), epoch)
64 |                   }
65 |                   .orElse {
66 |                     cursor.get[String]("lastJobName").toOption.flatMap { name =>
67 |                       JobManager.nameLookup.get(name.stripSuffix(SampleNameSuffix)).map { job =>
68 |                         (job.id, name.endsWith(SampleNameSuffix), epoch)
69 |                       }
70 |                     }
71 |                   }
72 |               }
73 |             }
74 |           }
75 |           .sortBy(-_._3)
76 |           .headOption
77 |         val info = ArchCollectionInfo(collectionId, file, lastJob)
78 |         GuavaCache.put(CachePrefix + collectionId, info, None)
79 |       }
80 |     }
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/ArchCollectionStats.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.model
2 | 
3 | case class ArchCollectionStats(size: Long, seeds: Long = -1, lastCrawlDate: String = "")
4 | 
5 | object ArchCollectionStats {
6 |   val Empty: ArchCollectionStats = ArchCollectionStats(-1)
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/ArchJobCategories.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model
 2 | 
 3 | object ArchJobCategories {
 4 |   val None = ArchJobCategory("", "")
 5 | 
 6 |   val BinaryInformation = ArchJobCategory(
 7 |     "File Formats",
 8 |     "Find, describe, and use the files contained within a web archive, based on their format.")
 9 | 
10 |   val Collection = ArchJobCategory(
11 |     "Collection",
12 |     "Discover domain-related patterns and high level information about the documents in a web archive.")
13 | 
14 |   val Network = ArchJobCategory("Network", "Explore connections in a web archive visually.")
15 | 
16 |   val Text = ArchJobCategory("Text", "Extract and analyze a web archive as text.")
17 | 
18 |   val System =
19 |     ArchJobCategory("System", "Internal system jobs that are not meant to be exposed to users.")
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/ArchJobCategory.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.model
2 | 
3 | case class ArchJobCategory(name: String, description: String) {
4 |   override def toString: String = name
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/ArchJobInstanceInfo.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model
 2 | 
 3 | import _root_.io.circe.syntax._
 4 | import io.circe.Json
 5 | import io.circe.parser.parse
 6 | import org.archive.webservices.ars.processing.DerivationJobConf
 7 | import org.archive.webservices.sparkling.io.HdfsIO
 8 | import org.scalatra.guavaCache.GuavaCache
 9 | 
10 | import java.time.Instant
11 | import scala.collection.immutable.ListMap
12 | 
13 | class ArchJobInstanceInfo private () {
14 |   var uuid: Option[String] = None
15 |   var conf: Option[DerivationJobConf] = None
16 |   var started: Option[Instant] = None
17 |   var finished: Option[Instant] = None
18 | 
19 |   def toJson: Json = {
20 |     (ListMap(uuid.map("uuid" -> _.asJson).toSeq: _*) ++ {
21 |       conf.map("conf" -> _.toJson)
22 |     } ++ {
23 |       started.map("started" -> _.getEpochSecond.asJson)
24 |     } ++ {
25 |       finished.map("finished" -> _.getEpochSecond.asJson)
26 |     }).asJson
27 |   }
28 | 
29 |   def save(jobOutPath: String): Unit = {
30 |     val file = ArchJobInstanceInfo.infoFile(jobOutPath)
31 |     GuavaCache.put(ArchJobInstanceInfo.CachePrefix + file, this, None)
32 |     HdfsIO.writeLines(file, Seq(toJson.spaces4), overwrite = true)
33 |   }
34 | }
35 | 
36 | object ArchJobInstanceInfo {
37 |   val Charset = "utf-8"
38 |   val CachePrefix = "job-instance-info#"
39 |   val InfoFile = "info.json"
40 | 
41 |   def infoFile(jobOutPath: String): String = jobOutPath + s"/$InfoFile"
42 | 
43 |   def apply(jobOutPath: String): ArchJobInstanceInfo = {
44 |     val file = infoFile(jobOutPath)
45 |     GuavaCache.get(CachePrefix + file).getOrElse {
46 |       val info = if (HdfsIO.exists(file)) {
47 |         parse(HdfsIO.lines(file).mkString).right.toOption.map(_.hcursor) match {
48 |           case Some(cursor) =>
49 |             val info = new ArchJobInstanceInfo()
50 |             info.uuid = cursor.get[String]("uuid").toOption
51 |             info.conf = cursor.downField("conf").focus.flatMap(DerivationJobConf.fromJson)
52 |             info.started = cursor.get[Long]("started").toOption.map(Instant.ofEpochSecond)
53 |             info.finished = cursor.get[Long]("finished").toOption.map(Instant.ofEpochSecond)
54 |             info
55 |           case None => new ArchJobInstanceInfo()
56 |         }
57 |       } else new ArchJobInstanceInfo()
58 |       GuavaCache.put(CachePrefix + file, info, None)
59 |     }
60 |   }
61 | 
62 |   def inMemory: ArchJobInstanceInfo = new ArchJobInstanceInfo()
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/DerivativeOutput.scala:
--------------------------------------------------------------------------------
  1 | package org.archive.webservices.ars.model
  2 | 
  3 | import _root_.io.circe._
  4 | import _root_.io.circe.syntax._
  5 | import org.apache.hadoop.fs.Path
  6 | import org.archive.webservices.ars.io.IOHelper
  7 | import org.archive.webservices.ars.processing.DerivationJobInstance
  8 | import org.archive.webservices.ars.util.FormatUtil
  9 | import org.archive.webservices.sparkling.io.HdfsIO
 10 | import org.archive.webservices.sparkling.util.{DigestUtil, StringUtil}
 11 | 
 12 | import java.io.{BufferedInputStream, FileInputStream, InputStream}
 13 | import java.time.Instant
 14 | import scala.util.Try
 15 | 
 16 | trait DerivativeOutput {
 17 |   def filename: String
 18 |   def dir: String
 19 |   def fileType: String
 20 |   def mimeType: String
 21 |   def downloadName: String
 22 |   def size: Long
 23 |   def time: Long
 24 |   def lineCount: Long
 25 |   def checksums: Map[String, String]
 26 |   def prefixDownload(prefix: String): DerivativeOutput
 27 | 
 28 |   lazy val path: String = dir + "/" + filename
 29 | 
 30 |   lazy val sizeStr: String = IOHelper.sizeStr(path)
 31 | 
 32 |   lazy val timeStr: String = FormatUtil.instantTimeString(Instant.ofEpochMilli(time))
 33 | 
 34 |   lazy val lineCountStr: Option[String] =
 35 |     if (lineCount < 0) None else Some(StringUtil.formatNumber(lineCount, 0))
 36 | 
 37 |   lazy val accessToken: String = DigestUtil.sha1Base32(filename + size + time)
 38 | 
 39 |   def prefixDownload(instance: DerivationJobInstance): DerivativeOutput = {
 40 |     val timestamp = instance.info.finished.map(IOHelper.pathTimestamp).map(_ + "_")
 41 |     prefixDownload(instance.conf.inputSpec.id + "_" + timestamp.getOrElse(""))
 42 |   }
 43 | }
 44 | 
 45 | case class DerivativeOutputFile(
 46 |     filename: String,
 47 |     dir: String,
 48 |     fileType: String,
 49 |     mimeType: String,
 50 |     downloadName: String)
 51 |     extends DerivativeOutput {
 52 |   import DerivativeOutput._
 53 | 
 54 |   lazy val (size, time) = Try {
 55 |     val status = HdfsIO.fs.getFileStatus(new Path(path))
 56 |     (status.getLen, status.getModificationTime)
 57 |   }.getOrElse((0L, 0L))
 58 | 
 59 |   lazy val lineCount: Long = {
 60 |     val p = path + LineCountFileSuffix
 61 |     if (HdfsIO.exists(p)) Try(HdfsIO.lines(p).head.toLong).getOrElse(-1)
 62 |     else -1
 63 |   }
 64 | 
 65 |   lazy val checksums: Map[String, String] = {
 66 |     val p = path + ChecksumsFileSuffix
 67 |     if (HdfsIO.exists(p))
 68 |       parser
 69 |         .decode[Map[String, String]](HdfsIO.lines(p).mkString)
 70 |         .right
 71 |         .toOption
 72 |         .getOrElse(Map.empty)
 73 |     else Map.empty
 74 |   }
 75 | 
 76 |   def prefixDownload(prefix: String): DerivativeOutput =
 77 |     copy(downloadName = IOHelper.escapePath(prefix) + filename)
 78 | }
 79 | 
 80 | object DerivativeOutput {
 81 |   val LineCountFileSuffix = "_linecount"
 82 |   val ChecksumsFileSuffix = ".checksums"
 83 | 
 84 |   def apply(
 85 |       filename: String,
 86 |       dir: String,
 87 |       fileType: String,
 88 |       mimeType: String): DerivativeOutputFile = {
 89 |     DerivativeOutputFile(filename, dir, fileType, mimeType, filename)
 90 |   }
 91 | 
 92 |   def hashFile(in: InputStream): Map[String, String] = Map("md5" -> DigestUtil.md5Hex(in))
 93 | 
 94 |   def hashFile(in: InputStream, hdfsPath: String): Unit =
 95 |     HdfsIO.writeLines(
 96 |       hdfsPath + ChecksumsFileSuffix,
 97 |       Seq(hashFile(in).asJson.spaces4),
 98 |       overwrite = true)
 99 | 
100 |   def hashFileLocal(localPath: String, hdfsPath: String): Unit = {
101 |     val in = new BufferedInputStream(new FileInputStream(localPath))
102 |     try {
103 |       hashFile(in, hdfsPath)
104 |     } finally {
105 |       in.close()
106 |     }
107 |   }
108 | 
109 |   def hashFileHdfs(hdfsPath: String): Unit =
110 |     HdfsIO.access(hdfsPath, decompress = false)(hashFile(_, hdfsPath))
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/DerivativeOutputCache.scala:
--------------------------------------------------------------------------------
  1 | package org.archive.webservices.ars.model
  2 | 
  3 | import _root_.io.circe.parser
  4 | import _root_.io.circe.syntax._
  5 | import org.archive.webservices.ars.io.IOHelper
  6 | import org.archive.webservices.sparkling.io.HdfsIO
  7 | 
  8 | import scala.collection.immutable.ListMap
  9 | 
 10 | case class DerivativeOutputCache(count: Int, size: Long, files: Iterator[DerivativeOutput])
 11 | 
 12 | object DerivativeOutputCache {
 13 |   case class CachedDerivativeOutput(
 14 |       filename: String,
 15 |       dir: String,
 16 |       fileType: String,
 17 |       mimeType: String,
 18 |       downloadName: String,
 19 |       size: Long,
 20 |       time: Long,
 21 |       lineCount: Long,
 22 |       checksums: Map[String, String])
 23 |       extends DerivativeOutput {
 24 |     override def prefixDownload(prefix: String): DerivativeOutput =
 25 |       copy(downloadName = IOHelper.escapePath(prefix) + filename)
 26 |   }
 27 | 
 28 |   def parseLine(line: String): Option[DerivativeOutput] = {
 29 |     parser.parse(line).toOption.map(_.hcursor).flatMap { cursor =>
 30 |       for {
 31 |         filename <- cursor.get[String]("filename").toOption
 32 |         dir <- cursor.get[String]("dir").toOption
 33 |         fileType <- cursor.get[String]("fileType").toOption
 34 |         mimeType <- cursor.get[String]("mimeType").toOption
 35 |         size <- cursor.get[Long]("size").toOption
 36 |         time <- cursor.get[Long]("time").toOption
 37 |         lineCount <- cursor.get[Long]("lineCount").toOption
 38 |         checksums = {
 39 |           val checksums = cursor.downField("checksums")
 40 |           checksums.keys.toIterator.flatten.flatMap { key =>
 41 |             checksums.get[String](key).toOption.map(key -> _)
 42 |           }.toMap
 43 |         }
 44 |       } yield CachedDerivativeOutput(
 45 |         filename,
 46 |         dir,
 47 |         fileType,
 48 |         mimeType,
 49 |         filename,
 50 |         size,
 51 |         time,
 52 |         lineCount,
 53 |         checksums)
 54 |     }
 55 |   }
 56 | 
 57 |   def parse(cacheFile: String): Option[DerivativeOutputCache] = {
 58 |     val lines = HdfsIO.iterLines(cacheFile)
 59 |     if (lines.hasNext) {
 60 |       val metadata = lines.next()
 61 |       parser.parse(metadata).toOption.map(_.hcursor).flatMap { cursor =>
 62 |         for {
 63 |           count <- cursor.get[Int]("count").toOption
 64 |           size <- cursor.get[Long]("size").toOption
 65 |         } yield DerivativeOutputCache(
 66 |           count,
 67 |           size, {
 68 |             lines.flatMap(parseLine)
 69 |           })
 70 |       }
 71 |     } else None
 72 |   }
 73 | 
 74 |   def write(files: Iterator[DerivativeOutput], cacheFile: String): Unit = {
 75 |     val tmpFile = cacheFile + ".tmp"
 76 |     var count = 0
 77 |     var size = 0L
 78 |     HdfsIO.writeLines(
 79 |       path = tmpFile, {
 80 |         files.map { file =>
 81 |           count += 1
 82 |           size += file.size
 83 |           ListMap(
 84 |             "filename" -> file.filename.asJson,
 85 |             "dir" -> file.dir.asJson,
 86 |             "fileType" -> file.fileType.asJson,
 87 |             "mimeType" -> file.mimeType.asJson,
 88 |             "size" -> file.size.asJson,
 89 |             "time" -> file.time.asJson,
 90 |             "lineCount" -> file.lineCount.asJson,
 91 |             "checksums" -> file.checksums.asJson).asJson.noSpaces
 92 |         }
 93 |       },
 94 |       overwrite = true)
 95 |     HdfsIO.writeLines(
 96 |       cacheFile, {
 97 |         Iterator(ListMap("count" -> count.asJson, "size" -> size.asJson).asJson.noSpaces) ++ {
 98 |           HdfsIO.iterLines(tmpFile)
 99 |         }
100 |       },
101 |       overwrite = true)
102 |     HdfsIO.delete(tmpFile)
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/AvailableJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import org.archive.webservices.ars.model.PublishedDatasets
 4 | import org.archive.webservices.ars.processing.DerivationJob
 5 | import org.scalatra.swagger.annotations.ApiModelProperty
 6 | 
 7 | case class AvailableJob(
 8 |     @ApiModelProperty(description = "Unique job type identifier") uuid: String,
 9 |     name: String,
10 |     description: String,
11 |     @ApiModelProperty(description = "Whether the job output is publishable to archive.org")
12 |     publishable: Boolean,
13 |     @ApiModelProperty(description = "Whether the job is internal/non-user-facing use only")
14 |     internal: Boolean,
15 |     @ApiModelProperty(description = "A link to the job source code") codeUrl: String,
16 |     @ApiModelProperty(description = "A link to information about the job") infoUrl: String)
17 |     extends ApiResponseObject[AvailableJob]
18 | 
19 | object AvailableJob {
20 |   def apply(job: DerivationJob, isInternal: Boolean): AvailableJob =
21 |     AvailableJob(
22 |       uuid = job.uuid.toString,
23 |       name = job.name,
24 |       description = job.description,
25 |       publishable = (!PublishedDatasets.ProhibitedJobs.contains(job)),
26 |       internal = isInternal,
27 |       codeUrl = job.codeUrl,
28 |       infoUrl = job.infoUrl)
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/AvailableJobsCategory.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 4 | import org.archive.webservices.ars.processing.DerivationJob
 5 | 
 6 | case class AvailableJobsCategory(
 7 |     categoryName: String,
 8 |     categoryDescription: String,
 9 |     jobs: Seq[AvailableJob])
10 |     extends ApiResponseObject[AvailableJobsCategory]
11 | 
12 | object AvailableJobsCategory {
13 |   def apply(category: ArchJobCategory, jobs: Seq[DerivationJob]): AvailableJobsCategory = {
14 |     val isInternal = (category == ArchJobCategories.System)
15 |     AvailableJobsCategory(
16 |       categoryName = category.name,
17 |       categoryDescription = category.description,
18 |       jobs = jobs.map(j => AvailableJob.apply(j, isInternal)))
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/Collection.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import io.circe.Json
 4 | import io.circe.syntax._
 5 | import org.archive.webservices.ars.model.app.RequestContext
 6 | import org.archive.webservices.ars.model.collections.CustomCollectionSpecifics
 7 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionInfo}
 8 | import org.archive.webservices.ars.processing.jobs.system.UserDefinedQuery
 9 | import org.archive.webservices.ars.util.FormatUtil
10 | 
11 | case class Collection(
12 |     id: String,
13 |     name: String,
14 |     public: Boolean,
15 |     size: String,
16 |     sortSize: Long,
17 |     seeds: Long,
18 |     lastCrawlDate: Option[String],
19 |     lastJobId: Option[String],
20 |     lastJobSample: Option[java.lang.Boolean],
21 |     lastJobName: Option[String],
22 |     lastJobTime: Option[String],
23 |     params: Option[Json])
24 |     extends ApiResponseObject[Collection]
25 | 
26 | object Collection {
27 |   private def params(collection: ArchCollection): Option[Json] =
28 |     if (collection.specifics.isInstanceOf[CustomCollectionSpecifics])
29 |       Some(
30 |         UserDefinedQuery
31 |           .parseInfo(CustomCollectionSpecifics.path(collection.id).get)
32 |           .get
33 |           .top
34 |           .get
35 |           .asObject
36 |           .get
37 |           .filterKeys(k => k != "name" && k != "size")
38 |           .asJson)
39 |     else None
40 | 
41 |   def apply(collection: ArchCollection)(implicit context: RequestContext): Collection = {
42 |     val info = ArchCollectionInfo.get(collection.id)
43 |     Collection(
44 |       id = collection.id,
45 |       name = collection.name,
46 |       public = collection.public,
47 |       size = FormatUtil.formatBytes(collection.stats.size),
48 |       sortSize = collection.stats.size,
49 |       seeds = collection.stats.seeds,
50 |       lastCrawlDate = Option(collection.stats.lastCrawlDate).filter(_.nonEmpty),
51 |       lastJobId = info.flatMap(_.lastJobId),
52 |       lastJobSample = info.flatMap(_.lastJobSample).map(Boolean.box),
53 |       lastJobName = info.flatMap(_.lastJobName),
54 |       lastJobTime = info.flatMap(_.lastJobTime).map(FormatUtil.instantTimeString),
55 |       params = params(collection))
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/Dataset.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import org.archive.webservices.ars.model.ArchCollection
 4 | import org.archive.webservices.ars.model.app.RequestContext
 5 | import org.archive.webservices.ars.processing.DerivationJobInstance
 6 | import org.archive.webservices.ars.util.{DatasetUtil, FormatUtil}
 7 | 
 8 | case class Dataset(
 9 |     id: String,
10 |     collectionId: String,
11 |     collectionName: String,
12 |     isSample: Boolean,
13 |     jobId: String,
14 |     category: String,
15 |     name: String,
16 |     sample: Int,
17 |     state: String,
18 |     startTime: Option[String],
19 |     finishedTime: Option[String])
20 |     extends ApiResponseObject[Dataset]
21 | 
22 | object Dataset {
23 |   def apply(collection: ArchCollection, jobInstance: DerivationJobInstance)(implicit
24 |       context: RequestContext): Dataset = {
25 |     Dataset(
26 |       id = DatasetUtil.formatId(collection.id, jobInstance),
27 |       collectionId = collection.id,
28 |       collectionName = collection.name,
29 |       isSample = jobInstance.conf.isSample,
30 |       jobId = jobInstance.job.uuid.toString,
31 |       category = jobInstance.job.category.name,
32 |       name = jobInstance.job.name,
33 |       sample = jobInstance.conf.sample,
34 |       state = jobInstance.stateStr,
35 |       startTime = jobInstance.info.started.map(FormatUtil.instantTimeString),
36 |       finishedTime = jobInstance.info.finished.map(FormatUtil.instantTimeString))
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/DatasetFile.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import org.archive.webservices.ars.model.DerivativeOutput
 4 | import org.archive.webservices.ars.util.FormatUtil
 5 | 
 6 | import java.time.Instant
 7 | 
 8 | case class DatasetFile(
 9 |     filename: String,
10 |     sizeBytes: Long,
11 |     mimeType: String,
12 |     lineCount: Long,
13 |     fileType: String,
14 |     creationTime: String,
15 |     md5Checksum: Option[String],
16 |     accessToken: String)
17 |     extends ApiResponseObject[DatasetFile]
18 | 
19 | object DatasetFile {
20 |   def apply(derivOut: DerivativeOutput): DatasetFile =
21 |     DatasetFile(
22 |       filename = derivOut.filename,
23 |       sizeBytes = derivOut.size,
24 |       lineCount = derivOut.lineCount,
25 |       mimeType = derivOut.mimeType,
26 |       fileType = derivOut.fileType,
27 |       creationTime = FormatUtil.instantTimeString(Instant.ofEpochMilli(derivOut.time)),
28 |       md5Checksum = derivOut.checksums.get("md5"),
29 |       accessToken = derivOut.accessToken)
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/InputSpec.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.model.api
2 | 
3 | case class InputSpec(
4 |     `type`: String,
5 |     collectionId: Option[String],
6 |     specs: Option[Seq[InputSpec]],
7 |     inputType: Option[String],
8 |     uuid: Option[String])
9 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/JobState.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | import org.archive.webservices.ars.processing.{DerivationJobInstance, ProcessingState}
 4 | import org.archive.webservices.ars.util.FormatUtil
 5 | 
 6 | case class JobState(
 7 |     id: String,
 8 |     uuid: String,
 9 |     name: String,
10 |     sample: Int,
11 |     state: String,
12 |     started: Boolean,
13 |     finished: Boolean,
14 |     failed: Boolean,
15 |     activeStage: String,
16 |     activeState: String,
17 |     queue: Option[String],
18 |     queuePos: Option[
19 |       Integer
20 |     ], // Option type lookup fails for Int in package.scala, so use Integer
21 |     startTime: Option[String],
22 |     finishedTime: Option[String])
23 |     extends ApiResponseObject[JobState]
24 | 
25 | object JobState {
26 |   def apply(instance: DerivationJobInstance): JobState = {
27 |     val active = instance.active
28 |     val info = instance.info
29 |     JobState(
30 |       id = instance.job.id,
31 |       uuid = instance.uuid,
32 |       name = instance.job.name,
33 |       sample = instance.conf.sample,
34 |       state = instance.stateStr,
35 |       started = (instance.state != ProcessingState.NotStarted),
36 |       finished = (instance.state == ProcessingState.Finished),
37 |       failed = (instance.state == ProcessingState.Failed),
38 |       activeStage = active.job.stage,
39 |       activeState = active.stateStr,
40 |       queue = active.queue.map(_.name),
41 |       queuePos = active.queue.map(q => active.queueIndex),
42 |       startTime = info.started.map(FormatUtil.instantTimeString),
43 |       finishedTime = info.finished.map(FormatUtil.instantTimeString))
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/api/WasapiResponse.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.api
 2 | 
 3 | case class WasapiResponseFile(
 4 |     filename: String,
 5 |     filetype: String,
 6 |     checksums: Map[String, String],
 7 |     locations: Seq[String],
 8 |     size: Long,
 9 |     collection: Option[String])
10 |     extends ApiResponseObject[WasapiResponseFile]
11 | 
12 | case class WasapiResponse(
13 |     count: Int,
14 |     next: Option[String],
15 |     previous: Option[String],
16 |     files: Seq[WasapiResponseFile])
17 |     extends ApiResponseObject[WasapiResponse]
18 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/app/RequestContext.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.app
 2 | 
 3 | import org.archive.webservices.ars.model.users.ArchUser
 4 | 
 5 | import javax.servlet.http.HttpServletRequest
 6 | 
 7 | class RequestContext private (
 8 |     val request: Option[HttpServletRequest],
 9 |     val loggedIn: ArchUser,
10 |     val user: ArchUser) {
11 |   def isRequest: Boolean = request.nonEmpty
12 |   def isInternal: Boolean = !isRequest
13 |   def isUser: Boolean = loggedIn.isUser
14 |   def loggedInOpt: Option[ArchUser] = loggedIn.option
15 |   def userOpt: Option[ArchUser] = user.option
16 |   def isAdmin: Boolean = loggedIn.isAdmin
17 |   def forRequest[R](action: HttpServletRequest => Option[R]): Option[R] = request.flatMap(action)
18 | }
19 | 
20 | object RequestContext {
21 |   val None: RequestContext = new RequestContext(scala.None, ArchUser.None, ArchUser.None)
22 | 
23 |   def apply(
24 |       request: Option[HttpServletRequest],
25 |       loggedIn: ArchUser,
26 |       user: ArchUser): RequestContext = {
27 |     new RequestContext(request, loggedIn, user)
28 |   }
29 |   def apply(loggedIn: ArchUser, user: ArchUser)(implicit
30 |       request: HttpServletRequest): RequestContext = {
31 |     RequestContext(Some(request), loggedIn, user)
32 |   }
33 |   def apply(user: ArchUser)(implicit request: HttpServletRequest): RequestContext = {
34 |     RequestContext(user, user)
35 |   }
36 |   def apply(user: Option[ArchUser])(implicit request: HttpServletRequest): RequestContext = {
37 |     RequestContext(user.getOrElse(ArchUser.None))
38 |   }
39 | 
40 |   def apply(request: HttpServletRequest): RequestContext = {
41 |     new RequestContext(Some(request), ArchUser.None, ArchUser.None)
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/CollectionSpecifics.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.io.{FileAccessContext, FilePointer, WebArchiveLoader}
 5 | import org.archive.webservices.ars.model.app.RequestContext
 6 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionStats}
 7 | import org.archive.webservices.ars.processing.DerivationJobConf
 8 | import org.archive.webservices.sparkling.cdx.CdxRecord
 9 | 
10 | import java.io.InputStream
11 | 
12 | abstract class CollectionSpecifics {
13 |   def id: String
14 |   def inputPath: String
15 |   def sourceId: String = id
16 | 
17 |   def collection(implicit context: RequestContext = RequestContext.None): Option[ArchCollection]
18 |   def stats(implicit context: RequestContext = RequestContext.None): ArchCollectionStats
19 |   def inputSize(conf: DerivationJobConf): Long = conf.inputSpec.collection.stats.size
20 |   def loadWarcFiles[R](inputPath: String)(action: RDD[(FilePointer, InputStream)] => R): R
21 | 
22 |   def loadCdx[R](inputPath: String)(action: RDD[CdxRecord] => R): R = loadWarcFiles(inputPath) {
23 |     rdd =>
24 |       action(WebArchiveLoader.loadCdxFromWarcGzStreams(rdd))
25 |   }
26 | 
27 |   def randomAccess(
28 |       context: FileAccessContext,
29 |       inputPath: String,
30 |       pointer: FilePointer,
31 |       offset: Long,
32 |       positions: Iterator[(Long, Long)]): InputStream
33 | }
34 | 
35 | object CollectionSpecifics {
36 |   def get(id: String): Option[CollectionSpecifics] = {
37 |     ArchCollection.prefix(id).map {
38 |       case AitCollectionSpecifics.Prefix => new AitCollectionSpecifics(id)
39 |       case SpecialCollectionSpecifics.Prefix => new SpecialCollectionSpecifics(id)
40 |       case CustomCollectionSpecifics.Prefix => new CustomCollectionSpecifics(id)
41 |       case UnionCollectionSpecifics.Prefix => new UnionCollectionSpecifics(id)
42 |       case FileCollectionSpecifics.Prefix => new FileCollectionSpecifics(id)
43 |     }
44 |   }
45 | 
46 |   def pointer(sourceId: String, filename: String): FilePointer =
47 |     FilePointer(sourceId + FilePointer.SourceSeparator + filename, filename)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/GenericRandomAccess.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections
 2 | 
 3 | import org.archive.webservices.ars.io.{FileAccessContext, FilePointer, RandomFileAccess}
 4 | 
 5 | import java.io.InputStream
 6 | 
 7 | trait GenericRandomAccess {
 8 |   def randomAccess(
 9 |       context: FileAccessContext,
10 |       inputPath: String,
11 |       pointer: FilePointer,
12 |       offset: Long,
13 |       positions: Iterator[(Long, Long)]): InputStream =
14 |     RandomFileAccess.access(context, pointer, offset, positions)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/UnionCollectionSpecifics.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.io.FilePointer
 5 | import org.archive.webservices.ars.model.app.RequestContext
 6 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionStats}
 7 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters}
 8 | import org.archive.webservices.sparkling.cdx.CdxRecord
 9 | import org.archive.webservices.sparkling.util.RddUtil
10 | 
11 | import java.io.InputStream
12 | import scala.reflect.ClassTag
13 | 
14 | class UnionCollectionSpecifics(val id: String)
15 |     extends CollectionSpecifics
16 |     with GenericRandomAccess {
17 |   val (userId, collectionId) =
18 |     ArchCollection.splitIdUserCollection(id.stripPrefix(UnionCollectionSpecifics.Prefix))
19 | 
20 |   def inputPath: String = ""
21 | 
22 |   def collection(implicit
23 |       context: RequestContext = RequestContext.None): Option[ArchCollection] = {
24 |     Some(
25 |       ArchCollection(
26 |         id,
27 |         collectionId,
28 |         public = false,
29 |         userId.map((_, UnionCollectionSpecifics.Prefix + collectionId)),
30 |         sourceId))
31 |   }
32 | 
33 |   override def stats(implicit
34 |       context: RequestContext = RequestContext.None): ArchCollectionStats =
35 |     ArchCollectionStats.Empty
36 | 
37 |   override def inputSize(conf: DerivationJobConf): Long = {
38 |     UnionCollectionSpecifics
39 |       .collections(conf.params)
40 |       .map(_.specifics.inputSize(conf))
41 |       .filter(_ > -1)
42 |       .sum
43 |   }
44 | 
45 |   private def loadUnion[A: ClassTag, R](
46 |       inputPath: String,
47 |       load: CollectionSpecifics => (RDD[A] => R) => R)(action: RDD[A] => R): R = {
48 |     def union(rdd: RDD[A], remaining: Seq[CollectionSpecifics], numPartitions: Int): R = {
49 |       if (remaining.nonEmpty) {
50 |         load(remaining.head) { nextRdd =>
51 |           union(rdd.union(nextRdd), remaining.tail, nextRdd.getNumPartitions.max(numPartitions))
52 |         }
53 |       } else action(rdd.coalesce(numPartitions))
54 |     }
55 |     val sourceIds = inputPath.split(',').map(_.trim).filter(_.nonEmpty).distinct
56 |     union(RddUtil.emptyRDD[A], sourceIds.flatMap(CollectionSpecifics.get), 0)
57 |   }
58 | 
59 |   def loadWarcFiles[R](inputPath: String)(action: RDD[(FilePointer, InputStream)] => R): R = {
60 |     loadUnion[(FilePointer, InputStream), R](inputPath, s => s.loadWarcFiles(s.inputPath))(action)
61 |   }
62 | 
63 |   override def loadCdx[R](inputPath: String)(action: RDD[CdxRecord] => R): R = {
64 |     loadUnion[CdxRecord, R](inputPath, s => s.loadCdx(s.inputPath))(action)
65 |   }
66 | }
67 | 
68 | object UnionCollectionSpecifics {
69 |   val Prefix = "UNION-"
70 | 
71 |   def collections(params: DerivationJobParameters)(implicit
72 |       context: RequestContext = RequestContext.None): Seq[ArchCollection] = {
73 |     params
74 |       .get[Array[String]]("input")
75 |       .toSeq
76 |       .flatten
77 |       .distinct
78 |       .sorted
79 |       .flatMap(ArchCollection.get(_))
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/ArchCollectionSpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.io.{FilePointer, WebArchiveLoader}
 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.{FileMetaData, FileMetaField}
 6 | 
 7 | import java.io.InputStream
 8 | 
 9 | object ArchCollectionSpecLoader extends InputSpecLoader {
10 |   val specType = "collection"
11 | 
12 |   class WarcFileRecord(file: FilePointer, val in: InputStream)
13 |       extends FileRecord
14 |       with OneTimeAccess {
15 |     override def filename: String = file.filename
16 |     override def mime: String = WebArchiveLoader.WarcMime
17 |     override def path: String =
18 |       file.url.stripSuffix(file.filename).stripSuffix(FilePointer.SourceSeparator)
19 |     override def pointer: FilePointer = file
20 |     override lazy val meta: FileMetaData = FileMetaData(
21 |       FileMetaField("filename", filename),
22 |       FileMetaField("mime", mime),
23 |       FileMetaField("path", path))
24 |   }
25 | 
26 |   override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
27 |     spec.collection.specifics.loadWarcFiles(spec.inputPath) { rdd =>
28 |       action(rdd.map { case (pointer, in) =>
29 |         new WarcFileRecord(pointer, in)
30 |       })
31 |     }
32 |   }
33 | 
34 |   override def size(spec: InputSpec): Long = spec.collection.stats.size
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/CdxQuerySpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.io.WebArchiveLoader
 5 | import org.archive.webservices.ars.processing.jobs.system.UserDefinedQuery
 6 | 
 7 | object CdxQuerySpecLoader extends InputSpecLoader {
 8 |   override def specType: String = "cdx-query"
 9 | 
10 |   def input(spec: InputSpec): InputSpec = {
11 |     spec.cursor
12 |       .downField("input")
13 |       .focus
14 |       .map(json => InputSpec(json.hcursor))
15 |   }.getOrElse {
16 |     throw new UnsupportedOperationException("No sub spec specified.")
17 |   }
18 | 
19 |   override def size(spec: InputSpec): Long = Some(super.size(spec)).filter(_ != -1).getOrElse {
20 |     input(spec).size
21 |   }
22 | 
23 |   override def inputType(spec: InputSpec): Option[String] = Some(InputSpec.InputType.CDX)
24 | 
25 |   override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = action({
26 |     for {
27 |       query <- spec.params("query")
28 |     } yield {
29 |       for {
30 |         error <- UserDefinedQuery.validateQuery(query)
31 |       } throw new RuntimeException(error)
32 | 
33 |       WebArchiveLoader.loadCdx(input(spec)) { rdd =>
34 |         val queryBc = rdd.sparkContext.broadcast(query)
35 |         rdd.mapPartitionsWithIndex { (idx, partition) =>
36 |           val cdx = UserDefinedQuery.filterQuery(partition, queryBc.value)
37 |           Iterator(InMemoryCdxFileRecord(idx, cdx).asInstanceOf[FileRecord])
38 |         }
39 |       }
40 |     }
41 |   }.getOrElse {
42 |     throw new UnsupportedOperationException("missing query")
43 |   })
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/DatasetSpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | import org.apache.spark.rdd.RDD
 3 | import org.archive.webservices.ars.io.FileAccessContext
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 5 | import org.archive.webservices.sparkling.Sparkling
 6 | import org.archive.webservices.sparkling.util.RddUtil
 7 | 
 8 | object DatasetSpecLoader extends InputSpecLoader {
 9 |   val specType = "dataset"
10 | 
11 |   override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
12 |     spec.toFileSpec
13 |       .map { fileSpec =>
14 |         FileSpecLoader.loadSpark(fileSpec)(action)
15 |       }
16 |       .getOrElse {
17 |         val recordFactoryBc = Sparkling.sc.broadcast(HdfsFileRecordFactory())
18 |         val accessContext = FileAccessContext.fromLocalArchConf
19 |         action(RddUtil.parallelize(spec.dataset.outFiles.toSeq).mapPartitions { partition =>
20 |           accessContext.init()
21 |           val recordFactory = recordFactoryBc.value
22 |           recordFactory.accessContext = accessContext
23 |           val meta = FileMetaData.empty
24 |           partition.map { file =>
25 |             recordFactory.get(file.path, file.mimeType, meta)
26 |           }
27 |         })
28 |       }
29 |   }
30 | 
31 |   override def size(spec: InputSpec): Long = spec.dataset.outputSize
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/FileRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.archive.webservices.ars.io.FilePointer
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 5 | 
 6 | import java.io.InputStream
 7 | 
 8 | trait FileRecord {
 9 |   def filename: String = filePath.split('/').last
10 |   def path: String = {
11 |     val slashIdx = filePath.lastIndexOf('/')
12 |     if (slashIdx < 0) "" else filePath.take(slashIdx)
13 |   }
14 |   def filePath: String = FileRecordFactory.filePath(path, filename)
15 |   def mime: String
16 |   def meta: FileMetaData
17 |   def access: InputStream
18 |   def pointer: FilePointer = FilePointer(filePath, filename)
19 | 
20 |   def withAccess(in: InputStream): FileRecord = {
21 |     val origin = this
22 |     new FileRecord {
23 |       override def filename: String = origin.filename
24 |       override def mime: String = origin.mime
25 |       override def path: String = origin.path
26 |       override def meta: FileMetaData = origin.meta
27 |       override def access: InputStream = in
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/FileRecordFactory.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.archive.webservices.ars.io.{FileAccessContext, IOHelper}
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 5 | 
 6 | import java.io.InputStream
 7 | 
 8 | trait FileRecordFactory extends Serializable {
 9 |   def companion: FileFactoryCompanion
10 |   def dataSourceType: String = companion.dataSourceType
11 |   @transient var accessContext: FileAccessContext =
12 |     FileAccessContext.fromLocalArchConf
13 |   def get(file: String, mime: String, meta: FileMetaData): FileRecord
14 |   def accessFile(
15 |       file: String,
16 |       resolve: Boolean = true,
17 |       accessContext: FileAccessContext = accessContext): InputStream
18 | }
19 | 
20 | trait FileFactoryCompanion {
21 |   def dataSourceType: String
22 |   def apply(spec: InputSpec): FileRecordFactory
23 | }
24 | 
25 | object FileRecordFactory {
26 |   var factories: Seq[FileFactoryCompanion] = Seq(
27 |     S3FileRecordFactory,
28 |     S3HttpFileRecordFactory,
29 |     HttpFileRecordFactory,
30 |     HdfsFileRecordFactory,
31 |     VaultFileRecordFactory)
32 | 
33 |   def apply(spec: InputSpec, default: FileFactoryCompanion): FileRecordFactory = {
34 |     apply(spec, Some(default))
35 |   }
36 | 
37 |   def apply(spec: InputSpec, default: Option[FileFactoryCompanion] = None): FileRecordFactory = {
38 |     spec
39 |       .str(InputSpec.DataSourceKey)
40 |       .flatMap { dataSource =>
41 |         factories.find { factory =>
42 |           factory.dataSourceType == dataSource
43 |         }
44 |       }
45 |       .orElse(default)
46 |       .getOrElse {
47 |         throw new UnsupportedOperationException()
48 |       }
49 |       .apply(spec)
50 |   }
51 | 
52 |   def filePath(path: String, filename: String): String = IOHelper.concatPaths(path, filename)
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/HdfsFileRecordFactory.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.archive.webservices.ars.io.FileAccessContext
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 5 | import org.archive.webservices.sparkling.io.HdfsIO
 6 | 
 7 | import java.io.{FileNotFoundException, InputStream}
 8 | 
 9 | class HdfsFileRecordFactory private (excludeSuffix: Option[String] = None)
10 |     extends FileRecordFactory {
11 |   def companion = HdfsFileRecordFactory
12 | 
13 |   class HdfsFileRecord private[HdfsFileRecordFactory] (
14 |       file: String,
15 |       val mime: String,
16 |       val meta: FileMetaData)
17 |       extends FileRecord {
18 |     override lazy val filePath: String = locateFile(file)
19 |     override def access: InputStream = accessFile(filePath, resolve = false)
20 |   }
21 | 
22 |   override def get(file: String, mime: String, meta: FileMetaData): FileRecord = {
23 |     new HdfsFileRecord(file, mime, meta)
24 |   }
25 | 
26 |   override def accessFile(
27 |       file: String,
28 |       resolve: Boolean,
29 |       accessContext: FileAccessContext = accessContext): InputStream = {
30 |     accessContext.hdfsIO.open(if (resolve) locateFile(file) else file)
31 |   }
32 | 
33 |   def locateFile(filePath: String): String = {
34 |     if (filePath.contains("*")) {
35 |       val files = HdfsIO.files(filePath, recursive = false)
36 |       val filtered =
37 |         if (excludeSuffix.isEmpty) files else files.filter(!_.endsWith(excludeSuffix.get))
38 |       if (filtered.isEmpty) throw new FileNotFoundException()
39 |       filtered.next
40 |     } else filePath
41 |   }
42 | }
43 | 
44 | object HdfsFileRecordFactory extends FileFactoryCompanion {
45 |   val dataSourceType: String = "hdfs"
46 | 
47 |   def apply(spec: InputSpec): HdfsFileRecordFactory = new HdfsFileRecordFactory(
48 |     spec.str("metaSuffix"))
49 | 
50 |   def apply(): HdfsFileRecordFactory = new HdfsFileRecordFactory()
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/HttpFileRecordFactory.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.archive.webservices.ars.io.FileAccessContext
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 5 | 
 6 | import java.io.InputStream
 7 | import java.net.URL
 8 | 
 9 | class HttpFileRecordFactory(location: String) extends FileRecordFactory {
10 |   def companion: FileFactoryCompanion = HttpFileRecordFactory
11 | 
12 |   class HttpFileRecord private[HttpFileRecordFactory] (
13 |       file: String,
14 |       val mime: String,
15 |       val meta: FileMetaData)
16 |       extends FileRecord {
17 |     override lazy val filePath: String = locateFile(file)
18 |     override def access: InputStream = accessFile(filePath, resolve = false)
19 |   }
20 | 
21 |   override def get(file: String, mime: String, meta: FileMetaData): FileRecord = {
22 |     new HttpFileRecord(file, mime, meta)
23 |   }
24 | 
25 |   def accessFile(
26 |       file: String,
27 |       resolve: Boolean = true,
28 |       accessContext: FileAccessContext): InputStream = {
29 |     val url = if (resolve) locateFile(file) else file
30 |     println(s"Reading $url...")
31 |     new URL(url).openStream
32 |   }
33 | 
34 |   def locateFile(filename: String): String = FileRecordFactory.filePath(location, filename)
35 | }
36 | 
37 | object HttpFileRecordFactory extends FileFactoryCompanion {
38 |   val dataSourceType: String = "http"
39 | 
40 |   def apply(spec: InputSpec): HttpFileRecordFactory = {
41 |     spec
42 |       .str(InputSpec.DataLocationKey)
43 |       .map(new HttpFileRecordFactory(_))
44 |       .getOrElse {
45 |         throw new RuntimeException("No location URL specified.")
46 |       }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/InMemoryCdxFileRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.archive.webservices.ars.io.WebArchiveLoader
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.{FileMetaData, FileMetaField}
 5 | import org.archive.webservices.sparkling.cdx.CdxRecord
 6 | import org.archive.webservices.sparkling.io.IteratorInputStream
 7 | 
 8 | import java.io.InputStream
 9 | 
10 | class InMemoryCdxFileRecord(override val filePath: String, records: Iterator[CdxRecord])
11 |     extends FileRecord {
12 |   override def mime: String = WebArchiveLoader.CdxMime
13 | 
14 |   override def meta: FileMetaData =
15 |     FileMetaData(FileMetaField("path", filePath), FileMetaField("mime", mime))
16 | 
17 |   override def access: InputStream =
18 |     new IteratorInputStream[CdxRecord](records, r => (r.toCdxString + "\n").getBytes)
19 | }
20 | 
21 | object InMemoryCdxFileRecord {
22 |   def apply(partition: Int, records: Iterator[CdxRecord]): InMemoryCdxFileRecord = {
23 |     new InMemoryCdxFileRecord(s"partition-$partition.cdx.gz", records)
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/InputSpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.io.WebArchiveLoader
 5 | 
 6 | trait InputSpecLoader {
 7 |   def specType: String
 8 |   def inputType(spec: InputSpec): Option[String] = None
 9 |   def size(spec: InputSpec): Long = spec.get[Long]("size").getOrElse(-1)
10 |   def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R
11 |   def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
12 |     loadFilesSpark(spec) { rdd =>
13 |       val filtered = spec.inputType match {
14 |         case InputSpec.InputType.WARC =>
15 |           rdd.filter(_.mime == WebArchiveLoader.WarcMime)
16 |         case InputSpec.InputType.CDX =>
17 |           rdd.filter(_.mime == WebArchiveLoader.CdxMime)
18 |         case _ => rdd
19 |       }
20 |       action(filtered)
21 |     }
22 |   }
23 | }
24 | 
25 | object InputSpecLoader {
26 |   var loaders: Seq[InputSpecLoader] = Seq(
27 |     DatasetSpecLoader,
28 |     ArchCollectionSpecLoader,
29 |     FileSpecLoader,
30 |     MetaRemoteSpecLoader,
31 |     MetaFilesSpecLoader,
32 |     MultiSpecLoader,
33 |     CdxQuerySpecLoader)
34 | 
35 |   def get(spec: InputSpec): Option[InputSpecLoader] = {
36 |     loaders.find(_.specType == spec.specType)
37 |   }
38 | 
39 |   def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
40 |     spec.loader.loadSpark(spec)(action)
41 |   }
42 | 
43 |   def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
44 |     spec.loader.loadFilesSpark(spec)(action)
45 |   }
46 | 
47 |   def size(spec: InputSpec): Long = spec.loader.size(spec)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/LongestPrefixProbing.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import java.io.FileNotFoundException
 4 | 
 5 | trait LongestPrefixProbing {
 6 |   protected def locateLongestPrefixPath(filename: String): String = {
 7 |     var remaining = filename
 8 |     var prefix = ""
 9 |     var next = nextPrefixes(prefix)
10 |     while (next.nonEmpty) {
11 |       val keys =
12 |         next.map(p => (p, p.stripPrefix(prefix).stripSuffix("/"))).filter(_._2.nonEmpty)
13 |       val longest = keys
14 |         .filter { case (_, k) =>
15 |           remaining.startsWith(k)
16 |         }
17 |         .toSeq
18 |         .sortBy(-_._2.length)
19 |         .headOption
20 |         .orElse {
21 |           keys
22 |             .filter { case (_, k) =>
23 |               filename.startsWith(k)
24 |             }
25 |             .toSeq
26 |             .sortBy(-_._2.length)
27 |             .headOption
28 |         }
29 |       if (longest.isEmpty) throw new FileNotFoundException(filename + s" ($prefix)")
30 |       val (p, k) = longest.get
31 |       if (k == filename) return prefix.stripSuffix("/")
32 |       if (remaining.startsWith(k)) remaining = remaining.stripPrefix(k)
33 |       prefix = p
34 |       next = nextPrefixes(prefix)
35 |     }
36 |     throw new FileNotFoundException(filename + s" ($prefix)")
37 |   }
38 | 
39 |   protected def nextPrefixes(prefix: String): Set[String]
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/MetaRemoteSpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | import org.apache.spark.rdd.RDD
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.archive.webservices.ars.io.FileAccessContext
 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 6 | import org.archive.webservices.sparkling.Sparkling
 7 | import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil}
 8 | 
 9 | object MetaRemoteSpecLoader extends InputSpecLoader {
10 |   val specType = "meta-remote"
11 | 
12 |   override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = action({
13 |     val recordFactory = FileRecordFactory(spec)
14 |     val recordFactoryBc = Sparkling.sc.broadcast(recordFactory)
15 |     for {
16 |       filenameKey <- spec.str("metaFilenameKey")
17 |       mimeKey <- spec.str("metaMimeKey")
18 |     } yield {
19 |       val accessContext = FileAccessContext.fromLocalArchConf
20 |       Sparkling.initPartitions(loadMeta(spec)).mapPartitions { partition =>
21 |         accessContext.init()
22 |         val recordFactory = recordFactoryBc.value
23 |         recordFactory.accessContext = accessContext
24 |         partition.flatMap { meta =>
25 |           for {
26 |             filename <- meta.str(filenameKey)
27 |             mime <- meta.str(mimeKey)
28 |           } yield recordFactory.get(filename, mime, meta)
29 |         }
30 |       }
31 |     }
32 |   }.getOrElse {
33 |     throw new RuntimeException("No meta filename and/or mime key specified.")
34 |   })
35 | 
36 |   def loadMeta(spec: InputSpec): RDD[FileMetaData] = {
37 |     spec
38 |       .str(InputSpec.MetaSourceKey)
39 |       .orElse(spec.str(InputSpec.DataSourceKey))
40 |       .flatMap {
41 |         case HdfsFileRecordFactory.dataSourceType => Some(loadMetaHdfs(spec))
42 |         case VaultFileRecordFactory.dataSourceType => Some(loadMetaVault(spec))
43 |         case _ => None
44 |       }
45 |       .getOrElse {
46 |         throw new UnsupportedOperationException()
47 |       }
48 |   }
49 | 
50 |   def loadMetaHdfs(spec: InputSpec): RDD[FileMetaData] = {
51 |     spec
52 |       .str(InputSpec.MetaLocationKey)
53 |       .map {
54 |         case location if location.endsWith(".parquet") =>
55 |           loadParquet(location)
56 |         case _ => throw new UnsupportedOperationException()
57 |       }
58 |       .getOrElse {
59 |         throw new RuntimeException("No meta location specified")
60 |       }
61 |   }
62 | 
63 |   def loadMetaVault(spec: InputSpec): RDD[FileMetaData] = {
64 |     spec
65 |       .str(InputSpec.MetaLocationKey)
66 |       .map {
67 |         case location if location.endsWith(".parquet") =>
68 |           val in = VaultFileRecordFactory(spec).accessFile(location)
69 |           val tmpFile = HdfsIO.createTmpPath()
70 |           val out = HdfsIO.out(tmpFile)
71 |           try {
72 |             IOUtil.copy(in, out)
73 |           } finally {
74 |             out.close()
75 |           }
76 |           loadParquet(tmpFile)
77 |         case _ => throw new UnsupportedOperationException()
78 |       }
79 |       .getOrElse {
80 |         throw new RuntimeException("No meta location specified")
81 |       }
82 |   }
83 | 
84 |   def loadParquet(path: String): RDD[FileMetaData] = {
85 |     val dataFrame = SparkSession.builder.getOrCreate.read.parquet(path)
86 |     val schema = Sparkling.sc.broadcast(dataFrame.schema)
87 |     dataFrame.rdd.map { row =>
88 |       FileMetaData.fromParquet(schema.value, row)
89 |     }
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/MultiSpecLoader.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.sparkling.util.RddUtil
 5 | 
 6 | object MultiSpecLoader extends InputSpecLoader {
 7 |   override def specType: String = "multi-specs"
 8 | 
 9 |   def multiSpecs(spec: InputSpec): Iterator[InputSpec] = {
10 |     spec.cursor
11 |       .downField("specs")
12 |       .values
13 |       .toIterator
14 |       .flatten
15 |       .map(json => InputSpec(json.hcursor))
16 |   }
17 | 
18 |   override def size(spec: InputSpec): Long = Some(super.size(spec)).filter(_ != -1).getOrElse {
19 |     val sizes = multiSpecs(spec).map(_.size).filter(_ != -1)
20 |     if (sizes.isEmpty) -1 else sizes.sum
21 |   }
22 | 
23 |   override def inputType(spec: InputSpec): Option[String] = {
24 |     val types = multiSpecs(spec).map(_.inputType).toSet
25 |     Some(if (types.size == 1) types.head else InputSpec.InputType.Files)
26 |   }
27 | 
28 |   private def unionSpark[R](spec: InputSpec, load: InputSpec => (RDD[FileRecord] => R) => R)(
29 |       action: RDD[FileRecord] => R): R = {
30 |     val specs = multiSpecs(spec)
31 |     var union = RddUtil.emptyRDD[FileRecord]
32 |     def next: R = {
33 |       if (specs.hasNext) {
34 |         load(specs.next) { rdd =>
35 |           union = union.union(rdd)
36 |           next
37 |         }
38 |       } else action(union)
39 |     }
40 |     next
41 |   }
42 | 
43 |   override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
44 |     unionSpark[R](spec, InputSpecLoader.loadFilesSpark)(action)
45 |   }
46 | 
47 |   override def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = {
48 |     unionSpark[R](spec, InputSpecLoader.loadSpark)(action)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/OneTimeAccess.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import java.io.InputStream
 4 | 
 5 | trait OneTimeAccess { this: FileRecord =>
 6 |   private var accessed = false
 7 | 
 8 |   def in: InputStream
 9 | 
10 |   override def access: InputStream = {
11 |     if (!accessed) {
12 |       accessed = true
13 |       in
14 |     } else throw new UnsupportedOperationException("InputStream can only be accessed once.")
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/S3HttpFileRecordFactory.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | class S3HttpFileRecordFactory(location: String, longestPrefixMapping: Boolean)
 6 |     extends HttpFileRecordFactory(location)
 7 |     with LongestPrefixProbing {
 8 |   override def companion: FileFactoryCompanion = S3HttpFileRecordFactory
 9 | 
10 |   override def locateFile(file: String): String = {
11 |     if (longestPrefixMapping) FileRecordFactory.filePath(locateLongestPrefixPath(file), file)
12 |     else super.locateFile(file)
13 |   }
14 | 
15 |   private val prefixes = collection.mutable.Map.empty[String, Set[String]]
16 |   override protected def nextPrefixes(prefix: String): Set[String] = {
17 |     prefixes.getOrElseUpdate(
18 |       prefix, {
19 |         val url = location + "?delimiter=/&prefix=" + prefix
20 |         val source = Source.fromURL(url)
21 |         try {
22 |           source.mkString
23 |             .split('<')
24 |             .filter(keyValue => keyValue.startsWith("Prefix>") || keyValue.startsWith("Key>"))
25 |             .map { keyValue =>
26 |               keyValue.split('>').last
27 |             }
28 |             .toSet
29 |         } finally {
30 |           source.close()
31 |         }
32 |       })
33 |   }
34 | }
35 | 
36 | object S3HttpFileRecordFactory extends FileFactoryCompanion {
37 |   val dataSourceType: String = "s3-http"
38 | 
39 |   def apply(spec: InputSpec): S3HttpFileRecordFactory = {
40 |     spec
41 |       .str(InputSpec.DataLocationKey)
42 |       .map { location =>
43 |         val longestPrefixMapping = spec.str("dataPathMapping").contains("longest-prefix")
44 |         new S3HttpFileRecordFactory(location, longestPrefixMapping)
45 |       }
46 |       .getOrElse {
47 |         throw new RuntimeException("No location URL specified.")
48 |       }
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaField.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta
 2 | 
 3 | import io.circe.Json
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | class FileMetaField private (val key: String, val value: Any, val fieldType: FileMetaFieldType)
 8 |     extends Serializable {
 9 |   def get[A: ClassTag]: Option[A] = implicitly[ClassTag[A]].unapply(value)
10 |   def gets[A: ClassTag]: Seq[A] =
11 |     get[Seq[_]].toSeq.flatMap(_.flatMap(implicitly[ClassTag[A]].unapply))
12 |   def toJson: Json = fieldType.toJson(value)
13 | }
14 | 
15 | object FileMetaField {
16 |   def apply(key: String, value: Any, fieldType: FileMetaFieldType): FileMetaField = {
17 |     new FileMetaField(key, value, fieldType)
18 |   }
19 | 
20 |   def apply(key: String, value: String): FileMetaField =
21 |     FileMetaField(key, value, FileMetaFieldType.String)
22 |   def apply(key: String, value: Int): FileMetaField =
23 |     FileMetaField(key, value, FileMetaFieldType.Number)
24 |   def apply(key: String, value: Long): FileMetaField =
25 |     FileMetaField(key, value, FileMetaFieldType.Number)
26 |   def apply(key: String, value: Double): FileMetaField =
27 |     FileMetaField(key, value, FileMetaFieldType.Number)
28 |   def apply(key: String, value: Boolean): FileMetaField =
29 |     FileMetaField(key, value, FileMetaFieldType.Boolean)
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaFieldSummary.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta
 2 | 
 3 | import io.circe._
 4 | import io.circe.syntax._
 5 | 
 6 | import scala.collection.immutable.ListMap
 7 | 
 8 | class FileMetaFieldSummary extends Serializable {
 9 |   var optional: Boolean = false
10 |   var types: Map[FileMetaFieldType, FileMetaFieldTypeSummary] = Map.empty
11 | 
12 |   def add(field: FileMetaField): Unit = {
13 |     val summary = types.getOrElse(
14 |       field.fieldType, {
15 |         val summary = field.fieldType.primitive match {
16 |           case FileMetaFieldType.String => new FileMetaFieldStringTypeSummary
17 |           case FileMetaFieldType.Number => new FileMetaFieldNumberTypeSummary
18 |           case FileMetaFieldType.Boolean => FileMetaFieldBooleanTypeSummary
19 |         }
20 |         types += field.fieldType.primitive -> summary
21 |         summary
22 |       })
23 |     if (field.fieldType.multi) {
24 |       val values = field.value.asInstanceOf[Seq[_]]
25 |       summary.adds(values)
26 |     } else summary.add(field.value)
27 |   }
28 | 
29 |   def ++(that: FileMetaFieldSummary): FileMetaFieldSummary = {
30 |     val summary = new FileMetaFieldSummary
31 |     summary.optional = optional || that.optional
32 |     summary.types = (types.keySet ++ that.types.keySet).toSeq.map { t =>
33 |       val thisType = types.get(t)
34 |       val thatType = that.types.get(t)
35 |       t -> (if (thisType.isEmpty || thatType.isEmpty) thisType.orElse(thatType).get
36 |             else thisType.get ++ thatType.get)
37 |     }.toMap
38 |     summary
39 |   }
40 | 
41 |   def toJson: Json = {
42 |     Map("optional" -> optional.asJson, "types" -> types.values.map(_.toJson).toMap.asJson).asJson
43 |   }
44 | 
45 |   def toJsonSchemaProperties: Seq[(String, Json)] = {
46 |     if (types.size == 1) {
47 |       types.head._2.toJsonSchemaProperties
48 |     } else {
49 |       Seq("oneOf" -> types.toSeq.map { case (_, t) =>
50 |         ListMap(t.toJsonSchemaProperties: _*).asJson
51 |       }.asJson)
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaFieldType.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta
 2 | 
 3 | import _root_.io.circe.syntax._
 4 | import io.circe.Json
 5 | 
 6 | trait FileMetaFieldType extends Serializable {
 7 |   def primitive: FileMetaFieldType = this
 8 |   def multi: Boolean = false
 9 |   def toJson(value: Any): Json
10 | }
11 | 
12 | trait FileMetaFieldMultiType extends FileMetaFieldType {
13 |   def primitive: FileMetaFieldType
14 |   override def multi: Boolean = true
15 |   def toJson(value: Any): Json = value.asInstanceOf[Seq[_]].map(primitive.toJson).asJson
16 | }
17 | 
18 | object FileMetaFieldType {
19 |   case object String extends FileMetaFieldType {
20 |     override def toJson(value: Any): Json = {
21 |       if (value == null) Json.Null else value.asInstanceOf[String].asJson
22 |     }
23 |   }
24 | 
25 |   case object Number extends FileMetaFieldType {
26 |     override def toJson(value: Any): Json = {
27 |       value match {
28 |         case i: Int => i.asJson
29 |         case l: Long => l.asJson
30 |         case d: Double => d.asJson
31 |       }
32 |     }
33 |   }
34 | 
35 |   case object Boolean extends FileMetaFieldType {
36 |     override def toJson(value: Any): Json = value.asInstanceOf[Boolean].asJson
37 |   }
38 | 
39 |   case object Strings extends FileMetaFieldMultiType {
40 |     override def primitive: FileMetaFieldType = String
41 |   }
42 | 
43 |   case object Numbers extends FileMetaFieldMultiType {
44 |     override def primitive: FileMetaFieldType = Number
45 |   }
46 | 
47 |   case object Booleans extends FileMetaFieldMultiType {
48 |     override def primitive: FileMetaFieldType = Boolean
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaSummary.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta
 2 | 
 3 | import io.circe._
 4 | import io.circe.syntax._
 5 | 
 6 | import scala.collection.immutable.ListMap
 7 | 
 8 | object FileMetaSummary {
 9 |   val MaxOptions = 10
10 |   val MaxStringOptionLength = 100
11 | 
12 |   lazy val empty = new FileMetaSummary()
13 | }
14 | 
15 | class FileMetaSummary extends Serializable {
16 |   private var fields: ListMap[String, FileMetaFieldSummary] = ListMap.empty
17 | 
18 |   def add(meta: FileMetaData): Unit = {
19 |     for (missing <- (fields -- meta.keys).values) missing.optional = true
20 |     val first = fields.isEmpty
21 |     for (field <- meta.fields) {
22 |       fields
23 |         .getOrElse(
24 |           field.key, {
25 |             val summary = new FileMetaFieldSummary
26 |             summary.optional = !first
27 |             fields += field.key -> summary
28 |             summary
29 |           })
30 |         .add(field)
31 |     }
32 |   }
33 | 
34 |   def ++(that: FileMetaSummary): FileMetaSummary = {
35 |     if (fields.isEmpty) that
36 |     else if (that.fields.isEmpty) this
37 |     else {
38 |       val newFields = {
39 |         fields.toSeq.map(_._1).zipWithIndex ++ that.fields.toSeq.map(_._1).zipWithIndex
40 |       }.groupBy(_._1)
41 |         .toSeq
42 |         .map { case (key, group) =>
43 |           (key, group.map(_._2).min)
44 |         }
45 |         .sortBy(_._2)
46 |         .map(_._1)
47 |         .map { field =>
48 |           val thisField = fields.get(field)
49 |           val thatField = that.fields.get(field)
50 |           field -> {
51 |             if (thisField.isEmpty || thatField.isEmpty) {
52 |               val field = thisField.orElse(thatField).get
53 |               field.optional = true
54 |               field
55 |             } else thisField.get ++ thatField.get
56 |           }
57 |         }
58 |       val summary = new FileMetaSummary
59 |       summary.fields ++= newFields
60 |       summary
61 |     }
62 |   }
63 | 
64 |   def toJson: Json = {
65 |     fields.toSeq
66 |       .map { case (key, field) =>
67 |         key -> field.toJson
68 |       }
69 |       .toMap
70 |       .asJson
71 |   }
72 | 
73 |   def toJsonSchema: Json = {
74 |     ListMap(
75 |       "$schema" -> "https://json-schema.org/draft/2020-12/schema".asJson,
76 |       "type" -> "object".asJson,
77 |       "required" -> fields.toSeq.filter(!_._2.optional).map(_._1).asJson,
78 |       "properties" -> ListMap(fields.toSeq.map { case (key, field) =>
79 |         key -> (ListMap("title" -> key.asJson) ++ field.toJsonSchemaProperties)
80 |       }: _*).asJson).asJson
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/users/DefaultArchUser.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.users
 2 | 
 3 | case class DefaultArchUser(
 4 |     id: String,
 5 |     userName: String,
 6 |     fullName: String,
 7 |     email: Option[String],
 8 |     isAdmin: Boolean,
 9 |     isUser: Boolean = true)
10 |     extends ArchUser
11 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/model/users/KeystoneUser.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.model.users
 2 | 
 3 | import io.circe.parser
 4 | import io.circe.syntax._
 5 | import org.archive.webservices.ars.model.ArchConf
 6 | import requests._
 7 | 
 8 | object KeystoneUser {
 9 |   val prefix = "ks"
10 | 
11 |   private def parseKeystoneUserResponse(r: Response): Option[DefaultArchUser] =
12 |     if (r.statusCode != 200) None
13 |     else {
14 |       parser.parse(r.text) match {
15 |         case Left(error) =>
16 |           None
17 |         case Right(json) =>
18 |           val cursor = json.hcursor
19 |           Some(
20 |             DefaultArchUser(
21 |               id = prefix + ":" + cursor.get[String]("username").toOption.get,
22 |               userName = cursor.get[String]("username").toOption.get,
23 |               fullName = cursor.get[String]("fullname").toOption.get,
24 |               email = cursor.get[String]("email").toOption,
25 |               isAdmin = cursor.get[Boolean]("is_staff").toOption.get))
26 |       }
27 |     }
28 | 
29 |   def get(username: String): Option[DefaultArchUser] =
30 |     if (ArchConf.keystoneBaseUrl.isEmpty || ArchConf.keystonePrivateApiKey.isEmpty) {
31 |       None
32 |     } else {
33 |       parseKeystoneUserResponse(
34 |         requests.get(
35 |           s"${ArchConf.keystoneBaseUrl.get}/private/api/user?username=${username}",
36 |           headers = Map("X-API-Key" -> ArchConf.keystonePrivateApiKey.get),
37 |           check = false))
38 |     }
39 | 
40 |   def login(username: String, password: String): Option[DefaultArchUser] =
41 |     if (ArchConf.keystoneBaseUrl.isEmpty || ArchConf.keystonePrivateApiKey.isEmpty) {
42 |       None
43 |     } else {
44 |       parseKeystoneUserResponse(
45 |         requests.post(
46 |           s"${ArchConf.keystoneBaseUrl.get}/private/api/proxy_login",
47 |           data = Map("username" -> username, "password" -> password).asJson.noSpaces,
48 |           headers = Map("X-API-Key" -> ArchConf.keystonePrivateApiKey.get),
49 |           check = false))
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/DerivationJobParameters.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import io.circe.parser._
 4 | import io.circe.syntax._
 5 | import io.circe.{Decoder, Encoder, HCursor, Json}
 6 | 
 7 | case class DerivationJobParameters(values: Map[String, Json]) extends Serializable {
 8 |   def size: Int = values.size
 9 |   def isEmpty: Boolean = size == 0
10 |   def nonEmpty: Boolean = !isEmpty
11 | 
12 |   def set[A: Encoder](key: String, value: A): DerivationJobParameters = {
13 |     DerivationJobParameters(values.updated(key, value.asJson))
14 |   }
15 | 
16 |   def set[A: Encoder](keyValues: (String, A)*): DerivationJobParameters = {
17 |     DerivationJobParameters(values ++ keyValues.map { case (k, v) => k -> v.asJson })
18 |   }
19 | 
20 |   def set(keyValues: (String, Json)*): DerivationJobParameters = {
21 |     DerivationJobParameters(values ++ keyValues)
22 |   }
23 | 
24 |   def get[A: Decoder](key: String): Option[A] = values.get(key).flatMap(_.as[A].toOption)
25 | 
26 |   def toJson: Json = values.asJson
27 | }
28 | 
29 | object DerivationJobParameters {
30 |   val Empty: DerivationJobParameters = DerivationJobParameters(Map.empty)
31 | 
32 |   def fromJson(cursor: HCursor): Option[DerivationJobParameters] = cursor.keys.map { keys =>
33 |     val params = keys.flatMap { key =>
34 |       cursor.downField(key).focus.map(key -> _)
35 |     }.toMap
36 |     DerivationJobParameters(params)
37 |   }
38 | 
39 |   def fromJson(json: Json): Option[DerivationJobParameters] = fromJson(json.hcursor)
40 | 
41 |   def fromJson(json: String): Option[DerivationJobParameters] =
42 |     parse(json).right.toOption.flatMap(fromJson)
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/GenericJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | trait GenericJob extends DerivationJob {
 4 |   override def enqueue(
 5 |       conf: DerivationJobConf,
 6 |       get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] = {
 7 |     super.enqueue(conf, get).flatMap(GenericJobManager.enqueue)
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/GenericJobManager.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.processing
2 | 
3 | object GenericJobManager extends JobManagerBase("Generic", 3)
4 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/JobQueue.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import org.archive.webservices.ars.model.collections.inputspecs.InputSpec
 4 | 
 5 | class JobQueue(val name: String) {
 6 |   private var _pos = 0
 7 |   private val queue = collection.mutable.Queue.empty[DerivationJobInstance]
 8 | 
 9 |   def items: Iterator[DerivationJobInstance] = queue.toIterator
10 | 
11 |   def isEmpty: Boolean = queue.isEmpty
12 |   def nonEmpty: Boolean = queue.nonEmpty
13 |   def size: Int = queue.size
14 | 
15 |   def enqueue(instance: DerivationJobInstance): Int = synchronized {
16 |     val remainder = Int.MaxValue - _pos
17 |     val thisPos = if (remainder < queue.size) queue.size - remainder else _pos + queue.size
18 |     queue.enqueue(instance)
19 |     instance.updateState(ProcessingState.Queued)
20 |     thisPos
21 |   }
22 | 
23 |   def dequeue: DerivationJobInstance = synchronized {
24 |     if (_pos == Int.MaxValue) _pos = 1 else _pos += 1
25 |     queue.dequeue
26 |   }
27 | 
28 |   def dequeue(
29 |       freeSlots: Int,
30 |       excludeSources: Set[InputSpec.Identifier] = Set.empty,
31 |       recentUsers: Seq[String]): Option[DerivationJobInstance] = synchronized {
32 |     val idxs = recentUsers.zipWithIndex.map { case (user, idx) => user -> (idx + 1) }.toMap
33 |     var minIdx = 0
34 |     var minUserInstance: Option[DerivationJobInstance] = None
35 |     queue
36 |       .dequeueFirst { instance =>
37 |         instance.slots <= freeSlots && !excludeSources.contains(
38 |           instance.conf.inputSpec) && instance.user.map(_.id).forall { id =>
39 |           idxs.get(id) match {
40 |             case Some(idx) =>
41 |               if (minIdx == 0 || idx < minIdx) {
42 |                 minIdx = idx
43 |                 minUserInstance = Some(instance)
44 |               }
45 |               false
46 |             case None => true
47 |           }
48 |         }
49 |       }
50 |       .orElse(minUserInstance.flatMap(instance => queue.dequeueFirst(_ == instance)))
51 |   }
52 | 
53 |   def pos: Int = _pos
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/PartialDerivationJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 3 | 
 4 | abstract class PartialDerivationJob(parent: ChainedJob) extends DerivationJob {
 5 |   override val partialOf: Option[DerivationJob] = Some(parent)
 6 |   override lazy val id: String = parent.id + "_" + super.id
 7 |   val name: String = id
 8 |   override lazy val uuid: String = parent.uuid
 9 |   override def relativeOutPath: String = parent.relativeOutPath
10 |   val category: ArchJobCategory = ArchJobCategories.None
11 |   val description: String = id
12 |   override val templateName: Option[String] = None
13 |   override def enqueue(
14 |       conf: DerivationJobConf,
15 |       get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] =
16 |     super.enqueue(conf, get)
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/ProcessingState.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | object ProcessingState {
 4 |   val Strings: Seq[String] = Seq("Not started", "Queued", "Running", "Finished", "Failed")
 5 | 
 6 |   val NotStarted = 0
 7 |   val Queued = 1
 8 |   val Running = 2
 9 |   val Finished = 3
10 |   val Failed = 4
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/SampleVizData.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import io.circe._
 4 | import io.circe.generic.semiauto._
 5 | 
 6 | case class SampleVizData(
 7 |     nodes: Seq[(String, String)],
 8 |     edges: Option[Seq[(String, String)]] = None)
 9 | 
10 | object SampleVizData {
11 |   implicit val sampleVizDataEncoder: Encoder[SampleVizData] = deriveEncoder
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/SparkJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | trait SparkJob extends DerivationJob {
 4 |   override def enqueue(
 5 |       conf: DerivationJobConf,
 6 |       get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] = {
 7 |     super.enqueue(conf, get).flatMap(SparkJobManager.enqueue)
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/SparkJobListener.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import org.apache.spark.scheduler._
 4 | import org.archive.webservices.sparkling.io.StageSyncManager
 5 | 
 6 | import java.time.Instant
 7 | 
 8 | object SparkJobListener extends SparkListener {
 9 |   private val _taskStartTimes = collection.mutable.Map.empty[String, Long]
10 | 
11 |   def taskStartTimes: Map[String, Long] = _taskStartTimes.toMap
12 | 
13 |   def id(info: TaskInfo): String = info.id + "#" + info.taskId
14 | 
15 |   override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
16 |     _taskStartTimes(id(taskStart.taskInfo)) = Instant.now.getEpochSecond
17 |   }
18 | 
19 |   override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
20 |     _taskStartTimes.remove(id(taskEnd.taskInfo))
21 |   }
22 | 
23 |   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
24 |     StageSyncManager.cleanup(StageSyncManager.stageId(stageCompleted.stageInfo.stageId))
25 |   }
26 | 
27 |   def reset(): Unit = synchronized(_taskStartTimes.clear())
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/SparkJobManager.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.archive.webservices.ars.Arch
 6 | import org.archive.webservices.ars.model.ArchConf
 7 | import org.archive.webservices.sparkling.Sparkling.executionContext
 8 | import org.archive.webservices.sparkling.util.SparkUtil
 9 | import org.archive.webservices.sparkling.{Sparkling, _}
10 | 
11 | import java.io.File
12 | import java.time.Instant
13 | import scala.concurrent.Future
14 | 
15 | object SparkJobManager
16 |     extends JobManagerBase("Spark", 3, timeoutSecondsMinMax = Some((60 * 60, 60 * 60 * 3))) {
17 |   val taskTimeoutSeconds = 60 * 60 * 12 // 12 hours
18 |   val SharedSparkContext = true
19 |   val SparkAllocationFile = "fairscheduler.xml"
20 |   val MaxPriorityWeight = 128
21 |   val PoolPrefix = "weight-"
22 | 
23 |   private var _context: Option[SparkContext] = None
24 | 
25 |   def context: Future[SparkContext] = {
26 |     Future {
27 |       synchronized(_context.filter(!_.isStopped).getOrElse {
28 |         val context = SparkUtil.config(
29 |           SparkSession.builder,
30 |           appName = s"ARCH ${ArchConf.deploymentEnvironment}",
31 |           executors = 15,
32 |           executorCores = 4,
33 |           executorMemory = "16g",
34 |           queue = ArchConf.hadoopQueue,
35 |           additionalConfigs = Map(
36 |             "spark.master" -> ArchConf.sparkMaster,
37 |             "spark.scheduler.mode" -> "FAIR",
38 |             "spark.yarn.executor.memoryOverhead" -> (4.gb / 1.mb).toString, // off-heap memory in MiB
39 |             "spark.scheduler.allocation.file" -> new File(SparkAllocationFile).getAbsolutePath,
40 |             "spark.yarn.am.memory" -> "4096m"),
41 |           verbose = true)
42 |         context.setLogLevel("INFO")
43 |         _context = Some(context)
44 |         Sparkling.resetSparkContext(Some(context))
45 |         context.addSparkListener(SparkJobListener)
46 |         println("New Spark context initialized: " + context.applicationId)
47 |         context
48 |       })
49 |     }
50 |   }
51 | 
52 |   private def priorityWeight: Int = if (currentPriority == 0) 1 else currentPriority
53 | 
54 |   def initThread(sc: SparkContext, job: DerivationJob, conf: DerivationJobConf): Unit = {
55 |     sc.setJobGroup(job.uuid, job.name + " " + conf.serialize)
56 |     sc.setLocalProperty("spark.scheduler.pool", PoolPrefix + priorityWeight)
57 |   }
58 | 
59 |   def stopContext(): Unit = synchronized {
60 |     for (context <- _context) {
61 |       context.stop()
62 |       while (!context.isStopped) Thread.`yield`()
63 |       _context = None
64 |       Sparkling.resetSparkContext()
65 |     }
66 |   }
67 | 
68 |   override protected def onAllJobsFinished(): Unit = synchronized {
69 |     super.onAllJobsFinished()
70 |     if (!Arch.debugging) stopContext()
71 |   }
72 | 
73 |   override protected def onTimeout(instances: Seq[DerivationJobInstance]): Unit = synchronized {
74 |     val threshold = Instant.now.getEpochSecond - taskTimeoutSeconds
75 |     val startTimes = SparkJobListener.taskStartTimes.values
76 |     if (startTimes.forall(_ < threshold)) {
77 |       SparkJobListener.synchronized {
78 |         SparkJobListener.reset()
79 |         stopContext()
80 |         return
81 |       }
82 |     }
83 |     if (numQueued > 0 && freeSlots == 0) bypassJobs()
84 |   }
85 | 
86 |   def bypassJobs(): Boolean = synchronized {
87 |     if (priorityWeight < MaxPriorityWeight && priorityRunningCount > 0) {
88 |       newPriority(priorityWeight * 2)
89 |       true
90 |     } else false
91 |   }
92 | 
93 |   def run(job: DerivationJob, conf: DerivationJobConf): Future[Boolean] = {
94 |     if (SharedSparkContext) job.run(conf) else SparkRunner.run(job, conf)
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/SparkRunner.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing
 2 | 
 3 | import org.apache.tools.ant.taskdefs.Java
 4 | import org.apache.tools.ant.{DefaultLogger, Project}
 5 | import org.archive.webservices.sparkling.Sparkling.executionContext
 6 | 
 7 | import java.io.File
 8 | import scala.concurrent.duration.Duration
 9 | import scala.concurrent.{Await, Future}
10 | 
11 | object SparkRunner {
12 |   def run(job: DerivationJob, conf: DerivationJobConf): Future[Boolean] = Future {
13 |     val mainClass = getClass.getName.stripSuffix("$")
14 |     val args = Seq(job.getClass.getName, conf.serialize)
15 | 
16 |     val command = System.getProperty("sun.java.command")
17 |     val jar = command.split(' ').head
18 |     val isSbt = jar.endsWith("/sbt-launch.jar")
19 | 
20 |     val project = new Project
21 | 
22 |     val logger = new DefaultLogger
23 |     project.addBuildListener(logger)
24 |     logger.setOutputPrintStream(System.out)
25 |     logger.setErrorPrintStream(System.err)
26 |     logger.setMessageOutputLevel(Project.MSG_DEBUG)
27 | 
28 |     val jvm = new Java
29 |     jvm.setTaskName(job.getClass.getSimpleName.stripSuffix("$"))
30 |     jvm.setProject(project)
31 |     jvm.setFork(true)
32 |     jvm.setCloneVm(true)
33 |     jvm.setJar(new File(jar))
34 | 
35 |     if (isSbt) jvm.createArg.setValue("runMain " + mainClass + " " + args.mkString(" "))
36 |     else {
37 |       jvm.setClassname(mainClass)
38 |       for (arg <- args) jvm.createArg.setValue(arg)
39 |     }
40 | 
41 |     jvm.executeJava == 0
42 |   }
43 | 
44 |   def main(args: Array[String]): Unit = {
45 |     val Array(className, confStr) = args
46 |     DerivationJobConf.deserialize(confStr) match {
47 |       case Some(conf) =>
48 |         val job =
49 |           Class.forName(className).getField("MODULE$").get(null).asInstanceOf[DerivationJob]
50 |         val success = Await.result(job.run(conf), Duration.Inf)
51 |         Await.ready(
52 |           SparkJobManager.context.map { sc =>
53 |             sc.stop()
54 |             while (!sc.isStopped) Thread.`yield`()
55 |           },
56 |           Duration.Inf)
57 |         System.exit(if (success) 0 else 1)
58 |       case None =>
59 |         System.exit(2)
60 |     }
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/AudioInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object AudioInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "Audio file information"
10 |   val uuid = "01895066-7db2-794b-b91b-e3f5a340e859"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#audio"
14 | 
15 |   val description =
16 |     "Locations and metadata for MP3, WAV, AAC, and other audio formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "audio-information.csv.gz"
19 | 
20 |   def checkMime(url: String, server: String, tika: String): Boolean =
21 |     tika.startsWith("audio/")
22 | 
23 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/DomainFrequencyExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.functions.desc
 5 | import org.apache.spark.sql.{Dataset, Row}
 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 7 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 8 | import org.archive.webservices.ars.processing.jobs.shared.AutJob
 9 | import org.archive.webservices.ars.processing.{DerivationJobConf, ProcessingState, SampleVizData}
10 | import org.archive.webservices.ars.util.{Common, PublicSuffixUtil}
11 | import org.archive.webservices.sparkling.io.HdfsIO
12 | import org.archive.webservices.sparkling.warc.WarcRecord
13 | 
14 | import java.io.PrintStream
15 | 
16 | object DomainFrequencyExtraction extends AutJob[(String, Long)] {
17 |   val name = "Domain frequency"
18 |   val uuid = "01894bc7-ff6a-7e25-a5b5-4570425a8ab7"
19 |   val category: ArchJobCategory = ArchJobCategories.Collection
20 | 
21 |   override val infoUrl =
22 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410734896148-ARCH-Collection-datasets#domain-frequency"
23 | 
24 |   val description =
25 |     "The number of unique documents collected from each domain in the collection. Output: one CSV file with columns for domain and count."
26 | 
27 |   val targetFile: String = "domain-frequency.csv.gz"
28 | 
29 |   override def printToOutputStream(out: PrintStream): Unit = out.println("domain, count")
30 | 
31 |   override def df(rdd: RDD[(String, Long)]): Dataset[Row] = {
32 |     val rows = rdd
33 |       .reduceByKey(_ + _)
34 |       .map { case (domain, count) =>
35 |         Row(domain, count)
36 |       }
37 |     AutLoader.domainFrequency(rows).orderBy(desc("count"))
38 |   }
39 | 
40 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[(String, Long)] = {
41 |     val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context)
42 |     rdd
43 |       .flatMap { r =>
44 |         Common.tryOrElse[Option[(String, Long)]](None) {
45 |           r.http.filter(AutUtil.validPage(r, _)).map { _ =>
46 |             val url = AutUtil.url(r)
47 |             (AutUtil.extractDomainRemovePrefixWWW(url, publicSuffixes.value), 1L)
48 |           }
49 |         }
50 |       }
51 |   }
52 | 
53 |   override val templateName: Option[String] = Some("jobs/DomainFrequencyExtraction")
54 | 
55 |   override def sampleVizData(conf: DerivationJobConf): Option[SampleVizData] =
56 |     checkFinishedState(conf.outputPath + relativeOutPath) match {
57 |       case Some(ProcessingState.Finished) =>
58 |         Some(
59 |           SampleVizData(
60 |             HdfsIO
61 |               .lines(conf.outputPath + relativeOutPath + "/" + targetFile, 11)
62 |               .drop(1)
63 |               .flatMap { line =>
64 |                 val comma = line.lastIndexOf(',')
65 |                 if (comma < 0) None
66 |                 else
67 |                   Some {
68 |                     val (domain, freq) =
69 |                       (line.take(comma).stripPrefix("\"").stripSuffix("\""), line.drop(comma + 1))
70 |                     (domain, freq)
71 |                   }
72 |               }))
73 |       case _ => None
74 |     }
75 | 
76 |   override def templateVariables(conf: DerivationJobConf): Seq[(String, Any)] = {
77 |     super.templateVariables(conf) ++ Seq(
78 |       "topDomains" ->
79 |         sampleVizData(conf).map(_.nodes).getOrElse(Seq.empty))
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/DomainGraphExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import io.archivesunleashed.matchbox.ExtractLinks
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.functions.desc
 6 | import org.apache.spark.sql.{Dataset, Row}
 7 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 8 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob
 9 | import org.archive.webservices.ars.util.{Common, HttpUtil, PublicSuffixUtil}
10 | import org.archive.webservices.sparkling.warc.WarcRecord
11 | 
12 | import java.io.PrintStream
13 | 
14 | object DomainGraphExtraction extends NetworkAutJob[((String, String, String), Long)] {
15 |   val name = "Domain graph"
16 |   val uuid = "01895067-417d-7665-ba60-a9bb9ca0aa3e"
17 | 
18 |   override val infoUrl =
19 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#domain-graph"
20 | 
21 |   val description =
22 |     "Links between domains in the collection over time. Output: one CSV file with columns for crawl date, source, target, and count."
23 | 
24 |   val targetFile: String = "domain-graph.csv.gz"
25 | 
26 |   val srcDstFields: (String, String) = ("src_domain", "dest_domain")
27 | 
28 |   override def printToOutputStream(out: PrintStream): Unit =
29 |     out.println("crawl_date, source, target, count")
30 | 
31 |   override def df(rdd: RDD[((String, String, String), Long)]): Dataset[Row] = {
32 |     val rows =
33 |       rdd
34 |         .reduceByKey(_ + _)
35 |         .filter(_._2 > 5)
36 |         .map { case ((date, source, target), count) =>
37 |           Row(date, source, target, count)
38 |         }
39 |     AutLoader.domainGraph(rows).orderBy(desc("count"))
40 |   }
41 | 
42 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[((String, String, String), Long)] = {
43 |     val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context)
44 |     rdd
45 |       .flatMap { r =>
46 |         r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http =>
47 |           Common
48 |             .tryOrElse(Seq.empty[((String, String, String), Long)]) {
49 |               val url = AutUtil.url(r)
50 |               AutUtil
51 |                 .extractLinks(ExtractLinks.apply, url, HttpUtil.bodyString(http.body, http))
52 |                 .map { case (source, target, _) =>
53 |                   (
54 |                     AutUtil.extractDomainRemovePrefixWWW(source, publicSuffixes.value),
55 |                     AutUtil.extractDomainRemovePrefixWWW(target, publicSuffixes.value))
56 |                 }
57 |                 .distinct
58 |                 .filter { case (s, t) => s != "" && t != "" }
59 |                 .map { case (source, target) =>
60 |                   ((AutUtil.timestamp(r).take(8), source, target), 1L)
61 |                 }
62 |             }
63 |             .toIterator
64 |         }
65 |       }
66 |   }
67 | 
68 |   override def edgeCounts(df: Dataset[Row]): RDD[((String, String), Long)] = {
69 |     val (srcField, dstField) = srcDstFields
70 |     df.rdd
71 |       .flatMap { row =>
72 |         Common.tryOrElse[Option[((String, String), Long)]](None) {
73 |           Some(
74 |             (
75 |               (row.getAs[String](srcField), row.getAs[String](dstField)),
76 |               row.getAs[Long]("count")))
77 |         }
78 |       }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/ImageGraphExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import io.archivesunleashed.matchbox.ExtractImageLinks
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.{Dataset, Row}
 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 7 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob
 8 | import org.archive.webservices.ars.util.{Common, HttpUtil}
 9 | import org.archive.webservices.sparkling.warc.WarcRecord
10 | 
11 | import java.io.PrintStream
12 | 
13 | object ImageGraphExtraction extends NetworkAutJob[Row] {
14 |   val name = "Image graph"
15 |   val uuid = "01895067-92fb-739c-a99d-037fde1798a4"
16 | 
17 |   override val infoUrl =
18 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#image-graph"
19 | 
20 |   val description =
21 |     "Timestamp, location, and any original description for each image file in the collection. Output: one CSV with columns for crawl date, source page, image file url, and alt text."
22 | 
23 |   val targetFile: String = "image-graph.csv.gz"
24 | 
25 |   val srcDstFields: (String, String) = ("src", "image_url")
26 | 
27 |   override def printToOutputStream(out: PrintStream): Unit =
28 |     out.println("crawl_date, source, url, alt_text")
29 | 
30 |   override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.imageGraph(rdd)
31 | 
32 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = {
33 |     rdd.flatMap { r =>
34 |       r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http =>
35 |         Common
36 |           .tryOrElse(Seq.empty[Row]) {
37 |             val url = AutUtil.url(r)
38 |             AutUtil
39 |               .extractLinks(ExtractImageLinks.apply, url, HttpUtil.bodyString(http.body, http))
40 |               .map { case (source, target, alt) =>
41 |                 Row(AutUtil.timestamp(r), source, target, alt)
42 |               }
43 |           }
44 |           .toIterator
45 |       }
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/ImageInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import io.archivesunleashed.matchbox.GetExtensionMIME
 4 | import org.apache.commons.io.FilenameUtils
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.sql.{Dataset, Row}
 7 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 8 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 9 | import org.archive.webservices.sparkling.Sparkling.executionContext
10 | import org.archive.webservices.sparkling.http.HttpMessage
11 | import org.archive.webservices.sparkling.io.InputStreamForker
12 | import org.archive.webservices.sparkling.util.{Common, DigestUtil}
13 | import org.archive.webservices.sparkling.warc.WarcRecord
14 | 
15 | import java.io.{InputStream, PrintStream}
16 | import java.net.URL
17 | import scala.concurrent.duration._
18 | import scala.concurrent.{Await, Future}
19 | import scala.util.Try
20 | 
21 | object ImageInformationExtraction extends BinaryInformationAutJob {
22 |   val name = "Image file information"
23 |   val uuid = "01895067-d598-7db8-88ad-46fed66e27f5"
24 | 
25 |   override val infoUrl =
26 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#image"
27 | 
28 |   val description =
29 |     "Locations and metadata for JPEG, PNG, GIF, and other image formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
30 | 
31 |   val targetFile: String = "image-information.csv.gz"
32 | 
33 |   override def printToOutputStream(out: PrintStream): Unit =
34 |     out.println(
35 |       "crawl_date, last_modified_date, url, filename, extension, mime_type_web_server, mime_type_tika, width, height, md5, sha1")
36 | 
37 |   override def checkMime(url: String, server: String, tika: String): Boolean =
38 |     tika.startsWith("image/")
39 | 
40 |   override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.images(rdd)
41 | 
42 |   override def prepareRecord(r: WarcRecord): Option[Row] =
43 |     prepareBinaryRow(
44 |       r,
45 |       (
46 |           url: String,
47 |           http: HttpMessage,
48 |           body: InputStream,
49 |           tikaMime: String,
50 |           crawlDate: String,
51 |           lastModifiedDate: String) => {
52 |         val forker = InputStreamForker(body)
53 |         val Array(imageIn, md5In, sha1In) = forker.fork(3).map(Future(_))
54 |         val Seq((width: Int, height: Int), md5: String, sha1: String) =
55 |           try {
56 |             Await.result(
57 |               Future.sequence(
58 |                 Seq(
59 |                   imageIn.map(in => Common.cleanup(AutUtil.computeImageSize(in))(in.close)),
60 |                   md5In.map(DigestUtil.md5Hex),
61 |                   sha1In.map(DigestUtil.sha1Hex))),
62 |               Duration.Inf)
63 |           } finally {
64 |             for (s <- imageIn) Try(s.close())
65 |             for (s <- md5In) Try(s.close())
66 |             for (s <- sha1In) Try(s.close())
67 |             Try(body.close())
68 |           }
69 | 
70 |         val jUrl = new URL(url)
71 |         val filename = FilenameUtils.getName(jUrl.getPath)
72 |         val extension = GetExtensionMIME(jUrl.getPath, tikaMime)
73 |         val lastModifiedDate =
74 |           AutUtil.rfc1123toTime14(http.headerMap.get("last-modified").getOrElse(""))
75 | 
76 |         Row(
77 |           crawlDate,
78 |           lastModifiedDate,
79 |           url,
80 |           filename,
81 |           extension,
82 |           AutUtil.mime(http),
83 |           tikaMime,
84 |           width,
85 |           height,
86 |           md5,
87 |           sha1)
88 |       })
89 | 
90 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/PdfInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object PdfInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "PDF file information"
10 |   val uuid = "01895068-3e02-72cb-b0d9-4e1bacc42c37"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#pdf"
14 | 
15 |   val description =
16 |     "Locations and metadata for Portable Document Format (PDF) files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "pdf-information.csv.gz"
19 | 
20 |   override def checkMime(url: String, server: String, tika: String): Boolean =
21 |     server == "application/pdf" // not `tika == `, which we had before, but also matches Adobe Illustrator and PostScript
22 | 
23 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/PresentationProgramInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object PresentationProgramInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "Presentation file information"
10 |   val uuid = "01895068-a576-7a00-b4dd-2d5650bc69ab"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#presentation"
14 | 
15 |   val description =
16 |     "Locations and metadata for PowerPoint, Keynote, and other presentation formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "powerpoint-information.csv.gz"
19 | 
20 |   val PresentationMimeTypes: Set[String] = Set(
21 |     "application/vnd.apple.keynote",
22 |     "application/vnd.ms-powerpoint",
23 |     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
24 |     "application/vnd.oasis.opendocument.presentation",
25 |     "application/vnd.oasis.opendocument.presentation-template",
26 |     "application/vnd.sun.xml.impress",
27 |     "application/vnd.sun.xml.impress.template",
28 |     "application/vnd.stardivision.impress",
29 |     "application/x-starimpress",
30 |     "application/vnd.ms-powerpoint.addin.macroEnabled.12",
31 |     "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
32 |     "application/vnd.ms-powerpoint.slide.macroEnabled.12",
33 |     "application/vnd.ms-powerpoint.slideshow.macroEnabled.12",
34 |     "application/vnd.ms-powerpoint.template.macroEnabled.12")
35 | 
36 |   override def checkMime(url: String, server: String, tika: String): Boolean =
37 |     PresentationMimeTypes.contains(tika)
38 | 
39 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/SpreadsheetInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object SpreadsheetInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "Spreadsheet file information"
10 |   val uuid = "01895069-192a-74f8-84a9-b14f20c20f89"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#spreadsheet"
14 | 
15 |   val description =
16 |     "Locations and metadata for CSV, XLS, ODS, and other spreadsheet formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "spreadsheet-information.csv.gz"
19 | 
20 |   val SpreadsheetMimeTypes: Set[String] = Set(
21 |     " application/vnd.apple.numbers",
22 |     "application/vnd.ms-excel",
23 |     "application/vnd.ms-excel.workspace.3",
24 |     "application/vnd.ms-excel.workspace.4",
25 |     "application/vnd.ms-excel.sheet.2",
26 |     "application/vnd.ms-excel.sheet.3",
27 |     "application/vnd.ms-excel.sheet.3",
28 |     "application/vnd.ms-excel.addin.macroenabled.12",
29 |     "application/vnd.ms-excel.sheet.binary.macroenabled.12",
30 |     "application/vnd.ms-excel.sheet.macroenabled.12",
31 |     "application/vnd.ms-excel.template.macroenabled.12",
32 |     "application/vnd.ms-spreadsheetml",
33 |     "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
34 |     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
35 |     "application/x-vnd.oasis.opendocument.spreadsheet-template",
36 |     "application/vnd.oasis.opendocument.spreadsheet-template",
37 |     "application/vnd.oasis.opendocument.spreadsheet",
38 |     "application/x-vnd.oasis.opendocument.spreadsheet",
39 |     "application/x-tika-msworks-spreadsheet",
40 |     "application/vnd.lotus-1-2-3",
41 |     "text/csv",
42 |     "text/tab-separated-values")
43 | 
44 |   override def checkMime(url: String, server: String, tika: String): Boolean =
45 |     SpreadsheetMimeTypes.contains(
46 |       tika) || server == "text/csv" || server == "text/tab-separated-values" || ((url.toLowerCase
47 |       .endsWith(".csv") || url.toLowerCase.endsWith(".tsv")) && tika == "text/plain")
48 | 
49 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/VideoInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object VideoInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "Video file information"
10 |   val uuid = "01895069-a9fa-734c-b669-fcf528f85c1e"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#video"
14 | 
15 |   val description =
16 |     "Locations and metadata for MP4, MOV, AVI, and other video formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "video-information.csv.gz"
19 | 
20 |   override def checkMime(url: String, server: String, tika: String): Boolean =
21 |     tika.startsWith("video/")
22 | 
23 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/WebGraphExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import io.archivesunleashed.matchbox.ExtractLinks
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.{Dataset, Row}
 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 7 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob
 8 | import org.archive.webservices.ars.util.{Common, HttpUtil}
 9 | import org.archive.webservices.sparkling.warc.WarcRecord
10 | 
11 | import java.io.PrintStream
12 | 
13 | object WebGraphExtraction extends NetworkAutJob[Row] {
14 |   val name = "Web graph"
15 |   val uuid = "01895069-e74c-79de-8292-effb45265179"
16 | 
17 |   override val infoUrl =
18 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#web-graph"
19 | 
20 |   val description =
21 |     "Links between all documents in the collection over time and any descriptive anchor text about with them. Output: one CSV file with columns for crawl date, source, target, and anchor text."
22 | 
23 |   val targetFile: String = "web-graph.csv.gz"
24 | 
25 |   val srcDstFields: (String, String) = ("src", "dest")
26 | 
27 |   override def printToOutputStream(out: PrintStream): Unit =
28 |     out.println("crawl_date, source, target, anchor_text")
29 | 
30 |   override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.webGraph(rdd)
31 | 
32 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = {
33 |     rdd.flatMap { r =>
34 |       r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http =>
35 |         Common
36 |           .tryOrElse(Seq.empty[Row]) {
37 |             val url = AutUtil.url(r)
38 |             AutUtil
39 |               .extractLinks(ExtractLinks.apply, url, HttpUtil.bodyString(http.body, http))
40 |               .map { case (source, target, alt) =>
41 |                 Row(AutUtil.timestamp(r), source, target, alt)
42 |               }
43 |           }
44 |           .toIterator
45 |       }
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/WebPagesExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import io.archivesunleashed.matchbox.{DetectLanguage, RemoveHTML}
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.{Dataset, Row}
 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil}
 7 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 8 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 9 | import org.archive.webservices.ars.util.{HttpUtil, PublicSuffixUtil}
10 | import org.archive.webservices.sparkling.http.HttpMessage
11 | import org.archive.webservices.sparkling.warc.WarcRecord
12 | 
13 | import java.io.{InputStream, PrintStream}
14 | 
15 | object WebPagesExtraction extends BinaryInformationAutJob {
16 |   val name = "Plain text of webpages"
17 |   val uuid = "0189506a-46f3-7d73-9dcf-a8fce59c50cc"
18 | 
19 |   override val category: ArchJobCategory = ArchJobCategories.Text
20 | 
21 |   override val infoUrl =
22 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#webpages"
23 | 
24 |   val description =
25 |     "Location, technical metadata, and extracted full text contents of each text-bearing document in the collection. Output: one CSV file with columns for crawl date, last modified date, domain, URL, MIME type as reported by the web server and as detected by Apache TIKA, and content."
26 | 
27 |   val targetFile: String = "web-pages.csv.gz"
28 | 
29 |   override def printToOutputStream(out: PrintStream): Unit =
30 |     out.println(
31 |       "crawl_date, last_modified_date, domain,url, mime_type_web_server, mime_type_tika, language, content")
32 | 
33 |   override def checkMime(url: String, server: String, tika: String): Boolean =
34 |     AutUtil.checkPageMime(url, server)
35 | 
36 |   override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.webpages(rdd)
37 | 
38 |   override def prepareRecord(r: WarcRecord): Option[Row] =
39 |     throw new RuntimeException(
40 |       "This method should not be called in WebPagesExtraction, see #prepareRecords")
41 | 
42 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = {
43 |     val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context)
44 |     rdd.flatMap { r =>
45 |       prepareBinaryRow(
46 |         r,
47 |         (
48 |             url: String,
49 |             http: HttpMessage,
50 |             body: InputStream,
51 |             tikaMime: String,
52 |             crawlDate: String,
53 |             lastModifiedDate: String) => {
54 |           val bodyString = HttpUtil.bodyString(body, http)
55 |           val content = RemoveHTML(bodyString)
56 |           Row(
57 |             crawlDate,
58 |             lastModifiedDate,
59 |             AutUtil.extractDomainRemovePrefixWWW(url, publicSuffixes.value),
60 |             url,
61 |             AutUtil.mime(http),
62 |             tikaMime,
63 |             DetectLanguage(content),
64 |             content)
65 |         })
66 |     }
67 |   }
68 | 
69 |   override val templateName: Option[String] = Some("jobs/DefaultAutJob")
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/WordProcessorInformationExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.Row
 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | object WordProcessorInformationExtraction extends BinaryInformationAutJob {
 9 |   val name = "Word processing file information"
10 |   val uuid = "0189506a-d09d-7571-9d3c-a44698d58d39"
11 | 
12 |   override val infoUrl =
13 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#word"
14 | 
15 |   val description =
16 |     "Locations and metadata for DOC, RTF, ODT, and other word processing files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values."
17 | 
18 |   val targetFile: String = "word-document-information.csv.gz"
19 | 
20 |   val WordProcessorMimeTypes: Set[String] = Set(
21 |     "application/vnd.lotus-wordpro",
22 |     "application/vnd.kde.kword",
23 |     "application/vnd.ms-word.document.macroEnabled.12",
24 |     "application/vnd.ms-word.template.macroEnabled.12",
25 |     "application/vnd.oasis.opendocument.text",
26 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
27 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
28 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
29 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
30 |     "application/vnd.wordperfect",
31 |     "application/wordperfect5.1",
32 |     "application/msword",
33 |     "application/vnd.ms-word.document.macroEnabled.12",
34 |     "application/vnd.ms-word.template.macroEnabled.12",
35 |     "application/vnd.apple.pages",
36 |     "application/macwriteii",
37 |     "application/vnd.ms-works",
38 |     "application/rtf")
39 | 
40 |   override def checkMime(url: String, server: String, tika: String): Boolean =
41 |     WordProcessorMimeTypes.contains(tika)
42 | 
43 |   override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord)
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/AiJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.processing.DerivationJobConf
 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkEnrichJob}
 6 | import org.archive.webservices.sparkling.io.StageSyncManager
 7 | 
 8 | abstract class AiJob extends ArchiveSparkEnrichJob {
 9 | //  override def maxInputSize: Int = 5000 // limit the input size to avoid extraordinarily long jobs
10 | 
11 |   override def enrich(
12 |       rdd: RDD[ArchEnrichRoot[_]],
13 |       conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = {
14 |     StageSyncManager.sync(super.enrich(rdd, conf))
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/ArchiveSparkFlexJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark
 2 | 
 3 | import io.circe.{HCursor, Json}
 4 | import org.archive.webservices.archivespark.model.EnrichFunc
 5 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.{ArchArchiveSparkFunctionAdapter, EntitiesAdapter}
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{Whisper, WhisperText}
 9 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters}
10 | import org.archive.webservices.sparkling.util.StringUtil
11 | 
12 | object ArchiveSparkFlexJob extends AiJob {
13 |   val uuid: String = "018f52cc-d917-71ac-9e64-19fb219114a4"
14 | 
15 |   val name: String = id
16 |   val description: String = "ArchiveSpark flex job "
17 |   val category: ArchJobCategory = ArchJobCategories.None
18 | 
19 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
20 |     val mime = conf.params.values
21 |       .get("mime")
22 |       .toSeq
23 |       .flatMap { mime =>
24 |         if (mime.isString) mime.asString.toSeq
25 |         else if (mime.isArray) mime.asArray.toSeq.flatMap(_.flatMap(_.asString))
26 |         else Seq.empty
27 |       }
28 |       .toSet
29 |     if (mime.isEmpty) {
30 |       super.genericPredicate(conf)
31 |     } else { record =>
32 |       mime.contains(record.mime) || mime.contains(StringUtil.prefixToSeparator(record.mime, "/"))
33 |     }
34 |   }
35 | 
36 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
37 |     val superFilter = super.warcPredicate(conf)
38 |     val status = conf.params.values.get("status").toSeq.flatMap { status =>
39 |       if (status.isNumber) status.asNumber.flatMap(_.toInt).toSeq
40 |       else if (status.isArray)
41 |         status.asArray.toSeq.flatMap(_.flatMap(_.asNumber).flatMap(_.toInt))
42 |       else Seq.empty
43 |     }
44 |     if (status.isEmpty) {
45 |       superFilter
46 |     } else { warc =>
47 |       superFilter(warc) && {
48 |         status.exists { s =>
49 |           warc.status == s || (s < 100 && (warc.status / 10 == s || (s < 10 && warc.status / 100 == s)))
50 |         }
51 |       }
52 |     }
53 |   }
54 | 
55 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
56 |     conf.params.values.get("functions").toSeq.flatMap(_.asArray.toSeq.flatten).map { function =>
57 |       ArchiveSparkFlexJob.initFunction(function)
58 |     }
59 |   }
60 | 
61 |   val adapters: Map[String, ArchArchiveSparkFunctionAdapter[_]] =
62 |     Seq(EntitiesAdapter, Whisper, WhisperText).flatMap { adapter =>
63 |       Iterator(adapter.name -> adapter, adapter.name.toLowerCase -> adapter)
64 |     }.toMap
65 | 
66 |   private def initFunction[A](
67 |       func: ArchArchiveSparkFunctionAdapter[A],
68 |       cursor: HCursor): EnrichFunc[ArchEnrichRoot[_], A, _] = {
69 |     val dependency =
70 |       cursor.downField("on").focus.map(initFunction).flatMap(func.toDependencyPointer)
71 |     cursor.downField("params").focus.flatMap(DerivationJobParameters.fromJson) match {
72 |       case Some(params) => func.withParams(params, on = dependency)
73 |       case None => func.noParams(on = dependency)
74 |     }
75 |   }
76 | 
77 |   def initFunction(definition: Json): EnrichFunc[ArchEnrichRoot[_], _, _] = {
78 |     if (definition.isString) {
79 |       adapters.get(definition.asString.get).map(_.noParams)
80 |     } else if (definition.isObject) {
81 |       val cursor = definition.hcursor
82 |       cursor.get[String]("name").toOption.flatMap { name =>
83 |         adapters.get(name).map(initFunction(_, cursor))
84 |       }
85 |     } else None
86 |   }.getOrElse {
87 |     throw new UnsupportedOperationException()
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/ArchiveSparkNoop.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkBaseJob}
 7 | 
 8 | object ArchiveSparkNoop extends ArchiveSparkBaseJob {
 9 |   val name: String = id
10 |   val uuid: String = "018d1cef-c91d-7d51-9cf4-05fe51900321"
11 |   val description: String =
12 |     "Am ArchiveSpark job that does nothing. Output: records turned into ArchiveSpark JSON format without any enrichment function applied."
13 |   val category: ArchJobCategory = ArchJobCategories.None
14 | 
15 |   override def enrich(
16 |       rdd: RDD[ArchEnrichRoot[_]],
17 |       conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = rdd
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchEnrichRoot.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import org.archive.webservices.archivespark.model.TypedEnrichRoot
 4 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, TextLoad}
 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
 6 | 
 7 | trait ArchEnrichRoot[+Meta]
 8 |     extends TypedEnrichRoot[Meta]
 9 |     with FileLoad.Root
10 |     with ByteLoad.Root
11 |     with TextLoad.Root
12 |     with PlainTextLoad.Root
13 |     with LocalFileCache {
14 |   def mime: String
15 |   def meta: FileMetaData
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchFileRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import io.circe.Json
 4 | import org.archive.webservices.archivespark.functions.StringContent
 5 | import org.archive.webservices.archivespark.model.EnrichRootCompanion
 6 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, DataLoad, TextLoad}
 7 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 8 | import org.archive.webservices.archivespark.util.Json.json
 9 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord
10 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
11 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{ArchFileBytes, ArchFileCache}
12 | import org.archive.webservices.sparkling.io.IOUtil
13 | import org.archive.webservices.sparkling.logging.{Log, LogContext}
14 | 
15 | import java.io.{File, InputStream}
16 | import scala.collection.immutable.ListMap
17 | import scala.util.Try
18 | 
19 | class ArchFileRecord(record: FileRecord) extends ArchEnrichRoot[FileRecord] {
20 |   implicit private val logContext: LogContext = LogContext(this)
21 | 
22 |   override def companion: EnrichRootCompanion[ArchFileRecord] = ArchFileRecord
23 |   override def get: FileRecord = record
24 | 
25 |   override def metaToJson: Json = {
26 |     json(
27 |       ListMap[String, Any](
28 |         "filename" -> record.filename,
29 |         "mime" -> Try(record.mime).fold("Error: " + _.getMessage, identity),
30 |         "path" -> Try(record.path).fold("Error: " + _.getMessage, identity)))
31 |   }
32 | 
33 |   def mime: String = record.mime
34 | 
35 |   def meta: FileMetaData = record.meta
36 | 
37 |   override def payloadAccess: InputStream = {
38 |     Log.info(s"Accessing ${record.filename}...")
39 |     IOUtil.supportMark(record.access)
40 |   }
41 | 
42 |   override def cacheLocal(): File = {
43 |     Log.info(s"Caching ${record.filename}...")
44 |     super.cacheLocal()
45 |   }
46 | }
47 | 
48 | object ArchFileRecord extends EnrichRootCompanion[ArchFileRecord] {
49 |   override def dataLoad[T](load: DataLoad[T]): Option[FieldPointer[ArchFileRecord, T]] =
50 |     (load match {
51 |       case FileLoad => Some(ArchFileCache)
52 |       case ByteLoad => Some(ArchFileBytes)
53 |       case TextLoad | PlainTextLoad => Some(StringContent)
54 |       case _ => None
55 |     }).map(_.asInstanceOf[FieldPointer[ArchFileRecord, T]])
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchFileSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import org.archive.webservices.archivespark.dataspecs.DataSpec
 6 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord
 7 | 
 8 | class ArchFileSpec(@transient val rdd: RDD[FileRecord])
 9 |     extends DataSpec[FileRecord, ArchFileRecord] {
10 |   override def load(sc: SparkContext, minPartitions: Int): RDD[FileRecord] = rdd
11 |   override def parse(file: FileRecord): Option[ArchFileRecord] = Some(new ArchFileRecord(file))
12 | }
13 | 
14 | object ArchFileSpec {
15 |   def apply(rdd: RDD[FileRecord]) = new ArchFileSpec(rdd)
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchWarcRecord.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import io.circe.Json
 4 | import org.archive.webservices.archivespark.functions.{HtmlText, StringContent}
 5 | import org.archive.webservices.archivespark.model.EnrichRootCompanion
 6 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, DataLoad, TextLoad}
 7 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 8 | import org.archive.webservices.archivespark.specific.warc.WarcLikeRecord
 9 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData
10 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{ArchFileCache, ArchWarcPayload}
11 | import org.archive.webservices.sparkling.cdx.CdxRecord
12 | import org.archive.webservices.sparkling.io.IOUtil
13 | import org.archive.webservices.sparkling.logging.{Log, LogContext}
14 | import org.archive.webservices.sparkling.warc.WarcRecord
15 | 
16 | import java.io.{File, InputStream}
17 | 
18 | class ArchWarcRecord(val warc: WarcRecord) extends ArchEnrichRoot[CdxRecord] with WarcLikeRecord {
19 |   implicit private val logContext: LogContext = LogContext(this)
20 | 
21 |   override def companion: EnrichRootCompanion[ArchWarcRecord] = ArchWarcRecord
22 | 
23 |   override lazy val get: CdxRecord = {
24 |     warc.toCdx(0L, handleRevisits = true, handleOthers = true).get
25 |   }
26 | 
27 |   def mime: String = warc.http.flatMap(_.mime).getOrElse("/")
28 | 
29 |   override lazy val meta: FileMetaData = FileMetaData.fromCdx(get)
30 | 
31 |   override def metaToJson: Json = meta.toJson
32 | 
33 |   override def payloadAccess: InputStream = IOUtil.supportMark(warc.http.map(_.body).getOrElse(warc.payload))
34 | 
35 |   override def cacheLocal(): File = {
36 |     Log.info(s"Caching ${warc.url.getOrElse("N/A")}...")
37 |     super.cacheLocal()
38 |   }
39 | }
40 | 
41 | object ArchWarcRecord extends EnrichRootCompanion[ArchWarcRecord] {
42 |   override def dataLoad[T](load: DataLoad[T]): Option[FieldPointer[ArchWarcRecord, T]] = {
43 |     (load match {
44 |       case FileLoad => Some(ArchFileCache)
45 |       case ByteLoad => Some(ArchWarcPayload)
46 |       case TextLoad => Some(StringContent)
47 |       case PlainTextLoad => Some(HtmlText)
48 |       case _ => None
49 |     }).map(_.asInstanceOf[FieldPointer[ArchWarcRecord, T]])
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchWarcSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import org.archive.webservices.archivespark.dataspecs.DataSpec
 6 | import org.archive.webservices.sparkling.warc.WarcRecord
 7 | 
 8 | class ArchWarcSpec(@transient val rdd: RDD[WarcRecord])
 9 |     extends DataSpec[WarcRecord, ArchWarcRecord] {
10 |   override def load(sc: SparkContext, minPartitions: Int): RDD[WarcRecord] = rdd
11 |   override def parse(warc: WarcRecord): Option[ArchWarcRecord] = Some(new ArchWarcRecord(warc))
12 | }
13 | 
14 | object ArchWarcSpec {
15 |   def apply(rdd: RDD[WarcRecord]) = new ArchWarcSpec(rdd)
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchiveSparkEnrichJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.archive.webservices.archivespark.model.EnrichFunc
 5 | import org.archive.webservices.archivespark.model.dataloads.ByteLoad
 6 | import org.archive.webservices.archivespark.model.pointers.DataLoadPointer
 7 | import org.archive.webservices.archivespark.util.Bytes
 8 | import org.archive.webservices.ars.processing.DerivationJobConf
 9 | 
10 | abstract class ArchiveSparkEnrichJob extends ArchiveSparkBaseJob {
11 |   def byteLoad: DataLoadPointer[ArchEnrichRoot[_], Bytes] =
12 |     ArchiveSparkEnrichJob.byteLoad
13 | 
14 |   def fileLoad: DataLoadPointer[ArchEnrichRoot[_], String] =
15 |     ArchiveSparkEnrichJob.fileLoad
16 | 
17 |   def plainTextLoad: DataLoadPointer[ArchEnrichRoot[_], String] =
18 |     ArchiveSparkEnrichJob.plainTextLoad
19 | 
20 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]]
21 | 
22 |   override def enrich(
23 |       rdd: RDD[ArchEnrichRoot[_]],
24 |       conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = {
25 |     val funcs = functions(conf)
26 |     var enriched = if (funcs.length <= 1) rdd else {
27 |       val (longest, longestPath) = funcs.map(f => (f, f.dependencyPath.toSet)).maxBy(_._2.size)
28 |       if (funcs.exists(f => f != longest && !longestPath.contains(f))) rdd.map { r =>
29 |         r.cacheEnabled = true
30 |         r
31 |       } else rdd
32 |     }
33 |     for (func <- funcs) enriched = enriched.enrich(func)
34 |     enriched.map { r =>
35 |       r.clearCache()
36 |       r
37 |     }
38 |   }
39 | }
40 | 
41 | object ArchiveSparkEnrichJob {
42 |   val byteLoad: DataLoadPointer[ArchEnrichRoot[_], Bytes] =
43 |     DataLoadPointer(ByteLoad)
44 | 
45 |   val fileLoad: DataLoadPointer[ArchEnrichRoot[_], String] =
46 |     DataLoadPointer(FileLoad)
47 | 
48 |   val plainTextLoad: DataLoadPointer[ArchEnrichRoot[_], String] =
49 |     DataLoadPointer(PlainTextLoad)
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/FileLoad.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
2 | 
3 | import org.archive.webservices.archivespark.model.dataloads.DataLoad
4 | 
5 | object FileLoad extends DataLoad[String] {
6 |   trait Root extends DataLoadRoot
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/LocalFileCache.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
 2 | 
 3 | import org.apache.commons.io.input.BoundedInputStream
 4 | import org.archive.webservices.archivespark.util.Bytes
 5 | import org.archive.webservices.sparkling._
 6 | import org.archive.webservices.sparkling.io.IOUtil
 7 | import org.archive.webservices.sparkling.logging.{Log, LogContext}
 8 | 
 9 | import java.io.{BufferedInputStream, File, FileInputStream, InputStream}
10 | import scala.util.Try
11 | 
12 | object LocalFileCache {
13 |   val MaxMemoryCacheSize: Long = 1.mb
14 | }
15 | 
16 | trait LocalFileCache {
17 |   implicit private val logContext: LogContext = LogContext(this)
18 | 
19 |   @transient private var _memoryCache: Option[Array[Byte]] = None
20 |   @transient private var _localCacheFile: Option[File] = None
21 | 
22 |   @transient var cacheEnabled = false
23 | 
24 |   def isLocalCached: Boolean = _localCacheFile.isDefined
25 | 
26 |   def localCacheFile: Option[File] = _localCacheFile
27 | 
28 |   private def cacheLocal(in: => Option[InputStream]): File = _localCacheFile.getOrElse {
29 |     synchronized(_localCacheFile.getOrElse {
30 |       val file = IOUtil.tmpFile
31 |       Log.info(s"Caching to ${file.getPath}...")
32 |       val out = IOUtil.fileOut(file)
33 |       try {
34 |         for (bytes <- _memoryCache) {
35 |           out.write(bytes)
36 |           _memoryCache = None
37 |         }
38 |         for (s <- in) {
39 |           try {
40 |             IOUtil.copy(s, out)
41 |           } finally {
42 |             s.close()
43 |           }
44 |         }
45 |       } finally out.close()
46 |       _localCacheFile = Some(file)
47 |       Log.info(s"Cached ${file.getPath}.")
48 |       file
49 |     })
50 |   }
51 | 
52 |   def cacheLocal(): File = cacheLocal(if (_memoryCache.isDefined) None else Some(payloadAccess))
53 | 
54 |   def clearCache(): Unit = if (_localCacheFile.isDefined || _memoryCache.isDefined) synchronized {
55 |     for (file <- _localCacheFile) file.delete()
56 |     _localCacheFile = None
57 |     _memoryCache = None
58 |   }
59 | 
60 |   def localFileCache: Option[InputStream] = _localCacheFile.map { file =>
61 |     new BufferedInputStream(new FileInputStream(file))
62 |   }
63 | 
64 |   def cachePayload(): Unit = if (_memoryCache.isEmpty && _localCacheFile.isEmpty) {
65 |     synchronized {
66 |       if (_memoryCache.isEmpty && _localCacheFile.isEmpty) {
67 |         val in = payloadAccess
68 |         try {
69 |           val bounded = new BoundedInputStream(in, LocalFileCache.MaxMemoryCacheSize + 1)
70 |           val array = IOUtil.bytes(bounded)
71 |           _memoryCache = Some(array)
72 |           if (array.length > LocalFileCache.MaxMemoryCacheSize) cacheLocal(Some(in))
73 |         } catch {
74 |           case e: Exception =>
75 |             // skip if payload can't be read, e.g. malformed HTTP stream / decoding error
76 |             Log.error(e.getMessage)
77 |         } finally {
78 |           Try(in.close())
79 |         }
80 |       }
81 |     }
82 |   }
83 | 
84 |   def cachedPayload: Bytes = Bytes.either({
85 |     if (cacheEnabled) cachePayload()
86 |     _memoryCache.map(Left(_)).getOrElse {
87 |       _localCacheFile.map(file => Right(new FileInputStream(file))).getOrElse {
88 |         Right(payloadAccess)
89 |       }
90 |     }
91 |   })
92 | 
93 |   def payloadAccess: InputStream
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/PlainTextLoad.scala:
--------------------------------------------------------------------------------
1 | package org.archive.webservices.ars.processing.jobs.archivespark.base
2 | 
3 | import org.archive.webservices.archivespark.model.dataloads.DataLoad
4 | 
5 | object PlainTextLoad extends DataLoad[String] {
6 |   trait Root extends DataLoadRoot
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchFileBytes.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, TypedEnrichRoot, TypedEnrichable}
 5 | import org.archive.webservices.archivespark.util.Bytes
 6 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache
 8 | import org.archive.webservices.sparkling._
 9 | 
10 | import scala.util.Try
11 | 
12 | object ArchFileBytes
13 |     extends EnrichFunc[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord, Bytes] {
14 |   val MaxContentLength: Long = 1.mb
15 | 
16 |   val source: FieldPointer[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord] =
17 |     FieldPointer.root[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord]
18 | 
19 |   val fields: Seq[String] = Seq("bytes")
20 | 
21 |   override def derive(source: TypedEnrichable[FileRecord], derivatives: Derivatives): Unit = {
22 |     Try(source.asInstanceOf[LocalFileCache].cachedPayload).toOption
23 |       .getOrElse(Bytes(source.get.access))
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchFileCache.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, EnrichRoot, TypedEnrichable}
 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache
 6 | 
 7 | object ArchFileCache extends EnrichFunc[EnrichRoot with LocalFileCache, Any, String] {
 8 |   val source: FieldPointer[EnrichRoot with LocalFileCache, Any] = FieldPointer(Seq.empty)
 9 |   val fields: Seq[String] = Seq("filePath")
10 |   override val isTransparent: Boolean = true
11 |   override def derive(source: TypedEnrichable[Any], derivatives: Derivatives): Unit = {
12 |     derivatives << source.asInstanceOf[LocalFileCache].cacheLocal().getPath
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchWarcPayload.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, TypedEnrichable}
 5 | import org.archive.webservices.archivespark.specific.warc.functions._
 6 | import org.archive.webservices.archivespark.util.Bytes
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.ArchWarcRecord
 8 | import org.archive.webservices.sparkling.cdx.CdxRecord
 9 | 
10 | class ArchWarcPayload private (http: Boolean = true)
11 |     extends EnrichFunc[ArchWarcRecord, CdxRecord, Bytes] {
12 |   import WarcPayloadFields._
13 | 
14 |   val source: FieldPointer[ArchWarcRecord, CdxRecord] =
15 |     FieldPointer.root[ArchWarcRecord, CdxRecord]
16 | 
17 |   val fields: Seq[String] = {
18 |     if (http) Seq(RecordHeader, HttpStatusLine, HttpHeader, Payload)
19 |     else Seq(RecordHeader, Payload)
20 |   }
21 | 
22 |   override val defaultField: String = Payload
23 | 
24 |   override def derive(source: TypedEnrichable[CdxRecord], derivatives: Derivatives): Unit = {
25 |     val record = source.asInstanceOf[ArchWarcRecord]
26 |     val warc = record.warc
27 |     derivatives << warc.headers.toMap
28 |     if (http) {
29 |       for (msg <- warc.http) {
30 |         derivatives << msg.statusLine
31 |         derivatives << msg.headers
32 |         derivatives << record.cachedPayload
33 |       }
34 |     } else {
35 |       derivatives << record.cachedPayload
36 |     }
37 |   }
38 | }
39 | 
40 | object ArchWarcPayload extends ArchWarcPayload(http = true) {
41 |   def apply(http: Boolean = true) = new ArchWarcPayload(http)
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/CoreNlpEntities.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import org.archive.webservices.archivespark.functions.{Entities, EntitiesConstants}
 4 | import org.archive.webservices.archivespark.model.{EnrichFunc, EnrichRoot}
 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache
 6 | import org.archive.webservices.sparkling.io.StageSyncManager
 7 | import org.archive.webservices.sparkling.util.IteratorUtil
 8 | 
 9 | import java.util.Properties
10 | 
11 | class CoreNlpEntities(
12 |     properties: Properties = EntitiesConstants.DefaultProps,
13 |     filterLatin: Boolean = false) extends Entities(properties, filterLatin = filterLatin) {
14 |   override def initPartition(partition: Iterator[EnrichRoot]): Iterator[EnrichRoot] = {
15 |     StageSyncManager.lockMutex()
16 |     super.initPartition(partition)
17 |   }
18 | 
19 |   override def cleanup(): Unit = StageSyncManager.unlockMutex()
20 | 
21 |   override def enrichPartition[R <: EnrichRoot](partition: Iterator[R], func: EnrichFunc[R, _, _]): Iterator[R] = {
22 |     IteratorUtil.preload(partition.map { r =>
23 |       r.asInstanceOf[LocalFileCache].cachePayload()
24 |       r
25 |     }, numPreload = 50, parallelism = 2)(func.enrich)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/TrOCR.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.CondaBasedArchiveSparkFunctionAdapter
 4 | 
 5 | object TrOCR extends CondaBasedArchiveSparkFunctionAdapter[String] {
 6 |   override def func: CondaBasedFunction[String] = new TrOCR
 7 | }
 8 | 
 9 | class TrOCR extends CondaBasedFunction[String] {
10 |   override val label: String = "trocr"
11 |   override val dataDir: String = s"$label/20240807195100"
12 |   override val condaEnv: String = s"conda-$label-env"
13 |   override val pythonFile: String = s"$label-run.py"
14 |   override val additionalPackages: Seq[String] =
15 |     Seq(s"$label-models.tar.gz", "craft-pytorch.tar.gz")
16 |   override val pythonArgumentFiles: Seq[String] = Seq(
17 |     s"$label-base-handwritten",
18 |     "weights/craft_mlt_25k.pth",
19 |     "weights/craft_refiner_CTW1500.pth")
20 | 
21 |   override def processOutput(output: String): Option[String] = Some(output)
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/Whisper.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import _root_.io.circe.parser._
 4 | import io.circe.Json
 5 | import org.archive.webservices.ars.Arch
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.CondaBasedArchiveSparkFunctionAdapter
 7 | 
 8 | object Whisper extends CondaBasedArchiveSparkFunctionAdapter[Json] {
 9 |   override def func: CondaBasedFunction[Json] = new Whisper
10 | }
11 | 
12 | class Whisper extends CondaBasedFunction[Json] {
13 |   override val label: String = "whisper"
14 |   override val dataDir: String = s"$label/20240807195100"
15 |   override val condaEnv: String = s"conda-$label-env"
16 |   override val pythonFile: String = s"$label-run.py"
17 |   override val pythonArgumentFiles: Seq[String] = Seq("base.en.pt")
18 | 
19 |   override def processOutput(output: String): Option[Json] = {
20 |     val trim = output.trim
21 |     if (trim.isEmpty) None
22 |     else
23 |       parse(trim) match {
24 |         case Left(failure) =>
25 |           Arch.reportError(
26 |             s"ArchiveSpark Whisper Output JSON Parsing Error",
27 |             failure.getMessage(),
28 |             Map("output" -> output))
29 |           None
30 |         case Right(json) => Some(json)
31 |       }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/WhisperText.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions
 2 | 
 3 | import io.circe.Json
 4 | import org.archive.webservices.archivespark.model.{EnrichFunc, EnrichRoot, GlobalEnrichFunc}
 5 | import org.archive.webservices.ars.processing.DerivationJobParameters
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.ArchArchiveSparkFunctionAdapter
 8 | 
 9 | object WhisperText extends ArchArchiveSparkFunctionAdapter[Json] {
10 |   override def initFunc(params: DerivationJobParameters): EnrichFunc[_, Json, _] = {
11 |     params.get[Double]("maxNoSpeechProb") match {
12 |       case Some(maxNoSpeechProb) => new WhisperText(maxNoSpeechProb)
13 |       case None => super.initFunc(params)
14 |     }
15 |   }
16 | 
17 |   override def baseFunc: EnrichFunc[_, Json, _] = new WhisperText(0.5)
18 | }
19 | 
20 | class WhisperText(maxNoSpeechProb: Double)
21 |     extends GlobalEnrichFunc[EnrichRoot with LocalFileCache, Json, String] {
22 |   val func: EnrichFunc[EnrichRoot with LocalFileCache, Json, String] = Whisper.func.map("text") {
23 |     json =>
24 |       json.asArray.toSeq.flatten
25 |         .map(_.hcursor)
26 |         .filter { cursor =>
27 |           cursor.get[Double]("no_speech_prob").exists(_ <= maxNoSpeechProb)
28 |         }
29 |         .flatMap { cursor =>
30 |           cursor.get[String]("text").toOption
31 |         }
32 |         .mkString
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/ArchArchiveSparkFunctionAdapter.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.archivespark.model.pointers.FieldPointer
 5 | import org.archive.webservices.ars.processing.DerivationJobParameters
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.ArchEnrichRoot
 7 | 
 8 | import scala.util.Try
 9 | 
10 | trait ArchArchiveSparkFunctionAdapter[Source] {
11 |   def name: String = baseFunc.getClass.getSimpleName.stripSuffix("$")
12 |   def baseFunc: EnrichFunc[_, Source, _]
13 |   def defaultDependency: Option[FieldPointer[ArchEnrichRoot[_], Source]] = None
14 |   def noParams(on: Option[FieldPointer[ArchEnrichRoot[_], Source]])
15 |       : EnrichFunc[ArchEnrichRoot[_], Source, _] = {
16 |     val dependency = on.orElse(defaultDependency)
17 |     dependency
18 |       .map(baseFunc.on(_))
19 |       .getOrElse(baseFunc)
20 |       .asInstanceOf[EnrichFunc[ArchEnrichRoot[_], Source, _]]
21 |   }
22 |   def noParams: EnrichFunc[ArchEnrichRoot[_], Source, _] = noParams(None)
23 |   def withParams(
24 |       params: DerivationJobParameters,
25 |       on: Option[FieldPointer[ArchEnrichRoot[_], Source]] = None)
26 |       : EnrichFunc[ArchEnrichRoot[_], Source, _] = {
27 |     if (params.isEmpty) noParams(on)
28 |     else {
29 |       val dependency = on.orElse(defaultDependency)
30 |       val func = initFunc(params)
31 |       dependency
32 |         .map(func.on)
33 |         .getOrElse(func)
34 |         .asInstanceOf[EnrichFunc[ArchEnrichRoot[_], Source, _]]
35 |     }
36 |   }
37 |   def initFunc(params: DerivationJobParameters): EnrichFunc[_, Source, _] = baseFunc
38 |   def toDependencyPointer(func: EnrichFunc[ArchEnrichRoot[_], _, _])
39 |       : Option[FieldPointer[ArchEnrichRoot[_], Source]] = Try {
40 |     func.asInstanceOf[FieldPointer[ArchEnrichRoot[_], Source]]
41 |   }.toOption
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/CondaBasedArchiveSparkFunctionAdapter.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters
 2 | import org.archive.webservices.archivespark.model.EnrichFunc
 3 | import org.archive.webservices.ars.processing.DerivationJobParameters
 4 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.CondaBasedFunction
 5 | 
 6 | trait CondaBasedArchiveSparkFunctionAdapter[Source]
 7 |     extends ArchArchiveSparkFunctionAdapter[Source] {
 8 |   def func: CondaBasedFunction[Source]
 9 | 
10 |   override def baseFunc: EnrichFunc[_, Source, _] = func.asInstanceOf[EnrichFunc[_, Source, _]]
11 | 
12 |   override def initFunc(params: DerivationJobParameters): EnrichFunc[_, Source, _] = {
13 |     val f = func
14 |     f.initFunc(params)
15 |     f.asInstanceOf[EnrichFunc[_, Source, _]]
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/EntitiesAdapter.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters
 2 | import edu.stanford.nlp.pipeline.StanfordCoreNLP
 3 | import org.archive.webservices.archivespark.functions.{Entities, EntitiesConstants}
 4 | import org.archive.webservices.archivespark.model.EnrichFunc
 5 | import org.archive.webservices.archivespark.model.pointers.DataLoadPointer
 6 | import org.archive.webservices.ars.processing.DerivationJobParameters
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkEnrichJob}
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.CoreNlpEntities
 9 | 
10 | import java.util.Properties
11 | import scala.collection.JavaConverters.asScalaSetConverter
12 | 
13 | object EntitiesAdapter extends ArchArchiveSparkFunctionAdapter[String] {
14 |   override lazy val baseFunc: Entities = new CoreNlpEntities()
15 | 
16 |   override def defaultDependency: Option[DataLoadPointer[ArchEnrichRoot[_], String]] = Some(
17 |     ArchiveSparkEnrichJob.plainTextLoad)
18 | 
19 |   override def initFunc(params: DerivationJobParameters): EnrichFunc[_, String, _] = {
20 |     val langParam = params.get[String]("lang").map(_.toLowerCase)
21 |     langParam match {
22 |       case Some("chinese") => new CoreNlpEntities(properties(langParam))
23 |       case _ => new CoreNlpEntities(properties(langParam), filterLatin = true)
24 |     }
25 |   }
26 | 
27 |   def properties(lang: Option[String] = None): Properties = {
28 |     val default = EntitiesConstants.DefaultProps
29 |     lang match {
30 |       case Some(l) =>
31 |         val props = new StanfordCoreNLP(l).getProperties
32 |         for (p <- default.stringPropertyNames.asScala)
33 |           props.setProperty(p, default.getProperty(p))
34 |         props
35 |       case None => default
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/EntityExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord, ArchiveSparkEnrichJob}
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter
 7 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters}
 8 | 
 9 | object EntityExtraction extends ArchiveSparkEnrichJob {
10 |   val name: String = "Named entities"
11 |   val description: String =
12 |     "Names of persons, organizations, and geographic locations detected in each text-bearing document in the collection. Output: one or more JSONL files comprising a JSON object for each input record."
13 | 
14 |   val uuid: String = "018d114d-3426-730e-94a1-b56ca73fc1ad"
15 | 
16 |   override val infoUrl =
17 |     "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets"
18 | 
19 |   val category: ArchJobCategory = ArchJobCategories.Text
20 | 
21 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
22 |     val superFilter = super.warcPredicate(conf)
23 |     warc => superFilter(warc) && warc.status == 200
24 |   }
25 | 
26 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
27 |     record => record.mime.startsWith("text/")
28 |   }
29 | 
30 |   def entitiesFunc(params: DerivationJobParameters): EnrichFunc[ArchEnrichRoot[_], _, _] = {
31 |     EntitiesAdapter.withParams(params)
32 |   }
33 | 
34 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
35 |     Seq(entitiesFunc(conf.params))
36 |   }
37 | }


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/TrOcrEntityExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.TrOCR
 9 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter
10 | 
11 | object TrOcrEntityExtraction extends AiJob {
12 |   val uuid: String = "019078a8-7b16-7a87-8b50-a30166e547dd"
13 | 
14 |   val name: String = "Named entities from text recognition"
15 |   val description: String =
16 |     "Names of persons, organizations, geographic locations, and dates from text recognized in collection images. Output: one or more JSONL files comprising a JSON object for each input record."
17 | 
18 |   override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets"
19 | 
20 |   override val category: ArchJobCategory = ArchJobCategories.Text
21 | 
22 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
23 |     val superFilter = super.warcPredicate(conf)
24 |     warc => superFilter(warc) && warc.status == 200
25 |   }
26 | 
27 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
28 |     record => record.mime.startsWith("image/")
29 |   }
30 | 
31 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
32 |     val text = TrOCR.noParams
33 |     val entities = EntitiesAdapter.noParams(on = EntitiesAdapter.toDependencyPointer(text))
34 |     Seq(text, entities)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/TrOcrProcessing.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.TrOCR
 9 | 
10 | object TrOcrProcessing extends AiJob {
11 |   val uuid: String = "019078a5-c6f3-7051-bb71-5b1f135307df"
12 | 
13 |   val name: String = "Text recognition"
14 |   val description: String =
15 |     "Text recognized and transcribed from images in a collection, including handwriting. Output: one or more JSONL files comprising a JSON object for each input record."
16 | 
17 |   override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#ocr"
18 | 
19 |   override val category: ArchJobCategory = ArchJobCategories.Text
20 | 
21 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
22 |     val superFilter = super.warcPredicate(conf)
23 |     warc => superFilter(warc) && warc.status == 200
24 |   }
25 | 
26 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
27 |     record => record.mime.startsWith("image/")
28 |   }
29 | 
30 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
31 |     Seq(TrOCR.noParams)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperEntityExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 9 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter
10 | 
11 | object WhisperEntityExtraction extends AiJob {
12 |   val uuid: String = "018f7b09-f7ca-756d-a4ca-69cea914185d"
13 | 
14 |   val name: String = "Named entities from speech recognition"
15 |   val description: String =
16 |     "Names of persons, organizations, geographic locations, and dates in text transcribed from collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record."
17 | 
18 |   override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets"
19 | 
20 |   override val category: ArchJobCategory = ArchJobCategories.Text
21 | 
22 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
23 |     val superFilter = super.warcPredicate(conf)
24 |     warc => superFilter(warc) && warc.status == 200
25 |   }
26 | 
27 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
28 |     record => record.mime.startsWith("audio/") || record.mime.startsWith("video/")
29 |   }
30 | 
31 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
32 |     val whisperText = archivespark.functions.WhisperText.withParams(conf.params)
33 |     val entities = EntitiesAdapter.noParams(on = EntitiesAdapter.toDependencyPointer(whisperText))
34 |     Seq(whisperText, entities)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperText.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 9 | 
10 | object WhisperText extends AiJob {
11 |   val uuid: String = "0191e26a-056c-77e2-8fe0-dfba9928b3e2"
12 | 
13 |   val name: String = "Speech recognition"
14 |   val description: String =
15 |     "Text transcribed from speech recognized in collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record."
16 | 
17 |   override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#speech"
18 | 
19 |   override val category: ArchJobCategory = ArchJobCategories.Text
20 | 
21 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
22 |     val superFilter = super.warcPredicate(conf)
23 |     warc => superFilter(warc) && warc.status == 200
24 |   }
25 | 
26 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
27 |     record => record.mime.startsWith("audio/") || record.mime.startsWith("video/")
28 |   }
29 | 
30 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
31 |     Seq(archivespark.functions.WhisperText.withParams(conf.params))
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperTranscription.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset
 2 | 
 3 | import org.archive.webservices.archivespark.model.EnrichFunc
 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory}
 5 | import org.archive.webservices.ars.processing.DerivationJobConf
 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob
 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord}
 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.Whisper
 9 | 
10 | object WhisperTranscription extends AiJob {
11 |   val uuid: String = "018f7b0a-4f3c-7846-862a-ff1ae26ce139"
12 | 
13 |   val name: String = "Speech recognition (raw)"
14 |   val description: String =
15 |     "Raw transcription output and technical metadata from speech recognized in collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record."
16 | 
17 |   override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#speech"
18 | 
19 |   override val category: ArchJobCategory = ArchJobCategories.Text
20 | 
21 |   override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = {
22 |     val superFilter = super.warcPredicate(conf)
23 |     warc => superFilter(warc) && warc.status == 200
24 |   }
25 | 
26 |   override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = {
27 |     record => record.mime.startsWith("audio/") || record.mime.startsWith("video/")
28 |   }
29 | 
30 |   def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = {
31 |     Seq(Whisper.noParams)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/shared/ArsJob.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.shared
 2 | 
 3 | import org.archive.webservices.ars.WasapiController
 4 | import org.archive.webservices.ars.model.ArchConf
 5 | import org.archive.webservices.ars.processing.{DerivationJob, DerivationJobConf}
 6 | 
 7 | trait ArsJob extends DerivationJob {
 8 |   override def templateVariables(conf: DerivationJobConf): Seq[(String, Any)] = {
 9 |     val wasapiUrl = ArchConf.baseUrl + {
10 |       "/wasapi/v1/jobs/" + id + "/result?collection=" + conf.inputSpec.collectionId + {
11 |         if (conf.isSample) "&sample=true" else ""
12 |       }
13 |     }
14 |     super.templateVariables(conf) ++ Seq(
15 |       "wasapiUrl" -> wasapiUrl,
16 |       "wasapiPages" -> (outFiles(conf).size.toDouble / WasapiController.FixedPageSize).ceil.toInt)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/processing/jobs/system/MetadataSummary.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.processing.jobs.system
 2 | 
 3 | import org.archive.webservices.ars.model.collections.inputspecs.InputSpecLoader
 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaSummary
 5 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory, ArchJobInstanceInfo, DerivativeOutput}
 6 | import org.archive.webservices.ars.processing._
 7 | import org.archive.webservices.sparkling.Sparkling.executionContext
 8 | import org.archive.webservices.sparkling.io._
 9 | 
10 | import scala.concurrent.Future
11 | 
12 | object MetadataSummary extends SparkJob {
13 |   val name = "Metadata Summary"
14 |   val uuid = "4a3fae37-99de-4a64-843d-bce3a44807b1"
15 |   val category: ArchJobCategory = ArchJobCategories.System
16 |   def description = "Summarizes metadata of a given input spec"
17 | 
18 |   val relativeOutPath: String = s"/$id"
19 | 
20 |   val SummaryFile = "summary.json"
21 | 
22 |   def run(conf: DerivationJobConf): Future[Boolean] = {
23 |     SparkJobManager.context.map { sc =>
24 |       SparkJobManager.initThread(sc, MetadataSummary, conf)
25 |       InputSpecLoader.loadSpark(conf.inputSpec) { rdd =>
26 |         val summary = rdd
27 |           .mapPartitions { partition =>
28 |             val summary = new FileMetaSummary()
29 |             for (f <- partition) summary.add(f.meta)
30 |             Iterator(summary)
31 |           }
32 |           .fold(FileMetaSummary.empty)(_ ++ _)
33 |         HdfsIO.writeLines(
34 |           conf.outputPath + relativeOutPath + "/" + SummaryFile,
35 |           lines = Seq(summary.toJsonSchema.spaces4))
36 |         true
37 |       }
38 |     }
39 |   }
40 | 
41 |   override def history(conf: DerivationJobConf): DerivationJobInstance = {
42 |     val instance = super.history(conf)
43 |     val started =
44 |       HdfsIO.exists(conf.outputPath + relativeOutPath + "/" + ArchJobInstanceInfo.InfoFile)
45 |     if (started) {
46 |       val completed = HdfsIO.exists(conf.outputPath + relativeOutPath + "/" + SummaryFile)
47 |       instance.state = if (completed) ProcessingState.Finished else ProcessingState.Failed
48 |     }
49 |     instance
50 |   }
51 | 
52 |   override def outFiles(conf: DerivationJobConf): Iterator[DerivativeOutput] = Iterator(
53 |     DerivativeOutput(SummaryFile, conf.outputPath + relativeOutPath, "JSON", "application/json"))
54 | 
55 |   override val templateName: Option[String] = Some("jobs/DefaultArsJob")
56 | 
57 |   override def reset(conf: DerivationJobConf): Unit =
58 |     HdfsIO.delete(conf.outputPath + relativeOutPath)
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/CacheUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import org.scalatra.ActionResult
 4 | import org.scalatra.guavaCache.GuavaCache
 5 | 
 6 | import javax.servlet.http.HttpServletRequest
 7 | import scala.concurrent.duration._
 8 | 
 9 | object CacheUtil {
10 |   val Charset: String = "UTF-8"
11 | 
12 |   val RequestCacheDuration: Duration = 10.minutes
13 | 
14 |   def cache[R](key: String, enabled: Boolean = true, ttl: Option[Duration] = None)(
15 |       value: => R): R =
16 |     if (enabled) {
17 |       GuavaCache.get[R](key) match {
18 |         case Some(cached) => cached
19 |         case None =>
20 |           val v = value
21 |           GuavaCache.put(key, v, ttl)
22 |           v
23 |       }
24 |     } else value
25 | 
26 |   def put[R](key: String, value: R, enabled: Boolean = true, ttl: Option[Duration] = None): R = {
27 |     GuavaCache.put(key, value, ttl)
28 |     value
29 |   }
30 | 
31 |   def get[R](key: String): Option[R] = GuavaCache.get[R](key)
32 | 
33 |   def cacheRequest(
34 |       request: HttpServletRequest,
35 |       enabled: Boolean = true,
36 |       subjects: Set[Any] = Set.empty)(value: => ActionResult): ActionResult =
37 |     if (enabled) {
38 |       val key = "request#" + request.getRequestURI + "?" + request.getQueryString
39 |       Iterator
40 |         .continually {
41 |           GuavaCache.get[Option[ActionResult]](key) match {
42 |             case Some(cached) =>
43 |               if (cached.isEmpty) Thread.sleep(1000)
44 |               cached
45 |             case None =>
46 |               GuavaCache.put(key, None, None)
47 |               try {
48 |                 val result = value
49 |                 if (result.status.code == 200)
50 |                   GuavaCache.put(key, Some(result), Some(RequestCacheDuration))
51 |                 else GuavaCache.remove(key)
52 |                 Some(result)
53 |               } catch {
54 |                 case e: Exception =>
55 |                   GuavaCache.remove(key)
56 |                   throw e
57 |               }
58 |           }
59 |         }
60 |         .flatten
61 |         .next()
62 |     } else value
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/Common.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | object Common {
 4 |   def tryOrElse[R](orElse: => R)(action: => R): R = {
 5 |     try {
 6 |       action
 7 |     } catch {
 8 |       case e: Exception =>
 9 |         e.printStackTrace()
10 |         orElse
11 |     }
12 |   }
13 | 
14 |   def retryWhile(cond: => Boolean, sleepMs: Int, maxTimes: Int, sleepInc: Int => Int): Boolean = {
15 |     var sleep = sleepMs
16 |     var times = 1
17 |     var result = cond
18 |     while (result && times < maxTimes) {
19 |       Thread.sleep(sleep)
20 |       sleep = sleepInc(sleep)
21 |       times += 1
22 |       result = cond
23 |     }
24 |     !result
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/DatafileUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import io.circe.parser.parse
 4 | import io.circe.{Json, JsonObject}
 5 | 
 6 | import java.io.{File, PrintWriter}
 7 | import scala.io.Source
 8 | import scala.util.Try
 9 | 
10 | object DatafileUtil {
11 |   private def getPath(filename: String) = s"data/$filename"
12 | 
13 |   def load(filename: String): Json = {
14 |     val source = Source.fromFile(getPath(filename), "utf-8")
15 |     Try {
16 |       try {
17 |         parse(source.getLines.mkString).right.get
18 |       } finally {
19 |         source.close()
20 |       }
21 |     }.getOrElse(Json.fromJsonObject(JsonObject.empty))
22 |   }
23 | 
24 |   def store(filename: String, json: Json): Unit = {
25 |     val path = getPath(filename)
26 |     val source = Source.fromFile(path, "utf-8")
27 |     try {
28 |       val pw = new PrintWriter(new File(path))
29 |       pw.write(json.toString)
30 |       pw.close()
31 |     } finally {
32 |       source.close()
33 |     }
34 |   }
35 | 
36 |   def loadArchUsers() = load("arch-users.json")
37 |   def storeArchUsers(json: Json) = store("arch-users.json", json)
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/DatasetUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import org.archive.webservices.ars.model.ArchCollection
 4 | import org.archive.webservices.ars.model.users.ArchUser
 5 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobInstance, JobManager}
 6 | 
 7 | object DatasetUtil {
 8 |   def formatId(collectionId: String, job: DerivationJobInstance): String = {
 9 |     s"${collectionId}:${if (job.conf.isSample) "1" else "0"}:${job.job.id}"
10 |   }
11 | 
12 |   def parseId(
13 |       datasetId: String,
14 |       user: ArchUser): Option[(ArchCollection, DerivationJobInstance)] = {
15 |     val Array(collectionId, isSample, jobId) =
16 |       datasetId.reverse.split(":", 3).map(_.reverse).reverse
17 |     val sample = if (isSample == "1") true else false
18 |     for {
19 |       collection <- ArchCollection.get(ArchCollection.userCollectionId(collectionId, user))
20 |       job <- (
21 |         JobManager
22 |           .getInstanceOrGlobal(
23 |             jobId,
24 |             DerivationJobConf.collection(collection, sample = sample, global = false),
25 |             Some(DerivationJobConf.collection(collection, sample = sample, global = true)))
26 |       )
27 |     } yield (collection, job)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/FormatUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import org.archive.webservices.sparkling.util.StringUtil
 4 | 
 5 | import java.time.Instant
 6 | 
 7 | object FormatUtil {
 8 |   def formatBytes(bytes: Long): String = {
 9 |     val units = Seq("B", "KB", "MB", "GB", "TB", "PB")
10 |     if (bytes < 0) "0 " + units.head
11 |     else {
12 |       var unitIdx = 0
13 |       var b = bytes.toDouble
14 |       while (b > 1024 && unitIdx < units.length - 1) {
15 |         unitIdx += 1
16 |         b = b / 1024
17 |       }
18 |       StringUtil.formatNumber(b, 1) + " " + units(unitIdx)
19 |     }
20 |   }
21 | 
22 |   def instantTimeString(instant: Instant): String =
23 |     instant.toString
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/HttpUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import org.apache.commons.io.input.BoundedInputStream
 4 | import org.archive.webservices.ars.model.ArchConf
 5 | import org.archive.webservices.sparkling._
 6 | import org.archive.webservices.sparkling.html.HtmlProcessor
 7 | import org.archive.webservices.sparkling.http.HttpMessage
 8 | import org.archive.webservices.sparkling.io.CatchingInputStream
 9 | 
10 | import java.io.InputStream
11 | import java.net.{HttpURLConnection, InetSocketAddress, Proxy, URL}
12 | import javax.net.ssl.HttpsURLConnection
13 | 
14 | object HttpUtil {
15 |   val MaxContentLength: Long = 1.mb
16 | 
17 |   def bodyString(body: InputStream, http: HttpMessage): String = {
18 |     val boundedBody = new BoundedInputStream(body, MaxContentLength)
19 |     HtmlProcessor.readStream(
20 |       new CatchingInputStream(boundedBody),
21 |       http.charset.toSeq ++ HttpMessage.BodyCharsets)
22 |   }
23 | 
24 |   lazy val proxy: Proxy = {
25 |     val split = ArchConf.httpProxy.split(':')
26 |     if (split.length > 1) {
27 |       new Proxy(Proxy.Type.HTTP, new InetSocketAddress(split.head, split(1).toInt))
28 |     } else Proxy.NO_PROXY
29 |   }
30 | 
31 |   def openConnection(url: String): HttpURLConnection = {
32 |     val u = new URL(url)
33 |     if (ArchConf.httpProxyHosts.contains(u.getHost)) {
34 |       u.openConnection(proxy)
35 |     } else u.openConnection()
36 |   }.asInstanceOf[HttpURLConnection]
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/LazyCache.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import io.circe.Json
 4 | import io.circe.syntax.EncoderOps
 5 | import org.archive.webservices.sparkling.io.HdfsIO
 6 | import org.scalatra.{ActionResult, Ok}
 7 | 
 8 | import scala.concurrent.ExecutionContext.Implicits.global
 9 | import scala.concurrent.Future
10 | 
11 | object LazyCache {
12 |   val writingSuffix = "_writing"
13 | 
14 |   private var lazyFuture = Future(true)
15 |   private var processing = Map.empty[String, Future[Boolean]]
16 | 
17 |   def getOrCache[A](
18 |       cacheFile: String)(parse: String => Option[A], write: String => Unit): Future[A] =
19 |     synchronized {
20 |       val parsed = getIfCached(cacheFile)(parse)
21 |       if (parsed.isDefined) return Future(parsed.get)
22 |       processing
23 |         .getOrElse(
24 |           cacheFile, {
25 |             lazyFuture = lazyFuture.map(_ => {
26 |               val tmpFilePath = cacheFile + writingSuffix
27 |               write(tmpFilePath)
28 |               HdfsIO.rename(tmpFilePath, cacheFile)
29 |               true
30 |             })
31 |             processing += cacheFile -> lazyFuture
32 |             lazyFuture
33 |           })
34 |         .map { _ =>
35 |           processing -= cacheFile
36 |           parse(cacheFile).get
37 |         }
38 |     }
39 | 
40 |   def getIfCached[A](cacheFile: String)(parse: String => Option[A]): Option[A] = {
41 |     if (HdfsIO.exists(cacheFile)) parse(cacheFile) else None
42 |   }
43 | 
44 |   def lazyJsonResponse[A](
45 |       cached: Option[Future[A]])(orElse: => A, json: A => Json): ActionResult = {
46 |     lazyJsonResponse[A, A](cached, identity, orElse, json)
47 |   }
48 | 
49 |   def lazyJsonResponse[A, B](
50 |       cached: Option[Future[A]],
51 |       map: A => B,
52 |       orElse: => B,
53 |       json: B => Json): ActionResult = {
54 |     cached match {
55 |       case Some(future) =>
56 |         if (future.isCompleted) {
57 |           Ok(
58 |             json(map(future.value.flatMap(_.toOption).get)).spaces4,
59 |             Map("Content-Type" -> "application/json"))
60 |         } else {
61 |           Ok(Map("lazy" -> true).asJson.spaces4, Map("Content-Type" -> "application/json"))
62 |         }
63 |       case None =>
64 |         Ok(json(orElse).spaces4, Map("Content-Type" -> "application/json"))
65 |     }
66 |   }
67 | 
68 |   def lazyProcess[A](cached: Option[Future[A]], orElse: => A)(process: A => Unit): Unit = {
69 |     lazyProcess[A, A](cached, identity, orElse)(process)
70 |   }
71 | 
72 |   def lazyProcess[A, B](cached: Option[Future[A]], map: A => B, orElse: => B)(
73 |       process: B => Unit): Unit = {
74 |     cached match {
75 |       case Some(future) =>
76 |         future.onComplete(v => process(map(v.get)))
77 |       case None =>
78 |         process(orElse)
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/PublicSuffixUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.broadcast.Broadcast
 5 | import org.archive.webservices.ars.model.ArchConf
 6 | 
 7 | import scala.io.Source
 8 | 
 9 | import org.archive.webservices.ars.model.ArchConf
10 | 
11 | object PublicSuffixUtil {
12 |   private var _broadcast: Option[(String, Broadcast[Set[String]])] = None
13 | 
14 |   def broadcast(sc: SparkContext): Broadcast[Set[String]] = {
15 |     if (_broadcast.isDefined && _broadcast.get._1 == sc.applicationId) _broadcast.get._2
16 |     else {
17 |       for ((_, bc) <- _broadcast) bc.destroy()
18 |       val bc = sc.broadcast(Suffixes)
19 |       _broadcast = Some((sc.applicationId, bc))
20 |       bc
21 |     }
22 |   }
23 | 
24 |   lazy val Suffixes: Set[String] = {
25 |     val source = Source
26 |       .fromURL(ArchConf.publicSuffixListUrl, "utf-8")
27 |     try {
28 |       source.getLines
29 |         .map(_.trim)
30 |         .filter(_.nonEmpty)
31 |         .filter(!_.startsWith("//"))
32 |         .toSet
33 |     } catch {
34 |       case _: Exception =>
35 |         Set.empty
36 |     } finally {
37 |       source.close()
38 |     }
39 |   }
40 | 
41 |   def resolve(host: String): String = resolve(host, Suffixes)
42 | 
43 |   def resolve(host: String, suffixes: Set[String]): String = {
44 |     val hostSplit = host.split('.')
45 |     hostSplit.tails
46 |       .filter(_.length > 1)
47 |       .find { domain =>
48 |         val suffix = domain.tail
49 |         suffixes.contains(suffix.mkString(".")) || (suffix.length > 1 && {
50 |           suffixes.contains("*." + suffix.tail.mkString("."))
51 |         })
52 |       }
53 |       .getOrElse(hostSplit)
54 |       .mkString(".")
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/org/archive/webservices/ars/util/UUID.scala:
--------------------------------------------------------------------------------
 1 | package org.archive.webservices.ars.util
 2 | 
 3 | import com.fasterxml.uuid.Generators
 4 | 
 5 | object UUID {
 6 |   def uuid7 = {
 7 |     // see https://github.com/cowtowncoder/java-uuid-generator
 8 |     Generators.timeBasedEpochGenerator().generate()
 9 |   }
10 | 
11 |   def uuid7str: String = uuid7.toString
12 | }
13 | 


--------------------------------------------------------------------------------
/src/test/scala/org/archive/webservices/ars/ApiController.scala:
--------------------------------------------------------------------------------
 1 | package test.org.archive.webservices.ars
 2 | 
 3 | import io.circe.parser.parse
 4 | import org.scalatra.test.scalatest._
 5 | 
 6 | import org.archive.webservices.ars.{ApiController, DefaultController}
 7 | 
 8 | import Fixtures._
 9 | 
10 | class ApiControllerSpec extends UnitSpec {
11 |   addServlet(classOf[DefaultController], "/*")
12 |   addServlet(classOf[ApiController], "/api/*")
13 | 
14 |   test("/api/collections returns status 403 when not authenticated") {
15 |     get("/api/collections") {
16 |       status should equal (403)
17 |     }
18 |   }
19 | 
20 |   test("/api/collections returns status 200 and count=0 when no collections exist") {
21 |     loggedInAs(makeArchUser()) {
22 |       get("/api/collections") {
23 |         status should equal (200)
24 |         val cur = parse(body).right.get.hcursor
25 |         cur.get[Int]("count").right.get should equal (0)
26 |       }
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/org/archive/webservices/ars/Fixtures.scala:
--------------------------------------------------------------------------------
 1 | package test.org.archive.webservices.ars
 2 | 
 3 | import java.io.{File, PrintWriter}
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | import io.circe.Json
 8 | import io.circe.syntax._
 9 | import io.circe.parser.parse
10 | 
11 | import org.archive.webservices.sparkling.util.DigestUtil
12 | 
13 | import org.archive.webservices.ars.model.users.ArchUser
14 | 
15 | object Fixtures {
16 |   private def load(path: String): Json = {
17 |     val source = Source.fromFile(path)
18 |     val json = parse(source.getLines.mkString).right.get
19 |     source.close()
20 |     json
21 |   }
22 | 
23 |   private def store(path: String, json: Json): Unit = {
24 |     val source = Source.fromFile(path)
25 |     val pw = new PrintWriter(new File(path))
26 |     pw.write(json.toString)
27 |     pw.close()
28 |   }
29 | 
30 |   def makeArchUser(admin: Boolean = false): ArchUser = {
31 |     // Insert a randomly-generated user into the arch-users.json file and
32 |     // return the corresponding ArchUser instance.
33 |     val path = "data/arch-users.json"
34 |     val json = load(path)
35 |     var userId = java.time.Instant.now.toEpochMilli.toString
36 |     // In the event of a collision, append a "0".
37 |     val existingUserIds = json.hcursor.keys.get.toSet
38 |     while (existingUserIds.contains(userId)) {
39 |       userId += "0"
40 |     }
41 |     store(path, json.deepMerge(Map(
42 |       userId -> Map(
43 |         "name" -> userId.asJson,
44 |         "password" -> s"sha1:${DigestUtil.sha1Base32(userId)}".asJson,
45 |         "admin" -> admin.asJson
46 |       )).asJson
47 |     ))
48 |     ArchUser.invalidateData()
49 |     ArchUser.get(s"arch:$userId").get
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/scala/org/archive/webservices/ars/JobUuidApiController.scala:
--------------------------------------------------------------------------------
 1 | package test.org.archive.webservices.ars
 2 | 
 3 | import io.circe.parser.parse
 4 | import io.circe.syntax._
 5 | import org.scalatra.test.scalatest._
 6 | 
 7 | import org.archive.webservices.ars.{ApiController, DefaultController, JobUuidApiController}
 8 | import org.archive.webservices.ars.processing.jobs.DomainFrequencyExtraction
 9 | 
10 | import Fixtures._
11 | 
12 | class JobUuidApiControllerSpec extends UnitSpec {
13 |   addServlet(classOf[DefaultController], "/*")
14 |   addServlet(classOf[JobUuidApiController], "/api/job/*")
15 |   addServlet(classOf[ApiController], "/api/*")
16 | 
17 |   test("Generating a DomainFrequencyExtraction on the test collection works") {
18 |     val user = makeArchUser()
19 |     loggedInAs(user) {
20 |       post(s"/api/runjob/${DomainFrequencyExtraction.id}?sample=false",
21 |         parse(s"""
22 |           {
23 |             "user": "${user.userName}",
24 |             "inputSpec": {
25 |               "type": "collection",
26 |               "collectionId": "SPECIAL-test-collection"
27 |             },
28 |             "params": {
29 |               "dataset": "${DomainFrequencyExtraction.id}"
30 |             }
31 |           }
32 |         """).toOption.get.toString.getBytes,
33 |         Map("content-type" -> "application/json")
34 |       ) {
35 |         status should equal (200)
36 |         val cur = parse(body).right.get.hcursor
37 |         cur.get[String]("id").right.get should equal (DomainFrequencyExtraction.id)
38 |         cur.get[String]("name").right.get should equal (DomainFrequencyExtraction.name)
39 |         cur.get[Int]("sample").right.get should equal (-1)
40 |         cur.get[String]("state").right.get should equal ("Running")
41 |         cur.get[Boolean]("started").right.get should equal (true)
42 |         cur.get[Boolean]("finished").right.get should equal (false)
43 |         cur.get[Boolean]("failed").right.get should equal (false)
44 |         cur.get[String]("activeStage").right.get should equal ("Processing")
45 |         cur.get[String]("activeState").right.get should equal ("Running")
46 | 
47 |         val uuid = cur.get[String]("uuid").right.get
48 | 
49 |         Thread.sleep(15000)
50 |       }
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/scala/org/archive/webservices/ars/UnitSpec.scala:
--------------------------------------------------------------------------------
 1 | package test.org.archive.webservices.ars
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.commons.io.FileUtils
 6 | import org.eclipse.jetty.server.Server
 7 | import org.scalamock.scalatest.MockFactory
 8 | import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
 9 | import org.scalatra.test.scalatest._
10 | 
11 | import org.archive.webservices.ars.model.ArchConf
12 | import org.archive.webservices.ars.model.users.ArchUser
13 | 
14 | /* Common Base Test Class */
15 | abstract class UnitSpec extends ScalatraSuite with FunSuiteLike with MockFactory with BeforeAndAfterAll {
16 |   private val dataDir = "data"
17 |   private val backupDataDir = ".data-test-bak"
18 | 
19 |   // Configure tests to use our non-standard webapp path.
20 |   servletContextHandler.setResourceBase("webapp")
21 | 
22 |   override def beforeAll {
23 |     super.beforeAll()
24 |     // Assert that the configured environment is valid for testing.
25 |     assert(ArchConf.deploymentEnvironment == "DEV")
26 | 
27 |     // Create a backup of the existing data directory.
28 |     FileUtils.copyDirectory(new File(dataDir), new File(backupDataDir))
29 |   }
30 | 
31 |   override def afterAll {
32 |     super.afterAll()
33 |     // Restore the pre-existing data directory.
34 |     FileUtils.copyDirectory(new File(backupDataDir), new File(dataDir))
35 |   }
36 | 
37 |   // https://stackoverflow.com/a/34030731
38 |   def setEnv(key: String, value: String) = {
39 |     val field = System.getenv().getClass.getDeclaredField("m")
40 |     field.setAccessible(true)
41 |     val map =
42 |       field.get(System.getenv()).asInstanceOf[java.util.Map[java.lang.String, java.lang.String]]
43 |     map.put(key, value)
44 |   }
45 | 
46 |   def loggedInAs[A](user: ArchUser)(test: => A): A = {
47 |     session {
48 |       post("/login", params = Seq(("username", user.fullName), ("password", user.fullName))) {
49 |         status should equal (302)
50 |       }
51 |       test
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/test/scala/org/archive/webservices/ars/model/LocalArchConfSpec.scala:
--------------------------------------------------------------------------------
 1 | package test.org.archive.webservices.ars
 2 | 
 3 | import org.archive.webservices.ars.model.LocalArchConf
 4 | 
 5 | class LocalArchConfSpec extends UnitSpec {
 6 |   setEnv("ARCH_BASE_PATH", "/")
 7 |   setEnv("ARCH_PROTO", "http")
 8 |   setEnv("ARCH_HOST", "arch.archive-it.org")
 9 |   setEnv("ARCH_EXTERNAL_PORT", "80")
10 | 
11 |   def conf = {
12 |     mock[LocalArchConf]
13 |   }
14 | 
15 |   test("baseUrl excludes port when proto=http and port=80") {
16 |     setEnv("ARCH_PROTO", "http")
17 |     setEnv("ARCH_EXTERNAL_PORT", "80")
18 |     conf.baseUrl should be("http://arch.archive-it.org")
19 |   }
20 | 
21 |   test("baseUrl excludes port when proto=https and port=443") {
22 |     setEnv("ARCH_PROTO", "https")
23 |     setEnv("ARCH_EXTERNAL_PORT", "443")
24 |     conf.baseUrl should be("https://arch.archive-it.org")
25 |   }
26 | 
27 |   test("baseUrl includes port when proto=http and port!=80") {
28 |     setEnv("ARCH_PROTO", "http")
29 |     setEnv("ARCH_EXTERNAL_PORT", "81")
30 |     conf.baseUrl should be("http://arch.archive-it.org:81")
31 |   }
32 | 
33 |   test("baseUrl includes port when proto=https and port!=443") {
34 |     setEnv("ARCH_PROTO", "https")
35 |     setEnv("ARCH_EXTERNAL_PORT", "444")
36 |     conf.baseUrl should be("https://arch.archive-it.org:444")
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/templates/sendmail_failed.txt:
--------------------------------------------------------------------------------
 1 | From: arch-no-reply@archive-it.org
 2 | To: helge@archive.org, archiveit-alerts@archive.org, kody@archive.org, karlb@archive.org, tpadilla@archive.org
 3 | Subject: ARCH: $jobName on $collectionName has FAILED!
 4 | MIME-Version: 1.0
 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH
 6 | 
 7 | --ARCHARCHARCH
 8 | Content-Type: text/plain; charset=utf-8
 9 | Hi Helge,
10 | ===============================================
11 | 
12 | $jobName job on $collectionName has FAILED for $userName ($accountId).
13 | 
14 | --ARCHARCHARCH
15 | Content-Type: text/html; charset=utf-8
16 | 
17 | <!DOCTYPE html>
18 |   <head>
19 |     <meta content='text/html; charset=UTF-8' http-equiv='Content-Type' />
20 |   </head>
21 |   <body>
22 |     <p>!!! JOB FAILED !!!</p>
23 |     <p>$jobName job on $collectionName has FAILED for $userName ($accountId).</p>
24 |   </body>
25 | </html>
26 | --ARCHARCHARCH--
27 | 


--------------------------------------------------------------------------------
/templates/sendmail_finished.txt:
--------------------------------------------------------------------------------
 1 | From: arch-no-reply@archive-it.org
 2 | To: $to
 3 | Subject: ARCH: Your $jobName dataset from $collectionName is ready to use
 4 | MIME-Version: 1.0
 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH
 6 | 
 7 | --ARCHARCHARCH
 8 | Content-Type: text/plain; charset=utf-8
 9 | Hello $userName,
10 | 
11 | ARCH has created your $jobName dataset from $collectionName. You may find it here: $datasetUrl
12 | 
13 | Best,
14 | The ARCH team
15 | --ARCHARCHARCH
16 | Content-Type: text/html; charset=utf-8
17 | 
18 | <!DOCTYPE html>
19 |   <head>
20 |     <meta content='text/html; charset=UTF-8' http-equiv='Content-Type' />
21 |   </head>
22 |   <body>
23 |     <p>Hello $userName,</p>
24 |     <p>
25 |       ARCH has created your $jobName dataset from $collectionName. You may find it here: <a href="$datasetUrl">$datasetUrl</a>
26 |     </p>
27 |     <p>
28 |       Best,
29 |       <br />
30 |       The ARCH team
31 |     </p>
32 |     <p>
33 |       Having trouble? <a href="https://arch-webservices.zendesk.com/hc/en-us/articles/14795196010772">Let us know!</a>
34 |     </p>
35 |   </body>
36 | </html>
37 | --ARCHARCHARCH--
38 | 


--------------------------------------------------------------------------------
/templates/sendmail_udq-finished.txt:
--------------------------------------------------------------------------------
 1 | From: arch-no-reply@archive-it.org
 2 | To: $to
 3 | Subject: ARCH: Your custom collection “$udqCollectionName” is ready to use
 4 | MIME-Version: 1.0
 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH
 6 | 
 7 | --ARCHARCHARCH
 8 | Content-Type: text/plain; charset=utf-8
 9 | Hello $userName,
10 | 
11 | ARCH has created your custom collection, “$udqCollectionName.” You may find and use it here: $collectionsUrl
12 | 
13 | Best,
14 | The ARCH team
15 | --ARCHARCHARCH
16 | Content-Type: text/html; charset=utf-8
17 | 
18 | <!DOCTYPE html>
19 |   <head>
20 |     <meta content='text/html; charset=UTF-8' http-equiv='Content-Type' />
21 |   </head>
22 |   <body>
23 |     <p>Hello $userName,</p>
24 |     <p>
25 |       ARCH has created your custom collection, "$udqCollectionName.” You may find and use it here: <a href="$collectionsUrl">$collectionsUrl</a>
26 |     </p>
27 |     <p>
28 |       Best,
29 |       <br />
30 |       The ARCH team
31 |     </p>
32 |     <p>
33 |       Having trouble? <a href="https://arch-webservices.zendesk.com/hc/en-us/articles/14795196010772">Let us know!</a>
34 |     </p>
35 |   </body>
36 | </html>
37 | --ARCHARCHARCH--
38 | 


--------------------------------------------------------------------------------
/webapp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/webapp/.gitkeep


--------------------------------------------------------------------------------