├── .DS_Store ├── .coveragerc ├── .gitignore ├── .pylintrc ├── .travis.yml ├── DECISIONS.md ├── Dockerfile ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── app.py ├── compiler ├── __init__.py ├── celery.py ├── celeryconfig.py ├── compiler.py ├── config.py ├── controllers.py ├── domain.py ├── factory.py ├── routes.py ├── services │ ├── __init__.py │ ├── filemanager │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_filemanager.py │ └── store │ │ ├── __init__.py │ │ ├── schema │ │ └── status.json │ │ └── tests.py ├── tests │ ├── Dockerfile │ ├── __init__.py │ ├── data │ │ ├── 1602.00123.tar │ │ └── real-test.tar.gz │ ├── test.aux │ ├── test.pdf │ ├── test.tar.gz │ ├── test.tex │ ├── test_app.py │ ├── test_compiler.py │ └── test_controllers.py ├── util.py └── worker.py ├── deploy ├── bin.md5 ├── compiler │ ├── Chart.yaml │ ├── README.md │ ├── templates │ │ ├── 00-service.yaml │ │ ├── 10-deployment.yaml │ │ └── 20-ingress.yaml │ └── values.yaml ├── install_helm.sh ├── make_and_push_images.sh └── publish_helm_chart.sh ├── docker-compose.yaml ├── docs ├── Makefile └── source │ ├── api │ ├── compiler.celery.rst │ ├── compiler.celeryconfig.rst │ ├── compiler.compiler.rst │ ├── compiler.config.rst │ ├── compiler.controllers.rst │ ├── compiler.domain.rst │ ├── compiler.factory.rst │ ├── compiler.routes.rst │ ├── compiler.rst │ ├── compiler.services.filemanager.rst │ ├── compiler.services.filemanager.tests.rst │ ├── compiler.services.filemanager.tests.test_filemanager.rst │ ├── compiler.services.rst │ ├── compiler.services.store.rst │ ├── compiler.services.store.tests.rst │ ├── compiler.tests.rst │ ├── compiler.tests.test_app.rst │ ├── compiler.tests.test_compiler.rst │ ├── compiler.tests.test_controllers.rst │ ├── compiler.util.rst │ ├── compiler.worker.rst │ └── modules.rst │ ├── architecture.rst │ ├── conf.py │ └── index.rst ├── mock_vault.py ├── mypy.ini ├── profile.yml ├── schema ├── openapi.yaml └── resources │ ├── compilationInfo.json │ ├── compilationStatus.json │ └── requestCompilation.json ├── test.py ├── tests ├── docstyle.sh ├── lint.sh └── static.sh ├── update-docs.sh ├── uwsgi.ini └── wsgi.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-compiler/3b80ce72ee9d519f15635e3819dcf8b45460cc4a/.DS_Store -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */app.py 4 | */bootstrap.py 5 | */wsgi.py 6 | */config.py 7 | docs/* 8 | */test* 9 | */worker.py 10 | mock*.py 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # VSCode 107 | .vscode 108 | settings.json 109 | 110 | .DS_Store 111 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns=test.* 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. 21 | jobs=1 22 | 23 | # List of plugins (as comma separated values of python modules names) to load, 24 | # usually to register additional checkers. 25 | load-plugins= 26 | 27 | # Pickle collected data for later comparisons. 28 | persistent=yes 29 | 30 | # Specify a configuration file. 31 | #rcfile= 32 | 33 | # Allow loading of arbitrary C extensions. Extensions are imported into the 34 | # active Python interpreter and may run arbitrary code. 35 | unsafe-load-any-extension=no 36 | 37 | 38 | [MESSAGES CONTROL] 39 | 40 | # Only show warnings with the listed confidence levels. Leave empty to show 41 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 42 | confidence= 43 | 44 | # Disable the message, report, category or checker with the given id(s). You 45 | # can either give multiple identifiers separated by comma (,) or put this 46 | # option multiple times (only on the command line, not in the configuration 47 | # file where it should appear only once).You can also use "--disable=all" to 48 | # disable everything first and then reenable specific checks. For example, if 49 | # you want to run only the similarities checker, you can use "--disable=all 50 | # --enable=similarities". If you want to run only the classes checker, but have 51 | # no Warning level messages displayed, use"--disable=all --enable=classes 52 | # --disable=W" 53 | disable=blacklisted-name,invalid-name,import-error,print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,parse-error,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,too-many-return-statements,too-many-arguments,too-many-locals,arguments-differ,signature-differs,unused-import,redefined-builtin,broad-except,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,too-few-public-methods,no-member 54 | 55 | # Enable the message, report, category or checker with the given id(s). You can 56 | # either give multiple identifier separated by comma (,) or put this option 57 | # multiple time (only on the command line, not in the configuration file where 58 | # it should appear only once). See also the "--disable" option for examples. 59 | enable= 60 | 61 | 62 | [REPORTS] 63 | 64 | # Python expression which should return a note less than 10 (10 is the highest 65 | # note). You have access to the variables errors warning, statement which 66 | # respectively contain the number of errors / warnings messages and the total 67 | # number of statements analyzed. This is used by the global evaluation report 68 | # (RP0004). 69 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 70 | 71 | # Template used to display messages. This is a python new-style format string 72 | # used to format the message information. See doc for all details 73 | #msg-template= 74 | 75 | # Set the output format. Available formats are text, parseable, colorized, json 76 | # and msvs (visual studio).You can also give a reporter class, eg 77 | # mypackage.mymodule.MyReporterClass. 78 | output-format=parseable 79 | 80 | # Tells whether to display a full report or only the messages 81 | reports=no 82 | 83 | # Activate the evaluation score. 84 | score=yes 85 | 86 | 87 | [REFACTORING] 88 | 89 | # Maximum number of nested blocks for function / method body 90 | max-nested-blocks=5 91 | 92 | 93 | [BASIC] 94 | 95 | # Naming hint for argument names 96 | argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 97 | 98 | # Regular expression matching correct argument names 99 | argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 100 | 101 | # Naming hint for attribute names 102 | attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 103 | 104 | # Regular expression matching correct attribute names 105 | attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 106 | 107 | # Bad variable names which should always be refused, separated by a comma 108 | bad-names=foo,bar,baz,toto,tutu,tata 109 | 110 | # Naming hint for class attribute names 111 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 112 | 113 | # Regular expression matching correct class attribute names 114 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 115 | 116 | # Naming hint for class names 117 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 118 | 119 | # Regular expression matching correct class names 120 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 121 | 122 | # Naming hint for constant names 123 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 124 | 125 | # Regular expression matching correct constant names 126 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 127 | 128 | # Minimum line length for functions/classes that require docstrings, shorter 129 | # ones are exempt. 130 | docstring-min-length=-1 131 | 132 | # Naming hint for function names 133 | function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 134 | 135 | # Regular expression matching correct function names 136 | function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 137 | 138 | # Good variable names which should always be accepted, separated by a comma 139 | good-names=i,j,k,ex,Run,_ 140 | 141 | # Include a hint for the correct naming format with invalid-name 142 | include-naming-hint=no 143 | 144 | # Naming hint for inline iteration names 145 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 146 | 147 | # Regular expression matching correct inline iteration names 148 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 149 | 150 | # Naming hint for method names 151 | method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 152 | 153 | # Regular expression matching correct method names 154 | method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 155 | 156 | # Naming hint for module names 157 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 158 | 159 | # Regular expression matching correct module names 160 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 161 | 162 | # Colon-delimited sets of names that determine each other's naming style when 163 | # the name regexes allow several styles. 164 | name-group= 165 | 166 | # Regular expression which should only match function or class names that do 167 | # not require a docstring. 168 | no-docstring-rgx=^_ 169 | 170 | # List of decorators that produce properties, such as abc.abstractproperty. Add 171 | # to this list to register other decorators that produce valid properties. 172 | property-classes=abc.abstractproperty 173 | 174 | # Naming hint for variable names 175 | variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 176 | 177 | # Regular expression matching correct variable names 178 | variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 179 | 180 | 181 | [SIMILARITIES] 182 | 183 | # Ignore comments when computing similarities. 184 | ignore-comments=yes 185 | 186 | # Ignore docstrings when computing similarities. 187 | ignore-docstrings=yes 188 | 189 | # Ignore imports when computing similarities. 190 | ignore-imports=no 191 | 192 | # Minimum lines number of a similarity. 193 | min-similarity-lines=4 194 | 195 | 196 | [SPELLING] 197 | 198 | # Spelling dictionary name. Available dictionaries: . 199 | spelling-dict= 200 | 201 | # List of comma separated words that should not be checked. 202 | spelling-ignore-words= 203 | 204 | # A path to a file that contains private dictionary; one word per line. 205 | spelling-private-dict-file= 206 | 207 | # Tells whether to store unknown words to indicated private dictionary in 208 | # --spelling-private-dict-file option instead of raising a message. 209 | spelling-store-unknown-words=no 210 | 211 | 212 | [TYPECHECK] 213 | 214 | # List of decorators that produce context managers, such as 215 | # contextlib.contextmanager. Add to this list to register other decorators that 216 | # produce valid context managers. 217 | contextmanager-decorators=contextlib.contextmanager 218 | 219 | # List of members which are set dynamically and missed by pylint inference 220 | # system, and so shouldn't trigger E1101 when accessed. Python regular 221 | # expressions are accepted. 222 | generated-members= 223 | 224 | # Tells whether missing members accessed in mixin class should be ignored. A 225 | # mixin class is detected if its name ends with "mixin" (case insensitive). 226 | ignore-mixin-members=yes 227 | 228 | # This flag controls whether pylint should warn about no-member and similar 229 | # checks whenever an opaque object is returned when inferring. The inference 230 | # can return multiple potential results while evaluating a Python object, but 231 | # some branches might not be evaluated, which results in partial inference. In 232 | # that case, it might be useful to still emit no-member and other checks for 233 | # the rest of the inferred objects. 234 | ignore-on-opaque-inference=yes 235 | 236 | # List of class names for which member attributes should not be checked (useful 237 | # for classes with dynamically set attributes). This supports the use of 238 | # qualified names. 239 | ignored-classes=optparse.Values,thread._local,_thread._local 240 | 241 | # List of module names for which member attributes should not be checked 242 | # (useful for modules/projects where namespaces are manipulated during runtime 243 | # and thus existing member attributes cannot be deduced by static analysis. It 244 | # supports qualified module names, as well as Unix pattern matching. 245 | ignored-modules= 246 | 247 | # Show a hint with possible names when a member name was not found. The aspect 248 | # of finding the hint is based on edit distance. 249 | missing-member-hint=yes 250 | 251 | # The minimum edit distance a name should have in order to be considered a 252 | # similar match for a missing member name. 253 | missing-member-hint-distance=1 254 | 255 | # The total number of similar names that should be taken in consideration when 256 | # showing a hint for a missing member. 257 | missing-member-max-choices=1 258 | 259 | 260 | [MISCELLANEOUS] 261 | 262 | # List of note tags to take in consideration, separated by a comma. 263 | notes=FIXME,XXX,TODO 264 | 265 | 266 | [LOGGING] 267 | 268 | # Logging modules to check that the string format arguments are in logging 269 | # function parameter format 270 | logging-modules=logging 271 | 272 | 273 | [VARIABLES] 274 | 275 | # List of additional names supposed to be defined in builtins. Remember that 276 | # you should avoid to define new builtins when possible. 277 | additional-builtins= 278 | 279 | # Tells whether unused global variables should be treated as a violation. 280 | allow-global-unused-variables=yes 281 | 282 | # List of strings which can identify a callback function by name. A callback 283 | # name must start or end with one of those strings. 284 | callbacks=cb_,_cb 285 | 286 | # A regular expression matching the name of dummy variables (i.e. expectedly 287 | # not used). 288 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 289 | 290 | # Argument names that match this expression will be ignored. Default to name 291 | # with leading underscore 292 | ignored-argument-names=_.*|^ignored_|^unused_ 293 | 294 | # Tells whether we should check for unused import in __init__ files. 295 | init-import=no 296 | 297 | # List of qualified module names which can have objects that can redefine 298 | # builtins. 299 | redefining-builtins-modules=six.moves,future.builtins 300 | 301 | 302 | [FORMAT] 303 | 304 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 305 | expected-line-ending-format= 306 | 307 | # Regexp for a line that is allowed to be longer than the limit. 308 | ignore-long-lines=^\s*(# )??$ 309 | 310 | # Number of spaces of indent required inside a hanging or continued line. 311 | indent-after-paren=4 312 | 313 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 314 | # tab). 315 | indent-string=' ' 316 | 317 | # Maximum number of characters on a single line. 318 | max-line-length=100 319 | 320 | # Maximum number of lines in a module 321 | max-module-lines=1000 322 | 323 | # List of optional constructs for which whitespace checking is disabled. `dict- 324 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 325 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 326 | # `empty-line` allows space-only lines. 327 | no-space-check=trailing-comma,dict-separator 328 | 329 | # Allow the body of a class to be on the same line as the declaration if body 330 | # contains single statement. 331 | single-line-class-stmt=no 332 | 333 | # Allow the body of an if to be on the same line as the test if there is no 334 | # else. 335 | single-line-if-stmt=no 336 | 337 | 338 | [DESIGN] 339 | 340 | # Maximum number of arguments for function / method 341 | max-args=5 342 | 343 | # Maximum number of attributes for a class (see R0902). 344 | max-attributes=7 345 | 346 | # Maximum number of boolean expressions in a if statement 347 | max-bool-expr=5 348 | 349 | # Maximum number of branch for function / method body 350 | max-branches=12 351 | 352 | # Maximum number of locals for function / method body 353 | max-locals=15 354 | 355 | # Maximum number of parents for a class (see R0901). 356 | max-parents=7 357 | 358 | # Maximum number of public methods for a class (see R0904). 359 | max-public-methods=20 360 | 361 | # Maximum number of return / yield for function / method body 362 | max-returns=6 363 | 364 | # Maximum number of statements in function / method body 365 | max-statements=50 366 | 367 | # Minimum number of public methods for a class (see R0903). 368 | min-public-methods=2 369 | 370 | 371 | [CLASSES] 372 | 373 | # List of method names used to declare (i.e. assign) instance attributes. 374 | defining-attr-methods=__init__,__new__,setUp 375 | 376 | # List of member names, which should be excluded from the protected access 377 | # warning. 378 | exclude-protected=_asdict,_fields,_replace,_source,_make 379 | 380 | # List of valid names for the first argument in a class method. 381 | valid-classmethod-first-arg=cls 382 | 383 | # List of valid names for the first argument in a metaclass class method. 384 | valid-metaclass-classmethod-first-arg=mcs 385 | 386 | 387 | [IMPORTS] 388 | 389 | # Allow wildcard imports from modules that define __all__. 390 | allow-wildcard-with-all=no 391 | 392 | # Analyse import fallback blocks. This can be used to support both Python 2 and 393 | # 3 compatible code, which means that the block might have code that exists 394 | # only in one or another interpreter, leading to false positives when analysed. 395 | analyse-fallback-blocks=no 396 | 397 | # Deprecated modules which should not be used, separated by a comma 398 | deprecated-modules=optparse,tkinter.tix 399 | 400 | # Create a graph of external dependencies in the given file (report RP0402 must 401 | # not be disabled) 402 | ext-import-graph= 403 | 404 | # Create a graph of every (i.e. internal and external) dependencies in the 405 | # given file (report RP0402 must not be disabled) 406 | import-graph= 407 | 408 | # Create a graph of internal dependencies in the given file (report RP0402 must 409 | # not be disabled) 410 | int-import-graph= 411 | 412 | # Force import order to recognize a module as part of the standard 413 | # compatibility libraries. 414 | known-standard-library= 415 | 416 | # Force import order to recognize a module as part of a third party library. 417 | known-third-party=enchant 418 | 419 | 420 | [EXCEPTIONS] 421 | 422 | # Exceptions that will emit a warning when being caught. Defaults to 423 | # "Exception" 424 | overgeneral-exceptions=Exception 425 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: required 3 | services: 4 | - docker 5 | cache: pip 6 | env: 7 | - MIN_SCORE=9 BOTO_CONFIG=/dev/null # This is the dumbest thing. https://github.com/travis-ci/travis-ci/issues/7940 8 | os: 9 | - linux 10 | python: 11 | - "3.6" 12 | script: 13 | - pip install pipenv 14 | - pipenv sync --dev 15 | - pipenv run openapi-spec-validator schema/openapi.yaml 16 | - pipenv run nose2 --with-coverage 17 | after_success: 18 | - coveralls 19 | - "./tests/docstyle.sh compiler" 20 | - "./tests/lint.sh compiler" 21 | - "./tests/static.sh compiler" 22 | deploy: 23 | - provider: script 24 | script: 25 | ./deploy/make_and_push_images.sh compiler ARXIVNG-2462 26 | on: 27 | all_branches: true 28 | - provider: script 29 | script: 30 | ./deploy/make_and_push_images.sh compiler ARXIVNG-2462 && 31 | ./deploy/install_helm.sh development && 32 | ./deploy/publish_helm_chart.sh 33 | on: 34 | tags: true 35 | -------------------------------------------------------------------------------- /DECISIONS.md: -------------------------------------------------------------------------------- 1 | # Decision log 2 | 3 | ## 2019-07-23: Don't store task state in S3 4 | 5 | In previous versions, both the compilation product (e.g. PDF) and information 6 | about the state of the compilation task were stored in S3. The main driver for 7 | this was to avoid introducing a new state store (e.g. a database) just to store 8 | a little bit of metadata about a compilation task. 9 | 10 | As reported by e.g. https://github.com/arXiv/arxiv-submission-ui/issues/123 11 | we ran into some performance issues, and some lost state. Specifically, the 12 | state of the compilation process was not always successfully stored in S3, 13 | leading to an unacceptable split-brain situation. The fix for this seemed to be 14 | to consult the task backend (Redis) for the "real" state of the compilation 15 | task.... which begs the question of why we would store compilation status 16 | metadata anywhere else in the first place. 17 | 18 | A simpler approach (that still avoids a database) is to rely entirely on the 19 | task/result backend (Redis) for the state of the compilation process. By 20 | setting [``result_extended = 21 | True``](http://docs.celeryproject.org/en/latest/userguide/configuration.html#result-extended) 22 | and handling failures gracefully, we can ensure that the original parameters 23 | for the compilation (e.g. owner) can always be successfully obtained. 24 | 25 | Incidentally, this also speeds up read-only requests for compilation status 26 | by something like 10x. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # arxiv/compiler 2 | 3 | ARG BASE_VERSION=0.16.6 4 | 5 | FROM arxiv/base:${BASE_VERSION} 6 | 7 | WORKDIR /opt/arxiv/ 8 | 9 | COPY Pipfile Pipfile.lock /opt/arxiv/ 10 | RUN pipenv install && rm -rf ~/.cache/pip 11 | 12 | ENV PATH="/opt/arxiv:${PATH}" \ 13 | LOGLEVEL=40 \ 14 | ARXIV_HOME="https://arxiv.org" \ 15 | APPLICATION_ROOT="/" 16 | 17 | # Add the code in this repo. 18 | COPY wsgi.py uwsgi.ini app.py /opt/arxiv/ 19 | COPY compiler/ /opt/arxiv/compiler/ 20 | 21 | EXPOSE 8000 22 | 23 | ENTRYPOINT ["pipenv", "run"] 24 | CMD ["uwsgi", "--ini", "/opt/arxiv/uwsgi.ini"] 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Cornell University Library 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | flask = ">=1.0.3" 8 | requests = ">=2.22.0" 9 | boto3 = "==1.9.194" 10 | celery = "==4.1.0" 11 | kombu = "==4.1.0" 12 | mysqlclient = "==1.4.2" 13 | arxiv-base = "==0.16.6" 14 | arxiv-auth = "==0.4.2rc1" 15 | redis = "==2.10.6" 16 | jsonschema = "*" 17 | arxiv-vault = "==0.1.1rc15" 18 | dataclasses = "==0.6" 19 | docker = ">=4.0.1" 20 | urllib3 = ">=1.24.2" 21 | Jinja2 = ">=2.10.1" 22 | 23 | [dev-packages] 24 | pytest = "*" 25 | moto = "==1.3.13" 26 | openapi-spec-validator = "==0.2.8" 27 | mypy = "==0.701" 28 | pylint = "<2" 29 | "nose2" = "*" 30 | coveralls = "*" 31 | coverage = "*" 32 | importlib_resources = "*" 33 | sphinx = "*" 34 | sphinx-autodoc-typehints = "*" 35 | pydocstyle = "*" 36 | 37 | [requires] 38 | python_version = "3.6" 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arXiv compiler service 2 | 3 | *NOTE:* This service is not under active development, and should not be used for attempting to build a "local arXiv-like" 4 | compiler. For such a service, please see the [arXiv/submission-tools](https://github.com/arXiv/submission-tools) project. 5 | 6 | __This project has been archived, as of 2025-01-28__ 7 | 8 | The build service compiles LaTeX source packages into PDF, PostScript, 9 | and other formats. This service encompasses the arXiv TeX tree. Compilation 10 | logs are also made available, for example to provide submitters feedback about 11 | compilation failures or warnings. 12 | 13 | There are three moving parts: 14 | - The compiler service API, a Flask web app that handles new requests for 15 | compilation and makes the results available (or not). 16 | - The compiler worker, a Celery app that grabs sources, dispatches compilation, 17 | and stores the results. 18 | - The converter Docker image, which does the actual work of compilation and is 19 | executed by the compiler worker. This contains the arXiv TeX tree. 20 | 21 | In addition, the following infrastructure parts are required: 22 | - Redis, used as a task queue between the API and the worker. 23 | - S3, used to store the result of compilation tasks. 24 | 25 | ## TODO 26 | 27 | - [ ] Update the ``schema/``, and implement resource URLs in the response data. 28 | 29 | ## Running the compiler service locally 30 | 31 | The easiest way to get up and running is to launch the whole service group 32 | using Docker Compose. You will need to pull the 33 | [converter](https://github.com/arXiv/arxiv-converter/tree/develop) image ahead 34 | of time, or it will be pulled on the first compilation request. Note that you 35 | will need to provide valid AWS credentials with ECR read access as env vars 36 | when starting docker-compose. 37 | 38 | If you do not have an instance of the file manager service running, you can 39 | try compiling published sources on the public arXiv.org site. 40 | 41 | You will also need a directory that the worker can use as /tmp space. 42 | 43 | For example: 44 | 45 | ```bash 46 | $ mkdir /tmp/compilestuff # Docker needs access to this. 47 | $ export DIND_SOURCE_ROOT=/tmp/compilestuff 48 | $ export CONVERTER_DOCKER_IMAGE=[name (including transport) of converter image] 49 | $ export FILEMANAGER_ENDPOINT=https://arxiv.org # Get public sources. 50 | $ export FILEMANAGER_CONTENT_PATH=/src/{source_id} 51 | $ export AWS_ACCESS_KEY_ID={access key with ECR access} 52 | $ export AWS_SECRET_ACCESS_KEY={secret key with ECR access} 53 | ``` 54 | 55 | And then run with: 56 | 57 | ```bash 58 | $ docker-compose build # Build the local images. 59 | $ docker-compose up # Start the service group. 60 | ``` 61 | 62 | Give it a few seconds; Localstack needs to come up (provides a local S3), and 63 | a bucket will be created. 64 | 65 | ### Authentication + Authorization 66 | 67 | To use the API you will need an auth token with scopes ``compile:read`` and 68 | ``compile:create``. The easiest way to generate one of these is to use the 69 | helper script 70 | [here](https://github.com/arXiv/arxiv-auth/blob/develop/generate_token.py). 71 | Make sure that you use the same ``JWT_SECRET`` that is used in 72 | ``docker-compose.yml``. 73 | 74 | 75 | ```bash 76 | JWT_SECRET=foosecret pipenv run python generate_token.py 77 | ``` 78 | 79 | You should pass this token as the value of the ``Authorization`` header in 80 | all requests to the API. For example: 81 | 82 | ```bash 83 | curl -XPOST -H "Authorization: [auth token]" http://127.0.0.1:8000/... 84 | ``` 85 | 86 | For requests in your browser, you can use something like 87 | [requestly](https://chrome.google.com/webstore/detail/requestly-redirect-url-mo/mdnleldcmiljblolnjhpnblkcekpdkpa?hl=en) 88 | to automatically add the auth header to your requests. 89 | 90 | [Advanced Rest Client](https://install.advancedrestclient.com/install) is also 91 | handy for this purpose. 92 | 93 | ### Request compilation 94 | 95 | ```bash 96 | $ curl -XPOST -i -H 'Authorization: {JWT}' -d '{"source_id":"1602.00123","checksum":"\"Tue, 02 Feb 2016 01:04:33 GMT\"","format":"pdf"}' http://localhost:8000/ 97 | HTTP/1.0 202 ACCEPTED 98 | Content-Type: application/json 99 | Content-Length: 3 100 | Location: http://localhost:8000/task/53cccb2e-faf7-4dfa-b8de-63854bd08b0a 101 | Server: Werkzeug/0.14.1 Python/3.6.4 102 | Date: Mon, 12 Nov 2018 10:41:42 GMT 103 | 104 | {} 105 | ``` 106 | 107 | You should get ``202 Accepted`` with headers that look like this: 108 | 109 | ```bash 110 | content-type: application/json 111 | content-length: 3 112 | location: http://127.0.0.1:8000/1901.00123/%22Thu%2C%2003%20Jan%202019%2001:04:33%20GMT%22/pdf 113 | server: Werkzeug/0.15.2 Python/3.6.5 114 | date: Wed, 08 May 2019 20:49:26 GMT 115 | ``` 116 | 117 | The ``location`` is the status resource for the compilation task. 118 | 119 | #### What's with the ``checksum`` entry in the request payload? 120 | 121 | Note that the compiler service assumes that the checksum of the source package 122 | is included in the ``ETag`` header (this is the behavior of the file manager 123 | service). If you are pulling sources from the core arXiv site (as above), this 124 | will be a datestamp instead of a checksum. You can get the current ETag for a 125 | source package like this: 126 | 127 | ```bash 128 | $ curl -I https://arxiv.org/src/1901.00123 129 | HTTP/1.1 200 OK 130 | Date: Wed, 08 May 2019 20:29:32 GMT 131 | Server: Apache 132 | ETag: "Thu, 03 Jan 2019 01:04:33 GMT" 133 | Expires: Thu, 09 May 2019 00:00:00 GMT 134 | Content-Encoding: x-gzip 135 | Content-Disposition: attachment; filename="arXiv-1901-00123v1.tar.gz" 136 | Strict-Transport-Security: max-age=31536000 137 | Set-Cookie: browser=128.84.116.178.1557347372434122; path=/; max-age=946080000; domain=.arxiv.org 138 | Last-Modified: Thu, 03 Jan 2019 01:04:33 GMT 139 | Content-Length: 111118 140 | Vary: User-Agent 141 | Content-Type: application/x-eprint-tar 142 | ``` 143 | 144 | **Note that the quotation marks are included as part of the value of the ETag/checksum 145 | field in the request.** 146 | 147 | ### Checking the compilation status endpoint 148 | 149 | You can request the status endpoint like this: 150 | 151 | ```bash 152 | $ curl -H 'Authorization: {JWT}' http://127.0.0.1:8000/1901.00123/%22Thu%2C%2003%20Jan%202019%2001:04:33%20GMT%22/pdf 153 | { 154 | "checksum": "\"Thu, 03 Jan 2019 01:04:33 GMT\"", 155 | "description": "", 156 | "output_format": "pdf", 157 | "owner": null, 158 | "reason": null, 159 | "size_bytes": 598859, 160 | "source_id": "1901.00123", 161 | "status": "completed", 162 | "task_id": "1901.00123/\"Thu, 03 Jan 2019 01:04:33 GMT\"/pdf" 163 | } 164 | ``` 165 | 166 | ### Getting the content 167 | 168 | You can get the content at: 169 | http://127.0.0.1:8000/1901.00123/%22Thu%2C%2003%20Jan%202019%2001:04:33%20GMT%22/pdf/content 170 | 171 | 172 | ## Documentation 173 | 174 | The latest documentation can be found at 175 | https://arxiv.github.io/arxiv-compiler. 176 | 177 | ### Building 178 | 179 | ```bash 180 | sphinx-apidoc -o docs/source/api/ -e -f -M compiler *test*/* 181 | cd docs/ 182 | make html SPHINXBUILD=$(pipenv --venv)/bin/sphinx-build 183 | ``` 184 | 185 | 186 | ## License 187 | 188 | See [LICENSE](./LICENSE). 189 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """Provides application for development purposes.""" 2 | 3 | import time 4 | from compiler.factory import create_app 5 | from compiler.services import store 6 | 7 | app = create_app() 8 | -------------------------------------------------------------------------------- /compiler/__init__.py: -------------------------------------------------------------------------------- 1 | """arXiv compiler service.""" 2 | -------------------------------------------------------------------------------- /compiler/celery.py: -------------------------------------------------------------------------------- 1 | """Initialize the Celery application.""" 2 | 3 | from . import celeryconfig 4 | from celery import Celery 5 | 6 | celery_app = Celery('compiler') 7 | """The celery application instance used in both the API and the worker.""" 8 | celery_app.config_from_object('compiler.celeryconfig') 9 | celery_app.autodiscover_tasks(['compiler'], related_name='compiler', 10 | force=True) 11 | -------------------------------------------------------------------------------- /compiler/celeryconfig.py: -------------------------------------------------------------------------------- 1 | """ 2 | Celery configuration module. 3 | 4 | See `the celery docs 5 | `_. 6 | """ 7 | 8 | import os 9 | from urllib import parse 10 | 11 | broker_url = "redis://%s:6379/0" % os.environ.get('REDIS_ENDPOINT') 12 | """URI for the Redis cluster endpoint used for task queue.""" 13 | 14 | result_backend = "redis://%s:6379/0" % os.environ.get('REDIS_ENDPOINT') 15 | """URI for the Redis cluster endpoint used as a result backend.""" 16 | 17 | backend = results = result_backend 18 | 19 | redis_socket_timeout = 5 20 | redis_socket_connect_timeout = 5 21 | 22 | broker_transport_options = { 23 | 'queue_name_prefix': 'compiler-', 24 | 'max_retries': 5, 25 | 'interval_start': 0, 26 | 'interval_step': 0.5, 27 | 'interval_max': 3, 28 | } 29 | worker_prefetch_multiplier = 1 30 | """Don't let workers grab a whole bunch of tasks at once.""" 31 | 32 | task_default_queue = 'compiler-worker' 33 | 34 | task_acks_late = False 35 | """ 36 | Tasks are not acknowledged until they are finished. 37 | 38 | This is intended to provide durability in cases where the worker disappears 39 | in the middle of processing a task. The goal is that a task is performed to 40 | completion once. 41 | """ 42 | 43 | task_publish_retry_policy = { 44 | 'max_retries': 5, 45 | 'interval_start': 0, 46 | 'interval_max': 1, 47 | 'interval_step': 0.2 48 | } 49 | 50 | result_extended = True 51 | """Task metadata (e.g. args) are stored in the result backend.""" -------------------------------------------------------------------------------- /compiler/config.py: -------------------------------------------------------------------------------- 1 | """Configuration for the compiler service.""" 2 | 3 | from os import environ 4 | import tempfile 5 | 6 | NAMESPACE = environ.get('NAMESPACE') 7 | """Namespace in which this service is deployed; to qualify keys for secrets.""" 8 | 9 | DEBUG = environ.get('DEBUG') == '1' 10 | """enable/disable debug mode""" 11 | 12 | SERVER_NAME = environ.get('SERVER_NAME', None) 13 | """ 14 | Inform the application what host and port it is bound to. Required for 15 | subdomain route matching support. 16 | 17 | If set, will be used for the session cookie domain if SESSION_COOKIE_DOMAIN is 18 | not set. Modern web browsers will not allow setting cookies for domains without 19 | a dot. 20 | 21 | If set, url_for can generate external URLs with only an application context 22 | instead of a request context. 23 | 24 | Default: None 25 | """ 26 | 27 | APPLICATION_ROOT = environ.get('APPLICATION_ROOT', '/') 28 | """ 29 | If the application does not occupy a whole domain or subdomain this can be set 30 | to the path where the application is configured to live. This is for session 31 | cookie as path value. 32 | """ 33 | 34 | JWT_SECRET = environ.get('JWT_SECRET', 'foosecret') 35 | """Secret key for auth tokens.""" 36 | 37 | SECRET_KEY = environ.get('FLASK_SECRET', 'fooflasksecret') 38 | 39 | FILEMANAGER_HOST = environ.get('FILEMANAGER_SERVICE_HOST', 'arxiv.org') 40 | """Hostname of the filemanager service.""" 41 | 42 | FILEMANAGER_PORT = environ.get('FILEMANAGER_SERVICE_PORT', '443') 43 | """Filemanager service HTTP(S) port.""" 44 | 45 | FILEMANAGER_PROTO = environ.get('FILEMANAGER_SERVICE_PORT_443_PROTO', 46 | environ.get('FILEMANAGER_PROTO', 'https')) 47 | """Protocol for calling the filemanager service. Default is ``https``.""" 48 | 49 | FILEMANAGER_PATH = environ.get('FILEMANAGER_PATH', 'filemanager/api') 50 | """Path to the base filemanager service API endpoint.""" 51 | 52 | FILEMANAGER_ENDPOINT = environ.get( 53 | 'FILEMANAGER_ENDPOINT', 54 | f'{FILEMANAGER_PROTO}://{FILEMANAGER_HOST}:{FILEMANAGER_PORT}' 55 | f'/{FILEMANAGER_PATH}' 56 | ) 57 | """Full URI for the base filemanager service API endpoint.""" 58 | 59 | FILEMANAGER_VERIFY = bool(int(environ.get('FILEMANAGER_VERIFY', '1'))) 60 | """Enable/disable TLS certificate verification for the filemanager service.""" 61 | 62 | FILEMANAGER_VERIFY_CHECKSUM = \ 63 | bool(int(environ.get('FILEMANAGER_VERIFY_CHECKSUM', '1'))) 64 | """Enable/disable verification of the source package checksum.""" 65 | 66 | FILEMANAGER_CONTENT_PATH = environ.get('FILEMANAGER_CONTENT_PATH', 67 | '/{source_id}/content') 68 | """ 69 | Sub-path template for retrieving source packages from the filemanager service. 70 | 71 | Should use the `curly-brace format syntax 72 | `_. Currently 73 | supports the ``source_id`` key. 74 | """ 75 | FILEMANAGER_STATUS_ENDPOINT = environ.get('FILEMANAGER_STATUS_ENDPOINT', 76 | 'status') 77 | # Configuration for object store. 78 | S3_ENDPOINT = environ.get('S3_ENDPOINT', None) 79 | """AWS S3 endpoint. Default is ``None`` (use the "real" S3 service).""" 80 | 81 | S3_VERIFY = bool(int(environ.get('S3_VERIFY', 1))) 82 | """Enable/disable TLS certificate verification for S3.""" 83 | 84 | S3_BUCKET = environ.get('S3_BUCKET', f'compiler-submission-{NAMESPACE}') 85 | """Bucket for storing compilation products and logs.""" 86 | 87 | AWS_ACCESS_KEY_ID = environ.get('AWS_ACCESS_KEY_ID', None) 88 | """Access key ID for AWS, authorized for S3 access.""" 89 | 90 | AWS_SECRET_ACCESS_KEY = environ.get('AWS_SECRET_ACCESS_KEY', None) 91 | """Secret key for AWS, authorized for S3 access.""" 92 | 93 | AWS_REGION = environ.get('AWS_REGION', 'us-east-1') 94 | """AWS region. Defaults to ``us-east-1``.""" 95 | 96 | REDIS_ENDPOINT = environ.get('REDIS_ENDPOINT') 97 | """Hostname of the Redis cluster endpoint.""" 98 | 99 | CONVERTER_DOCKER_IMAGE = environ.get('CONVERTER_DOCKER_IMAGE') 100 | """Image name (including tag) for the TeX converter.""" 101 | 102 | CONVERTER_IMAGE_PULL = bool(int(environ.get('CONVERTER_IMAGE_PULL', '1'))) 103 | """Whether or not to pull the converter image if it is not present.""" 104 | 105 | DIND_SOURCE_ROOT = environ.get('DIND_SOURCE_ROOT', tempfile.mkdtemp()) 106 | """ 107 | Path where sources are stored on the docker host that runs converter. 108 | 109 | This must be the same underlying volume as :const:`WORKER_SOURCE_ROOT`. 110 | """ 111 | 112 | WORKER_SOURCE_ROOT = environ.get('WORKER_SOURCE_ROOT', '/tmp') 113 | """ 114 | Path where sources are stored on the worker. 115 | 116 | This must be the same underlying volume as :const:`DIND_SOURCE_ROOT`. 117 | """ 118 | 119 | VERBOSE_COMPILE = bool(int(environ.get('VERBOSE_COMPILE', 0))) 120 | """If 1 (True), converter image is run in verbose mode.""" 121 | 122 | AUTH_UPDATED_SESSION_REF = True 123 | 124 | LOGLEVEL = 10 125 | 126 | VAULT_ENABLED = bool(int(environ.get('VAULT_ENABLED', '0'))) 127 | """Enable/disable secret retrieval from Vault.""" 128 | 129 | KUBE_TOKEN = environ.get('KUBE_TOKEN', 'fookubetoken') 130 | """Service account token for authenticating with Vault. May be a file path.""" 131 | 132 | VAULT_HOST = environ.get('VAULT_HOST', 'foovaulthost') 133 | """Vault hostname/address.""" 134 | 135 | VAULT_PORT = environ.get('VAULT_PORT', '1234') 136 | """Vault API port.""" 137 | 138 | VAULT_ROLE = environ.get('VAULT_ROLE', 'compiler') 139 | """Vault role linked to this application's service account.""" 140 | 141 | VAULT_CERT = environ.get('VAULT_CERT') 142 | """Path to CA certificate for TLS verification when talking to Vault.""" 143 | 144 | VAULT_SCHEME = environ.get('VAULT_SCHEME', 'https') 145 | """Default is ``https``.""" 146 | 147 | NS_AFFIX = '' if NAMESPACE == 'production' else f'-{NAMESPACE}' 148 | 149 | VAULT_REQUESTS = [ 150 | {'type': 'generic', 151 | 'name': 'JWT_SECRET', 152 | 'mount_point': f'secret{NS_AFFIX}/', 153 | 'path': 'jwt', 154 | 'key': 'jwt-secret', 155 | 'minimum_ttl': 3600}, 156 | {'type': 'aws', 157 | 'name': 'AWS_S3_CREDENTIAL', 158 | 'mount_point': f'aws{NS_AFFIX}/', 159 | 'role': environ.get('VAULT_CREDENTIAL')} 160 | ] 161 | """Requests for Vault secrets.""" 162 | 163 | WAIT_FOR_SERVICES = bool(int(environ.get('WAIT_FOR_SERVICES', '0'))) 164 | WAIT_ON_STARTUP = int(environ.get('WAIT_ON_STARTUP', '0')) 165 | WAIT_FOR_WORKER = int(environ.get('WAIT_FOR_WORKER', '0')) 166 | 167 | DOCKER_HOST = environ.get('DOCKER_HOST', 'unix:///var/run/docker.sock') 168 | -------------------------------------------------------------------------------- /compiler/controllers.py: -------------------------------------------------------------------------------- 1 | """Request controllers.""" 2 | 3 | import string 4 | from typing import Tuple, Optional, Callable, Any 5 | from http import HTTPStatus as status 6 | from base64 import urlsafe_b64encode 7 | 8 | from werkzeug.datastructures import MultiDict 9 | from werkzeug.exceptions import BadRequest, NotFound, InternalServerError, \ 10 | Forbidden 11 | 12 | from flask import url_for 13 | 14 | from arxiv.users.domain import Session 15 | from arxiv.base import logging 16 | from arxiv.base.globals import get_application_config 17 | 18 | from .services import Store, filemanager 19 | from .services.store import DoesNotExist 20 | from . import compiler 21 | from .domain import Task, Product, Status, Format 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | Response = Tuple[dict, int, dict] 26 | 27 | urlsafe_base64_alphabet = (set(range(65, 91)) # A-Z 28 | | set(range(97, 123)) # a-z 29 | | set(range(48, 58)) # 0-9 30 | | set((45, 95, 61))) # -_= 31 | """ 32 | Ordinal representation of the URL-safe Base64 alphabet. 33 | 34 | RFC 3548 `defines `_ the 35 | base 64 alphabet as ``A-Za-z0-9+/=``. The Python base64 module `describes 36 | `_ 37 | the URL-safe alphabet as the standard Base64 alphabet with ``-`` 38 | substituted for ``+`` and ``_`` substituted for ``/``. 39 | """ 40 | 41 | 42 | def is_urlsafe_base64(val: str) -> bool: 43 | """ 44 | Determine whether a string is exclusively from the urlsafe base64 alphabet. 45 | 46 | See :const:`.urlsafe_base64_alphabet`. 47 | """ 48 | return bool(len(set((ord(c) for c in val)) - urlsafe_base64_alphabet) == 0) 49 | 50 | 51 | def _redirect_to_status(source_id: str, checksum: str, output_format: Format, 52 | code: int = status.SEE_OTHER) -> Response: 53 | """Redirect to the status endpoint.""" 54 | location = url_for('api.get_status', source_id=source_id, 55 | checksum=checksum, output_format=output_format.value) 56 | return {}, code, {'Location': location} 57 | 58 | 59 | def service_status(*args: Any, **kwargs: Any) -> Response: 60 | """Exercise dependencies and verify operational status.""" 61 | fm = filemanager.FileManager.current_session() 62 | store = Store.current_session() 63 | response_data = {} 64 | response_data['store'] = store.is_available() 65 | response_data['compiler'] = compiler.is_available() 66 | response_data['filemanager'] = fm.is_available() 67 | if not all(response_data.values()): 68 | return response_data, status.SERVICE_UNAVAILABLE, {} 69 | return response_data, status.OK, {} 70 | 71 | 72 | def compile(request_data: MultiDict, token: str, session: Session, 73 | is_authorized: Callable = lambda task: True) -> Response: 74 | """ 75 | Start compilation of an upload workspace. 76 | 77 | Parameters 78 | ---------- 79 | request_data : :class:`.MultiDict` 80 | Data payload from the request. 81 | token : str 82 | Auth token to be used for subrequests (e.g. to file management 83 | service). 84 | 85 | Returns 86 | ------- 87 | dict 88 | Response data. 89 | int 90 | HTTP status code. 91 | dict 92 | Headers to add to response. 93 | 94 | """ 95 | # Since these may originate from a JSON payload, values may be deserialized 96 | # as int; cast to str to ensure that we are passing the correct type. 97 | source_id = _validate_source_id(str(request_data.get('source_id', ''))) 98 | checksum = _validate_checksum(str(request_data.get('checksum', ''))) 99 | product_format = _validate_output_format( 100 | request_data.get('output_format', Format.PDF.value)) 101 | 102 | # We don't want to compile the same source package twice. 103 | force = request_data.get('force', False) 104 | 105 | # Support label and link for PS/PDF Stamping 106 | # Test 107 | stamp_label: Optional[str] = request_data.get('stamp_label', None) 108 | stamp_link: Optional[str] = request_data.get('stamp_link', None) 109 | 110 | logger.debug('%s: request compilation with %s', __name__, request_data) 111 | 112 | # Unless we are forcing recompilation, we do not want to compile the same 113 | # source twice. So we check our storage for a compilation (successful or 114 | # not) corresponding to the requested source package. 115 | if not force: 116 | try: 117 | task_state = compiler.get_task(source_id, checksum, product_format) 118 | if not is_authorized(task_state): 119 | raise Forbidden('Not authorized to compile this resource') 120 | logger.debug('compilation exists, redirecting') 121 | return _redirect_to_status(source_id, checksum, product_format) 122 | except compiler.NoSuchTask as e: 123 | # raise NotFound('No such task') from e 124 | pass 125 | 126 | owner = _get_owner(source_id, checksum, token) 127 | try: 128 | compiler.start_compilation(source_id, checksum, stamp_label, 129 | stamp_link, product_format, token=token, 130 | owner=owner) 131 | except compiler.TaskCreationFailed as e: 132 | logger.error('Failed to start compilation: %s', e) 133 | raise InternalServerError('Failed to start compilation') from e 134 | return _redirect_to_status(source_id, checksum, product_format, 135 | status.ACCEPTED) 136 | 137 | 138 | def get_status(source_id: str, checksum: str, output_format: str, 139 | is_authorized: Callable = lambda task: True) -> Response: 140 | """ 141 | Get the status of a compilation. 142 | 143 | See ``schema/resources/compilationStatus.json``. 144 | 145 | Parameters 146 | ---------- 147 | source_id : int 148 | Identifier for the source package. 149 | checksum : str 150 | Checksum of the source package to compile. 151 | output_format : str 152 | Desired output format. Only `pdf` is currently supported. 153 | 154 | Returns 155 | ------- 156 | dict 157 | Response data. 158 | int 159 | HTTP status code. 160 | dict 161 | Headers to add to response. 162 | 163 | """ 164 | source_id = _validate_source_id(source_id) 165 | checksum = _validate_checksum(checksum) 166 | product_format = _validate_output_format(output_format) 167 | 168 | logger.debug('get_status for %s, %s, %s', source_id, checksum, 169 | output_format) 170 | try: 171 | task_state = compiler.get_task(source_id, checksum, product_format) 172 | except compiler.NoSuchTask as e: 173 | raise NotFound('No such compilation task') from e 174 | 175 | # Verify that the requester is authorized to view this resource. 176 | if not is_authorized(task_state): 177 | raise Forbidden('Access denied') 178 | return task_state.to_dict(), status.OK, {'ARXIV-OWNER': task_state.owner} 179 | 180 | 181 | def get_product(source_id: str, checksum: str, output_format: str, 182 | is_authorized: Callable = lambda task: True) -> Response: 183 | """ 184 | Get the product of a compilation. 185 | 186 | Parameters 187 | ---------- 188 | source_id : int 189 | Identifier for the source package. 190 | checksum : str 191 | Checksum of the source package to compile. 192 | output_format : str 193 | Desired output format. Only `pdf` is currently supported. 194 | 195 | Returns 196 | ------- 197 | dict 198 | Response data. 199 | int 200 | HTTP status code. 201 | dict 202 | Headers to add to response. 203 | 204 | """ 205 | source_id = _validate_source_id(source_id) 206 | checksum = _validate_checksum(checksum) 207 | product_format = _validate_output_format(output_format) 208 | 209 | # Verify that the requester is authorized to view this resource. 210 | try: 211 | task_state = compiler.get_task(source_id, checksum, product_format) 212 | except compiler.NoSuchTask as e: 213 | raise NotFound('No such task') from e 214 | if not is_authorized(task_state): 215 | raise Forbidden('Access denied') 216 | 217 | if not task_state.is_completed: 218 | return _redirect_to_status(source_id, checksum, product_format) 219 | 220 | store = Store.current_session() 221 | try: 222 | product = store.retrieve(source_id, checksum, product_format) 223 | except DoesNotExist as e: 224 | raise NotFound('No such compilation product') from e 225 | data = { 226 | 'stream': product.stream, 227 | 'content_type': product_format.content_type, 228 | 'filename': f'{source_id}.{product_format.ext}', 229 | } 230 | headers = {'ARXIV-OWNER': task_state.owner, 'ETag': product.checksum} 231 | return data, status.OK, headers 232 | 233 | 234 | def get_log(source_id: str, checksum: str, output_format: str, 235 | is_authorized: Callable = lambda task: True) -> Response: 236 | """ 237 | Get a compilation log. 238 | 239 | Parameters 240 | ---------- 241 | source_id : int 242 | Identifier for the source package. 243 | checksum : str 244 | Checksum of the source package to compile. 245 | output_format : str 246 | Desired output format. Only `pdf` is currently supported. 247 | 248 | Returns 249 | ------- 250 | dict 251 | Response data. 252 | int 253 | HTTP status code. 254 | dict 255 | Headers to add to response. 256 | 257 | """ 258 | source_id = _validate_source_id(source_id) 259 | checksum = _validate_checksum(checksum) 260 | product_format = _validate_output_format(output_format) 261 | 262 | # Verify that the requester is authorized to view this resource. 263 | try: 264 | task_state = compiler.get_task(source_id, checksum, product_format) 265 | except compiler.NoSuchTask as e: 266 | raise NotFound('No such task') from e 267 | if not is_authorized(task_state): 268 | raise Forbidden('Access denied') 269 | if not task_state.is_completed: 270 | return _redirect_to_status(source_id, checksum, product_format) 271 | 272 | store = Store.current_session() 273 | try: 274 | product = store.retrieve_log(source_id, checksum, product_format) 275 | except DoesNotExist as e: 276 | raise NotFound('No such compilation product') from e 277 | data = { 278 | 'stream': product.stream, 279 | 'content_type': 'text/plain', 280 | 'filename': f'{source_id}.{product_format.ext}' 281 | } 282 | headers = {'ARXIV-OWNER': task_state.owner, 'ETag': product.checksum} 283 | return data, status.OK, headers 284 | 285 | 286 | def _validate_source_id(source_id: str) -> str: 287 | if not source_id or not _is_valid_source_id(source_id): 288 | raise BadRequest(f'Invalid source_id: {source_id}') 289 | return source_id 290 | 291 | 292 | def _validate_checksum(checksum: str) -> str: 293 | verify = get_application_config().get('FILEMANAGER_VERIFY_CHECKSUM', True) 294 | if not checksum or not is_urlsafe_base64(checksum): 295 | # If we are not verifying the checksum, attempt to create a URL-safe 296 | # value that we can use to identify the source package for our own 297 | # purposes. 298 | if checksum and not verify: 299 | try: 300 | checksum_bytes = urlsafe_b64encode(checksum.encode('utf-8')) 301 | return checksum_bytes.decode('utf-8') 302 | except UnicodeDecodeError: 303 | pass 304 | logger.debug('Not a valid source checksum: %s', checksum) 305 | raise BadRequest(f'Not a valid source checksum: {checksum}') 306 | return checksum 307 | 308 | 309 | def _validate_output_format(output_format: str) -> Format: 310 | try: 311 | return Format(output_format) 312 | except ValueError as e: 313 | raise BadRequest(f'Unsupported format: {output_format}') from e 314 | 315 | 316 | def _is_valid_source_id(source_id: str) -> bool: 317 | allowed = set(string.ascii_letters) | set(string.digits) | set('.-_') 318 | return bool(len(set(source_id) - allowed) == 0) 319 | 320 | 321 | def _get_owner(source_id: str, checksum: str, token: str) -> Optional[str]: 322 | """Get the owner of the upload source package.""" 323 | fm = filemanager.FileManager.current_session() 324 | try: 325 | logger.debug('Check for source') 326 | try: 327 | owner: Optional[str] = fm.owner(source_id, checksum, token) 328 | except Exception as e: 329 | raise NotFound('No such source') from e 330 | except (filemanager.exceptions.RequestForbidden, 331 | filemanager.exceptions.RequestUnauthorized): 332 | logger.debug('Not authorized to check source') 333 | raise Forbidden('Not authorized to access source') 334 | return owner 335 | -------------------------------------------------------------------------------- /compiler/domain.py: -------------------------------------------------------------------------------- 1 | """Domain class for the compiler service.""" 2 | 3 | from typing import NamedTuple, Optional, BinaryIO, Dict 4 | import io 5 | from datetime import datetime 6 | from .util import ResponseStream 7 | from enum import Enum 8 | 9 | 10 | class Format(Enum): 11 | """Compilation formats supported by this service.""" 12 | 13 | PDF = "pdf" 14 | DVI = "dvi" 15 | PS = "ps" 16 | 17 | @property 18 | def ext(self) -> str: 19 | """Filename extension for the compilation product.""" 20 | value: str = self.value 21 | return value 22 | 23 | @property 24 | def content_type(self) -> str: 25 | """The mime-type for this format.""" 26 | _ctypes: Dict['Format', str] = { 27 | Format.PDF: 'application/pdf', 28 | Format.DVI: 'application/x-dvi', 29 | Format.PS: 'application/postscript' 30 | } 31 | return _ctypes[self] 32 | 33 | 34 | class Status(Enum): 35 | """Represents the status of a requested compilation.""" 36 | 37 | COMPLETED = "completed" 38 | IN_PROGRESS = "in_progress" 39 | FAILED = "failed" 40 | 41 | 42 | class Reason(Enum): 43 | """Specific reasons for a (usually failure) outcome.""" 44 | 45 | AUTHORIZATION = "auth_error" 46 | MISSING = "missing_source" 47 | SOURCE_TYPE = "invalid_source_type" 48 | CORRUPTED = "corrupted_source" 49 | STORAGE = "storage" 50 | CANCELLED = "cancelled" 51 | COMPILATION = "compilation_errors" 52 | NETWORK = "network_error" 53 | DOCKER = "docker" 54 | NONE = None 55 | 56 | 57 | class Task(NamedTuple): 58 | """Represents the state of a compilation product in the store.""" 59 | 60 | # Here are the actual slots/fields. 61 | status: Status 62 | """ 63 | The status of the compilation. 64 | 65 | If :attr:`Status.COMPLETED`, the current file corresponding to the format 66 | of this compilation status is the product of this compilation. 67 | """ 68 | 69 | source_id: Optional[str] = None 70 | 71 | output_format: Optional[Format] = None 72 | """ 73 | The target format of the compilation. 74 | 75 | One of :attr:`PDF`, :attr:`DVI`, or :attr:`PS`. 76 | """ 77 | 78 | checksum: Optional[str] = None 79 | """ 80 | Checksum of the source tarball from the file management service. 81 | 82 | This is likely to be a checksum of some kind, but may be something else. 83 | """ 84 | 85 | task_id: Optional[str] = None 86 | """If a task exists for this compilation, the unique task ID.""" 87 | 88 | reason: Reason = Reason.NONE 89 | """An explanation of the current status. E.g. why did it fail.""" 90 | 91 | description: str = "" 92 | """A description of the outcome.""" 93 | 94 | size_bytes: int = 0 95 | """Size of the product.""" 96 | 97 | owner: Optional[str] = None 98 | """The owner of this resource.""" 99 | 100 | @property 101 | def is_completed(self) -> bool: 102 | """Indicate whether or not this task is completed.""" 103 | return bool(self.status in [Status.COMPLETED, Status.FAILED]) 104 | 105 | @property 106 | def is_failed(self) -> bool: 107 | """Indicate whether or not this task has failed.""" 108 | return bool(self.status is Status.FAILED) 109 | 110 | @property 111 | def is_in_progress(self) -> bool: 112 | """Indicate whether or not this task is still in progress.""" 113 | return bool(self.status is Status.IN_PROGRESS) 114 | 115 | @property 116 | def ext(self) -> str: 117 | """Filename extension for the compilation product.""" 118 | if self.output_format is None: 119 | raise TypeError('Output format `None` has no extension') 120 | return self.output_format.ext 121 | 122 | @property 123 | def content_type(self) -> str: 124 | """Mime type for the output format of this compilation.""" 125 | if self.output_format is None: 126 | raise TypeError('Output format `None` has no content type') 127 | return self.output_format.content_type 128 | 129 | def to_dict(self) -> dict: 130 | """Generate a dict representation of this object.""" 131 | return { 132 | 'source_id': self.source_id, 133 | 'output_format': 134 | self.output_format.value if self.output_format else None, 135 | 'checksum': self.checksum, 136 | 'task_id': self.task_id, 137 | 'status': self.status.value if self.status else None, 138 | 'reason': self.reason.value if self.reason else None, 139 | 'description': self.description, 140 | 'size_bytes': self.size_bytes, 141 | 'owner': self.owner 142 | } 143 | 144 | @classmethod 145 | def from_dict(cls, data: dict) -> 'Task': 146 | """Generate a :class:`.Task` instance from raw data.""" 147 | data['output_format'] = Format(data['output_format']) 148 | data['status'] = Status(data['status']) 149 | data['reason'] = Reason(data['reason']) 150 | data['size_bytes'] = data['size_bytes'] 151 | return cls(**data) 152 | 153 | 154 | class Product(NamedTuple): 155 | """Content of a compilation product itself.""" 156 | 157 | stream: BinaryIO 158 | """Readable buffer with the product content.""" 159 | 160 | checksum: Optional[str] = None 161 | """The B64-encoded MD5 hash of the compilation product.""" 162 | 163 | 164 | class SourcePackage(NamedTuple): 165 | """Source package content, retrieved from file management service.""" 166 | 167 | source_id: str 168 | """The identifier of the source package (upload workspace).""" 169 | path: str 170 | """Path to the retrieved source package.""" 171 | etag: str 172 | """Etag returned with the source package content.""" 173 | 174 | 175 | class SourcePackageInfo(NamedTuple): 176 | """Current state of the source package in the file managment service.""" 177 | 178 | source_id: str 179 | etag: str 180 | -------------------------------------------------------------------------------- /compiler/factory.py: -------------------------------------------------------------------------------- 1 | """Application factory for compiler service.""" 2 | import os 3 | import time 4 | 5 | from typing import Any 6 | from typing_extensions import Protocol 7 | 8 | from werkzeug.exceptions import Forbidden, Unauthorized, NotFound, \ 9 | InternalServerError, BadRequest, HTTPException, MethodNotAllowed 10 | # from werkzeug.middleware.profiler import ProfilerMiddleware 11 | 12 | from flask import Flask, jsonify, Response 13 | 14 | from arxiv.base import Base, logging 15 | from arxiv.users import auth 16 | from arxiv.base.middleware import wrap, request_logs 17 | from arxiv import vault 18 | 19 | from .celery import celery_app 20 | from .services import store, filemanager 21 | from . import routes, compiler 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def create_app() -> Flask: 28 | """Create an instance of the compiler service app.""" 29 | from . import celeryconfig 30 | app = Flask(__name__) 31 | filemanager.FileManager.init_app(app) 32 | store.Store.init_app(app) 33 | app.config.from_pyfile('config.py') 34 | celery_app.config_from_object(celeryconfig) 35 | 36 | Base(app) 37 | auth.Auth(app) 38 | 39 | app.register_blueprint(routes.blueprint) 40 | register_error_handlers(app) 41 | 42 | middleware = [auth.middleware.AuthMiddleware] 43 | 44 | if app.config['VAULT_ENABLED']: 45 | middleware.insert(0, vault.middleware.VaultMiddleware) 46 | wrap(app, middleware) 47 | if app.config['VAULT_ENABLED']: 48 | app.middlewares['VaultMiddleware'].update_secrets({}) 49 | 50 | # Leaving this here for future performance tuning. - Erick 51 | # 52 | # app.config['PROFILE'] = True 53 | # app.config['DEBUG'] = True 54 | # app.wsgi_app = ProfilerMiddleware(app.wsgi_app, restrictions=[100], 55 | # sort_by=('cumtime', )) 56 | # 57 | 58 | if app.config['WAIT_FOR_SERVICES']: 59 | with app.app_context(): # type: ignore 60 | logger.info('initialize and wait for upstream services') 61 | # Adding a wait here can help keep boto3 from getting stuck if 62 | # we are starting localstack at the same time. This can probably 63 | # just be 0 (default) in production. 64 | time.sleep(app.config['WAIT_ON_STARTUP']) 65 | filemanager_service = filemanager.FileManager.current_session() 66 | store_service = store.Store.current_session() 67 | store_service.initialize() 68 | wait_for(filemanager_service) 69 | 70 | if app.config['WAIT_FOR_WORKER']: 71 | wait_for(compiler, await_result=True) # type: ignore 72 | 73 | logger.info('All upstream services are available; ready to start') 74 | 75 | return app 76 | 77 | 78 | class IAwaitable(Protocol): 79 | """An object that provides an ``is_available`` predicate.""" 80 | 81 | __name__: str 82 | 83 | def is_available(self, **kwargs: Any) -> bool: 84 | """Check whether an object (e.g. a service) is available.""" 85 | ... 86 | 87 | 88 | def wait_for(service: IAwaitable, delay: int = 2, **extra: Any) -> None: 89 | """Wait for a service to become available.""" 90 | if hasattr(service, '__name__'): 91 | service_name = service.__name__ 92 | elif hasattr(service, '__class__'): 93 | service_name = service.__class__.__name__ 94 | else: 95 | service_name = str(service) 96 | 97 | logger.info('await %s', service_name) 98 | while not service.is_available(**extra): 99 | logger.info('service %s is not available; try again', service_name) 100 | time.sleep(delay) 101 | logger.info('service %s is available!', service_name) 102 | 103 | 104 | def register_error_handlers(app: Flask) -> None: 105 | """Register error handlers for the Flask app.""" 106 | app.errorhandler(Forbidden)(jsonify_exception) 107 | app.errorhandler(Unauthorized)(jsonify_exception) 108 | app.errorhandler(BadRequest)(jsonify_exception) 109 | app.errorhandler(InternalServerError)(jsonify_exception) 110 | app.errorhandler(NotFound)(jsonify_exception) 111 | app.errorhandler(MethodNotAllowed)(jsonify_exception) 112 | 113 | 114 | def jsonify_exception(error: HTTPException) -> Response: 115 | """Render exceptions as JSON.""" 116 | exc_resp = error.get_response() 117 | response: Response = jsonify(reason=error.description) 118 | response.status_code = exc_resp.status_code 119 | return response 120 | -------------------------------------------------------------------------------- /compiler/routes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides the main API blueprint for compilation. 3 | 4 | Notes 5 | ----- 6 | mypy doesn't have types for flask.Headers.extend, so those lines are 7 | excluded from type checking. 8 | 9 | """ 10 | 11 | from typing import Callable, Union 12 | from http import HTTPStatus as status 13 | from werkzeug.wrappers import Response as WkzResponse 14 | from flask.json import jsonify 15 | from flask import Blueprint, redirect, request, send_file 16 | from flask import Response as FlaskResponse 17 | 18 | from arxiv.users.auth.decorators import scoped 19 | from arxiv.users.auth import scopes 20 | from arxiv.base import logging 21 | from arxiv.users.domain import Scope 22 | 23 | from . import controllers 24 | from .domain import Task 25 | 26 | Response = Union[FlaskResponse, WkzResponse] 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | blueprint = Blueprint('api', __name__, url_prefix='') 31 | 32 | base_url = '///' 33 | 34 | 35 | def authorizer(scope: Scope) -> Callable[[Task], bool]: 36 | """Make an authorizer function for injection into a controller.""" 37 | def inner(task: Task) -> bool: 38 | """Check whether the session is authorized for a specific resource.""" 39 | if not task.owner: # If there is no owner, this is a public resource. 40 | return True 41 | return (request.auth.is_authorized(scope, task.task_id) 42 | or (request.auth.user 43 | and str(request.auth.user.user_id) == str(task.owner))) 44 | return inner 45 | 46 | 47 | def resource_id(source_id: str, checksum: str, output_format: str) -> str: 48 | """Get the resource ID for an endpoint.""" 49 | return f"{source_id}/{checksum}/{output_format}" 50 | 51 | 52 | @blueprint.route('/status', methods=['GET']) 53 | def get_service_status() -> Union[str, Response]: 54 | """Get information about the current status of compilation service.""" 55 | data, code, headers = controllers.service_status() 56 | response: Response = jsonify(data) 57 | response.status_code = code 58 | response.headers.extend(headers.items()) # type: ignore 59 | return response 60 | 61 | 62 | @blueprint.route('/', methods=['POST']) 63 | @scoped(scopes.CREATE_COMPILE) 64 | def compile() -> Response: 65 | """Request that a source package be compiled.""" 66 | request_data = request.get_json(force=True) 67 | token = request.environ['token'] 68 | logger.debug('Request for compilation: %s', request_data) 69 | logger.debug('Got token: %s', token) 70 | data, code, headers = controllers.compile( 71 | request_data, 72 | token, 73 | request.auth, 74 | authorizer(scopes.CREATE_COMPILE) 75 | ) 76 | response: Response = jsonify(data) 77 | response.status_code = code 78 | response.headers.extend(headers.items()) # type: ignore 79 | return response 80 | 81 | 82 | @blueprint.route(base_url, methods=['GET']) 83 | @scoped(scopes.READ_COMPILE, resource=resource_id) 84 | def get_status(source_id: str, checksum: str, output_format: str) -> Response: 85 | """Get the status of a compilation task.""" 86 | data, code, headers = controllers.get_status( 87 | source_id, 88 | checksum, 89 | output_format, 90 | authorizer(scopes.READ_COMPILE) 91 | ) 92 | if code in [status.SEE_OTHER, status.FOUND]: 93 | return redirect(headers['Location'], code=code) 94 | response: Response = jsonify(data) 95 | response.status_code = code 96 | response.headers.extend(headers.items()) # type: ignore 97 | return response 98 | 99 | 100 | @blueprint.route(f'{base_url}/log', methods=['GET']) 101 | @scoped(scopes.READ_COMPILE, resource=resource_id) 102 | def get_log(source_id: str, checksum: str, output_format: str) -> Response: 103 | """Get a compilation log.""" 104 | data, code, headers = controllers.get_log(source_id, checksum, 105 | output_format, 106 | authorizer(scopes.READ_COMPILE)) 107 | if 299 < code < 400: 108 | return redirect(headers['Location'], code=code) 109 | response: Response = send_file(data['stream'], 110 | mimetype=data['content_type'], 111 | attachment_filename=data['filename']) 112 | response.status_code = code 113 | return response 114 | 115 | 116 | @blueprint.route(f'{base_url}/product', methods=['GET']) 117 | @scoped(scopes.READ_COMPILE, resource=resource_id) 118 | def get_product(source_id: str, checksum: str, output_format: str) -> Response: 119 | """Get a compilation product.""" 120 | data, code, head = controllers.get_product(source_id, checksum, 121 | output_format, 122 | authorizer(scopes.READ_COMPILE)) 123 | if 299 < code < 400: 124 | return redirect(head['Location'], code=code) 125 | response: Response = send_file(data['stream'], 126 | mimetype=data['content_type'], 127 | attachment_filename=data['filename']) 128 | response.set_etag(head.get('ETag')) 129 | response.status_code = code 130 | return response 131 | -------------------------------------------------------------------------------- /compiler/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Service integration modules.""" 2 | 3 | from .filemanager import FileManager 4 | from .store import Store 5 | -------------------------------------------------------------------------------- /compiler/services/filemanager/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integration with the :mod:`filemanager` service API. 3 | 4 | The file management service is responsible for accepting and processing user 5 | uploads used for submissions. The core resource for the file management service 6 | is the upload "workspace", which contains one or many files. We associate the 7 | workspace with a submission prior to finalization. The workspace URI is used 8 | for downpath processing, e.g. compilation. 9 | 10 | A key requirement for this integration is the ability to pass uploads to 11 | the file management service as they are being received by this UI application. 12 | """ 13 | 14 | from typing import Optional, Any 15 | import json 16 | import re 17 | import os 18 | 19 | import requests 20 | from arxiv.integration.api import status, service, exceptions 21 | from arxiv.base import logging 22 | from arxiv.base.globals import get_application_config 23 | 24 | from ...domain import SourcePackageInfo, SourcePackage 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | class Default(dict): 30 | """A more palatable dict for string formatting.""" 31 | 32 | def __missing__(self, key: str) -> str: 33 | """Return a key when missing rather than raising a KeyError.""" 34 | return key 35 | 36 | 37 | class FileManager(service.HTTPIntegration): 38 | """Encapsulates a connection with the file management service.""" 39 | 40 | class Meta: 41 | """Configuration for :class:`FileManagementService`.""" 42 | 43 | service_name = "filemanager" 44 | 45 | def is_available(self, **kwargs: Any) -> bool: 46 | """Check our connection to the filemanager service.""" 47 | config = get_application_config() 48 | status_endpoint = config.get('FILEMANAGER_STATUS_ENDPOINT', 'status') 49 | timeout: float = kwargs.get('timeout', 2.0) 50 | try: 51 | response = self.request('get', status_endpoint, timeout=timeout) 52 | return bool(response.status_code == 200) 53 | except Exception as e: 54 | logger.error('Error when calling filemanager: %s', e) 55 | return False 56 | return True 57 | 58 | @property 59 | def _must_verify_checksum(self) -> bool: 60 | config = get_application_config() 61 | return bool(config.get('FILEMANAGER_VERIFY_CHECKSUM', True)) 62 | 63 | def owner(self, source_id: str, checksum: str, token: str) \ 64 | -> Optional[str]: 65 | """Get the owner of a source package.""" 66 | config = get_application_config() 67 | content_endpoint = config.get('FILEMANAGER_CONTENT_PATH', 68 | '/{source_id}/content') 69 | path = content_endpoint.format_map(Default(source_id=source_id)) 70 | response = self.request('head', path, token) 71 | if self._must_verify_checksum and response.headers['ETag'] != checksum: 72 | logger.error('checksum does not match: %s != %s', 73 | response.headers['ETag'], checksum) 74 | raise RuntimeError('Not the resource we were looking for') 75 | owner: Optional[str] = response.headers.get('ARXIV-OWNER') 76 | return owner 77 | 78 | def get_source_content(self, source_id: str, token: str, 79 | save_to: str = '/tmp') -> SourcePackage: 80 | """ 81 | Retrieve the sanitized/processed upload package. 82 | 83 | Parameters 84 | ---------- 85 | source_id : str 86 | Unique long-lived identifier for the upload. 87 | save_to : str 88 | Directory into which source should be saved. 89 | 90 | Returns 91 | ------- 92 | :class:`SourcePackage` 93 | A ``read() -> bytes``-able wrapper around response content. 94 | 95 | """ 96 | logger.debug('Get upload content for: %s', source_id) 97 | config = get_application_config() 98 | content_endpoint = config.get('FILEMANAGER_CONTENT_PATH', 99 | '/{source_id}/content') 100 | path = content_endpoint.format_map(Default(source_id=source_id)) 101 | response = self.request('get', path, token) 102 | logger.debug('Got response with status %s', response.status_code) 103 | source_file_path = self._save_content(path, source_id, response, 104 | save_to) 105 | logger.debug('wrote source content to %s', source_file_path) 106 | return SourcePackage(source_id=source_id, path=source_file_path, 107 | etag=response.headers['ETag']) 108 | 109 | def _save_content(self, path: str, source_id: str, 110 | response: requests.Response, source_dir: str) -> str: 111 | # Get the filename from the response headers. 112 | match = re.search('filename=(.+)', 113 | response.headers.get('content-disposition', '')) 114 | if match: 115 | filename = match.group(1).strip('"') 116 | else: # Or make one ourselves. 117 | filename = f'{source_id}.tar.gz' 118 | 119 | # There is a bug on the production public site: source downloads have 120 | # .gz extension, but are not in fact gzipped tarballs. 121 | if path.startswith('https://arxiv.org/src'): 122 | filename.rstrip('.gz') 123 | 124 | source_file_path = os.path.abspath(os.path.join(source_dir, filename)) 125 | if not source_file_path.startswith(source_dir): 126 | logger.error('Source file path would escape working filesystem' 127 | ' context; may be malicious: %s', source_file_path) 128 | raise RuntimeError(f'Bad source file path: {source_file_path}') 129 | 130 | with open(source_file_path, 'wb') as f: 131 | for chunk in response.iter_content(1024): 132 | if chunk: 133 | f.write(chunk) 134 | return source_file_path 135 | 136 | def get_upload_info(self, source_id: str, token: str) -> SourcePackageInfo: 137 | """ 138 | Get the current state of the source package/upload workspace. 139 | 140 | Parameters 141 | ---------- 142 | source_id: str 143 | 144 | Returns 145 | ------- 146 | :class:`SourcePackageInfo` 147 | 148 | """ 149 | logger.debug('Get upload info for: %s', source_id) 150 | config = get_application_config() 151 | content_endpoint = config.get('FILEMANAGER_CONTENT_PATH', 152 | '/{source_id}/content') 153 | path = content_endpoint.format_map(Default(source_id=source_id)) 154 | response, _, headers = self.json('get', path, token) 155 | logger.debug('Got response with etag %s', headers['ETag']) 156 | return SourcePackageInfo(source_id=source_id, etag=headers['ETag']) 157 | -------------------------------------------------------------------------------- /compiler/services/filemanager/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`compiler.services.filemanager`.""" 2 | -------------------------------------------------------------------------------- /compiler/services/filemanager/tests/test_filemanager.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`compiler.services.filemanager`.""" 2 | 3 | from unittest import TestCase, mock 4 | import json 5 | import os 6 | import requests 7 | 8 | from flask import Flask 9 | from arxiv.integration.api import exceptions, status 10 | 11 | from .. import FileManager 12 | from .... import domain, util 13 | 14 | CONFIG = { 15 | 'FILEMANAGER_ENDPOINT': 'http://fooendpoint:1234', 16 | 'FILEMANAGER_VERIFY': False 17 | } 18 | mock_app = Flask('foo') 19 | mock_app.config.update(CONFIG) 20 | 21 | 22 | class TestServiceStatus(TestCase): 23 | """Test :func:`.FileManager.get_status`.""" 24 | 25 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 26 | @mock.patch('arxiv.integration.api.service.requests.Session') 27 | def test_status(self, mock_Session): 28 | """Get the status of the file manager service sucessfully.""" 29 | mock_Session.return_value = mock.MagicMock( 30 | get=mock.MagicMock( 31 | return_value=mock.MagicMock( 32 | status_code=status.OK, 33 | json=mock.MagicMock(return_value={'iam': 'ok'}) 34 | ) 35 | ) 36 | ) 37 | self.assertEqual(FileManager.get_status(), {'iam': 'ok'}, 38 | "Gets the response content from the status enpoint") 39 | 40 | 41 | class TestGetUploadInfo(TestCase): 42 | """:func:`FileManager.get_upload_info` returns the current ETag.""" 43 | 44 | def session(self, status_code=status.OK, method="get", json={}, 45 | content="", headers={}): 46 | """Make a mock session.""" 47 | return mock.MagicMock(**{ 48 | method: mock.MagicMock( 49 | return_value=mock.MagicMock( 50 | status_code=status_code, 51 | json=mock.MagicMock( 52 | return_value=json 53 | ), 54 | content=content, 55 | headers=headers 56 | ) 57 | ) 58 | }) 59 | 60 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 61 | @mock.patch('arxiv.integration.api.service.requests.Session') 62 | def test_get_upload_info(self, mock_Session): 63 | """Get info for an upload workspace that exists.""" 64 | etag = 'asdf12345checksum' 65 | source_id = '123456' 66 | mock_Session.return_value = self.session(headers={'ETag': etag}) 67 | info = FileManager.get_upload_info(source_id, 'footoken') 68 | self.assertIsInstance(info, domain.SourcePackageInfo) 69 | self.assertEqual(info.etag, etag) 70 | self.assertEqual(info.source_id, source_id) 71 | 72 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 73 | @mock.patch('arxiv.integration.api.service.requests.Session') 74 | def test_get_upload_info_nonexistant(self, mock_Session): 75 | """Get info for an upload workspace that does not exist.""" 76 | source_id = '123456' 77 | mock_Session.return_value = self.session(status.NOT_FOUND) 78 | 79 | with self.assertRaises(exceptions.NotFound): 80 | FileManager.get_upload_info(source_id, 'footoken') 81 | 82 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 83 | @mock.patch('arxiv.integration.api.service.requests.Session') 84 | def test_get_upload_info_bad_request(self, mock_Session): 85 | """We made a bad request.""" 86 | source_id = '123456' 87 | mock_Session.return_value = self.session(status.BAD_REQUEST) 88 | with self.assertRaises(exceptions.BadRequest): 89 | FileManager.get_upload_info(source_id, 'footoken') 90 | 91 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 92 | @mock.patch('arxiv.integration.api.service.requests.Session') 93 | def test_get_upload_info_unauthorized(self, mock_Session): 94 | """We made an unauthorized request.""" 95 | source_id = '123456' 96 | mock_Session.return_value = self.session(status.UNAUTHORIZED) 97 | with self.assertRaises(exceptions.RequestUnauthorized): 98 | FileManager.get_upload_info(source_id, 'footoken') 99 | 100 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 101 | @mock.patch('arxiv.integration.api.service.requests.Session') 102 | def test_get_upload_info_forbidden(self, mock_Session): 103 | """We made a forbidden request.""" 104 | source_id = '123456' 105 | mock_Session.return_value = self.session(status.FORBIDDEN) 106 | 107 | with self.assertRaises(exceptions.RequestForbidden): 108 | FileManager.get_upload_info(source_id, 'footoken') 109 | 110 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 111 | @mock.patch('arxiv.integration.api.service.requests.Session') 112 | def test_get_upload_info_error(self, mock_Session): 113 | """FM service replied 500 Internal Server Error.""" 114 | source_id = '123456' 115 | mock_Session.return_value = self.session( 116 | status.INTERNAL_SERVER_ERROR 117 | ) 118 | 119 | with self.assertRaises(exceptions.RequestFailed): 120 | FileManager.get_upload_info(source_id, 'footoken') 121 | 122 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 123 | @mock.patch('arxiv.integration.api.service.requests.Session') 124 | def test_get_upload_info_bad_json(self, mock_Session): 125 | """FM service reurns bad JSON.""" 126 | source_id = '123456' 127 | 128 | def raise_JSONDecodeError(*a, **k): 129 | raise json.decoder.JSONDecodeError('nope', 'nope', 0) 130 | 131 | mock_Session.return_value = mock.MagicMock( 132 | get=mock.MagicMock( 133 | return_value=mock.MagicMock( 134 | status_code=status.OK, 135 | json=mock.MagicMock(side_effect=raise_JSONDecodeError) 136 | ) 137 | ) 138 | ) 139 | with self.assertRaises(exceptions.BadResponse): 140 | FileManager.get_upload_info(source_id, 'footoken') 141 | 142 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 143 | @mock.patch('arxiv.integration.api.service.requests.Session') 144 | def test_get_upload_info_bad_ssl(self, mock_Session): 145 | """FM service has bad TLS.""" 146 | source_id = '123456' 147 | 148 | def raise_ssl_error(*a, **k): 149 | raise requests.exceptions.SSLError('danger fill bobinson') 150 | 151 | mock_Session.return_value = mock.MagicMock( 152 | get=mock.MagicMock(side_effect=raise_ssl_error) 153 | ) 154 | with self.assertRaises(exceptions.SecurityException): 155 | FileManager.get_upload_info(source_id, 'footoken') 156 | 157 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 158 | @mock.patch('arxiv.integration.api.service.requests.Session') 159 | def test_get_upload_info_no_connection(self, mock_Session): 160 | """FM service cannot connect.""" 161 | source_id = '123456' 162 | 163 | def raise_connection_error(*a, **k): 164 | raise requests.exceptions.ConnectionError('where r u') 165 | 166 | mock_Session.return_value = mock.MagicMock( 167 | get=mock.MagicMock(side_effect=raise_connection_error) 168 | ) 169 | with self.assertRaises(exceptions.ConnectionFailed): 170 | FileManager.get_upload_info(source_id, 'footoken') 171 | 172 | 173 | class TestGetUpload(TestCase): 174 | """:func:`FileManager.get_upload` returns the upload content.""" 175 | 176 | def session(self, status_code=status.OK, method="get", json={}, 177 | content="", headers={}): 178 | """Make a mock session.""" 179 | return mock.MagicMock(**{ 180 | method: mock.MagicMock( 181 | return_value=mock.MagicMock( 182 | status_code=status_code, 183 | json=mock.MagicMock( 184 | return_value=json 185 | ), 186 | content=content, 187 | headers=headers 188 | ) 189 | ) 190 | }) 191 | 192 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 193 | @mock.patch('arxiv.integration.api.service.requests.Session') 194 | def test_get_upload(self, mock_Session): 195 | """Get upload that exists.""" 196 | etag = 'asdf12345checksum' 197 | source_id = '123456' 198 | content = b'foocontent' 199 | mock_iter_content = mock.MagicMock(return_value=[content]) 200 | mock_Session.return_value = mock.MagicMock( 201 | get=mock.MagicMock( 202 | return_value=mock.MagicMock( 203 | status_code=status.OK, 204 | iter_content=mock_iter_content, 205 | headers={'ETag': etag} 206 | ) 207 | ) 208 | ) 209 | info = FileManager.get_source_content(source_id, 'footoken') 210 | self.assertIsInstance(info, domain.SourcePackage) 211 | self.assertEqual(info.etag, etag) 212 | self.assertEqual(info.source_id, source_id) 213 | self.assertIsInstance(info.path, str) 214 | 215 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 216 | @mock.patch('arxiv.integration.api.service.requests.Session') 217 | def test_get_upload_with_filename(self, mock_Session): 218 | """Get upload with an explicit filename in ``content-disposition``.""" 219 | etag = 'asdf12345checksum' 220 | source_id = '123456' 221 | content = b'foocontent' 222 | mock_iter_content = mock.MagicMock(return_value=[content]) 223 | mock_Session.return_value = mock.MagicMock( 224 | get=mock.MagicMock( 225 | return_value=mock.MagicMock( 226 | status_code=status.OK, 227 | iter_content=mock_iter_content, 228 | headers={'ETag': etag, 229 | 'content-disposition': 'filename=foo.tar.gz'} 230 | ) 231 | ) 232 | ) 233 | info = FileManager.get_source_content(source_id, 'footoken') 234 | self.assertIsInstance(info, domain.SourcePackage) 235 | self.assertEqual(info.etag, etag) 236 | self.assertEqual(info.source_id, source_id) 237 | self.assertIsInstance(info.path, str) 238 | self.assertEqual(info.path, '/tmp/foo.tar.gz') 239 | self.assertTrue(os.path.exists(info.path)) 240 | 241 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 242 | @mock.patch('arxiv.integration.api.service.requests.Session') 243 | def test_get_upload_with_suspicious_filename(self, mock_Session): 244 | """Get upload with a suspicious filename in ``content-disposition``.""" 245 | etag = 'asdf12345checksum' 246 | source_id = '123456' 247 | content = b'foocontent' 248 | mock_iter_content = mock.MagicMock(return_value=[content]) 249 | filename = '../whereDoesThisGetWritten.txt' 250 | mock_Session.return_value = mock.MagicMock( 251 | get=mock.MagicMock( 252 | return_value=mock.MagicMock( 253 | status_code=status.OK, 254 | iter_content=mock_iter_content, 255 | headers={'ETag': etag, 256 | 'content-disposition': f'filename={filename}'} 257 | ) 258 | ) 259 | ) 260 | with self.assertRaises(RuntimeError): 261 | FileManager.get_source_content(source_id, 'footoken') 262 | 263 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 264 | @mock.patch('arxiv.integration.api.service.requests.Session') 265 | def test_get_upload_with_malicious_filename(self, mock_Session): 266 | """Get upload with a malicious filename in ``content-disposition``.""" 267 | etag = 'asdf12345checksum' 268 | source_id = '123456' 269 | content = b'foocontent' 270 | mock_iter_content = mock.MagicMock(return_value=[content]) 271 | filename = '//bin/bash' 272 | mock_Session.return_value = mock.MagicMock( 273 | get=mock.MagicMock( 274 | return_value=mock.MagicMock( 275 | status_code=status.OK, 276 | iter_content=mock_iter_content, 277 | headers={'ETag': etag, 278 | 'content-disposition': f'filename={filename}'} 279 | ) 280 | ) 281 | ) 282 | with self.assertRaises(RuntimeError): 283 | FileManager.get_source_content(source_id, 'footoken') 284 | 285 | @mock.patch('arxiv.integration.api.service.current_app', mock_app) 286 | @mock.patch('arxiv.integration.api.service.requests.Session') 287 | def test_get_upload_nonexistant(self, mock_Session): 288 | """Get info for an upload workspace that does not exist.""" 289 | source_id = '123456' 290 | mock_Session.return_value = self.session(status.NOT_FOUND) 291 | with self.assertRaises(exceptions.NotFound): 292 | FileManager.get_source_content(source_id, 'footoken') 293 | -------------------------------------------------------------------------------- /compiler/services/store/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Content store for compiled representation of paper. 3 | 4 | Uses S3 as the underlying storage facility. 5 | """ 6 | 7 | import json 8 | from typing import Tuple, Optional, Dict, Union, List, Any, Mapping 9 | from functools import wraps 10 | from hashlib import md5 11 | from base64 import b64encode 12 | from collections import defaultdict 13 | import boto3 14 | import botocore 15 | from botocore.config import Config 16 | from botocore.exceptions import ClientError 17 | from flask import Flask 18 | 19 | from arxiv.base import logging 20 | from arxiv.base.globals import get_application_global, get_application_config 21 | 22 | from ...domain import Task, Product, Format 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class DoesNotExist(Exception): 29 | """The requested content does not exist.""" 30 | 31 | 32 | class NoSuchBucket(Exception): 33 | """The configured bucket does not exist.""" 34 | 35 | 36 | def hash_content(body: bytes) -> str: 37 | """Generate an encoded MD5 hash of a bytes.""" 38 | return b64encode(md5(body).digest()).decode('utf-8') 39 | 40 | 41 | class Store: 42 | """Represents an object store session.""" 43 | 44 | LOG_KEY = '{src_id}/{chk}/{out_fmt}/{src_id}.{ext}.log' 45 | KEY = '{src_id}/{chk}/{out_fmt}/{src_id}.{ext}' 46 | 47 | def __init__(self, bucket: str, verify: bool = False, 48 | region_name: Optional[str] = None, 49 | endpoint_url: Optional[str] = None, 50 | aws_access_key_id: Optional[str] = None, 51 | aws_secret_access_key: Optional[str] = None) -> None: 52 | """Initialize with connection config parameters.""" 53 | self._bucket = bucket 54 | self._region_name = region_name 55 | self._endpoint_url = endpoint_url 56 | self._verify = verify 57 | self._aws_access_key_id = aws_access_key_id 58 | self._aws_secret_access_key = aws_secret_access_key 59 | self.client = self._new_client() 60 | 61 | def _new_client(self, config: Optional[Config] = None) -> boto3.client: 62 | # Only add credentials to the client if they are explicitly set. 63 | # If they are not set, boto3 falls back to environment variables and 64 | # credentials files. 65 | params: Dict[str, Any] = {'region_name': self._region_name} 66 | if self._aws_access_key_id and self._aws_secret_access_key: 67 | params['aws_access_key_id'] = self._aws_access_key_id 68 | params['aws_secret_access_key'] = self._aws_secret_access_key 69 | if self._endpoint_url: 70 | params['endpoint_url'] = self._endpoint_url 71 | params['verify'] = self._verify 72 | logger.debug('new client with params %s', params) 73 | return boto3.client('s3', **params) 74 | 75 | def _handle_client_error(self, exc: ClientError) -> None: 76 | logger.error('error: %s', str(exc.response)) 77 | if exc.response['Error']['Code'] == 'NoSuchBucket': 78 | logger.error('Caught ClientError: NoSuchBucket') 79 | raise NoSuchBucket(f'{self._bucket} does not exist') from exc 80 | if exc.response['Error']['Code'] == "NoSuchKey": 81 | raise DoesNotExist(f'No such object in {self._bucket}') from exc 82 | logger.error('Unhandled ClientError: %s', exc) 83 | raise RuntimeError('Unhandled ClientError') from exc 84 | 85 | def __hash__(self) -> int: 86 | """Generate a unique hash for this store session using its config.""" 87 | return hash((self._bucket, self._region_name, self._endpoint_url, 88 | self._verify, self._aws_access_key_id, 89 | self._aws_secret_access_key)) 90 | 91 | def is_available(self, retries: int = 0, read_timeout: int = 5, 92 | connect_timeout: int = 5) -> bool: 93 | """Check whether we can write to the S3 bucket.""" 94 | try: 95 | self._test_put(retries=retries, read_timeout=read_timeout, 96 | connect_timeout=connect_timeout) 97 | logger.debug('S3 is available') 98 | return True 99 | except RuntimeError: 100 | logger.debug('S3 is not available') 101 | return False 102 | 103 | def _test_put(self, retries: int = 0, read_timeout: int = 5, 104 | connect_timeout: int = 5) -> None: 105 | """Test the connection to S3 by putting a tiny object.""" 106 | # Use a new client with a short timeout and no retries by default; we 107 | # want to fail fast here. 108 | config = Config(retries={'max_attempts': retries}, 109 | read_timeout=read_timeout, 110 | connect_timeout=connect_timeout) 111 | client = self._new_client(config=config) 112 | try: 113 | logger.info('trying to put to bucket %s', self._bucket) 114 | client.put_object(Body=b'test', Bucket=self._bucket, Key='stat') 115 | except ClientError as e: 116 | logger.error('Error when calling store: %s', e) 117 | self._handle_client_error(e) 118 | 119 | def _wait_for_bucket(self, retries: int = 0, delay: int = 0) -> None: 120 | """Wait for the bucket to available.""" 121 | try: 122 | waiter = self.client.get_waiter('bucket_exists') 123 | waiter.wait( 124 | Bucket=self._bucket, 125 | WaiterConfig={ 126 | 'Delay': delay, 127 | 'MaxAttempts': retries 128 | } 129 | ) 130 | except ClientError as exc: 131 | self._handle_client_error(exc) 132 | 133 | def initialize(self) -> None: 134 | """Perform initial checks, e.g. at application start-up.""" 135 | logger.info('initialize storage service') 136 | try: 137 | # We keep these tries short, since start-up connection problems 138 | # usually clear out pretty fast. 139 | if self.is_available(retries=20, connect_timeout=1, 140 | read_timeout=1): 141 | logger.info('storage service is already available') 142 | return 143 | except NoSuchBucket: 144 | logger.info('bucket does not exist; creating') 145 | self._create_bucket(retries=5, read_timeout=5, connect_timeout=5) 146 | logger.info('wait for bucket to be available') 147 | self._wait_for_bucket(retries=5, delay=5) 148 | return 149 | raise RuntimeError('Failed to initialize storage service') 150 | 151 | def store(self, product: Product, task: Task) -> None: 152 | """ 153 | Store a compilation product. 154 | 155 | Parameters 156 | ---------- 157 | product : :class:`Product` 158 | 159 | """ 160 | if task.output_format is None: 161 | raise TypeError('Output format must not be None') 162 | 163 | k = self.KEY.format(src_id=task.source_id, 164 | chk=task.checksum, 165 | out_fmt=task.output_format.value, 166 | ext=task.output_format.ext) 167 | self._put(k, product.stream.read(), task.content_type) 168 | 169 | def retrieve(self, src_id: str, chk: str, out_fmt: Format) -> Product: 170 | """ 171 | Retrieve a compilation product. 172 | 173 | Parameters 174 | ---------- 175 | src_id : str 176 | chk : str 177 | out_fmt : enum 178 | One of :attr:`Format`. 179 | 180 | Returns 181 | ------- 182 | :class:`Product` 183 | 184 | """ 185 | key = self.KEY.format(src_id=src_id, chk=chk, out_fmt=out_fmt.value, 186 | ext=out_fmt.ext) 187 | resp = self._get(key) 188 | return Product(stream=resp['Body'], checksum=resp['ETag'][1:-1]) 189 | 190 | def store_log(self, product: Product, task: Task) -> None: 191 | """ 192 | Store a compilation log. 193 | 194 | Parameters 195 | ---------- 196 | product : :class:`Product` 197 | Stream should be log content. 198 | 199 | """ 200 | if task.output_format is None: 201 | raise TypeError('Output format must not be None') 202 | key = self.LOG_KEY.format(src_id=task.source_id, 203 | chk=task.checksum, 204 | out_fmt=task.output_format.value, 205 | ext=task.output_format.ext) 206 | log_bytes = product.stream.read() 207 | logger.debug('Storing %s bytes of log', len(log_bytes)) 208 | self._put(key, log_bytes, 'text/plain') 209 | 210 | def retrieve_log(self, src_id: str, chk: str, out_fmt: Format) -> Product: 211 | """ 212 | Retrieve a compilation log. 213 | 214 | Parameters 215 | ---------- 216 | src_id : str 217 | chk : str 218 | out_fmt : enum 219 | One of :attr:`Format`. 220 | 221 | Returns 222 | ------- 223 | :class:`Product` 224 | 225 | """ 226 | key = self.LOG_KEY.format(src_id=src_id, chk=chk, 227 | out_fmt=out_fmt.value, ext=out_fmt.ext) 228 | resp = self._get(key) 229 | return Product(stream=resp['Body'], checksum=resp['ETag'][1:-1]) 230 | 231 | def _create_bucket(self, retries: int = 2, read_timeout: int = 5, 232 | connect_timeout: int = 5) -> None: 233 | """Create S3 bucket.""" 234 | config = Config(retries={'max_attempts': retries}, 235 | read_timeout=read_timeout, 236 | connect_timeout=connect_timeout) 237 | client = self._new_client(config=config) 238 | client.create_bucket(Bucket=self._bucket) 239 | 240 | def _get(self, key: str) -> dict: 241 | resp: dict 242 | try: 243 | resp = self.client.get_object(Bucket=self._bucket, Key=key) 244 | except ClientError as e: 245 | self._handle_client_error(e) 246 | return resp 247 | 248 | def _put(self, key: str, body: bytes, content_type: str) -> None: 249 | try: 250 | self.client.put_object(Body=body, Bucket=self._bucket, 251 | ContentMD5=hash_content(body), 252 | ContentType=content_type, Key=key) 253 | except ClientError as exc: 254 | self._handle_client_error(exc) 255 | 256 | @classmethod 257 | def init_app(cls, app: Flask) -> None: 258 | """Set defaults for required configuration parameters.""" 259 | app.config.setdefault('AWS_REGION', 'us-east-1') 260 | app.config.setdefault('AWS_ACCESS_KEY_ID', None) 261 | app.config.setdefault('AWS_SECRET_ACCESS_KEY', None) 262 | app.config.setdefault('S3_ENDPOINT', None) 263 | app.config.setdefault('S3_VERIFY', True) 264 | app.config.setdefault('S3_BUCKET', 'arxiv-compiler') 265 | 266 | @classmethod 267 | def get_session(cls) -> 'Store': 268 | """Create a new :class:`botocore.client.S3` session.""" 269 | config = get_application_config() 270 | return cls(config['S3_BUCKET'], 271 | config['S3_VERIFY'], 272 | config['AWS_REGION'], 273 | config['S3_ENDPOINT'], 274 | config['AWS_ACCESS_KEY_ID'], 275 | config['AWS_SECRET_ACCESS_KEY']) 276 | 277 | @classmethod 278 | def current_session(cls) -> 'Store': 279 | """Get the current store session for this application.""" 280 | g = get_application_global() 281 | if g is None: 282 | return cls.get_session() 283 | if 'store' not in g: 284 | g.store = cls.get_session() 285 | store: Store = g.store 286 | return store 287 | -------------------------------------------------------------------------------- /compiler/services/store/schema/status.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Compilation status", 3 | "description": "Describes the state of compilation products in the store", 4 | "additionalProperties": false, 5 | "required": ["compilations"], 6 | "type": "object", 7 | "properties": { 8 | "compilations": { 9 | "type": "array", 10 | "items": { 11 | "properties": { 12 | "source_checksum": { 13 | "description": "Checksum of the source tarball from the file management service.", 14 | "type": "string" 15 | }, 16 | "task_id": { 17 | "description": "If a task exists for this compilation, the unique task ID.", 18 | "oneOf": [ 19 | {"type": "string"}, 20 | {"type": "null"} 21 | ] 22 | }, 23 | "status": { 24 | "description": "The status of the compilation. If `current`, the current file corresponding to the format of this compilation status is the product of this compilation.", 25 | "type": "string", 26 | "enum": ["current", "in_progress", "failed"] 27 | }, 28 | "format": { 29 | "description": "The target format of the compilation.", 30 | "type": "string", 31 | "enum": ["pdf", "dvi", "ps"] 32 | } 33 | } 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /compiler/services/store/tests.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`compiler.services.store`.""" 2 | 3 | from unittest import TestCase, mock 4 | from moto import mock_s3 5 | import io 6 | from datetime import datetime 7 | 8 | from .. import Store 9 | from .. import store as store_ 10 | from ... import domain 11 | 12 | mock_app_config = mock.MagicMock(return_value={ 13 | 'S3_ENDPOINT': None, 14 | 'S3_VERIFY': True, 15 | 'S3_BUCKET': 'arxiv-compiler', 16 | 'AWS_ACCESS_KEY_ID': 'foo_id', 17 | 'AWS_SECRET_ACCESS_KEY': 'foosecretkey', 18 | 'AWS_REGION': 'us-east-1' 19 | }) 20 | 21 | 22 | class TestStore(TestCase): 23 | """Test methods on :mod:`compiler.services.store`.""" 24 | 25 | @mock_s3 26 | @mock.patch(f'{store_.__name__}.get_application_config', mock_app_config) 27 | def test_store_retrieve(self): 28 | """Test storing and retrieving compilation products.""" 29 | store = Store.current_session() 30 | content = io.BytesIO(b'somepdfcontent') 31 | store._create_bucket() 32 | status_pdf = domain.Task( 33 | source_id='12345', 34 | output_format=domain.Format.PDF, 35 | checksum='abc123checksum', 36 | task_id='foo-task-1234-6789', 37 | size_bytes=309192, 38 | status=domain.Status.COMPLETED 39 | ) 40 | product = domain.Product(stream=content) 41 | store.store(product, status_pdf) 42 | returned = store.retrieve('12345', 'abc123checksum', 43 | domain.Format.PDF) 44 | self.assertEqual(returned.stream.read(), b'somepdfcontent') 45 | 46 | with self.assertRaises(store_.DoesNotExist): 47 | store.retrieve('12345', 'foocheck', 48 | domain.Format.PS) 49 | 50 | @mock_s3 51 | @mock.patch(f'{store_.__name__}.get_application_config', mock_app_config) 52 | def test_store_retrieve_log(self): 53 | """Test storing and retrieving compilation logs.""" 54 | store = Store.current_session() 55 | content = io.BytesIO(b'some log output') 56 | store._create_bucket() 57 | status_pdf = domain.Task( 58 | source_id='12345', 59 | output_format=domain.Format.PDF, 60 | checksum='abc123checksum', 61 | task_id='foo-task-1234-6789', 62 | size_bytes=0, 63 | status=domain.Status.COMPLETED 64 | ) 65 | product = domain.Product(stream=content) 66 | store.store_log(product, status_pdf) 67 | 68 | returned = store.retrieve_log('12345', 'abc123checksum', 69 | domain.Format.PDF) 70 | self.assertEqual(returned.stream.read(), b'some log output') 71 | 72 | with self.assertRaises(store_.DoesNotExist): 73 | store.retrieve('12345', 'foocheck', 74 | domain.Format.PS) 75 | -------------------------------------------------------------------------------- /compiler/tests/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.7 2 | CMD cp /autotex/test.pdf /out/test.pdf 3 | -------------------------------------------------------------------------------- /compiler/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the arXiv compiler service.""" 2 | -------------------------------------------------------------------------------- /compiler/tests/data/1602.00123.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-compiler/3b80ce72ee9d519f15635e3819dcf8b45460cc4a/compiler/tests/data/1602.00123.tar -------------------------------------------------------------------------------- /compiler/tests/data/real-test.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-compiler/3b80ce72ee9d519f15635e3819dcf8b45460cc4a/compiler/tests/data/real-test.tar.gz -------------------------------------------------------------------------------- /compiler/tests/test.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}} 3 | -------------------------------------------------------------------------------- /compiler/tests/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-compiler/3b80ce72ee9d519f15635e3819dcf8b45460cc4a/compiler/tests/test.pdf -------------------------------------------------------------------------------- /compiler/tests/test.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arXiv/arxiv-compiler/3b80ce72ee9d519f15635e3819dcf8b45460cc4a/compiler/tests/test.tar.gz -------------------------------------------------------------------------------- /compiler/tests/test.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \title{Test Article} 4 | \author{arXiv-NG Team} 5 | \date{September 2018} 6 | 7 | \begin{document} 8 | 9 | \maketitle 10 | 11 | \section{Introduction} 12 | 13 | \end{document} -------------------------------------------------------------------------------- /compiler/tests/test_app.py: -------------------------------------------------------------------------------- 1 | """Test the compiler application as a whole.""" 2 | 3 | from unittest import TestCase, mock 4 | import io 5 | 6 | from arxiv.integration.api import status, service 7 | from arxiv.users.helpers import generate_token 8 | from arxiv.users.auth import scopes 9 | 10 | from .. import factory, compiler, domain 11 | from ..services import store, filemanager 12 | 13 | 14 | OS_ENVIRON = {'JWT_SECRET': 'foosecret'} 15 | 16 | 17 | class TestCompilerApp(TestCase): 18 | """The the app API.""" 19 | 20 | def setUp(self): 21 | """Create a test app and client.""" 22 | self.app = factory.create_app() 23 | self.client = self.app.test_client() 24 | self.app.config['JWT_SECRET'] = 'foosecret' 25 | self.app.config['S3_BUCKET'] = 'test-submission-bucket' 26 | self.app.config['AWS_ACCESS_KEY_ID'] = 'fookey' 27 | self.app.config['AWS_SECRET_ACCESS_KEY'] = 'foosecret' 28 | self.user_id = '123' 29 | with self.app.app_context(): 30 | self.token = generate_token(self.user_id, 'foo@user.com', 31 | 'foouser', 32 | scope=[scopes.CREATE_COMPILE, 33 | scopes.READ_COMPILE]) 34 | 35 | @mock.patch(f'{compiler.__name__}.do_nothing', mock.MagicMock()) 36 | @mock.patch(f'{service.__name__}.requests.Session') 37 | @mock.patch(f'{store.__name__}.boto3.client', mock.MagicMock()) 38 | def test_get_status(self, mock_session): 39 | """GET the ``getServiceStatus`` endpoint.""" 40 | mock_session.return_value.get.return_value.status_code = status.OK 41 | response = self.client.get('/status') 42 | self.assertEqual(response.status_code, status.OK) 43 | 44 | def test_get_nonexistant(self): 45 | """GET a nonexistant endpoint.""" 46 | response = self.client.get('/nowhere') 47 | self.assertEqual(response.status_code, status.NOT_FOUND) 48 | self.assertEqual( 49 | response.json, 50 | {'reason': 'The requested URL was not found on the server. If you' 51 | ' entered the URL manually please check your spelling' 52 | ' and try again.'} 53 | ) 54 | 55 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 56 | def test_post_bad_request(self): 57 | """POST the ``requestCompilation`` endpoint without data.""" 58 | response = self.client.post('/', headers={'Authorization': self.token}) 59 | self.assertEqual(response.status_code, status.BAD_REQUEST) 60 | self.assertEqual( 61 | response.json, 62 | {'reason': 'The browser (or proxy) sent a request that this server' 63 | ' could not understand.'} 64 | ) 65 | 66 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 67 | @mock.patch('compiler.controllers.compiler') 68 | @mock.patch('compiler.controllers.Store') 69 | @mock.patch('compiler.controllers.filemanager.FileManager') 70 | def test_post_request_compile(self, mock_fm, mock_store, mock_compiler): 71 | """POST the ``requestCompilation`` endpoint with valid data.""" 72 | mock_compiler.NoSuchTask = compiler.NoSuchTask 73 | mock_fm.current_session.return_value.owner.return_value = None 74 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 75 | 76 | response = self.client.post('/', json={ 77 | 'source_id': '54', 78 | 'checksum': 'a1b2c3d4=', 79 | 'output_format': 'pdf' 80 | }, 81 | headers={'Authorization': self.token} 82 | ) 83 | 84 | self.assertEqual(response.status_code, status.ACCEPTED) 85 | self.assertEqual(response.headers['Location'], 86 | 'http://localhost/54/a1b2c3d4%3D/pdf') 87 | 88 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 89 | @mock.patch('compiler.controllers.compiler') 90 | @mock.patch('compiler.controllers.Store') 91 | def test_post_compilation_product_exists(self, mock_store, mock_compiler): 92 | """POST ``requestCompilation`` for existant product.""" 93 | mock_compiler.NoSuchTask = compiler.NoSuchTask 94 | 95 | source_id = '54' 96 | checksum = 'a1b2c3d4=' 97 | fmt = 'pdf' 98 | owner = self.user_id 99 | 100 | comp_status = domain.Task.from_dict({ 101 | 'status': 'completed', 102 | 'reason': None, 103 | 'source_id': source_id, 104 | 'output_format': fmt, 105 | 'checksum': checksum, 106 | 'size_bytes': 123456, 107 | 'owner': owner, 108 | 'task_id': f'{source_id}/{checksum}/{fmt}' 109 | }) 110 | mock_compiler.get_task.return_value = comp_status 111 | 112 | response = self.client.post('/', json={ 113 | 'source_id': source_id, 114 | 'checksum': checksum, 115 | 'output_format': fmt 116 | }, 117 | headers={'Authorization': self.token} 118 | ) 119 | 120 | self.assertEqual(response.status_code, status.SEE_OTHER) 121 | self.assertEqual(response.headers['Location'], 122 | f'http://localhost/{source_id}/a1b2c3d4%3D/{fmt}') 123 | 124 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 125 | @mock.patch('compiler.controllers.compiler') 126 | @mock.patch('compiler.controllers.Store') 127 | def test_product_exists_unauthorized(self, mock_store, mock_compiler): 128 | """POST ``requestCompilation`` for existant product, wrong owner.""" 129 | mock_compiler.NoSuchTask = compiler.NoSuchTask 130 | 131 | source_id = '54' 132 | checksum = 'a1b2c3d4=' 133 | fmt = 'pdf' 134 | owner = '84843' 135 | 136 | comp_status = domain.Task.from_dict({ 137 | 'status': 'completed', 138 | 'reason': None, 139 | 'source_id': source_id, 140 | 'output_format': fmt, 141 | 'checksum': checksum, 142 | 'size_bytes': 123456, 143 | 'owner': owner, 144 | 'task_id': f'{source_id}/{checksum}/{fmt}' 145 | }) 146 | mock_compiler.get_task.return_value = comp_status 147 | 148 | response = self.client.post('/', json={ 149 | 'source_id': source_id, 150 | 'checksum': checksum, 151 | 'output_format': fmt 152 | }, 153 | headers={'Authorization': self.token} 154 | ) 155 | 156 | self.assertEqual(response.status_code, status.FORBIDDEN) 157 | 158 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 159 | @mock.patch('compiler.controllers.compiler') 160 | @mock.patch('compiler.controllers.Store') 161 | @mock.patch('compiler.controllers.filemanager.FileManager') 162 | def test_post_task_start_failed(self, mock_fm, mock_store, mock_compiler): 163 | """Could not start compilation.""" 164 | mock_compiler.NoSuchTask = compiler.NoSuchTask 165 | mock_compiler.TaskCreationFailed = compiler.TaskCreationFailed 166 | mock_fm.current_session.return_value.owner.return_value = None 167 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 168 | 169 | def raise_creation_failed(*args, **kwargs): 170 | raise compiler.TaskCreationFailed('Nope') 171 | 172 | mock_compiler.start_compilation.side_effect = raise_creation_failed 173 | 174 | response = self.client.post('/', json={ 175 | 'source_id': '54', 176 | 'checksum': 'a1b2c3d4=', 177 | 'output_format': 'pdf' 178 | }, 179 | headers={'Authorization': self.token} 180 | ) 181 | 182 | self.assertEqual(response.status_code, 183 | status.INTERNAL_SERVER_ERROR) 184 | 185 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 186 | @mock.patch('compiler.controllers.compiler') 187 | @mock.patch('compiler.controllers.Store') 188 | def test_get_status_completed(self, mock_store, mock_compiler): 189 | """GET the ``getCompilationStatus`` endpoint with valid data.""" 190 | mock_compiler.NoSuchTask = compiler.NoSuchTask 191 | 192 | source_id = '54' 193 | checksum = 'a1b2c3d4=' 194 | fmt = 'pdf' 195 | owner = self.user_id 196 | 197 | comp_status = domain.Task.from_dict({ 198 | 'status': 'completed', 199 | 'reason': None, 200 | 'source_id': source_id, 201 | 'output_format': fmt, 202 | 'checksum': checksum, 203 | 'size_bytes': 123456, 204 | 'owner': owner, 205 | 'task_id': f'{source_id}/{checksum}/{fmt}' 206 | }) 207 | mock_compiler.get_task.return_value \ 208 | = comp_status 209 | 210 | response = self.client.get( 211 | f'/{source_id}/{checksum}/{fmt}', 212 | headers={'Authorization': self.token} 213 | ) 214 | 215 | self.assertEqual(response.status_code, status.OK) 216 | self.assertDictEqual(response.json, comp_status.to_dict()) 217 | 218 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 219 | @mock.patch('compiler.controllers.compiler') 220 | @mock.patch('compiler.controllers.Store') 221 | def test_get_status_not_owner(self, mock_store, mock_compiler): 222 | """Someone other than the owner requests ``getCompilationStatus``.""" 223 | mock_compiler.NoSuchTask = compiler.NoSuchTask 224 | 225 | source_id = '54' 226 | checksum = 'a1b2c3d4=' 227 | fmt = 'pdf' 228 | owner = '5943' 229 | 230 | comp_status = domain.Task.from_dict({ 231 | 'status': 'completed', 232 | 'reason': None, 233 | 'source_id': source_id, 234 | 'output_format': fmt, 235 | 'checksum': checksum, 236 | 'size_bytes': 123456, 237 | 'owner': owner, 238 | 'task_id': f'{source_id}/{checksum}/{fmt}' 239 | }) 240 | mock_compiler.get_task.return_value \ 241 | = comp_status 242 | 243 | response = self.client.get( 244 | f'/{source_id}/{checksum}/{fmt}', 245 | headers={'Authorization': self.token} 246 | ) 247 | 248 | self.assertEqual(response.status_code, status.FORBIDDEN, 249 | 'Forbidden user gets a 403 Forbidden response.') 250 | 251 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 252 | @mock.patch('compiler.controllers.compiler') 253 | @mock.patch('compiler.controllers.Store') 254 | def test_get_status_nonexistant(self, mock_store, mock_compiler): 255 | """GET ``getCompilationStatus`` for nonexistant task.""" 256 | mock_compiler.NoSuchTask = compiler.NoSuchTask 257 | 258 | source_id = '54' 259 | checksum = 'a1b2c3d4=' 260 | fmt = 'pdf' 261 | 262 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 263 | 264 | response = self.client.get( 265 | f'/{source_id}/{checksum}/{fmt}', 266 | headers={'Authorization': self.token} 267 | ) 268 | 269 | self.assertEqual(response.status_code, status.NOT_FOUND) 270 | 271 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 272 | @mock.patch('compiler.controllers.compiler') 273 | @mock.patch('compiler.controllers.Store') 274 | def test_get_status_invalid_format(self, mock_store, mock_compiler): 275 | """GET ``getCompilationStatus`` for unsupported format.""" 276 | source_id = '54' 277 | checksum = 'a1b2c3d4=' 278 | fmt = 'wav' 279 | response = self.client.get( 280 | f'/{source_id}/{checksum}/{fmt}', 281 | headers={'Authorization': self.token} 282 | ) 283 | 284 | self.assertEqual(response.status_code, status.BAD_REQUEST) 285 | 286 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 287 | @mock.patch('compiler.controllers.compiler') 288 | @mock.patch('compiler.controllers.Store') 289 | def test_get_log(self, mock_store, mock_compiler): 290 | """GET the ``getCompilationLog`` endpoint with valid data.""" 291 | mock_compiler.NoSuchTask = compiler.NoSuchTask 292 | 293 | source_id = '54' 294 | checksum = 'a1b2c3d4=' 295 | fmt = 'pdf' 296 | owner = self.user_id 297 | 298 | comp_status = domain.Task.from_dict({ 299 | 'status': 'completed', 300 | 'reason': None, 301 | 'source_id': source_id, 302 | 'output_format': fmt, 303 | 'checksum': checksum, 304 | 'size_bytes': 123456, 305 | 'owner': owner, 306 | 'task_id': f'{source_id}/{checksum}/{fmt}' 307 | }) 308 | comp_log = domain.Product(stream=io.BytesIO(b'foologcontent')) 309 | mock_compiler.get_task.return_value \ 310 | = comp_status 311 | mock_store.current_session.return_value.retrieve_log.return_value \ 312 | = comp_log 313 | 314 | response = self.client.get( 315 | f'/{source_id}/{checksum}/{fmt}/log', 316 | headers={'Authorization': self.token} 317 | ) 318 | 319 | self.assertEqual(response.status_code, status.OK) 320 | self.assertEqual(response.data, b'foologcontent', 321 | "Returns the raw log content") 322 | self.assertEqual(response.headers['Content-Type'], 323 | 'text/plain; charset=utf-8') 324 | 325 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 326 | @mock.patch('compiler.controllers.compiler') 327 | @mock.patch('compiler.controllers.Store') 328 | def test_get_log_not_owner(self, mock_store, mock_compiler): 329 | """GET the ``getCompilationLog`` by someone who is not the owner.""" 330 | mock_compiler.NoSuchTask = compiler.NoSuchTask 331 | 332 | source_id = '54' 333 | checksum = 'a1b2c3d4=' 334 | fmt = 'pdf' 335 | owner = '98766' 336 | 337 | comp_status = domain.Task.from_dict({ 338 | 'status': 'completed', 339 | 'reason': None, 340 | 'source_id': source_id, 341 | 'output_format': fmt, 342 | 'checksum': checksum, 343 | 'size_bytes': 123456, 344 | 'owner': owner, 345 | 'task_id': f'{source_id}/{checksum}/{fmt}' 346 | }) 347 | comp_log = domain.Product(stream=io.BytesIO(b'foologcontent')) 348 | mock_compiler.get_task.return_value \ 349 | = comp_status 350 | mock_store.current_session.return_value.retrieve_log.return_value \ 351 | = comp_log 352 | 353 | response = self.client.get( 354 | f'/{source_id}/{checksum}/{fmt}/log', 355 | headers={'Authorization': self.token} 356 | ) 357 | 358 | self.assertEqual(response.status_code, status.FORBIDDEN) 359 | 360 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 361 | @mock.patch('compiler.controllers.compiler') 362 | @mock.patch('compiler.controllers.Store') 363 | def test_get_log_nononexistant(self, mock_store, mock_compiler): 364 | """GET the ``getCompilationLog`` for nonexistant compilation.""" 365 | mock_compiler.NoSuchTask = compiler.NoSuchTask 366 | 367 | source_id = '54' 368 | checksum = 'a1b2c3d4=' 369 | fmt = 'pdf' 370 | 371 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 372 | 373 | response = self.client.get( 374 | f'/{source_id}/{checksum}/{fmt}/log', 375 | headers={'Authorization': self.token} 376 | ) 377 | 378 | self.assertEqual(response.status_code, status.NOT_FOUND) 379 | 380 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 381 | @mock.patch('compiler.controllers.compiler') 382 | @mock.patch('compiler.controllers.Store') 383 | def test_get_log_invalid_format(self, mock_store, mock_compiler): 384 | """GET the ``getCompilationLog`` for unsupported format.""" 385 | source_id = '54' 386 | checksum = 'a1b2c3d4=' 387 | fmt = 'wav' 388 | 389 | response = self.client.get( 390 | f'/{source_id}/{checksum}/{fmt}/log', 391 | headers={'Authorization': self.token} 392 | ) 393 | 394 | self.assertEqual(response.status_code, status.BAD_REQUEST) 395 | 396 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 397 | @mock.patch('compiler.controllers.compiler') 398 | @mock.patch('compiler.controllers.Store') 399 | def test_get_product(self, mock_store, mock_compiler): 400 | """GET the ``getCompilationProduct`` endpoint with valid data.""" 401 | mock_compiler.NoSuchTask = compiler.NoSuchTask 402 | 403 | source_id = '54' 404 | checksum = 'a1b2c3d4=' 405 | fmt = 'pdf' 406 | owner = self.user_id 407 | 408 | comp_status = domain.Task.from_dict({ 409 | 'status': 'completed', 410 | 'reason': None, 411 | 'source_id': source_id, 412 | 'output_format': fmt, 413 | 'checksum': checksum, 414 | 'size_bytes': 123456, 415 | 'owner': owner, 416 | 'task_id': f'{source_id}/{checksum}/{fmt}' 417 | }) 418 | comp_product = domain.Product( 419 | stream=io.BytesIO(b'fooproductcontents'), 420 | checksum='productchxm' 421 | ) 422 | mock_compiler.get_task.return_value \ 423 | = comp_status 424 | mock_store.current_session.return_value.retrieve.return_value \ 425 | = comp_product 426 | 427 | response = self.client.get( 428 | f'/{source_id}/{checksum}/{fmt}/product', 429 | headers={'Authorization': self.token} 430 | ) 431 | 432 | self.assertEqual(response.status_code, status.OK) 433 | self.assertEqual(response.data, b'fooproductcontents', 434 | "Returns the raw product content") 435 | self.assertEqual(response.headers['Content-Type'], 'application/pdf') 436 | self.assertEqual(response.headers['ETag'], '"productchxm"') 437 | 438 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 439 | @mock.patch('compiler.controllers.compiler') 440 | @mock.patch('compiler.controllers.Store') 441 | def test_get_product_not_owner(self, mock_store, mock_compiler): 442 | """GET the ``getCompilationProduct`` by someone not the owner.""" 443 | mock_compiler.NoSuchTask = compiler.NoSuchTask 444 | 445 | source_id = '54' 446 | checksum = 'a1b2c3d4=' 447 | fmt = 'pdf' 448 | owner = '98766' 449 | 450 | comp_status = domain.Task.from_dict({ 451 | 'status': 'completed', 452 | 'reason': None, 453 | 'source_id': source_id, 454 | 'output_format': fmt, 455 | 'checksum': checksum, 456 | 'size_bytes': 123456, 457 | 'owner': owner, 458 | 'task_id': f'{source_id}/{checksum}/{fmt}' 459 | }) 460 | mock_compiler.get_task.return_value = comp_status 461 | 462 | response = self.client.get( 463 | f'/{source_id}/{checksum}/{fmt}/product', 464 | headers={'Authorization': self.token} 465 | ) 466 | 467 | self.assertEqual(response.status_code, status.FORBIDDEN) 468 | 469 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 470 | @mock.patch('compiler.controllers.compiler') 471 | @mock.patch('compiler.controllers.Store') 472 | def test_get_product_nononexistant(self, mock_store, mock_compiler): 473 | """GET the ``getCompilationProduct`` for nonexistant compilation.""" 474 | mock_compiler.NoSuchTask = compiler.NoSuchTask 475 | 476 | source_id = '54' 477 | checksum = 'a1b2c3d4=' 478 | fmt = 'pdf' 479 | 480 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 481 | 482 | response = self.client.get( 483 | f'/{source_id}/{checksum}/{fmt}/product', 484 | headers={'Authorization': self.token} 485 | ) 486 | 487 | self.assertEqual(response.status_code, status.NOT_FOUND) 488 | 489 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 490 | @mock.patch('compiler.controllers.compiler') 491 | @mock.patch('compiler.controllers.Store') 492 | def test_get_product_invalid_format(self, mock_store, mock_compiler): 493 | """GET the ``getCompilationProduct`` for unsupported format.""" 494 | source_id = '54' 495 | checksum = 'a1b2c3d4=' 496 | fmt = 'wav' 497 | 498 | response = self.client.get( 499 | f'/{source_id}/{checksum}/{fmt}/product', 500 | headers={'Authorization': self.token} 501 | ) 502 | 503 | self.assertEqual(response.status_code, status.BAD_REQUEST) 504 | 505 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 506 | @mock.patch('compiler.controllers.compiler') 507 | @mock.patch('compiler.controllers.Store') 508 | def test_get_product_while_recompiling(self, mock_store, mock_compiler): 509 | """GET ``getCompilationProduct`` while recompilation is in progress.""" 510 | mock_compiler.NoSuchTask = compiler.NoSuchTask 511 | 512 | source_id = '54' 513 | checksum = 'a1b2c3d4=' 514 | fmt = 'pdf' 515 | owner = self.user_id 516 | 517 | comp_status = domain.Task.from_dict({ 518 | 'status': 'in_progress', 519 | 'reason': None, 520 | 'source_id': source_id, 521 | 'output_format': fmt, 522 | 'checksum': checksum, 523 | 'size_bytes': 0, 524 | 'owner': owner, 525 | 'task_id': f'{source_id}/{checksum}/{fmt}' 526 | }) 527 | comp_product = domain.Product( 528 | stream=io.BytesIO(b'fooproductcontents'), 529 | checksum='productchxm' 530 | ) 531 | mock_compiler.get_task.return_value = comp_status 532 | mock_store.current_session.return_value.retrieve.return_value \ 533 | = comp_product 534 | 535 | response = self.client.get( 536 | f'/{source_id}/{checksum}/{fmt}/product', 537 | headers={'Authorization': self.token} 538 | ) 539 | 540 | self.assertEqual(response.status_code, status.SEE_OTHER, 541 | 'Returns a redirect response') 542 | self.assertTrue( 543 | response.headers['Location'].endswith('/54/a1b2c3d4%3D/pdf'), 544 | 'Redirects to status endpoint' 545 | ) 546 | 547 | @mock.patch('arxiv.users.auth.middleware.os.environ', OS_ENVIRON) 548 | @mock.patch('compiler.controllers.compiler') 549 | @mock.patch('compiler.controllers.Store') 550 | def test_get_log_while_recompiling(self, mock_store, mock_compiler): 551 | """GET ``getCompilationLog`` while recompilation is in progress.""" 552 | mock_compiler.NoSuchTask = compiler.NoSuchTask 553 | 554 | source_id = '54' 555 | checksum = 'a1b2c3d4=' 556 | fmt = 'pdf' 557 | owner = self.user_id 558 | 559 | comp_status = domain.Task.from_dict({ 560 | 'status': 'in_progress', 561 | 'reason': None, 562 | 'source_id': source_id, 563 | 'output_format': fmt, 564 | 'checksum': checksum, 565 | 'size_bytes': 0, 566 | 'owner': owner, 567 | 'task_id': f'{source_id}/{checksum}/{fmt}' 568 | }) 569 | mock_compiler.get_task.return_value = comp_status 570 | response = self.client.get( 571 | f'/{source_id}/{checksum}/{fmt}/log', 572 | headers={'Authorization': self.token} 573 | ) 574 | 575 | self.assertEqual(response.status_code, status.SEE_OTHER, 576 | 'Returns a redirect response') 577 | self.assertTrue( 578 | response.headers['Location'].endswith('/54/a1b2c3d4%3D/pdf'), 579 | 'Redirects to status endpoint' 580 | ) 581 | -------------------------------------------------------------------------------- /compiler/tests/test_controllers.py: -------------------------------------------------------------------------------- 1 | """Tests for :mod:`compiler.controllers`.""" 2 | 3 | from unittest import TestCase, mock 4 | import io 5 | from http import HTTPStatus as status 6 | 7 | from flask import Flask 8 | from werkzeug.datastructures import MultiDict 9 | from werkzeug.exceptions import NotFound, BadRequest 10 | 11 | from ..domain import Task, Product, Format, Status 12 | from .. import controllers, compiler 13 | from ..services import store, filemanager 14 | 15 | 16 | def mock_url_for(endpoint, **kwargs): 17 | """Simple mock for :func:`flask.url_for`.""" 18 | params = '/'.join(map(str, kwargs.values())) 19 | return f'http://{endpoint}/{params}' 20 | 21 | 22 | def raise_store_does_not_exist(*args, **kwargs): 23 | raise store.DoesNotExist('Nope!') 24 | 25 | 26 | def raise_no_such_task(*args, **kwargs): 27 | raise compiler.NoSuchTask('Nope!') 28 | 29 | 30 | class TestRequestCompilation(TestCase): 31 | """Tests for :func:`controllers.compile`.""" 32 | 33 | def setUp(self): 34 | """Create an app.""" 35 | self.app = Flask(__name__) 36 | filemanager.FileManager.init_app(self.app) 37 | 38 | def test_request_missing_parameter(self): 39 | """Request for a new compilation with missing parameter.""" 40 | with self.assertRaises(BadRequest): 41 | controllers.compile( 42 | MultiDict({'checksum': 'as12345'}), 43 | 'footoken', 44 | mock.MagicMock() 45 | ) 46 | 47 | with self.assertRaises(BadRequest): 48 | controllers.compile( 49 | MultiDict({'source_id': '1234'}), 50 | 'footoken', 51 | mock.MagicMock() 52 | ) 53 | 54 | def test_bad_checksum(self): 55 | """Request for a new compilation with a bad checksum value.""" 56 | request_params = MultiDict({ 57 | 'source_id': '1234', 58 | 'checksum': 'as12345!@#$', 59 | 'output_format': 'pdf' 60 | }) 61 | with self.assertRaises(BadRequest): 62 | controllers.compile(request_params, 'footoken', mock.MagicMock()) 63 | 64 | def test_bad_source_id(self): 65 | """Request for a new compilation with a bad source_id value.""" 66 | request_params = MultiDict({ 67 | 'source_id': '1234!@#$', 68 | 'checksum': 'as12345=', 69 | 'output_format': 'pdf' 70 | }) 71 | with self.assertRaises(BadRequest): 72 | controllers.compile(request_params, 'footoken', mock.MagicMock()) 73 | 74 | def test_bad_format(self): 75 | """Request for a new compilation with a bad output_format value.""" 76 | request_params = MultiDict({ 77 | 'source_id': '1234', 78 | 'checksum': 'as12345=', 79 | 'output_format': 'fdp' 80 | }) 81 | with self.assertRaises(BadRequest): 82 | controllers.compile(request_params, 'footoken', mock.MagicMock()) 83 | 84 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 85 | @mock.patch(f'{controllers.__name__}.compiler') 86 | @mock.patch(f'{controllers.__name__}.Store') 87 | @mock.patch(f'{controllers.__name__}.filemanager.FileManager') 88 | def test_compile_de_novo(self, mock_fm, mock_store, mock_compiler): 89 | """Request for a new compilation.""" 90 | mock_fm.current_session.return_value.owner.return_value = None 91 | mock_compiler.NoSuchTask = compiler.NoSuchTask 92 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 93 | task_id = '123::asdf12345zxcv::pdf' 94 | token = 'footoken' 95 | mock_compiler.start_compilation.return_value = task_id 96 | 97 | request_data = MultiDict({ 98 | 'source_id': '1234', 99 | 'checksum': 'asdf12345zxcv', 100 | 'output_format': 'pdf' 101 | }) 102 | with self.app.app_context(): 103 | response_data = controllers.compile( 104 | request_data, 105 | token, 106 | mock.MagicMock() 107 | ) 108 | data, code, headers = response_data 109 | self.assertEqual(code, status.ACCEPTED) 110 | self.assertIn('Location', headers) 111 | self.assertIn(str(request_data['source_id']), headers['Location']) 112 | self.assertIn(request_data['checksum'], headers['Location']) 113 | self.assertIn(request_data['output_format'], headers['Location']) 114 | 115 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 116 | @mock.patch(f'{controllers.__name__}.compiler') 117 | @mock.patch(f'{controllers.__name__}.Store') 118 | def test_compile_exists(self, mock_store, mock_compiler): 119 | """Request for a compilation that already exists.""" 120 | task_id = '123::asdf12345zxcv::pdf' 121 | source_id = '1234' 122 | checksum = 'asdf12345zxcv' 123 | output_format = 'pdf' 124 | token = "footoken" 125 | mock_compiler.get_task.return_value \ 126 | = Task(source_id=source_id, 127 | output_format=Format.PDF, 128 | status=Status.COMPLETED, 129 | task_id=task_id, 130 | checksum=checksum) 131 | request_data = MultiDict({'source_id': source_id, 'checksum': checksum, 132 | 'output_format': output_format}) 133 | mock_session = mock.MagicMock() 134 | response_data = controllers.compile(request_data, token, mock_session) 135 | data, code, headers = response_data 136 | self.assertEqual(code, status.SEE_OTHER) 137 | self.assertIn('Location', headers) 138 | self.assertIn(str(request_data['source_id']), headers['Location']) 139 | self.assertIn(request_data['checksum'], headers['Location']) 140 | self.assertIn(request_data['output_format'], headers['Location']) 141 | 142 | 143 | class TestGetTask(TestCase): 144 | """Tests for :func:`controllers.get_status`.""" 145 | 146 | def test_bad_checksum(self): 147 | """Request for status with a bad checksum value.""" 148 | with self.assertRaises(BadRequest): 149 | controllers.get_status('1234', 'as12345!@#$', 'pdf') 150 | 151 | def test_bad_source_id(self): 152 | """Request for status with a bad source_id value.""" 153 | with self.assertRaises(BadRequest): 154 | controllers.get_status('1234!@#$', 'as12345=', 'pdf') 155 | 156 | def test_bad_format(self): 157 | """Request for status with a bad output_format value.""" 158 | with self.assertRaises(BadRequest): 159 | controllers.get_status('1234', 'as12345=', 'fdp') 160 | 161 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 162 | @mock.patch(f'{controllers.__name__}.compiler') 163 | def test_get_info_completed(self, mock_compiler): 164 | """Request for a completed compilation.""" 165 | task_id = '123::asdf12345zxcv::pdf' 166 | source_id = '1234' 167 | checksum = 'asdf12345zxcv' 168 | output_format = 'pdf' 169 | mock_compiler.get_task.return_value \ 170 | = Task(source_id=source_id, 171 | output_format=Format.PDF, 172 | status=Status.COMPLETED, 173 | task_id=task_id, 174 | checksum=checksum) 175 | response_data = controllers.get_status(source_id, checksum, 176 | output_format) 177 | data, code, headers = response_data 178 | self.assertEqual(code, status.OK) 179 | 180 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 181 | @mock.patch(f'{controllers.__name__}.compiler') 182 | def test_get_info_in_progress(self, mock_compiler): 183 | """Request for a compilation in progress.""" 184 | task_id = 'task1234' 185 | source_id = '1234' 186 | checksum = 'asdf12345zxcv' 187 | output_format = 'pdf' 188 | mock_compiler.get_task.return_value \ 189 | = Task(source_id=source_id, 190 | output_format=Format.PDF, 191 | status=Status.IN_PROGRESS, 192 | task_id=task_id, 193 | checksum=checksum) 194 | response_data = controllers.get_status(source_id, checksum, 195 | output_format) 196 | data, code, headers = response_data 197 | self.assertEqual(code, status.OK) 198 | 199 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 200 | @mock.patch(f'{controllers.__name__}.compiler') 201 | def test_get_info_nonexistant(self, mock_compiler): 202 | """Request for a nonexistant compilation.""" 203 | mock_compiler.NoSuchTask = compiler.NoSuchTask 204 | source_id = '1234' 205 | checksum = 'asdf12345zxcv' 206 | output_format = 'pdf' 207 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 208 | 209 | with self.assertRaises(NotFound): 210 | controllers.get_status(source_id, checksum, output_format) 211 | 212 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 213 | @mock.patch(f'{controllers.__name__}.compiler') 214 | def test_get_status_completed(self, mock_compiler): 215 | """Request for a completed compilation.""" 216 | task_id = 'task1234' 217 | source_id = '1234' 218 | checksum = 'asdf12345zxcv' 219 | output_format = 'pdf' 220 | mock_compiler.get_task.return_value \ 221 | = Task(source_id=source_id, 222 | output_format=Format.PDF, 223 | status=Status.COMPLETED, 224 | task_id=task_id, 225 | checksum=checksum) 226 | response_data = controllers.get_status(source_id, checksum, 227 | output_format) 228 | data, code, headers = response_data 229 | self.assertEqual(code, status.OK) 230 | 231 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 232 | @mock.patch(f'{controllers.__name__}.compiler') 233 | def test_get_status_in_progress(self, mock_compiler): 234 | """Request for a completed compilation.""" 235 | task_id = 'task1234' 236 | source_id = '1234' 237 | checksum = 'asdf12345zxcv' 238 | output_format = Format.PDF 239 | mock_compiler.get_task.return_value \ 240 | = Task(source_id=source_id, 241 | output_format=Format.PDF, 242 | status=Status.IN_PROGRESS, 243 | task_id=task_id, 244 | checksum=checksum) 245 | response_data = controllers.get_status(source_id, checksum, 246 | output_format.value) 247 | data, code, headers = response_data 248 | self.assertEqual(code, status.OK) 249 | 250 | 251 | class TestGetProduct(TestCase): 252 | """Tests for :func:`controllers.get_product`.""" 253 | 254 | def test_bad_checksum(self): 255 | """Request with a bad checksum value.""" 256 | with self.assertRaises(BadRequest): 257 | controllers.get_product('1234', 'as12345!@#$', 'pdf') 258 | 259 | def test_bad_source_id(self): 260 | """Request with a bad source_id value.""" 261 | with self.assertRaises(BadRequest): 262 | controllers.get_product('1234!@#$', 'as12345=', 'pdf') 263 | 264 | def test_bad_format(self): 265 | """Request with a bad output_format value.""" 266 | with self.assertRaises(BadRequest): 267 | controllers.get_product('1234', 'as12345=', 'fdp') 268 | 269 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 270 | @mock.patch(f'{controllers.__name__}.compiler', mock.MagicMock()) 271 | @mock.patch(f'{controllers.__name__}.Store') 272 | def test_get_product_completed(self, mock_store): 273 | """Request for a completed compilation product.""" 274 | task_id = 'task1234' 275 | source_id = '1234' 276 | checksum = 'asdf12345zxcv' 277 | output_format = 'pdf' 278 | product_checksum = 'thechecksumoftheproduct' 279 | mock_store.current_session.return_value.retrieve.return_value \ 280 | = Product(stream=io.BytesIO(b'foocontent'), 281 | checksum=product_checksum) 282 | response_data = controllers.get_product( 283 | source_id, 284 | checksum, 285 | output_format 286 | ) 287 | data, code, headers = response_data 288 | self.assertEqual(code, status.OK) 289 | self.assertEqual(headers['ETag'], product_checksum) 290 | 291 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 292 | @mock.patch(f'{controllers.__name__}.compiler') 293 | @mock.patch(f'{controllers.__name__}.Store') 294 | def test_get_product_nonexistant(self, mock_store, mock_compiler): 295 | """Request for a nonexistant compilation product.""" 296 | mock_compiler.NoSuchTask = compiler.NoSuchTask 297 | source_id = '1234' 298 | checksum = 'asdf12345zxcv' 299 | output_format = 'pdf' 300 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 301 | 302 | with self.assertRaises(NotFound): 303 | controllers.get_product(source_id, checksum, output_format) 304 | 305 | 306 | class TestGetCompilationLog(TestCase): 307 | """Tests for :func:`controllers.get_log`.""" 308 | 309 | def test_bad_checksum(self): 310 | """Request with a bad checksum value.""" 311 | with self.assertRaises(BadRequest): 312 | controllers.get_log('1234', 'as12345!@#$', 'pdf') 313 | 314 | def test_bad_source_id(self): 315 | """Request with a bad source_id value.""" 316 | with self.assertRaises(BadRequest): 317 | controllers.get_log('1234!@#$', 'as12345=', 'pdf') 318 | 319 | def test_bad_format(self): 320 | """Request with a bad output_format value.""" 321 | with self.assertRaises(BadRequest): 322 | controllers.get_log('1234', 'as12345=', 'fdp') 323 | 324 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 325 | @mock.patch(f'{controllers.__name__}.compiler', mock.MagicMock()) 326 | @mock.patch(f'{controllers.__name__}.Store') 327 | def test_get_log_completed(self, mock_store): 328 | """Request log for a completed compilation.""" 329 | task_id = 'task1234' 330 | source_id = '1234' 331 | checksum = 'asdf12345zxcv' 332 | output_format = 'pdf' 333 | product_checksum = 'thechecksumoftheproduct' 334 | mock_store.current_session.return_value.retrieve_log.return_value \ 335 | = Product(stream=io.BytesIO(b'foolog'), 336 | checksum=product_checksum) 337 | response_data = controllers.get_log( 338 | source_id, 339 | checksum, 340 | output_format 341 | ) 342 | data, code, headers = response_data 343 | self.assertEqual(code, status.OK) 344 | self.assertEqual(headers['ETag'], product_checksum) 345 | 346 | @mock.patch(f'{controllers.__name__}.url_for', mock_url_for) 347 | @mock.patch(f'{controllers.__name__}.compiler') 348 | @mock.patch(f'{controllers.__name__}.Store') 349 | def test_get_log_nonexistant(self, mock_store, mock_compiler): 350 | """Request for a nonexistant compilation log.""" 351 | mock_compiler.NoSuchTask = compiler.NoSuchTask 352 | source_id = '1234' 353 | checksum = 'asdf12345zxcv' 354 | output_format = 'pdf' 355 | mock_compiler.get_task.side_effect = compiler.NoSuchTask 356 | 357 | with self.assertRaises(NotFound): 358 | controllers.get_log(source_id, checksum, output_format) 359 | -------------------------------------------------------------------------------- /compiler/util.py: -------------------------------------------------------------------------------- 1 | """Helpers and utilities for the compilation service.""" 2 | 3 | from typing import Iterator 4 | from io import BytesIO, SEEK_END 5 | 6 | 7 | class ResponseStream(object): 8 | """Streaming wrapper for bytes-producing iterators.""" 9 | 10 | def __init__(self, iterator: Iterator) -> None: 11 | """Set the bytes-producing iterator.""" 12 | self.read = iterator 13 | 14 | # def read(self, *args, **kwargs) -> Iterator: 15 | # """Get bytes from the stream.""" 16 | # return self._iterator(chunk_size=1, decode_unicode=False) 17 | -------------------------------------------------------------------------------- /compiler/worker.py: -------------------------------------------------------------------------------- 1 | """Initialize the Celery application.""" 2 | 3 | import os 4 | from typing import Any 5 | from base64 import b64decode 6 | 7 | import docker 8 | from celery.signals import task_prerun, celeryd_init, worker_init, celeryd_init 9 | import boto3 10 | 11 | from arxiv.vault.manager import ConfigManager 12 | from .factory import create_app as create_flask_app 13 | from .celery import celery_app 14 | 15 | app = create_flask_app() 16 | app.app_context().push() # type: ignore 17 | 18 | if app.config['VAULT_ENABLED']: 19 | __secrets__ = app.middlewares['VaultMiddleware'].secrets 20 | else: 21 | __secrets__ = None 22 | 23 | 24 | @celeryd_init.connect 25 | def get_secrets(*args: Any, **kwargs: Any) -> None: 26 | """Collect any required secrets from Vault, and get the convert image.""" 27 | if not app.config['VAULT_ENABLED']: 28 | print('Vault not enabled; skipping') 29 | return 30 | for key, value in __secrets__.yield_secrets(): 31 | app.config[key] = value 32 | print('updated secrets') 33 | 34 | 35 | @celeryd_init.connect 36 | def verify_converter_image_up_to_date(*args: Any, **kwargs: Any) -> None: 37 | """Upon startup, pull the compiler image.""" 38 | image = app.config['CONVERTER_DOCKER_IMAGE'] 39 | ecr_registry, _ = image.split('/', 1) 40 | client = docker.from_env() 41 | 42 | # Get login credentials from AWS for the ECR registry. 43 | ecr = boto3.client('ecr', 44 | region_name=app.config.get('AWS_REGION', 'us-east-1')) 45 | response = ecr.get_authorization_token() 46 | token = b64decode(response['authorizationData'][0]['authorizationToken']) 47 | username, password = token.decode('utf-8').split(':', 1) 48 | 49 | # Log in to the ECR registry with Docker. 50 | client.login(username, password, registry=ecr_registry) 51 | client.images.pull(image) 52 | 53 | 54 | @task_prerun.connect 55 | def verify_secrets_up_to_date(*args: Any, **kwargs: Any) -> None: 56 | """Verify that any required secrets from Vault are up to date.""" 57 | if not app.config['VAULT_ENABLED']: 58 | print('Vault not enabled; skipping') 59 | return 60 | for key, value in __secrets__.yield_secrets(): 61 | app.config[key] = value 62 | print('updated secrets') 63 | -------------------------------------------------------------------------------- /deploy/bin.md5: -------------------------------------------------------------------------------- 1 | 0ee0dbc791be702dbedc7f8145c997dc kubectl 2 | 79b60dbc556b2fab591c2c76dafb9eb3 get_helm.sh 3 | -------------------------------------------------------------------------------- /deploy/compiler/Chart.yaml: -------------------------------------------------------------------------------- 1 | name: compiler 2 | version: 0.0.1 3 | appVersion: 0.0.1 4 | description: LaTeX compiler service 5 | sources: 6 | - https://github.com/arxiv/arxiv-compiler 7 | engine: gotpl 8 | -------------------------------------------------------------------------------- /deploy/compiler/README.md: -------------------------------------------------------------------------------- 1 | # Deployment Instructions for compiler 2 | 3 | To install `compiler` and `compiler-worker` to the development namespace in the 4 | kubernetes cluster: 5 | 6 | 7 | ```bash 8 | helm install ./ --name=compiler --set=image.tag=6b1b6a4 \ 9 | --tiller-namespace=development --namespace=development \ 10 | --set=vault.enabled=1 --set=vault.port=8200 --set=vault.host= \ 11 | --set=ingress.host=development.arxiv.org \ 12 | --set=redis.host= 13 | ``` 14 | 15 | This will create *at least* 1 pod for `compiler` and *at least* 1 pod for `compiler-worker`, depending on the how the values for `scaling.worker_replicas` and `scaling.api_replicas` are set; the defaults for both is 3. 16 | 17 | The `compiler` pod(s) run a single container called `compiler` and the `compiler-worker` pods run two containers: `compiler-dind-daemon` and `arxiv-compiler-worker`. 18 | 19 | 20 | To delete the pods associated with `compiler` and `compiler-worker`, run: 21 | ``` 22 | helm del --purge compiler --tiller-namespace=development 23 | ``` 24 | 25 | Notes: 26 | - `image.tag`: this refers to the tag in [dockerhub](https://hub.docker.com/repository/docker/arxiv/compiler) 27 | - `vault.host`: the actual IP of the Vault host can be retrieved from most of the other pods 28 | - `redis.host`: the Redis cluster is provisioned separately from k8s. See AWS ElastiCache dashboard; get the endpoint from the `tasks-development` cluster (without the port). 29 | -------------------------------------------------------------------------------- /deploy/compiler/templates/00-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | metadata: 3 | annotations: 4 | prometheus.io/scrape: 'true' 5 | name: "{{ default "compiler" .Values.service.name }}" 6 | namespace: "{{ .Values.namespace }}" 7 | labels: 8 | subsystem: "{{ .Values.labels.subsystem }}" 9 | container: "{{ default "compiler" .Values.deployment.name }}" 10 | service-group: "{{ .Values.labels.service_group }}" 11 | log-style: uwsgi 12 | env: "{{ .Values.namespace }}" 13 | spec: 14 | type: NodePort 15 | ports: 16 | - port: 80 17 | targetPort: 8000 18 | selector: 19 | subsystem: "{{ .Values.labels.subsystem }}" 20 | container: "{{ default "compiler" .Values.deployment.name }}" 21 | -------------------------------------------------------------------------------- /deploy/compiler/templates/10-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | namespace: "{{ .Values.namespace }}" 5 | name: compiler 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | service-group: "{{ .Values.labels.service_group }}" 9 | 10 | --- 11 | apiVersion: apps/v1beta1 12 | kind: Deployment 13 | metadata: 14 | name: "{{ default "compiler" .Values.deployment.name }}" 15 | namespace: "{{ .Values.namespace }}" 16 | labels: 17 | subsystem: "{{ .Values.labels.subsystem }}" 18 | container: "{{ default "compiler" .Values.deployment.name }}" 19 | service-group: "{{ .Values.labels.service_group }}" 20 | log-style: uwsgi 21 | spec: 22 | replicas: {{ int .Values.scaling.api_replicas }} 23 | template: 24 | metadata: 25 | labels: 26 | subsystem: "{{ .Values.labels.subsystem }}" 27 | container: "{{ default "compiler" .Values.deployment.name }}" 28 | service-group: "{{ .Values.labels.service_group }}" 29 | log-style: uwsgi 30 | # annotations: 31 | # prometheus.io/scrape: 'true' 32 | spec: 33 | serviceAccount: compiler 34 | volumes: 35 | - name: vault-token 36 | emptyDir: {} 37 | - name: vault-certificate 38 | secret: 39 | secretName: vault-certificate 40 | 41 | containers: 42 | - name: "{{ default "compiler" .Values.deployment.name }}" 43 | image: arxiv/compiler:{{ .Values.image.tag }} 44 | imagePullPolicy: Always 45 | volumeMounts: 46 | - name: vault-token 47 | mountPath: /etc/vault 48 | - name: vault-certificate 49 | mountPath: /etc/vault-certificate 50 | readOnly: true 51 | ports: 52 | - containerPort: 8000 53 | 54 | readinessProbe: 55 | periodSeconds: 30 56 | httpGet: 57 | path: "{{ .Values.ingress.path }}/status" 58 | port: 8000 59 | 60 | resources: 61 | limits: 62 | cpu: 300m 63 | memory: 512Mi 64 | requests: 65 | cpu: 200m 66 | memory: 256Mi 67 | # livenessProbe: 68 | # initialDelaySeconds: 2 69 | # periodSeconds: 5 70 | # httpGet: 71 | # path: /status 72 | # port: 8000 73 | # readinessProbe: 74 | # periodSeconds: 5 75 | # httpGet: 76 | # path: /status 77 | # port: 8000 78 | env: 79 | - name: LOGLEVEL 80 | value: "{{ .Values.loglevel }}" 81 | - name: REDIS_ENDPOINT 82 | value: "{{ .Values.redis.host }}" 83 | - name: VAULT_ENABLED 84 | value: "1" 85 | - name: VAULT_HOST 86 | value: "{{ .Values.vault.host }}" 87 | - name: VAULT_PORT 88 | value: "{{ .Values.vault.port }}" 89 | - name: VAULT_CERT 90 | value: /etc/vault-certificate/vaulttls.cert.pem 91 | - name: VAULT_ROLE 92 | value: "{{ .Values.vault.role }}-{{ .Values.namespace }}" 93 | - name: VAULT_CREDENTIAL 94 | value: "{{ .Values.vault.credential }}" 95 | - name: S3_SUBMISSION_BUCKET 96 | value: "{{ .Values.s3.submission.bucket }}-{{ .Values.namespace }}" 97 | - name: KUBE_TOKEN 98 | value: /var/run/secrets/kubernetes.io/serviceaccount/token 99 | - name: FILEMANAGER_PROTO 100 | value: "{{ .Values.filemanager.proto }}" 101 | - name: NAMESPACE 102 | value: "{{ .Values.namespace }}" 103 | - name: APPLICATION_ROOT 104 | value: "{{ .Values.ingress.path }}" 105 | - name: WAIT_FOR_SERVICES 106 | value: "1" 107 | - name: WAIT_ON_STARTUP 108 | value: "10" 109 | - name: WAIT_FOR_WORKER 110 | value: "1" 111 | 112 | 113 | --- 114 | apiVersion: apps/v1beta1 115 | kind: Deployment 116 | metadata: 117 | name: "{{ default "compiler" .Values.deployment.name }}-worker" 118 | namespace: "{{ .Values.namespace }}" 119 | labels: 120 | subsystem: "{{ .Values.labels.subsystem }}" 121 | container: "{{ default "compiler" .Values.deployment.name }}-worker" 122 | service-group: backend 123 | log-style: celery 124 | spec: 125 | replicas: {{ int .Values.scaling.worker_replicas }} 126 | template: 127 | metadata: 128 | labels: 129 | subsystem: "{{ .Values.labels.subsystem }}" 130 | container: "{{ default "compiler" .Values.deployment.name }}-worker" 131 | service-group: "{{ .Values.labels.service_group }}" 132 | log-style: celery 133 | # annotations: 134 | # prometheus.io/scrape: 'true' 135 | spec: 136 | serviceAccount: compiler 137 | volumes: 138 | - name: vault-token 139 | emptyDir: {} 140 | - name: docker-graph-storage 141 | emptyDir: {} 142 | - name: run 143 | emptyDir: {} 144 | - name: sources 145 | emptyDir: {} 146 | - name: vault-certificate 147 | secret: 148 | secretName: vault-certificate 149 | 150 | containers: 151 | - name: compiler-dind-daemon 152 | image: docker:18.09.2-dind 153 | resources: 154 | requests: 155 | memory: "512Mi" 156 | cpu: "500m" 157 | limits: 158 | memory: "1Gi" 159 | cpu: "1" 160 | securityContext: 161 | privileged: true 162 | volumeMounts: 163 | - name: docker-graph-storage 164 | mountPath: /var/lib/docker 165 | - name: sources 166 | mountPath: /sources 167 | - name: vault-token 168 | mountPath: /etc/vault 169 | - name: vault-certificate 170 | mountPath: /etc/vault-certificate 171 | readOnly: true 172 | 173 | - name: arxiv-compiler-worker 174 | image: arxiv/compiler:{{ .Values.image.tag }} 175 | imagePullPolicy: Always 176 | ports: 177 | - containerPort: 8000 178 | resources: 179 | limits: 180 | cpu: 300m 181 | memory: 256Mi 182 | requests: 183 | cpu: 100m 184 | memory: 128Mi 185 | volumeMounts: 186 | - name: run 187 | mountPath: /var/run/celery 188 | - name: sources 189 | mountPath: /sources 190 | - name: vault-token 191 | mountPath: /etc/vault 192 | - name: vault-certificate 193 | mountPath: /etc/vault-certificate 194 | readOnly: true 195 | command: ['pipenv', 'run', 'celery', 'worker', '-A', 'compiler.worker.celery_app', '-l', 'INFO', '-E', '--concurrency=2'] 196 | env: 197 | - name: REDIS_ENDPOINT 198 | value: "{{ .Values.redis.host }}" 199 | - name: VAULT_ENABLED 200 | value: "1" 201 | - name: VAULT_HOST 202 | value: "{{ .Values.vault.host }}" 203 | - name: VAULT_PORT 204 | value: "{{ .Values.vault.port }}" 205 | - name: VAULT_CERT 206 | value: /etc/vault-certificate/vaulttls.cert.pem 207 | - name: VAULT_ROLE 208 | value: "{{ .Values.vault.role }}-{{ .Values.namespace }}" 209 | - name: VAULT_CREDENTIAL 210 | value: "{{ .Values.vault.credential }}" 211 | - name: S3_SUBMISSION_BUCKET 212 | value: "{{ .Values.s3.submission.bucket }}-{{ .Values.namespace }}" 213 | - name: KUBE_TOKEN 214 | value: /var/run/secrets/kubernetes.io/serviceaccount/token 215 | - name: DOCKER_HOST 216 | value: tcp://localhost:2375 217 | - name: CONVERTER_DOCKER_IMAGE 218 | value: "{{ .Values.converter.image }}" 219 | - name: DIND_SOURCE_ROOT 220 | value: /sources 221 | - name: VERBOSE_COMPILE 222 | value: "1" 223 | - name: WORKER_SOURCE_ROOT 224 | value: /sources 225 | - name: WAIT_FOR_SERVICES 226 | value: "1" 227 | - name: WAIT_ON_STARTUP 228 | value: "10" 229 | - name: LOGLEVEL 230 | value: "{{ .Values.loglevel }}" 231 | - name: FILEMANAGER_PROTO 232 | value: "{{ .Values.filemanager.proto }}" 233 | - name: NAMESPACE 234 | value: "{{ .Values.namespace }}" 235 | # {{ if .Values.filemanager.override }} 236 | # - name: FILEMANAGER_SERVICE_HOST 237 | # value: "{{ .Values.filemanager.host }}" 238 | # - name: FILEMANAGER_SERVICE_PORT 239 | # value: "{{ .Values.filemanager.port }}" 240 | # - name: FILEMANAGER_SERVICE_PORT_443_PROTO 241 | # value: "{{ .Values.filemanager.proto }}" 242 | # - name: FILEMANAGER_PATH 243 | # value: "{{ .Values.filemanager.path }}" 244 | # - name: FILEMANAGER_CONTENT_PATH 245 | # value: "{{ .Values.filemanager.content_path }}" 246 | # - name: FILEMANAGER_VERIFY_CHECKSUM 247 | # value: "{{ .Values.filemanager.verify_checksum }}" 248 | # - name: FILEMANAGER_STATUS_ENDPOINT 249 | # value: "{{ .Values.filemanager.status_endpoint }}" 250 | # {{ end }} 251 | -------------------------------------------------------------------------------- /deploy/compiler/templates/20-ingress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: "{{ default "compiler" .Values.ingress.name }}" 5 | namespace: "{{ .Values.namespace }}" 6 | labels: 7 | subsystem: "{{ .Values.labels.subsystem }}" 8 | container: "{{ default "compiler" .Values.deployment.name }}" 9 | service-group: api 10 | annotations: 11 | ingress.kubernetes.io/configuration-snippet: | 12 | more_set_headers "Request-Id: $req_id"; 13 | ingress.kubernetes.io/limit-connections: "4" 14 | ingress.kubernetes.io/limit-rps: "16" 15 | ingress.kubernetes.io/rewrite-target: / 16 | spec: 17 | tls: # This will use the default certificate for the ingress controller. 18 | - hosts: 19 | - "{{ .Values.ingress.host }}" 20 | rules: 21 | - host: "{{ .Values.ingress.host }}" 22 | http: 23 | paths: 24 | - path: "{{ default "/compiler" .Values.ingress.path }}" 25 | backend: 26 | serviceName: "{{ default "compiler" .Values.service.name }}" 27 | servicePort: 80 28 | -------------------------------------------------------------------------------- /deploy/compiler/values.yaml: -------------------------------------------------------------------------------- 1 | labels: 2 | subsystem: submission-moderation 3 | service_group: backend 4 | 5 | scaling: 6 | worker_replicas: "3" 7 | api_replicas: "3" 8 | 9 | redis: 10 | host: task-queue-development 11 | port: 6379 12 | namespace: development 13 | image: 14 | tag: "0.0" 15 | 16 | deployment: 17 | name: compiler 18 | service: 19 | name: compiler 20 | 21 | redis: 22 | host: changeme 23 | 24 | vault: 25 | host: changeme 26 | port: changeme 27 | role: compiler 28 | credential: compiler 29 | 30 | s3: 31 | submission: 32 | bucket: compiler-submission 33 | region: us-east-1 34 | bucket: compiler-submission 35 | region: us-east-1 36 | 37 | 38 | converter: 39 | image: "arxiv/converter:0.10.1" 40 | 41 | loglevel: 10 42 | 43 | filemanager: 44 | override: false 45 | host: "arxiv.org" 46 | port: "443" 47 | proto: "http" 48 | path: "/" 49 | content_path: "/src/{source_id}" 50 | verify_checksum: "0" 51 | status_endpoint: "" 52 | 53 | 54 | ingress: 55 | host: "development.arxiv.org" 56 | path: "/compiler" 57 | -------------------------------------------------------------------------------- /deploy/install_helm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ENVIRONMENT=$1 4 | TOKEN_NAME=USER_TOKEN_$(echo $ENVIRONMENT | awk '{print toupper($0)}') 5 | SA_NAME=USER_SA_$(echo $ENVIRONMENT | awk '{print toupper($0)}') 6 | DEPLOYMENT_HOSTNAME_VAR=DEPLOYMENT_DOMAIN_$(echo $ENVIRONMENT | awk '{print toupper($0)}') 7 | USER_TOKEN=${!TOKEN_NAME} 8 | USER_SA=${!SA_NAME} 9 | HELM_RELEASE=${CHART_NAME}-${ENVIRONMENT} 10 | DEPLOYMENT_HOSTNAME=${!DEPLOYMENT_HOSTNAME_VAR} 11 | 12 | # Install kubectl & Helm 13 | curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.9.2/bin/linux/amd64/kubectl 14 | curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get > get_helm.sh 15 | 16 | md5sum -c ./deploy/bin.md5 17 | if [[ "$?" != "0" ]]; then 18 | echo "One or more binary does not match expected checksum" 19 | exit 1 20 | fi 21 | 22 | chmod +x ./kubectl 23 | sudo mv ./kubectl /usr/local/bin/kubectl 24 | echo "Intalled kubectl" 25 | 26 | chmod 700 get_helm.sh 27 | sudo ./get_helm.sh -v v2.8.0 || echo "Helm already installed" 28 | echo "Intalled Helm" 29 | 30 | # Configure Kubernetes & Helm 31 | echo $CA_CERT | base64 --decode > ${HOME}/ca.crt 32 | 33 | kubectl config set-cluster $CLUSTER_NAME --embed-certs=true --server=$CLUSTER_ENDPOINT --certificate-authority=${HOME}/ca.crt 34 | kubectl config set-credentials $USER_SA --token=$(echo $USER_TOKEN | base64 --decode) 35 | kubectl config set-context travis --cluster=$CLUSTER_NAME --user=$USER_SA --namespace=$ENVIRONMENT 36 | kubectl config use-context travis 37 | kubectl config current-context 38 | echo "Configured kubectl" 39 | 40 | helm init --client-only --tiller-namespace $ENVIRONMENT 41 | echo "Set up helm client" 42 | 43 | # Add S3 repo. Requires AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to be set 44 | # in the environment. 45 | helm plugin install https://github.com/hypnoglow/helm-s3.git || echo "Helm S3 already installed" 46 | helm repo add arxiv $HELM_REPOSITORY 47 | helm repo update 48 | echo "Updated Helm repo" 49 | -------------------------------------------------------------------------------- /deploy/make_and_push_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o pipefail 4 | set -o errexit 5 | set -o nounset 6 | 7 | # Used to deploy 8 | 9 | export LABEL=$1 10 | export BASE_VERSION=$2 11 | export LOGLEVEL=40 12 | export IMAGE_NAME=arxiv/${LABEL} 13 | if [ -z "${TRAVIS_TAG}" ]; then 14 | export SOURCE_REF=${TRAVIS_COMMIT} 15 | else 16 | export SOURCE_REF=${TRAVIS_TAG} 17 | fi 18 | 19 | git fetch --unshallow || echo "Repository is already complete" 20 | docker login -u "$DOCKERHUB_USERNAME" -p "$DOCKERHUB_PASSWORD" 21 | docker build ./ -t ${IMAGE_NAME}:${SOURCE_REF} -f ./Dockerfile --build-arg=BASE_VERSION=${BASE_VERSION} 22 | docker push ${IMAGE_NAME}:${SOURCE_REF} 23 | -------------------------------------------------------------------------------- /deploy/publish_helm_chart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o pipefail 4 | set -o errexit 5 | set -o nounset 6 | 7 | 8 | export LOGLEVEL=40 9 | if [ -z "${TRAVIS_TAG}" ]; then 10 | export SOURCE_REF=${TRAVIS_COMMIT} 11 | else 12 | export SOURCE_REF=${TRAVIS_TAG} 13 | fi 14 | 15 | helm package --version ${SOURCE_REF} --app-version ${SOURCE_REF} ./deploy/compiler/ 16 | helm s3 push compiler-${SOURCE_REF}.tgz arxiv || echo "This chart version already published" 17 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | x-base-service: 3 | &base-service 4 | stdin_open: true 5 | tty: true 6 | environment: 7 | SECRET_KEY: "not secure only use for development" 8 | JWT_SECRET: "foosecret" 9 | DOCKER_HOST: "unix:///var/run/docker.sock" 10 | REDIS_ENDPOINT: "compiler-test-redis" 11 | AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}" 12 | AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}" 13 | AWS_S3_REGION_NAME: "us-east-1" 14 | S3_ENDPOINT: "https://compiler-test-localstack:4572" 15 | S3_VERIFY: 0 16 | CONVERTER_DOCKER_IMAGE: "626657773168.dkr.ecr.us-east-1.amazonaws.com/arxiv/converter:0.9" 17 | FILEMANAGER_SERVICE_HOST: "arxiv.org" 18 | FILEMANAGER_SERVICE_PORT: "443" 19 | FILEMANAGER_SERVICE_PORT_443_PROTO: "https" 20 | FILEMANAGER_PATH: "/" 21 | FILEMANAGER_CONTENT_PATH: "/src/{source_id}" 22 | FILEMANAGER_STATUS_ENDPOINT: "" 23 | FILEMANAGER_VERIFY_CHECKSUM: 0 24 | LOGLEVEL: 10 25 | FLASK_APP: /opt/arxiv/app.py 26 | FLASK_DEBUG: 1 27 | DIND_SOURCE_ROOT: "${DIND_SOURCE_ROOT}" 28 | VERBOSE_COMPILE: 1 29 | VAULT_ENABLED: "0" 30 | NAMESPACE: "production" 31 | KUBE_TOKEN: "fookubetoken" 32 | WAIT_FOR_SERVICES: 1 33 | WAIT_ON_STARTUP: 5 34 | 35 | services: 36 | compiler-test-redis: 37 | image: redis 38 | container_name: compiler-test-redis 39 | networks: 40 | - compiler-test 41 | ports: 42 | - "6379:6379" 43 | logging: 44 | driver: none 45 | 46 | compiler-test-localstack: 47 | image: atlassianlabs/localstack 48 | container_name: compiler-test-localstack 49 | networks: 50 | - compiler-test 51 | ports: 52 | - "4572:4572" 53 | - "4568:4568" 54 | environment: 55 | USE_SSL: 'true' 56 | DEBUG: 'true' 57 | logging: 58 | driver: none 59 | 60 | compiler-test-api: 61 | << : *base-service 62 | build: 63 | context: . 64 | args: 65 | BASE_VERSION: "0.16.1" 66 | # command: pipenv run flask run -h 0.0.0.0 -p 8000 67 | depends_on: 68 | - "compiler-test-localstack" 69 | - "compiler-test-redis" 70 | - "compiler-test-worker" 71 | networks: 72 | - compiler-test 73 | ports: 74 | - "8000:8000" 75 | 76 | compiler-test-worker: 77 | << : *base-service 78 | build: 79 | context: . 80 | args: 81 | BASE_VERSION: "0.16.1" 82 | command: celery worker -A compiler.worker.celery_app --loglevel=INFO -E --concurrency=2 83 | depends_on: 84 | - "compiler-test-localstack" 85 | - "compiler-test-redis" 86 | networks: 87 | - compiler-test 88 | 89 | volumes: 90 | - "${DIND_SOURCE_ROOT}:/tmp" 91 | - /var/run/docker.sock:/var/run/docker.sock 92 | 93 | networks: 94 | compiler-test: 95 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/source/api/compiler.celery.rst: -------------------------------------------------------------------------------- 1 | compiler.celery module 2 | ====================== 3 | 4 | .. automodule:: compiler.celery 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.celeryconfig.rst: -------------------------------------------------------------------------------- 1 | compiler.celeryconfig module 2 | ============================ 3 | 4 | .. automodule:: compiler.celeryconfig 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.compiler.rst: -------------------------------------------------------------------------------- 1 | compiler.compiler module 2 | ======================== 3 | 4 | .. automodule:: compiler.compiler 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.config.rst: -------------------------------------------------------------------------------- 1 | compiler.config module 2 | ====================== 3 | 4 | .. automodule:: compiler.config 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.controllers.rst: -------------------------------------------------------------------------------- 1 | compiler.controllers module 2 | =========================== 3 | 4 | .. automodule:: compiler.controllers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.domain.rst: -------------------------------------------------------------------------------- 1 | compiler.domain module 2 | ====================== 3 | 4 | .. automodule:: compiler.domain 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.factory.rst: -------------------------------------------------------------------------------- 1 | compiler.factory module 2 | ======================= 3 | 4 | .. automodule:: compiler.factory 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.routes.rst: -------------------------------------------------------------------------------- 1 | compiler.routes module 2 | ====================== 3 | 4 | .. automodule:: compiler.routes 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.rst: -------------------------------------------------------------------------------- 1 | compiler package 2 | ================ 3 | 4 | .. automodule:: compiler 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | compiler.services 15 | compiler.tests 16 | 17 | Submodules 18 | ---------- 19 | 20 | .. toctree:: 21 | 22 | compiler.celery 23 | compiler.celeryconfig 24 | compiler.compiler 25 | compiler.config 26 | compiler.controllers 27 | compiler.domain 28 | compiler.factory 29 | compiler.routes 30 | compiler.util 31 | compiler.worker 32 | 33 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.filemanager.rst: -------------------------------------------------------------------------------- 1 | compiler.services.filemanager package 2 | ===================================== 3 | 4 | .. automodule:: compiler.services.filemanager 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | compiler.services.filemanager.tests 15 | 16 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.filemanager.tests.rst: -------------------------------------------------------------------------------- 1 | compiler.services.filemanager.tests package 2 | =========================================== 3 | 4 | .. automodule:: compiler.services.filemanager.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | compiler.services.filemanager.tests.test_filemanager 15 | 16 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.filemanager.tests.test_filemanager.rst: -------------------------------------------------------------------------------- 1 | compiler.services.filemanager.tests.test\_filemanager module 2 | ============================================================ 3 | 4 | .. automodule:: compiler.services.filemanager.tests.test_filemanager 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.rst: -------------------------------------------------------------------------------- 1 | compiler.services package 2 | ========================= 3 | 4 | .. automodule:: compiler.services 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | compiler.services.filemanager 15 | compiler.services.store 16 | 17 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.store.rst: -------------------------------------------------------------------------------- 1 | compiler.services.store package 2 | =============================== 3 | 4 | .. automodule:: compiler.services.store 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | compiler.services.store.tests 15 | 16 | -------------------------------------------------------------------------------- /docs/source/api/compiler.services.store.tests.rst: -------------------------------------------------------------------------------- 1 | compiler.services.store.tests module 2 | ==================================== 3 | 4 | .. automodule:: compiler.services.store.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.tests.rst: -------------------------------------------------------------------------------- 1 | compiler.tests package 2 | ====================== 3 | 4 | .. automodule:: compiler.tests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | 14 | compiler.tests.test_app 15 | compiler.tests.test_compiler 16 | compiler.tests.test_controllers 17 | 18 | -------------------------------------------------------------------------------- /docs/source/api/compiler.tests.test_app.rst: -------------------------------------------------------------------------------- 1 | compiler.tests.test\_app module 2 | =============================== 3 | 4 | .. automodule:: compiler.tests.test_app 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.tests.test_compiler.rst: -------------------------------------------------------------------------------- 1 | compiler.tests.test\_compiler module 2 | ==================================== 3 | 4 | .. automodule:: compiler.tests.test_compiler 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.tests.test_controllers.rst: -------------------------------------------------------------------------------- 1 | compiler.tests.test\_controllers module 2 | ======================================= 3 | 4 | .. automodule:: compiler.tests.test_controllers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.util.rst: -------------------------------------------------------------------------------- 1 | compiler.util module 2 | ==================== 3 | 4 | .. automodule:: compiler.util 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/compiler.worker.rst: -------------------------------------------------------------------------------- 1 | compiler.worker module 2 | ====================== 3 | 4 | .. automodule:: compiler.worker 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/modules.rst: -------------------------------------------------------------------------------- 1 | compiler 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | compiler 8 | -------------------------------------------------------------------------------- /docs/source/architecture.rst: -------------------------------------------------------------------------------- 1 | Architecture 2 | ============ 3 | 4 | This section provides a high-level overview of the compilation service, 5 | including its context within the arXiv software system, the primary containers 6 | by which the service is comprised, and the main functional components within 7 | those containers. 8 | 9 | Context 10 | ------- 11 | The compilation service encapsulates the core TeX compilation functionality 12 | of the arXiv system. This functionality is leveraged in four contexts: 13 | 14 | 1. Compilation of submission content (TeX, PS) to PDF during the submission 15 | process so that the submitter can preview their e-print as it will appear 16 | on arXiv. 17 | #. Providing compiled PDFs during moderation, to facilitate quality assurance 18 | checks and moderator review. 19 | #. Compilation of PDF, PS, DVI, and other derived formats at announcement time, 20 | to be included in the canonical record and distributed via the public 21 | website. 22 | #. Compilation of sources provided by API consumers, e.g. for overlay journals 23 | or authoring platforms to validate compatibility with arXiv and/or provide 24 | previews to authors. 25 | 26 | The current implementation of the compiler service focuses on the first two 27 | contexts, specifically on PDF outputs. 28 | 29 | The third context (announcement) will be supported during later milestones of 30 | the arXiv-NG project. 31 | 32 | The fourth context is currently aspirational. 33 | 34 | Containers 35 | ---------- 36 | The compilation service is deployed as three containers: 37 | 38 | 1. The compiler API, a Flask WSGI application that handles requests for 39 | compilation, dispatches compilation tasks to the worker, and provides access 40 | to compiled products and logs. 41 | #. The compiler worker, a Celery application that handles compilation tasks 42 | by retrieving source content, executing the 43 | [converter](https://github.com/arXiv/arxiv-converter), and storing the 44 | resultant products and logs. 45 | #. A Docker-in-Docker (DinD) container, within which the converter image is 46 | executed. The DinD exposes its API to the compiler worker. 47 | 48 | The API and worker containers rely on two infrastructure services: 49 | 50 | 1. A Redis cluster used as a task queue and result backend, typically shared 51 | within a deployment namespace. 52 | 2. S3 bucket(s) for storing compilation products and logs. 53 | 54 | 55 | Components 56 | ---------- 57 | The API and worker containers are composed from the same set of components, 58 | the :py:mod:`compiler` package. This package follows the patterns described 59 | in :std:doc:`arxitecture:crosscutting/services`. 60 | 61 | :mod:`compiler.domain` provides the core concepts and data structures for the 62 | compiler service. 63 | 64 | :mod:`compiler.factory` provides the :func:`create_app` factory for generating 65 | the compiler API WSGI application. 66 | 67 | :mod:`compiler.routes` provides the API blueprint. 68 | 69 | API request controllers can be found in :mod:`compiler.controllers`. 70 | 71 | Two service integration modules can be found in :mod:`compiler.services`: 72 | 73 | 1. :mod:`compiler.services.filemanager` provides integration with the 74 | filemanager service, to retrieve source content for submissions. 75 | 2. :mod:`compiler.services.store` provides integration with the S3 buckets 76 | used to store compilation products and logs. 77 | 78 | Dispatching and execution of compilation tasks is implemented in 79 | :mod:`compiler.compiler`. 80 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('.')) 18 | sys.path.append(os.path.abspath('..')) 19 | sys.path.append(os.path.abspath('../..')) 20 | 21 | from compiler.factory import create_app 22 | app = create_app() 23 | app.app_context().push() 24 | 25 | 26 | # -- Project information ----------------------------------------------------- 27 | 28 | project = 'arXiv Compilation Service' 29 | copyright = '2018, arXiv-NG Team' 30 | author = 'arXiv-NG Team' 31 | 32 | # The short X.Y version 33 | version = '' 34 | # The full version, including alpha/beta/rc tags 35 | release = '0.1' 36 | 37 | 38 | # -- General configuration --------------------------------------------------- 39 | 40 | # If your documentation needs a minimal Sphinx version, state it here. 41 | # 42 | # needs_sphinx = '1.0' 43 | 44 | # Add any Sphinx extension module names here, as strings. They can be 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 46 | # ones. 47 | extensions = [ 48 | 'sphinx.ext.autodoc', 49 | 'sphinx_autodoc_typehints', 50 | 'sphinx.ext.autosummary', 51 | 'sphinx.ext.napoleon', 52 | 'sphinx.ext.intersphinx', 53 | 'sphinx.ext.graphviz', 54 | 'sphinx.ext.todo', 55 | 'sphinx.ext.coverage', 56 | 'sphinx.ext.mathjax', 57 | 'sphinx.ext.ifconfig', 58 | 'sphinx.ext.viewcode', 59 | 'sphinx.ext.githubpages' 60 | ] 61 | 62 | # Add any paths that contain templates here, relative to this directory. 63 | templates_path = ['_templates'] 64 | 65 | # The suffix(es) of source filenames. 66 | # You can specify multiple suffix as a list of string: 67 | # 68 | # source_suffix = ['.rst', '.md'] 69 | source_suffix = '.rst' 70 | 71 | # The master toctree document. 72 | master_doc = 'index' 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | # This pattern also affects html_static_path and html_extra_path. 84 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 85 | 86 | # The name of the Pygments (syntax highlighting) style to use. 87 | pygments_style = None 88 | 89 | 90 | # -- Options for HTML output ------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | # 95 | html_theme = 'alabaster' 96 | 97 | # Theme options are theme-specific and customize the look and feel of a theme 98 | # further. For a list of options available for each theme, see the 99 | # documentation. 100 | # 101 | # html_theme_options = {} 102 | 103 | # Add any paths that contain custom static files (such as style sheets) here, 104 | # relative to this directory. They are copied after the builtin static files, 105 | # so a file named "default.css" will overwrite the builtin "default.css". 106 | html_static_path = ['_static'] 107 | 108 | # Custom sidebar templates, must be a dictionary that maps document names 109 | # to template names. 110 | # 111 | # The default sidebars (for documents that don't match any pattern) are 112 | # defined by theme itself. Builtin themes are using these templates by 113 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 114 | # 'searchbox.html']``. 115 | # 116 | # html_sidebars = {} 117 | 118 | 119 | # -- Options for HTMLHelp output --------------------------------------------- 120 | 121 | # Output file base name for HTML help builder. 122 | htmlhelp_basename = 'arXivCompilationServicedoc' 123 | 124 | 125 | # -- Options for LaTeX output ------------------------------------------------ 126 | 127 | latex_elements = { 128 | # The paper size ('letterpaper' or 'a4paper'). 129 | # 130 | # 'papersize': 'letterpaper', 131 | 132 | # The font size ('10pt', '11pt' or '12pt'). 133 | # 134 | # 'pointsize': '10pt', 135 | 136 | # Additional stuff for the LaTeX preamble. 137 | # 138 | # 'preamble': '', 139 | 140 | # Latex figure (float) alignment 141 | # 142 | # 'figure_align': 'htbp', 143 | } 144 | 145 | # Grouping the document tree into LaTeX files. List of tuples 146 | # (source start file, target name, title, 147 | # author, documentclass [howto, manual, or own class]). 148 | latex_documents = [ 149 | (master_doc, 'arXivCompilationService.tex', 'arXiv Compilation Service Documentation', 150 | 'arXiv-NG Team', 'manual'), 151 | ] 152 | 153 | 154 | # -- Options for manual page output ------------------------------------------ 155 | 156 | # One entry per manual page. List of tuples 157 | # (source start file, name, description, authors, manual section). 158 | man_pages = [ 159 | (master_doc, 'arxivcompilationservice', 'arXiv Compilation Service Documentation', 160 | [author], 1) 161 | ] 162 | 163 | 164 | # -- Options for Texinfo output ---------------------------------------------- 165 | 166 | # Grouping the document tree into Texinfo files. List of tuples 167 | # (source start file, target name, title, author, 168 | # dir menu entry, description, category) 169 | texinfo_documents = [ 170 | (master_doc, 'arXivCompilationService', 'arXiv Compilation Service Documentation', 171 | author, 'arXivCompilationService', 'One line description of project.', 172 | 'Miscellaneous'), 173 | ] 174 | 175 | 176 | # -- Options for Epub output ------------------------------------------------- 177 | 178 | # Bibliographic Dublin Core info. 179 | epub_title = project 180 | 181 | # The unique identifier of the text. This can be a ISBN number 182 | # or the project homepage. 183 | # 184 | # epub_identifier = '' 185 | 186 | # A unique identification for the text. 187 | # 188 | # epub_uid = '' 189 | 190 | # A list of files that should not be packed into the epub file. 191 | epub_exclude_files = ['search.html'] 192 | 193 | 194 | # -- Extension configuration ------------------------------------------------- 195 | 196 | # -- Options for intersphinx extension --------------------------------------- 197 | 198 | # Example configuration for intersphinx: refer to the Python standard library. 199 | intersphinx_mapping = { 200 | 'python': ('https://docs.python.org/3.6', None), 201 | 'arxitecture': ('https://arxiv.github.io/arxiv-arxitecture/', None), 202 | 'arxiv.taxonomy': ('https://arxiv.github.io/arxiv-base', None), 203 | 'arxiv.base': ('https://arxiv.github.io/arxiv-base', None), 204 | 'arxiv.users': ('https://arxiv.github.io/arxiv-auth', None), 205 | 'browse': ('https://arxiv.github.io/arxiv-browse/', None), 206 | 'search': ('https://arxiv.github.io/arxiv-search/', None), 207 | 'zero': ('https://arxiv.github.io/arxiv-zero/', None), 208 | } 209 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. arXiv Compilation Service documentation master file, created by 2 | sphinx-quickstart on Mon Oct 15 16:35:18 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | arXiv Compiler Service 7 | ====================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | architecture.rst 14 | api/modules.rst 15 | 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | -------------------------------------------------------------------------------- /mock_vault.py: -------------------------------------------------------------------------------- 1 | """Mock endpoint for Vault secrets.""" 2 | 3 | from flask import Flask, send_file, jsonify, request 4 | from datetime import datetime 5 | 6 | application = Flask(__name__) 7 | 8 | TOK_ID = 0 9 | KV_ID = 0 10 | AWS_ID = 0 11 | 12 | tokens = {} 13 | 14 | 15 | @application.route('/v1/auth/kubernetes/login', methods=['POST']) 16 | def log_in(): 17 | global TOK_ID 18 | TOK_ID += 1 19 | tokens[TOK_ID] = datetime.now() 20 | return jsonify({'auth': {'client_token': f'{TOK_ID}'}}) 21 | 22 | 23 | @application.route('/v1/secret/data/') 24 | def get_kv_secret(path): 25 | global KV_ID 26 | KV_ID += 1 27 | return jsonify({ 28 | "request_id": f"foo-request-{KV_ID}", 29 | "lease_id": "", 30 | "renewable": False, 31 | "lease_duration": 0, 32 | "data": { 33 | "data": { 34 | "jwt-secret": "foosecret" 35 | }, 36 | "metadata": { 37 | "created_time": "2019-04-18T12:58:32.820693897Z", 38 | "deletion_time": "", 39 | "destroyed": False, 40 | "version": 1 41 | } 42 | }, 43 | "wrap_info": None, 44 | "warnings": None, 45 | "auth": None 46 | }) 47 | 48 | 49 | @application.route('/v1/aws/creds/') 50 | def get_aws_secret(role): 51 | """Get an AWS credential.""" 52 | global AWS_ID 53 | AWS_ID += 1 54 | return jsonify({ 55 | "request_id": f"a-request-id-{AWS_ID}", 56 | "lease_id": f"aws/creds/{role}/a-lease-id-{AWS_ID}", 57 | "renewable": True, 58 | "lease_duration": 3600, 59 | "data": { 60 | "access_key": "ASDF1234", 61 | "secret_key": "xljadslklk3mlkmlkmxklmx09j3990j", 62 | "security_token": None 63 | }, 64 | "wrap_info": None, 65 | "warnings": None, 66 | "auth": None 67 | }) 68 | 69 | 70 | @application.route('/v1/auth/token/lookup') 71 | def look_up_a_token(self): 72 | """Look up an auth token.""" 73 | tok = request.get_json()['token'] 74 | return jsonify({ 75 | "data": { 76 | "accessor": "8609694a-cdbc-db9b-d345-e782dbb562ed", 77 | "creation_time": int(round(datetime.timestamp(tokens[tok]), 0)), 78 | "creation_ttl": 2764800, 79 | "display_name": "fooname", 80 | "entity_id": "7d2e3179-f69b-450c-7179-ac8ee8bd8ca9", 81 | "expire_time": "2018-05-19T11:35:54.466476215-04:00", 82 | "explicit_max_ttl": 0, 83 | "id": "cf64a70f-3a12-3f6c-791d-6cef6d390eed", 84 | "identity_policies": [ 85 | "dev-group-policy" 86 | ], 87 | "issue_time": tokens[tok].isoformat(), 88 | "meta": { 89 | "username": "tesla" 90 | }, 91 | "num_uses": 0, 92 | "orphan": True, 93 | "path": "auth/kubernetes/login", 94 | "policies": [ 95 | "default" 96 | ], 97 | "renewable": True, 98 | "ttl": 2764790 99 | } 100 | }) 101 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | mypy_path = $MYPYPATH:./sqlalchemy-stubs 3 | 4 | # 5 | # Covered by --strict, with some turned off: 6 | # 7 | disallow_untyped_calls=True 8 | disallow_untyped_defs=True 9 | check_untyped_defs=True 10 | # currently an issue with sql alchemy 11 | disallow_subclassing_any=false 12 | # Need to experiment/think about this one: 13 | disallow_any_decorated=false 14 | warn_redundant_casts=True 15 | warn_return_any=True 16 | warn_unused_ignores=True 17 | # this seems to be at least somewhat non-functioning: 18 | #warn_unused_configs=True 19 | #may be worth reconsidering this one: 20 | no_implicit_optional=True 21 | strict_optional=True 22 | 23 | # 24 | # Other: 25 | # 26 | ignore_missing_imports=True 27 | 28 | 29 | [mypy-sqlalchemy.*] 30 | disallow_untyped_calls=False 31 | disallow_untyped_defs=False 32 | -------------------------------------------------------------------------------- /profile.yml: -------------------------------------------------------------------------------- 1 | # Application profile 2 | 3 | application: 4 | slug: compiler # image: arxiv/filemanager 5 | type: wsgi # exposes port 8000 6 | subsystem: submission-moderation 7 | service_group: backend 8 | resources: 9 | memory: 10 | min: 128Mi 11 | max: 256Mi 12 | cpu: 13 | min: 100m 14 | max: 300m 15 | dependencies: 16 | - name: compiler-submission 17 | type: s3 18 | description: Contains compiler products. 19 | - name: task-queue 20 | type: redis 21 | description: Task queue and result backend for Celery. 22 | secrets: 23 | - name: JWT_SECRET 24 | type: generic 25 | - name: write-compiler-submission 26 | type: aws 27 | resources: 28 | - action: 29 | - s3:GetObject 30 | - s3:PutObject 31 | - s3:ListBucket 32 | - s3:DeleteObject 33 | resource: s3 34 | name: compiler-submission 35 | 36 | 37 | application: 38 | slug: compiler-worker # image: arxiv/filemanager 39 | type: worker # exposes port 8000 40 | subsystem: submission-moderation 41 | service_group: backend 42 | resources: 43 | memory: 44 | min: 128Mi 45 | max: 256Mi 46 | cpu: 47 | min: 100m 48 | max: 300m 49 | dependencies: 50 | - name: compiler-submission 51 | type: s3 52 | description: Contains compiler products. 53 | - name: converter-image 54 | type: image 55 | var: CONVERTER_DOCKER_IMAGE 56 | value: 626657773168.dkr.ecr.us-east-1.amazonaws.com/arxiv/converter:0.8 57 | description: Image to be executed within DinD container. 58 | - name: task-queue 59 | type: redis 60 | description: Task queue and result backend for Celery. 61 | secrets: 62 | - name: jwt-secret 63 | type: generic 64 | var: JWT_SECRET 65 | - name: write-compiler-submission 66 | type: aws 67 | resources: 68 | - action: 69 | - s3:GetObject 70 | - s3:PutObject 71 | - s3:ListBucket 72 | - s3:DeleteObject 73 | resource: s3 74 | name: compiler-submission 75 | - action: 76 | - "ecr:GetAuthorizationToken" 77 | - "ecr:BatchCheckLayerAvailability" 78 | - "ecr:GetDownloadUrlForLayer" 79 | - "ecr:GetRepositoryPolicy" 80 | - "ecr:DescribeRepositories" 81 | - "ecr:ListImages" 82 | - "ecr:DescribeImages" 83 | - "ecr:BatchGetImage" 84 | resource: ecr 85 | name: converter-image 86 | -------------------------------------------------------------------------------- /schema/openapi.yaml: -------------------------------------------------------------------------------- 1 | openapi: "3.0.0" 2 | info: 3 | version: "0.1" 4 | title: "arXiv Compliler Service" 5 | contact: 6 | name: "arXiv API Team" 7 | email: nextgen@arxiv.org 8 | license: 9 | name: MIT 10 | 11 | components: 12 | parameters: 13 | source_id: 14 | name: source_id 15 | in: path 16 | description: The upload workspace ID. 17 | required: true 18 | schema: 19 | type: integer 20 | checksum: 21 | name: checksum 22 | in: path 23 | description: The checksum of the upload workspace being compiled. 24 | required: true 25 | schema: 26 | type: string 27 | output_format: 28 | name: output_format 29 | in: path 30 | description: The compilation output format. 31 | required: true 32 | schema: 33 | type: string 34 | 35 | paths: 36 | /status: 37 | get: 38 | operationId: getServiceStatus 39 | summary: | 40 | Get information about the current status of compilation service. 41 | responses: 42 | '200': 43 | description: OK 44 | '503': 45 | description: Service Unavailable 46 | 47 | /: 48 | post: 49 | description: | 50 | Using the source_id, checksum, and format, use the Celery AsyncTask API 51 | to check the existance/status of a compilation. If it does not exist, 52 | create an asynchronous task using the source_id, checksum, and format 53 | as a joint key and as call parameters. In either case, redirect (303) 54 | to getCompilationStatus. 55 | operationId: requestCompilation 56 | requestBody: 57 | required: true 58 | content: 59 | application/json: 60 | schema: 61 | $ref: 'resources/requestCompilation.json' 62 | responses: 63 | '202': 64 | description: Accepted for compilation. 65 | headers: 66 | Location: 67 | description: The URL for the compilation task. 68 | schema: 69 | type: string 70 | 71 | 72 | /{source_id}/{checksum}/{output_format}: 73 | description: | 74 | This resource represents the compilation and its products. This should 75 | tell the client about the compilation task itself (e.g. when it started, 76 | its status, parameters), the location of the product of the compilation 77 | (e.g. the PDF, DVI, PS), and the location of the log. 78 | parameters: 79 | - $ref: '#/components/parameters/source_id' 80 | - $ref: '#/components/parameters/checksum' 81 | - $ref: '#/components/parameters/output_format' 82 | get: 83 | operationId: getCompilationStatus 84 | responses: 85 | '200': 86 | description: OK 87 | content: 88 | application/json: 89 | schema: 90 | $ref: 'resources/compilationStatus.json' 91 | '303': 92 | description: Redirect to getCompilationStatusStatus for the compilation. 93 | '404': 94 | description: No compilation has been requested. 95 | 96 | 97 | /{source_id}/{checksum}/{output_format}/log: 98 | description: | 99 | The log bytestream itself. 100 | TODO: add text/plain mime-type, etc. to response description. 101 | parameters: 102 | - $ref: '#/components/parameters/source_id' 103 | - $ref: '#/components/parameters/checksum' 104 | - $ref: '#/components/parameters/output_format' 105 | get: 106 | operationId: getCompilationLog 107 | responses: 108 | '200': 109 | description: OK 110 | 111 | /{source_id}/{checksum}/{output_format}/product: 112 | description: | 113 | The product bytestream; e.g. application/pdf, application/ps 114 | parameters: 115 | - $ref: '#/components/parameters/source_id' 116 | - $ref: '#/components/parameters/checksum' 117 | - $ref: '#/components/parameters/output_format' 118 | get: 119 | operationId: getProduct 120 | responses: 121 | '200': 122 | description: OK 123 | -------------------------------------------------------------------------------- /schema/resources/compilationInfo.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Compilation Info", 3 | "description": "Describes the compilations and outputs for the given upload.", 4 | "additionalProperties": false, 5 | "required": ["source_id", "products", "timestamp"], 6 | "type": "object", 7 | "properties": { 8 | "source_id": { 9 | "description": "Identifier of the upload bundle.", 10 | "type": "integer" 11 | }, 12 | "products": { 13 | "additionalProperties": { 14 | "type": "string", 15 | "$ref" : "compilationProduct.json" 16 | } 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /schema/resources/compilationStatus.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Compilation Product", 3 | "description": "Describes the compilation and outputs for an upload, along with times for the process.", 4 | "additionalProperties": false, 5 | "required": ["content", "log", "status", "md5", "create_datetime"], 6 | "type": "object", 7 | "properties": { 8 | "content": { 9 | "type": "string", 10 | "description": "URL of the compiled content" 11 | }, 12 | "log": { 13 | "type": "string", 14 | "description": "URL of the compilation log" 15 | }, 16 | "status": { 17 | "description": "Current status of the compilation task.", 18 | "type": "string", 19 | "enum": ["FAILED", "SUCCEEDED", "IN_PROGRESS"] 20 | }, 21 | "checksum": { 22 | "description": "The md5 checksum of the upload bundle at time of compilation.", 23 | "type": "string" 24 | }, 25 | "compilation_requested": { 26 | "description": "The date-time that compilation was requested.", 27 | "type": "string", 28 | "format": "datetime" 29 | }, 30 | "compilation_start": { 31 | "description": "The date-time that compilation started.", 32 | "type": "string", 33 | "format": "datetime" 34 | }, 35 | "compilation_end": { 36 | "description": "The date-time that compilation ended.", 37 | "type": "string", 38 | "format": "datetime" 39 | }, 40 | "source_updated": { 41 | "description": "The date-time when the source package was updated.", 42 | "type": "string", 43 | "format": "datetime" 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /schema/resources/requestCompilation.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Compilation Request", 3 | "description": "Describes the compilation request.", 4 | "additionalProperties": false, 5 | "required": ["source_id", "checksum", "format"], 6 | "properties": { 7 | "source_id": {"type": "integer"}, 8 | "checksum": {"type": "string"}, 9 | "format": {"type": "string"}, 10 | "force": {"type": "boolean"}, 11 | "compiler": {"type": "string"}, 12 | "stamp_label": { 13 | "description": "Label to use in PS/PDF stamp/watermark", 14 | "type": "string" 15 | }, 16 | "stamp_link": { 17 | "description": "Link to associate with label in PS/PDF stamp/watermark", 18 | "type": "string" 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import datetime 3 | from itertools import chain 4 | import json 5 | import logging 6 | import random 7 | import requests 8 | 9 | def generate_arxiv_id(): 10 | year = random.randint(9,17) 11 | month = random.randint(1,12) 12 | id = random.randint(1,2500) 13 | 14 | return f"{year:02d}{month:02d}.{id:05d}" 15 | 16 | def payload(id): 17 | year = int(id[0:2]) 18 | year += 1900 if year > 90 else 2000 19 | month = int(id[2:4]) 20 | date = datetime(year, month, 1,1,4,33) 21 | return { 22 | "source_id" : id, 23 | # TODO: Update with actual checksum 24 | #"checksum" : "\"Tue, 02 Feb 2016 01:04:33 GMT\"", 25 | "checksum" : date.timestamp(), 26 | "format" : "pdf", 27 | "force" : True 28 | } 29 | 30 | def check_status(task_url): 31 | r = requests.get(task_url) 32 | try: 33 | data = r.json() 34 | return data['status']['status'] 35 | except: 36 | if r.status_code == 404: 37 | return 'pending' 38 | 39 | async def test_compilation(arxiv_id=None): 40 | """ returns (arxiv_id: str, success: Bool) """ 41 | if arxiv_id is None: 42 | arxiv_id = generate_arxiv_id() 43 | data = json.dumps(payload(arxiv_id)) 44 | logging.debug(f"submitting task for {arxiv_id}") 45 | r = requests.post("http://localhost:8000/", data=data) 46 | task_url = r.headers['Location'] 47 | 48 | status = 'in_progress' 49 | while status in ['in_progress', 'pending']: 50 | await asyncio.sleep(10) 51 | status = check_status(task_url) 52 | print(arxiv_id, status) 53 | 54 | if status == 'failed': 55 | return (arxiv_id, False) 56 | elif status == 'completed': 57 | return (arxiv_id, True) 58 | 59 | def main(N=1, ids=[]): 60 | futures = [] 61 | if ids: 62 | for id in ids: 63 | futures.append(asyncio.ensure_future(test_compilation(id))) 64 | else: 65 | for i in range(N): 66 | futures.append(asyncio.ensure_future(test_compilation())) 67 | 68 | loop = asyncio.get_event_loop() 69 | result = loop.run_until_complete(asyncio.wait(futures)) 70 | for future in futures: 71 | arxiv_id, success = future.result() 72 | print(arxiv_id, success) 73 | 74 | if __name__ == '__main__': 75 | from argparse import ArgumentParser 76 | 77 | parser = ArgumentParser() 78 | group = parser.add_mutually_exclusive_group() 79 | group.add_argument('-N', type=int, default=5) 80 | group.add_argument('--ids', nargs="+") 81 | args = parser.parse_args() 82 | 83 | main(N=args.N, ids=args.ids) 84 | -------------------------------------------------------------------------------- /tests/docstyle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | PROJECT=$1 6 | 7 | pipenv run pydocstyle --convention=numpy --add-ignore=D401 ${PROJECT} 8 | PYDOCSTYLE_STATUS=$? 9 | if [ $PYDOCSTYLE_STATUS -ne 0 ]; then PYDOCSTYLE_STATE="failure" && echo "pydocstyle failed"; else PYDOCSTYLE_STATE="success" && echo "pydocstyle passed"; fi 10 | if [ "$TRAVIS_PULL_REQUEST_SHA" = "" ]; then SHA=$TRAVIS_COMMIT; else SHA=$TRAVIS_PULL_REQUEST_SHA; fi 11 | 12 | if [ -z ${GITHUB_TOKEN} ]; then 13 | echo "Github token not set; will not report results"; 14 | else 15 | curl -u $USERNAME:$GITHUB_TOKEN \ 16 | -d '{"state": "'$PYDOCSTYLE_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "", "context": "code-quality/pydocstyle"}' \ 17 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ 18 | > /dev/null 2>&1; 19 | fi -------------------------------------------------------------------------------- /tests/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | PROJECT=$1 6 | 7 | if [ -z ${MIN_SCORE} ]; then MIN_SCORE="9"; fi 8 | PYLINT_SCORE=$( pipenv run pylint ${PROJECT} | tail -2 | grep -Eo '[0-9\.]+/10' | tail -1 | sed s/\\/10// ) 9 | PYLINT_PASS=$(echo $PYLINT_SCORE">="$MIN_SCORE | bc -l) 10 | 11 | if [ "$TRAVIS_PULL_REQUEST_SHA" = "" ]; then SHA=$TRAVIS_COMMIT; else SHA=$TRAVIS_PULL_REQUEST_SHA; fi 12 | if [ "$PYLINT_PASS" ]; then PYLINT_STATE="success" && echo "pylint passed with score "$PYLINT_SCORE" for sha "$SHA; else PYLINT_STATE="failure" && echo "pylint failed with score "$PYLINT_SCORE" for sha "$SHA; fi 13 | 14 | if [ -z ${GITHUB_TOKEN} ]; then 15 | echo "Github token not set; will not report results"; 16 | else 17 | curl -u $USERNAME:$GITHUB_TOKEN \ 18 | -d '{"state": "'$PYLINT_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "'$PYLINT_SCORE'/10", "context": "code-quality/pylint"}' \ 19 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ 20 | > /dev/null 2>&1; 21 | fi -------------------------------------------------------------------------------- /tests/static.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | PROJECT=$1 6 | 7 | MYPY_STATUS=$( pipenv run mypy -p ${PROJECT} | tee /dev/tty | grep -v "test.*" | wc -l | tr -d '[:space:]' ) 8 | if [ $MYPY_STATUS -ne 0 ]; then MYPY_STATE="failure" && echo "mypy failed"; else MYPY_STATE="success" && echo "mypy passed"; fi 9 | if [ "$TRAVIS_PULL_REQUEST_SHA" = "" ]; then SHA=$TRAVIS_COMMIT; else SHA=$TRAVIS_PULL_REQUEST_SHA; fi 10 | if [ -z ${GITHUB_TOKEN} ]; then 11 | echo "Github token not set; will not report results"; 12 | else 13 | curl -u $USERNAME:$GITHUB_TOKEN \ 14 | -d '{"state": "'$MYPY_STATE'", "target_url": "https://travis-ci.org/'$TRAVIS_REPO_SLUG'/builds/'$TRAVIS_BUILD_ID'", "description": "", "context": "code-quality/mypy"}' \ 15 | -XPOST https://api.github.com/repos/$TRAVIS_REPO_SLUG/statuses/$SHA \ 16 | > /dev/null 2>&1; 17 | fi -------------------------------------------------------------------------------- /update-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SRCDOCS=`pwd`/docs/_build/html 3 | REPO=arxiv/arxiv-compiler 4 | echo $SRCDOCS 5 | 6 | cd `pwd`/docs 7 | make html 8 | 9 | cd $SRCDOCS 10 | MSG="Adding gh-pages docs for `git log -1 --pretty=short --abbrev-commit`" 11 | 12 | TMPREPO=/tmp/docs/$REPO 13 | rm -rf $TMPREPO 14 | mkdir -p -m 0755 $TMPREPO 15 | echo $MSG 16 | 17 | git clone git@github.com:$REPO.git $TMPREPO 18 | cd $TMPREPO 19 | 20 | ## checkout the branch if it exists, if not then create it and detach it from the history 21 | if ! git checkout gh-pages; then 22 | git checkout --orphan gh-pages 23 | git rm -rf . 24 | touch .nojekyll 25 | git add .nojekyll 26 | else 27 | git checkout gh-pages ###gh-pages has previously one off been set to be nothing but html 28 | fi 29 | 30 | cp -r $SRCDOCS/* $TMPREPO 31 | git add -A 32 | git commit -m "$MSG" && git push origin gh-pages 33 | -------------------------------------------------------------------------------- /uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | http-socket = :8000 3 | chdir = /opt/arxiv/ 4 | wsgi-file = wsgi.py 5 | callable = application 6 | master = true 7 | harakiri = 3000 8 | manage-script-name = true 9 | processes = 8 10 | vacuum = true 11 | single-interpreter = true 12 | mount = $(APPLICATION_ROOT)=wsgi.py 13 | logformat = "%(addr) %(addr) - %(user_id)|%(session_id) [%(rtime)] [%(uagent)] \"%(method) %(uri) %(proto)\" %(status) %(size) %(micros) %(ttfb)" 14 | buffer-size = 65535 15 | wsgi-disable-file-wrapper = true 16 | -------------------------------------------------------------------------------- /wsgi.py: -------------------------------------------------------------------------------- 1 | """Web Server Gateway Interface entry-point.""" 2 | 3 | import os 4 | from typing import Optional 5 | from flask import Flask 6 | from compiler.factory import create_app 7 | 8 | __flask_app__ = create_app() 9 | 10 | 11 | def application(environ, start_response): 12 | """WSGI application factory.""" 13 | global __flask_app__ 14 | for key, value in environ.items(): 15 | os.environ[key] = str(value) 16 | 17 | # The value for SERVER_NAME will usually be the container ID or some 18 | # other useless hostname. 19 | if key in __flask_app__.config and key != 'SERVER_NAME': 20 | __flask_app__.config[key] = value 21 | 22 | return __flask_app__(environ, start_response) 23 | --------------------------------------------------------------------------------