├── .coveragerc ├── docs ├── sequence.png ├── architecture.png ├── uris_example.png ├── client_server.png ├── headers_example.png ├── client_server_tg.png ├── code_architecture.png ├── authors.rst ├── changes.rst ├── installation.rst ├── contributing.rst ├── license.rst ├── api.rst ├── index.rst ├── advanced-features.rst ├── http-response-headers.rst ├── memento.rst ├── cache.rst ├── big-picture.rst ├── introduction.rst ├── getting-started.rst ├── configuration.rst ├── handler.rst ├── make.bat ├── Makefile └── conf.py ├── timegate ├── examples │ ├── __init__.py │ ├── es.py │ ├── aueb.py │ ├── si.py │ ├── cat.py │ ├── sg.py │ ├── can.py │ ├── cr.py │ ├── loc.py │ ├── w3c.py │ ├── arxiv.py │ ├── nara.py │ ├── webcite.py │ ├── simple.py │ ├── wikia.py │ ├── orain.py │ ├── mediawiki.py │ ├── pastpages.py │ ├── wikipedia.py │ ├── github.py │ └── gitlab.py ├── version.py ├── __init__.py ├── _compat.py ├── conf │ ├── timegate.ini │ └── config.ini ├── errors.py ├── constants.py ├── config.py ├── handler.py ├── utils.py ├── cache.py └── application.py ├── CHANGES.rst ├── pytest.ini ├── MANIFEST.in ├── setup.cfg ├── run-tests.sh ├── INSTALL.rst ├── RELEASE-NOTES.rst ├── AUTHORS.rst ├── LICENSE ├── .gitignore ├── tests ├── conftest.py └── test_timegate.py ├── README.rst ├── .travis.yml ├── CONTRIBUTING.rst └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = timegate/examples/* 3 | -------------------------------------------------------------------------------- /docs/sequence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/sequence.png -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/architecture.png -------------------------------------------------------------------------------- /docs/uris_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/uris_example.png -------------------------------------------------------------------------------- /docs/client_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/client_server.png -------------------------------------------------------------------------------- /docs/headers_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/headers_example.png -------------------------------------------------------------------------------- /docs/client_server_tg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/client_server_tg.png -------------------------------------------------------------------------------- /docs/code_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/code_architecture.png -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | .. include:: ../AUTHORS.rst 10 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | .. include:: ../CHANGES.rst 10 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | .. include:: ../INSTALL.rst 10 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | .. include:: ../CONTRIBUTING.rst 10 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | License 10 | ======= 11 | 12 | .. include:: ../LICENSE 13 | :literal: 14 | -------------------------------------------------------------------------------- /timegate/examples/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """List of examples for TimeGate.""" 11 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | 10 | Changes 11 | ======= 12 | 13 | Version 0.5.0 (released TBD) 14 | 15 | - Initial public release. 16 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | [pytest] 11 | addopts = --pep8 --ignore=docs --cov=timegate --cov-report=term-missing 12 | pep8ignore = timegate/examples/* ALL 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.py 2 | include *.rst 3 | include *.sh 4 | include .coveragerc 5 | include LICENSE 6 | include pytest.ini 7 | include timegate/conf/*.ini 8 | recursive-include conf *.ini 9 | recursive-include docs *.bat 10 | recursive-include docs *.png 11 | recursive-include docs *.py 12 | recursive-include docs *.rst 13 | recursive-include docs Makefile 14 | recursive-include examples *.py 15 | recursive-include tests *.py 16 | 17 | prune docs/_build 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | 11 | [aliases] 12 | test=pytest 13 | 14 | [build_sphinx] 15 | source-dir = docs/ 16 | build-dir = docs/_build 17 | all_files = 1 18 | 19 | [bdist_wheel] 20 | universal = 1 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | 10 | API Docs 11 | ======== 12 | 13 | .. automodule:: timegate.application 14 | 15 | Errors 16 | ------ 17 | 18 | .. automodule:: timegate.errors 19 | :members: 20 | 21 | Utilities 22 | --------- 23 | 24 | .. automodule:: timegate.utils 25 | :members: 26 | -------------------------------------------------------------------------------- /timegate/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Version information for TimeGate. 11 | 12 | This file is imported by ``timegate.__init__``, and parsed by 13 | ``setup.py``. 14 | 15 | """ 16 | 17 | from __future__ import absolute_import, print_function 18 | 19 | __version__ = "0.5.0.dev20160000" 20 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # -*- coding: utf-8 -*- 3 | # 4 | # This file is part of TimeGate. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | # pydocstyle timegate && \ 12 | isort -rc -c -df **/*.py && \ 13 | check-manifest --ignore ".travis-*" && \ 14 | sphinx-build -qnNW docs docs/_build/html && \ 15 | python setup.py test && \ 16 | sphinx-build -qnNW -b doctest docs docs/_build/doctest 17 | -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | Installation 10 | ============ 11 | 12 | In this installation guide, we’ll create a basic TimeGate instance. 13 | 14 | .. code-block:: console 15 | 16 | $ pip install -e git+https://github.com/mementoweb/timegate.git#egg=TimeGate 17 | $ uwsgi --http :9999 -s /tmp/mysock.sock --module timegate.application --callable application 18 | -------------------------------------------------------------------------------- /RELEASE-NOTES.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | TimeGate v0.5.0 3 | ================= 4 | 5 | TimeGate v0.5.0 was released on TBD, 2016. 6 | 7 | About 8 | ----- 9 | 10 | A Memento TimeGate. 11 | 12 | What's new 13 | ---------- 14 | 15 | - Initial public release. 16 | 17 | Installation 18 | ------------ 19 | 20 | $ pip install timegate==0.5.0 21 | 22 | Documentation 23 | ------------- 24 | 25 | http://pythonhosted.org/timegate/ 26 | 27 | Happy hacking and thanks for flying TimeGate. 28 | 29 | | TimeGate Development Team 30 | | GitHub: https://github.com/mementoweb/timegate 31 | | URL: http://mementoweb.org 32 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | 10 | Authors 11 | ======= 12 | 13 | A Memento TimeGate contributors: 14 | 15 | - Christian Pietsch 16 | - Harihar Shankar 17 | - Jiri Kuncar 18 | - Luda171 19 | - Sawood Alam 20 | - Tibor Simko 21 | - Yorick Chollet 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013, 2 | Yorick Chollet, Harihar Shankar, Herbert Van de Sompel. 3 | -- Los Alamos National Laboratory. 4 | 5 | Licensed under the BSD open source software license. 6 | You may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://mementoweb.github.io/SiteStory/license.html 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /timegate/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Make your web resources Memento compliant in a few easy steps. 12 | 13 | The Memento framework enables datetime negotiation for web resources. 14 | Knowing the URI of a Memento-compliant web resource, a user can select a 15 | date and see what it was like around that time. 16 | """ 17 | 18 | from .version import __version__ 19 | 20 | __all__ = ( 21 | '__version__', 22 | ) 23 | -------------------------------------------------------------------------------- /timegate/_compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """PY2/PY3 compatibility layer.""" 11 | 12 | import sys 13 | 14 | PY2 = sys.version_info[0] == 2 15 | 16 | if not PY2: # pragma: no cover 17 | from urllib.parse import urlparse, quote, unquote 18 | 19 | text_type = str 20 | string_types = (str,) 21 | integer_types = (int,) 22 | else: # pragma: no cover 23 | from urlparse import urlparse 24 | from urllib2 import quote, unquote 25 | 26 | text_type = unicode 27 | string_types = (str, unicode) 28 | integer_types = (int, long) 29 | -------------------------------------------------------------------------------- /timegate/conf/timegate.ini: -------------------------------------------------------------------------------- 1 | # uWSGI launch configuration file 2 | [uwsgi] 3 | home = /Users/harihar/venv/timegate/ 4 | #socket = uwsgi.sock 5 | http = :9000 6 | #chdir = /data/web/timegate/w3c 7 | 8 | #daemonize = /data/var/logs/timegate/w3c.log 9 | module = timegate.application 10 | callable = application 11 | master = true 12 | #pidfile = /data/var/run/timegate/w3c/w3c.pid 13 | #harakiri = 120 14 | 15 | memory-report 16 | processes = 4 17 | threads = 2 18 | listen = 60000 19 | cheaper-algo = spare 20 | cheaper = 3 21 | cheaper-initial = 3 22 | workers = 15 23 | cheaper-step = 2 24 | #cheaper-rss-limit-soft = 134217728 25 | vacuum 26 | max-requests = 500 27 | 28 | reload-mercy = 8 29 | reload-on-as = 512 30 | evil-reload-on-rss = 96 31 | limit-as = 1024 32 | 33 | # To stop the server, use uwsgi --stop /data/var/run/timegate/orain/orain.pid 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Idea software family 6 | .idea/ 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | 49 | # Translations 50 | *.mo 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # Cache 62 | cache/ 63 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | 11 | """Pytest configuration.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import pytest 16 | 17 | 18 | @pytest.fixture() 19 | def app(tmpdir): 20 | """Initialize cache directory.""" 21 | from timegate import application 22 | from timegate.cache import Cache 23 | return application.TimeGate(config=dict( 24 | HOST='http://localhost', 25 | BASE_URI='http://www.example.com/', 26 | CACHE_USE=True, 27 | CACHE_FILE=tmpdir.mkdir('cache').strpath, 28 | )) 29 | 30 | 31 | @pytest.fixture() 32 | def client(app): 33 | """Application fixture.""" 34 | from timegate import application 35 | from werkzeug.test import Client 36 | from werkzeug.wrappers import BaseResponse 37 | return Client(app, BaseResponse) 38 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Memento TimeGate 2 | ================ 3 | 4 | .. image:: https://img.shields.io/travis/mementoweb/timegate.svg 5 | :target: https://travis-ci.org/mementoweb/timegate 6 | 7 | About 8 | ----- 9 | 10 | Make your web resources `Memento `__ compliant in a 11 | few easy steps. 12 | 13 | The Memento framework enables datetime negotiation for web resources. 14 | Knowing the URI of a Memento-compliant web resource, a user can select a 15 | date and see what it was like around that time. 16 | 17 | Installation 18 | ------------ 19 | 20 | Memento TimeGate is on PyPI so all you need is: :: 21 | 22 | pip install -e git+https://github.com/mementoweb/timegate.git#egg=TimeGate 23 | uwsgi --http :9999 -s /tmp/mysock.sock --module timegate.application --callable application 24 | 25 | 26 | Documentation 27 | ------------- 28 | 29 | The documentation is readable at http://timegate.readthedocs.io or can be built 30 | using Sphinx: :: 31 | 32 | pip install timegate[docs] 33 | python setup.py build_sphinx 34 | 35 | 36 | Testing 37 | ------- 38 | 39 | Running the test suite is as simple as: :: 40 | 41 | ./run-tests.sh 42 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. 2 | This file is part of TimeGate 3 | Copyright (C) 2016 CERN. 4 | 5 | TimeGate is free software; you can redistribute it and/or modify 6 | it under the terms of the Revised BSD License; see LICENSE file for 7 | more details. 8 | 9 | 10 | .. include:: ../README.rst 11 | :end-before: Installation 12 | 13 | User's Guide 14 | ------------ 15 | 16 | This part of the documentation will show you how to get started in using 17 | TimeGate. 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | 22 | introduction 23 | installation 24 | big-picture 25 | getting-started 26 | memento 27 | http-response-headers 28 | handler 29 | configuration 30 | cache 31 | advanced-features 32 | 33 | API Reference 34 | ------------- 35 | 36 | If you are looking for information on a specific function, class or method, 37 | this part of the documentation is for you. 38 | 39 | .. toctree:: 40 | :maxdepth: 2 41 | 42 | api 43 | 44 | Additional Notes 45 | ---------------- 46 | 47 | Notes on how to contribute, legal information and changes are here for the 48 | interested. 49 | 50 | .. toctree:: 51 | :maxdepth: 1 52 | 53 | contributing 54 | changes 55 | license 56 | authors 57 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | 11 | notifications: 12 | email: false 13 | 14 | sudo: false 15 | 16 | language: python 17 | 18 | cache: 19 | - pip 20 | 21 | env: 22 | - REQUIREMENTS=lowest 23 | - REQUIREMENTS=release 24 | # - REQUIREMENTS=devel 25 | 26 | python: 27 | - "2.7" 28 | - "3.3" 29 | - "3.4" 30 | - "3.5" 31 | 32 | before_install: 33 | - "travis_retry pip install --upgrade pip setuptools py" 34 | - "travis_retry pip install twine wheel coveralls requirements-builder" 35 | - "requirements-builder --level=min setup.py > .travis-lowest-requirements.txt" 36 | - "requirements-builder --level=pypi setup.py > .travis-release-requirements.txt" 37 | # - "requirements-builder --level=dev --req requirements-devel.txt setup.py > .travis-devel-requirements.txt" 38 | 39 | install: 40 | - "travis_retry pip install -r .travis-${REQUIREMENTS}-requirements.txt" 41 | - "travis_retry pip install -e .[all]" 42 | 43 | script: 44 | - "./run-tests.sh" 45 | 46 | after_success: 47 | - coveralls 48 | 49 | branches: 50 | only: 51 | - master 52 | -------------------------------------------------------------------------------- /timegate/errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Custom TimeGate errors.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | from werkzeug.exceptions import HTTPException 16 | 17 | 18 | class TimegateError(HTTPException): 19 | """General TimeGate Exception.""" 20 | 21 | code = 400 22 | description = 'Invalid TimeGate request.' 23 | 24 | def __init__(self, msg, status=None): 25 | super(TimegateError, self).__init__(description=msg) 26 | if status: 27 | self.code = status 28 | 29 | 30 | class TimeoutError(TimegateError): 31 | """Raise to signalize a timeout.""" 32 | 33 | code = 416 34 | 35 | 36 | class URIRequestError(TimegateError): 37 | """Raise if the request contains invalid URI.""" 38 | 39 | code = 400 40 | 41 | 42 | class HandlerError(TimegateError): 43 | """Raise to signal handler error.""" 44 | 45 | code = 503 46 | 47 | 48 | class DateTimeError(TimegateError): 49 | """Raise if the server is unable to handle the date time.""" 50 | 51 | code = 400 52 | 53 | 54 | class CacheError(TimegateError): 55 | """Raise if the cache is not functioning.""" 56 | 57 | code = 500 58 | -------------------------------------------------------------------------------- /timegate/examples/es.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Memento proxy for Estonia Web Archive. 12 | 13 | TODO: rewrite regex html parsing(?) with lxml. 14 | """ 15 | 16 | from __future__ import absolute_import, print_function 17 | 18 | import logging 19 | import re 20 | 21 | from timegate.errors import HandlerError 22 | from timegate.handler import Handler 23 | 24 | BASEURI = "http://veebiarhiiv.digar.ee/a/*/" 25 | 26 | 27 | class EsHandler(Handler): 28 | 29 | def __init__(self): 30 | Handler.__init__(self) 31 | regex = r'' 32 | self.uriRegex = re.compile(regex) 33 | 34 | def get_all_mementos(self, req_url): 35 | # implement the changes list for this particular proxy 36 | 37 | uri = BASEURI + req_url 38 | try: 39 | resp = self.request(uri) 40 | data = resp.content 41 | except Exception as e: 42 | logging.error("Cannot request URI: %s" % e) 43 | raise HandlerError("Cannot request URI", 404) 44 | 45 | changes = [] 46 | uris = re.findall(self.uriRegex, data) 47 | for u in uris: 48 | dtstr = u[0] 49 | loc = u[1] 50 | dtstr += " GMT" 51 | changes.append((loc, dtstr)) 52 | 53 | return changes 54 | -------------------------------------------------------------------------------- /timegate/examples/aueb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Greece handler.""" 12 | 13 | import logging 14 | import re 15 | import urllib 16 | 17 | from timegate.errors import HandlerError 18 | from timegate.handler import Handler 19 | 20 | 21 | class GreeceHandler(Handler): 22 | 23 | def __init__(self): 24 | 25 | self.baseuri = "http://83.212.204.92:8080/*/" 26 | 27 | regex = r''; 28 | self.uriRegex = re.compile(regex) 29 | Handler.__init__(self) 30 | 31 | def get_all_mementos(self, req_url): 32 | # def fetch_changes(self, req, requri, dt=None): 33 | # implement the changes list for this particular proxy 34 | 35 | uri = self.baseuri + req_url 36 | try: 37 | fh = urllib.urlopen(uri) 38 | except Exception as e: 39 | logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e))) 40 | return None 41 | data = fh.read() 42 | fh.close() 43 | 44 | changes = [] 45 | uris = re.findall(self.uriRegex, data) 46 | for u in uris: 47 | dtstr = u[27:41] 48 | loc = u[52:-2] 49 | dtstr += " GMT" 50 | # dtobj = dateparser.parse(dtstr) 51 | changes.append((loc, dtstr)) 52 | 53 | return changes 54 | -------------------------------------------------------------------------------- /timegate/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Important constants of the TimeGate server.""" 12 | 13 | # Code constants 14 | HTTP_STATUS = { 15 | 200: "200 OK", 16 | 302: "302 Found", 17 | 400: "400 Bad Request", 18 | 403: "403 Forbidden", 19 | 404: "404 Not Found", 20 | 405: "405 Method Not Allowed", 21 | 416: '416 Requested Range Not Satisfiable', 22 | 500: "500 Internal Server Error", 23 | 502: "502 Bad Gateway", 24 | 501: "501 Not Implemented", 25 | 503: "503 Service Unavailable" 26 | } 27 | 28 | # Memento date rfc1123 29 | DATE_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' 30 | 31 | # TimeMap max size (in URIs) safeguard 32 | TM_MAX_SIZE = 100000 33 | 34 | # Server configuration 35 | HOST = None 36 | STRICT_TIME = True 37 | API_TIME_OUT = 6 38 | 39 | # Handler configuration 40 | HANDLER_MODULE = 'simple' 41 | BASE_URI = '' 42 | RESOURCE_TYPE = 'vcs' 43 | USE_TIMEMAPS = True 44 | 45 | # Cache 46 | # When False, all cache requests will be cache MISS 47 | CACHE_USE = False 48 | # Time window in which the cache value is considered young enough to be valid 49 | CACHE_TOLERANCE = 86400 50 | # Cache files paths 51 | CACHE_DIRECTORY = 'cache' 52 | # Maximum number of TimeMaps stored in cache 53 | CACHE_MAX_VALUES = 250 54 | # Cache files paths 55 | CACHE_FILE = CACHE_DIRECTORY # + '/cache_data' 56 | # Cache expiration (space bound) in seconds 57 | CACHE_EXP = 259200 # Three days 58 | -------------------------------------------------------------------------------- /timegate/examples/si.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """TimeGate proxy for uni-lj.si.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | import urllib 18 | 19 | from timegate.handler import Handler 20 | 21 | 22 | class SloveniaHandler(Handler): 23 | 24 | def __init__(self): 25 | self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/" 26 | regex = r'' 27 | self.uriRegex = re.compile(regex) 28 | Handler.__init__(self) 29 | 30 | def get_all_mementos(self, req_url): 31 | # def fetch_changes(self, req, requri, dt=None): 32 | # implement the changes list for this particular proxy 33 | 34 | uri = self.baseuri + req_url 35 | try: 36 | fh = urllib.urlopen(uri) 37 | except Exception as e: 38 | logging.error("Couldn't retrieve data from %s : %s" % 39 | (uri, str(e))) 40 | return None 41 | data = fh.read() 42 | fh.close() 43 | 44 | changes = [] 45 | uris = re.findall(self.uriRegex, data) 46 | for u in uris: 47 | dtstr = u[27:41] 48 | loc = u[52:-2] 49 | dtstr += " GMT" 50 | # dtobj = dateparser.parse(dtstr) 51 | changes.append((loc, dtstr)) 52 | 53 | return changes 54 | -------------------------------------------------------------------------------- /timegate/examples/cat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Catalonia handler.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | import urllib 18 | 19 | from core.handler_baseclass import Handler 20 | from errors.timegateerrors import HandlerError 21 | 22 | 23 | class CataloniaHandler(Handler): 24 | 25 | def __init__(self): 26 | 27 | self.baseuri = "http://www.padi.cat:8080/wayback/*/" 28 | 29 | regex = r''; 30 | self.uriRegex = re.compile(regex) 31 | Handler.__init__(self) 32 | 33 | def get_all_mementos(self, req_url): 34 | # def fetch_changes(self, req, requri, dt=None): 35 | # implement the changes list for this particular proxy 36 | 37 | uri = self.baseuri + req_url 38 | try: 39 | fh = urllib.urlopen(uri) 40 | except Exception as e: 41 | logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e))) 42 | return None 43 | data = fh.read() 44 | fh.close() 45 | 46 | changes = [] 47 | uris = re.findall(self.uriRegex, data) 48 | for u in uris: 49 | dtstr = u[27:41] 50 | loc = u[52:-2] 51 | dtstr += " GMT" 52 | # dtobj = dateparser.parse(dtstr) 53 | changes.append((loc, dtstr)) 54 | 55 | return changes 56 | -------------------------------------------------------------------------------- /timegate/examples/sg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Singapore handler.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | import urllib 18 | 19 | from core.handler_baseclass import Handler 20 | from errors.timegateerrors import HandlerError 21 | 22 | 23 | class SingaporeHandler(Handler): 24 | 25 | def __init__(self): 26 | #self.baseuri = "http://was.nl.sg/wayback/*/" 27 | self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/" 28 | 29 | regex = r''; 30 | self.uriRegex = re.compile(regex) 31 | Handler.__init__(self) 32 | 33 | def get_all_mementos(self, req_url): 34 | # def fetch_changes(self, req, requri, dt=None): 35 | # implement the changes list for this particular proxy 36 | 37 | uri = self.baseuri + req_url 38 | try: 39 | fh = urllib.urlopen(uri) 40 | except Exception as e: 41 | logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e))) 42 | return None 43 | data = fh.read() 44 | fh.close() 45 | 46 | changes = [] 47 | uris = re.findall(self.uriRegex, data) 48 | for u in uris: 49 | dtstr = u[27:41] 50 | loc = u[52:-2] 51 | dtstr += " GMT" 52 | # dtobj = dateparser.parse(dtstr) 53 | changes.append((loc, dtstr)) 54 | 55 | return changes 56 | -------------------------------------------------------------------------------- /timegate/examples/can.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Canadian archive proxy.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | import StringIO 18 | 19 | from lxml import etree 20 | 21 | from timegate.errors import HandlerError 22 | from timegate.handler import Handler 23 | 24 | 25 | class CanHandler(Handler): 26 | 27 | def __init__(self): 28 | Handler.__init__(self) 29 | self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/" 30 | self.dtre = re.compile( 31 | "http://www.collectionscanada.gc.ca/webarchives/(\d+)/") 32 | 33 | def get_all_mementos(self, req_url): 34 | iauri = self.baseuri + req_url 35 | dom = self.get_xml(iauri, html=True) 36 | 37 | alist = dom.xpath('//div[@class="inner-content"]//a') 38 | if not alist: 39 | return [] 40 | 41 | changes = [] 42 | for a in alist: 43 | if not 'name' in a.attrib: 44 | uri = a.attrib['href'] 45 | match = self.dtre.match(uri) 46 | if bool(match): 47 | dtstr = match.groups()[0] 48 | changes.append((uri, dtstr)) 49 | return changes 50 | 51 | def get_xml(self, uri, html=False): 52 | page = self.request(uri) 53 | try: 54 | page_data = page.content 55 | if not html: 56 | parser = etree.XMLParser(recover=True) 57 | else: 58 | parser = etree.HTMLParser(recover=True) 59 | return etree.parse(StringIO.StringIO(page_data), parser) 60 | except Exception as e: 61 | logging.error("Cannot parse XML/HTML from %s" % uri) 62 | raise HandlerError("Couldn't parse data from %s" % uri, 404) 63 | -------------------------------------------------------------------------------- /docs/advanced-features.rst: -------------------------------------------------------------------------------- 1 | .. _advanced_features: 2 | 3 | TimeMaps 4 | ======== 5 | 6 | The TimeGate can easily be used as a TimeMap server too. ## Requirements 7 | For that there are two requirements: 8 | 9 | - The Handler must implement the ``get_all_mementos(uri_r)`` function to return 10 | the entire history of an Original Resource. 11 | 12 | 13 | - The ``conf/config.ini`` file must have the variable ``use_timemap = true``. 14 | 15 | Resulting links 16 | --------------- 17 | 18 | Once this setup is in place, the TimeGate responses' ``Link`` header 19 | will contain two new relations, for two different formats (MIME types): 20 | 21 | - ``; rel="timemap"; type="application/link-format"`` 22 | `Link TimeMaps `_ 23 | 24 | - ``; rel="timemap"; type="application/json"`` JSON 25 | TimeMaps 26 | 27 | Where ``HOST`` is the base URI of the program and ``URI-R`` is the URI 28 | of the Original Resource. 29 | 30 | Example 31 | ------- 32 | 33 | For example, suppose ``http://www.example.com/resourceA`` is the URI-R 34 | of an Original Resource. And suppose the TimeGate/TimeMap server's 35 | ``host`` configuration is set to ``http://timegate.example.com`` Then, 36 | HTTP responses from the TimeGate will include the following: 37 | 38 | - ``; rel="timemap"; type="application/link-format"`` 39 | - ``; rel="timemap"; type="application/json"`` 40 | 41 | Now a user can request an ``HTTP GET`` on one of those link and the 42 | server's response will have a ``200 OK`` status code and its body will 43 | be the TimeMap. 44 | 45 | HandlerErrors 46 | ============= 47 | 48 | Custom error messages can be sent to the client using the custom 49 | exception module: ``from errors.timegateerrors import HandlerError``. 50 | For instance, a custom message with HTTP status ``400`` and body 51 | ``Custom error message`` can be sent using: 52 | ``raise HandlerError("Custom error message", status=400)``. Raising a 53 | ``HandlerError`` will stop the request and not return any Memento to the 54 | client. 55 | -------------------------------------------------------------------------------- /docs/http-response-headers.rst: -------------------------------------------------------------------------------- 1 | .. _http_response_headers: 2 | 3 | Memento and HTTP 4 | ================ 5 | 6 | The Memento framework requires specific HTTP headers in order to work 7 | properly. They must be added to the server's response headers for any 8 | Original Resources or Mementos request. 9 | 10 | Intuitively, a user needs to be able to know which server to contact to 11 | do the time negotiation. Hence a link to the TimeGate is needed from 12 | both the Original Resource and the Mementos. Additionally, a Memento is 13 | defined by an Original Resource it is the snapshot of, and the date time 14 | at which it was created. Thus, it carries a link to its Original 15 | Resource and a datetime information. 16 | 17 | Example 18 | ------- 19 | 20 | Let's take the following example: Suppose a server is handling requests 21 | for the following URIs: 22 | 23 | .. image:: uris_example.png 24 | 25 | Each time a server responds to requests for any of these URIs, standards 26 | HTTP headers are returned. With Memento, the following headers are 27 | added: - For the Original Resource, add a "Link" header that points at 28 | its TimeGate - For each Memento, add a "Link" header that points at the 29 | TimeGate - For each Memento, add a "Link" header that points to the 30 | Original Resource - For each Memento, add a Memento-Datetime header that 31 | conveys the snapshot datetime 32 | 33 | Using the previous example, and supposing a TimeGate server is running 34 | at ``http://example.com/timegate/``, Memento HTTP response headers for 35 | the Original Resource and one Memento look as follows: 36 | 37 | .. image:: uris_example.png 38 | 39 | To sum up 40 | --------- 41 | 42 | - The ``Memento-Datetime:`` header is a Memento-specific header which 43 | value is the `rfc1123 `__-date of 44 | the Memento. 45 | - It must be included in any response to a Memento request. 46 | - It cannot be in an Original Resource response. 47 | - The ``Link:`` header is a standard header to which new values are 48 | added. 49 | - A link to the TimeGate with relation ``rel="timegate"`` must be 50 | included in all Memento and Original Resource responses. 51 | - A link to the Original Resource with relation ``rel="original"`` must 52 | be included in all Memento responses. 53 | - Link with relation ``rel="original"`` cannot be in an Original 54 | Resource response. 55 | -------------------------------------------------------------------------------- /timegate/examples/cr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Croatian web archive proxy.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | import urllib 18 | 19 | from timegate.errors import HandlerError 20 | from timegate.handler import Handler 21 | 22 | baseuri = "http://haw.nsk.hr/json.php?" 23 | 24 | 25 | class CrHandler(Handler): 26 | 27 | def __init__(self): 28 | Handler.__init__(self) 29 | 30 | def get_all_mementos(self, req_url): 31 | # implement the changes list for this particular proxy 32 | 33 | parameters = {} 34 | parameters['q'] = req_url 35 | parameters['subject'] = 'url' 36 | 37 | uri = baseuri + urllib.urlencode(parameters) 38 | try: 39 | jsonobj = self.request(uri).json() 40 | except Exception as e: 41 | logging.error("Cannot request API or parse json response: " + e) 42 | raise HandlerError("Cannot get API response.", 404) 43 | 44 | changes = [] 45 | 46 | if int(jsonobj['availableHits']) == 0: 47 | return [] 48 | 49 | tmid = jsonobj['hits'][0]['ID'] 50 | tmuri = "http://haw.nsk.hr/publikacija/" + tmid 51 | 52 | try: 53 | data = self.request(tmuri).content 54 | except Exception as e: 55 | logging.error("Error requerying API: " + e) 56 | raise HandlerError("Cannot get API response.", 404) 57 | 58 | uriRegex = re.compile(r'[\d]*\..*') 59 | dtregex = re.compile('\d\d\.\d\d\.\d\d\d\d[0-9\.:\s]*') 60 | 61 | uris = re.findall(uriRegex, data) 62 | for u in uris: 63 | d = u.index("title") 64 | 65 | loc = "http://haw.nsk.hr/" + u[45:d - 2].lstrip('/') 66 | 67 | result = dtregex.search(u) 68 | if result: 69 | dtstr = result.group(0) 70 | dtstr = dtstr[4:-5] 71 | 72 | dtstr = dtstr[6:10] + dtstr[3:5] + dtstr[0:2] + \ 73 | dtstr[11:19].replace(":", "") + " GMT" 74 | changes.append((loc, dtstr)) 75 | 76 | return changes 77 | -------------------------------------------------------------------------------- /docs/memento.rst: -------------------------------------------------------------------------------- 1 | Memento Framework 2 | ================= 3 | 4 | Resources on the web change over time. While many server keep archives 5 | of what these resources looked like in the past, it is often difficult 6 | for the user to retrieve the URI of such an archive for a specific point 7 | in time. 8 | 9 | The `Memento Framework `__ leverages the 10 | need for the user to do the search by hand. 11 | 12 | Components 13 | ---------- 14 | 15 | - Suppose a web resource is located at some URI. We call the resource 16 | the **Original Resource** and refer to its URI as the **URI-R**. This 17 | is the resource for which a user wants to find a prior version. 18 | - A prior version of an Original Resource is called a **Memento** and 19 | we refer to its URI as the **URI-M**. There could be many Mementos 20 | for one Original Resource. Each having its own URI-Mi and each 21 | encapsulating the state of the Original Resource at a specific point 22 | in time. 23 | - The **TimeGate** is the application which selects the best Memento of 24 | an Original Resource for a given datetime. This is where datetime 25 | negotiation happens. 26 | 27 | Requirements 28 | ------------ 29 | 30 | - The first requirements is that Original Resources and Mementos must 31 | be accessible through their respective and unique URIs. 32 | - Also, the framework operates using HTTP headers to work. Headers of requests 33 | from/to the TimeGate are taken care of. However, Original Resources and 34 | Mementos require the add of new headers. (See :ref:`http_response_headers`.) 35 | 36 | The Generic TimeGate 37 | -------------------- 38 | 39 | The TimeGate is where most of the Memento magic happens. And its 40 | implementation is likely to be extremely close from one server to 41 | another. In this sense, its processing of HTTP requests / responses 42 | headers, its algorithms and logic can be abstracted and made generic. 43 | The only thing server-specific is the management of URIs and datetimes. 44 | To do that, this TimeGate can fit any web resource if it is provided a 45 | way to retrieve a history of a specific Original Resource. This is made 46 | using a custom handler. (See :ref:`handler`.) 47 | 48 | More about Memento 49 | ------------------ 50 | 51 | - Details about Memento are available in the `RFC 52 | 7089 `__. 53 | - A `quick intro `__ is 54 | available on Memento's website. 55 | -------------------------------------------------------------------------------- /docs/cache.rst: -------------------------------------------------------------------------------- 1 | .. _cache: 2 | 3 | Cache 4 | ===== 5 | 6 | The TimeGate comes with a built-in cache that is activated by default. Change 7 | this behavior editing in the configuration file. See :ref:`configuration`. 8 | 9 | Populating the cache 10 | -------------------- 11 | 12 | The cache stores TimeMaps which is the return values of the handler 13 | function ``get_all_mementos()`` only: - If the Handler does not have 14 | ``get_all_mementos()`` implemented, the cache will never be filled. - If 15 | the Handler has both the functions ``get_all_mementos()`` and 16 | ``get_memento()``, only TimeMap requests will fill the cache. All 17 | TimeGate requests will use ``get_memento()`` which result will not be 18 | cached. 19 | 20 | Cache HIT conditions 21 | -------------------- 22 | 23 | - Cached TimeMaps can be used used to respond to a TimeMap request from 24 | a client if it is fresh enough. The tolerance for freshness can be 25 | defined in the configuration file. 26 | - Cached TimeMap can also be used to respond to a TimeGate requests 27 | from a client. In this case, it is not the request's time that must 28 | lie within the tolerance bounds, but the requested datetime. 29 | 30 | Force Fresh value 31 | ----------------- 32 | 33 | If the request contains the header ``Cache Control: no-cache``, then the 34 | TimeGate will not return anything from cache. 35 | 36 | Example 37 | ------- 38 | 39 | Suppose you have a TimeMap that was cached at time ``T``. Suppose you 40 | have a tolerance of ``d`` seconds. A TimeMap request arrives at time 41 | ``R1``. A TimeGate request arrives at time ``R2`` with requested 42 | datetime j. This request does **not** contain the header 43 | ``Cache Control: no-cache``. - A TimeMap request will be served from 44 | cache only if it arrives within the tolerance: ``R1 <= T+d``. - A 45 | TimeGate request will be served from cache only if the requested 46 | datetime happens within the tolerance: ``j <= T+d``, no matter ``R2``. 47 | This means that even if a cached value is old, the cache can still 48 | respond to TimeGate requests for requested datetimes that are until time 49 | ``T+d``. - All other requests will be cache misses. 50 | 51 | Cache size 52 | ---------- 53 | 54 | There is no "maximum size" parameter. The reason for this is that the 55 | cache size will depend on the average size of TimeMaps, which itself 56 | depends on the length of each URI-Ms it contains, and their average 57 | count. These variables will depend on your system. The cache can be 58 | managed using the ``cache_max_values`` parameter which will affect 59 | indirectly its size. 60 | -------------------------------------------------------------------------------- /timegate/examples/loc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | from __future__ import absolute_import, print_function 12 | 13 | import logging 14 | import re 15 | import StringIO 16 | 17 | from lxml import etree 18 | 19 | from timegate.handler import Handler 20 | 21 | 22 | class LocHandler(Handler): 23 | 24 | def __init__(self): 25 | Handler.__init__(self) 26 | 27 | self.datere = re.compile( 28 | 'http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+') 29 | self.colls = [ 30 | 'lcwa0001', 31 | 'lcwa0002', 32 | 'lcwa0003', 33 | 'lcwa0004', 34 | 'lcwa0005', 35 | 'lcwa0006', 36 | 'lcwa0007', 37 | 'lcwa0008', 38 | 'lcwa0009', 39 | 'lcwa0010', 40 | 'lcwa0011', 41 | 'lcwa0012', 42 | 'lcwa0013', 43 | 'lcwa0014', 44 | 'lcwa0015', 45 | 'lcwa0016', 46 | 'lcwa0017', 47 | 'lcwa0018', 48 | 'lcwa0019', 49 | 'lcwa0020', 50 | 'lcwa0029', 51 | 'lcwa0031', 52 | 'lcwa0032', 53 | 'lcwa0033', 54 | 'lcwa0037' 55 | ] 56 | 57 | def get_all_mementos(self, requri): 58 | changes = [] 59 | 60 | for c in self.colls: 61 | iauri = "http://webarchives.loc.gov/%s/*/%s" % (c, requri) 62 | 63 | try: 64 | req = self.request(iauri) 65 | data = req.content 66 | except Exception as e: 67 | continue 68 | 69 | try: 70 | parser = etree.HTMLParser(recover=True) 71 | dom = etree.parse(StringIO.StringIO(data), parser) 72 | except Exception as e: 73 | logging.error("Exception parsing data in loc handler: %s" % e) 74 | continue 75 | 76 | alist = dom.xpath('//a') 77 | 78 | for a in alist: 79 | loc = a.attrib.get('href', '') 80 | if loc.startswith('http://webarchive.loc.gov/%s/' % c): 81 | 82 | # extract time from link 83 | m = self.datere.match(loc) 84 | if m and a.tail: 85 | datestr = m.groups()[0] 86 | changes.append((loc, datestr)) 87 | return changes 88 | -------------------------------------------------------------------------------- /timegate/examples/w3c.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2015, 2016 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """TimeGate proxy to W3C pages.""" 12 | 13 | import re 14 | import time 15 | 16 | import requests 17 | 18 | from timegate.errors import HandlerError 19 | from timegate.handler import Handler 20 | 21 | ACCEPTABLE_RESOURCE = ( 22 | "This TimeGate understands W3C specification uri of " 23 | "the format: http://www.w3.org/TR/" 24 | ) 25 | 26 | # NOTE: Add API Key here 27 | APIKEY = "" 28 | 29 | 30 | class W3cHandler(Handler): 31 | 32 | def __init__(self): 33 | Handler.__init__(self) 34 | 35 | # Local fields 36 | self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1' 37 | 38 | self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)", re.IGNORECASE) 39 | 40 | def get_all_mementos(self, uri): 41 | MAX_TIME = 120 #seconds 42 | 43 | match_spec_name = self.re_spec_name.match(uri) 44 | if not bool(match_spec_name): 45 | raise HandlerError("Unknown W3C specification uri. \n" 46 | + ACCEPTABLE_RESOURCE, 404) 47 | 48 | spec_name = match_spec_name.groups()[1] 49 | if spec_name.endswith("/"): 50 | spec_name = spec_name[:-1] 51 | 52 | api_response = self.request(self.api_url % (spec_name, APIKEY)) 53 | 54 | if not api_response.status_code == 200: 55 | raise HandlerError("No versions were found for the requested specification with shortname: %s" % spec_name, 404) 56 | 57 | json_response = {} 58 | try: 59 | json_response = api_response.json() 60 | #for versions in json_response.get("_embedded").get("versions"): 61 | # spec_versions.append((versions.get("uri"), versions.get("date"))) 62 | except: 63 | raise HandlerError("The W3C API returned an unknown response.", 502) 64 | 65 | if not json_response.get("_embedded") and json_response.get("_embedded").get("versions"): 66 | raise HandlerError("The W3C API returned an unknown response.", 502) 67 | 68 | versions = map( 69 | lambda version: (version.get("uri"), version.get("date")), 70 | json_response.get("_embedded").get("version-history") 71 | ) 72 | #return versions 73 | return sorted(versions, key=lambda version: version[1]) 74 | -------------------------------------------------------------------------------- /timegate/examples/arxiv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Arxiv handler.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import re 17 | from StringIO import StringIO 18 | 19 | from lxml import etree 20 | 21 | from timegate.errors import HandlerError 22 | from timegate.handler import Handler 23 | 24 | 25 | class ArxivHandler(Handler): 26 | 27 | def __init__(self): 28 | Handler.__init__(self) 29 | 30 | # Resources 31 | # Ignores all that trails the identifier (? params, vX version,...info) 32 | self.rex = re.compile( 33 | r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)') 34 | self.api_base = 'http://export.arxiv.org/oai2' 35 | 36 | def get_all_mementos(self, uri_r): 37 | try: 38 | # Extract the resource ID 39 | match = self.rex.match(uri_r) 40 | if not match: 41 | raise HandlerError("URI does not match a valid resource.", 404) 42 | parts = match.groups() 43 | base = parts[0] 44 | type = parts[1] 45 | resource = parts[2] 46 | normalized_uri = '%s/%s/%s' % (base, type, resource) 47 | 48 | # Prepars the API call 49 | params = { 50 | 'verb': 'GetRecord', 51 | 'identifier': 'oai:arXiv.org:%s' % resource, 52 | 'metadataPrefix': 'arXivRaw' 53 | } 54 | 55 | # Queries the API and extract the values 56 | response = self.request(self.api_base, params=params) 57 | if not response: 58 | raise HandlerError("API response not 2XX", 404) 59 | root = etree.parse(StringIO(response.content), 60 | etree.XMLParser(recover=True)) 61 | versions = root.findall( 62 | './/{http://arxiv.org/OAI/arXivRaw/}version') 63 | 64 | # Processes the return 65 | def mapper(version): 66 | v = version.xpath('@*')[0] 67 | date = version.find( 68 | './{http://arxiv.org/OAI/arXivRaw/}date').text 69 | return (normalized_uri + v, date) 70 | 71 | return map(mapper, versions) 72 | 73 | except HandlerError as he: 74 | raise he 75 | 76 | except Exception as e: 77 | logging.error('Arxiv handler exception: %s returning 404' % e) 78 | return 79 | -------------------------------------------------------------------------------- /timegate/conf/config.ini: -------------------------------------------------------------------------------- 1 | [server] 2 | 3 | # host 4 | # TimeGate server base URI 5 | # Example: host = http://timegate.example.com 6 | host = http://localhost 7 | 8 | # strict_datetime 9 | # When set to true, the user must use the RFC 1123 date in 'Accept-Datetime' header 10 | # When set to false, the server will also try to parse other time formats 11 | strict_datetime = true 12 | 13 | # api_time_out 14 | # Timeout for any API request in seconds 15 | api_time_out = 6 16 | 17 | # user-agent 18 | # Provide a user-agent to be added to the requests made by the timegate server 19 | user_agent = Memento TimeGate 20 | 21 | [handler] 22 | # handler_class 23 | # Optional path to handler class. If not provided the program will 24 | # search core extensions for a possible handler. 25 | handler_class = timegate.examples.es.EsHandler 26 | 27 | # use_timemap 28 | # Optional boolean to define wether the program can handle timemap requests. 29 | use_timemap = true 30 | 31 | 32 | # is_vcs 33 | # When true, the mementos are served from a Version Control System 34 | # When false, the mementos are served from a Snapshot system 35 | # This implies that the best memento to a date d is either, respectively 36 | # The closest to time d, before d 37 | # The absolute closest to time d 38 | is_vcs = true 39 | 40 | # base_uri 41 | # (Optional) String that will be prepended to requested URI if it is not already present 42 | # For example, if the server runs at `http://timegate.example.com` and all original resources begin with `http://example.com/res/{resource ID}`, 43 | # then setting `base_uri = http://example.com/res/` will allow short requests such `http://timegate.example.com/{resource ID}` 44 | base_uri = 45 | 46 | [cache] 47 | 48 | # cache_activated 49 | # When true, the cache stores TimeMaps from API that allows batch (get_all_mementos) requests, except for requests with `Cache-Control: no-cache` header, which will always return fresh Mementos. 50 | # When false, no cache file will be created 51 | # Default true 52 | cache_activated = false 53 | 54 | # cache_refresh_time 55 | # Time in seconds, for which it is assumed that a TimeMap didn't change. Any TimeGate request for a datetime past this period (or any TimeMap request past this period) will trigger a refresh of the cached value. 56 | # Default 86400 (one day) 57 | cache_refresh_time = 86400 58 | 59 | # cache_directory 60 | # Cache directory relative path for data files. Make sure that this directory is empty or else the cache will start deleting random files. 61 | # Default cache/ 62 | cache_directory = cache 63 | 64 | # cache_max_values 65 | # Maximum number of stored TimeMaps in the cache. 66 | # Tweak this depending on how big your TimeMaps can become (number of elements and length of URIs) 67 | # Default 250 68 | cache_max_values = 250 69 | -------------------------------------------------------------------------------- /docs/big-picture.rst: -------------------------------------------------------------------------------- 1 | .. _big_picture: 2 | 3 | Big picture 4 | =========== 5 | 6 | Definitions 7 | ----------- 8 | 9 | From now on, this documentation will refer to the web server where 10 | resources and archives are as the **web server** and to the Memento 11 | TimeGate datetime negotiation server as the **TimeGate**. 12 | 13 | - Suppose you have a web resource accessible in a web server by some 14 | URI. We call the resource the **Original Resource** and refer to its 15 | URI as **URI-R**. 16 | - Suppose a web server has a snapshot of what this URI-R looked like in 17 | the past. We call such a snapshot a **Memento** and we refer to its 18 | URI as **URI-M**. There could be many snapshots of URI-R, taken at 19 | different moments in time, each with their distinct URI-Ms. The 20 | Mementos do not necessary need to be in the same web server as the 21 | Original Resources. 22 | 23 | Client, Server and TimeGate 24 | --------------------------- 25 | 26 | This figure represents the current situation; Without date time 27 | negotiation, the client has to find by hand the URIs for the previous 28 | versions of a web resource. If they exists: |client_server.png| To make 29 | this web resources Memento compliant, two things need to be added. The 30 | new components of the systems are the TimeGate and Memento HTTP headers 31 | at the web server's side: |client_server_tg.png| With these links, the 32 | client now gets the address of the TimeGate when retrieving an Original 33 | Resource or a Memento. Then, he can use datetime negotiation with the 34 | TimeGate to get the URI of an archived version (``URI-M2``) of the 35 | Original Resource at specific a point in time (``T2``): |sequence.png| 36 | 37 | Architecture 38 | ------------ 39 | 40 | The TimeGate will manage the framework's logic in a generic manner. 41 | However, every web server has its specific way to store snapshots and to 42 | construct URI-Ms. Thus, a specific plugin must be written for every web 43 | server. Such a plugin is called a handler. A handler will typically talk 44 | to an API to return the list of URI-Ms given a URI-R, but there are 45 | several alternatives to this setup. 46 | 47 | .. figure:: architecture.png 48 | :alt: architecture.png 49 | 50 | architecture.png 51 | 52 | The system can be seen as three components. 53 | 54 | - The Memento user who wishes to retrieve an older version of a 55 | resource 56 | - The web server where the active version (original URI) and revisions 57 | (mementos) can be accessed. This entity must provide a way to access 58 | these versions. Typically through an API. 59 | - The TimeGate which itself is composed of two main elements: 60 | - One API-specific handler 61 | - The generic TimeGate code 62 | 63 | .. |client_server.png| image:: client_server.png 64 | .. |client_server_tg.png| image:: client_server_tg.png 65 | .. |sequence.png| image:: sequence.png 66 | -------------------------------------------------------------------------------- /timegate/examples/nara.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | from __future__ import absolute_import, print_function 12 | 13 | import logging 14 | import StringIO 15 | from datetime import datetime 16 | 17 | from lxml import etree 18 | 19 | from timegate.errors import HandlerError 20 | from timegate.handler import Handler 21 | 22 | 23 | class NaraHandler(Handler): 24 | 25 | def __init__(self): 26 | Handler.__init__(self) 27 | self.baseuri = "http://webharvest.gov/" 28 | congress_number = 109 29 | FIRST_YEAR = 2006 30 | THIS_YEAR = datetime.utcnow().year 31 | self.collections = ["peth04"] 32 | 33 | for i in range(FIRST_YEAR, THIS_YEAR, 2): 34 | self.collections.append("congress%sth" % congress_number) 35 | congress_number += 1 36 | 37 | def get_all_mementos(self, requri): 38 | # implement the changes list for this particular proxy 39 | changes = [] 40 | 41 | for collection in self.collections: 42 | uri = self.baseuri + collection + "/*/" + requri 43 | dom = self.get_xml(uri, html=True) 44 | 45 | if dom: 46 | rlist = dom.xpath('//*[@class="mainBody"]') 47 | for td in rlist: 48 | if len(td.getchildren()) > 0: 49 | for a in td: 50 | if a.tag == 'a': 51 | loc = a.get('href') 52 | if not loc.startswith(self.baseuri): 53 | if loc.startswith("/"): 54 | loc = self.baseuri + loc[1:] 55 | else: 56 | loc = self.baseuri + loc 57 | dtstr = a.get('onclick').split("'")[1] + " GMT" 58 | 59 | # if a.tail: 60 | changes.append((loc, dtstr)) 61 | 62 | return changes 63 | 64 | def get_xml(self, uri, html=False): 65 | """Retrieves the resource using the url, parses it as XML or HTML and 66 | returns the parsed dom object. 67 | 68 | :param uri: [str] The uri to retrieve 69 | :param headers: [dict(header_name: value)] optional http headers to 70 | send in the request. 71 | :param html: [bool] optional flag to parse the response. 72 | as HTML 73 | :return: [lxml_obj] parsed dom. 74 | """ 75 | 76 | page = self.request(uri) 77 | try: 78 | page_data = page.content 79 | if not html: 80 | parser = etree.XMLParser(recover=True) 81 | else: 82 | parser = etree.HTMLParser(recover=True) 83 | return etree.parse(StringIO.StringIO(page_data), parser) 84 | except Exception as e: 85 | logging.error("Cannot parse XML/HTML from %s" % uri) 86 | raise HandlerError("Couldn't parse data from %s" % uri) 87 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Introduction 5 | ------------ 6 | 7 | In order to support Memento, a web server must obviously have accessible 8 | archives of its online resources. And it must also have a piece of 9 | software that handles the datetime negotiation according to the Memento 10 | protocol for those resources. 11 | 12 | But in such datetime negotiation server, only a small proportion of the 13 | code is specific to the particular web resources it handles. The main 14 | part of logic will be very similar throughout many implementations. 15 | TimeGate isolates the core components and functionality. With it, 16 | there's no need to implement, or to re-implement the same logic and 17 | algorithms over and over again. Its architecture is designed to accept 18 | easy-to-code plugins to match any web resources. 19 | 20 | From now on, this documentation will refer to the web server where 21 | resources and archives are as the **web server** and to the Memento 22 | TimeGate datetime negotiation server as the **TimeGate**. 23 | 24 | - Suppose you have a web resource accessible in a web server by some 25 | URI. We call the resource the **Original Resource** and refer to its 26 | URI as **URI-R**. 27 | - Suppose a web server has a snapshot of what this URI-R looked like in 28 | the past. We call such a snapshot a **Memento** and we refer to its 29 | URI as **URI-M**. There could be many snapshots of URI-R, taken at 30 | different moments in time, each Memento i with its distinct URI-Mi. 31 | The Mementos do not necessary need to be in the same web server as 32 | the Original Resources. 33 | 34 | Example 35 | ------- 36 | 37 | .. figure:: uris_example.png 38 | 39 | There are only two steps to make such resource Memento compliant. 40 | 41 | Step 1: Setting up TimeGate 42 | --------------------------- 43 | 44 | The first thing to do is to set up the TimeGate for the specific web 45 | server. 46 | 47 | * Run the TimeGate with your custom handler. The handler is the 48 | piece of code that is specific to how the web server manages Original 49 | Resources and Mementos. It needs to implement either one of the 50 | following: 51 | 52 | - Given a URI-R, return the list of URI-Ms along with their respective dates. 53 | - Given a URI-R and a datetime, return one single URI-M along with its date. 54 | 55 | Step 2: Providing the headers 56 | ----------------------------- 57 | 58 | The second thing to do is to provide Memento's HTTP headers at the web 59 | server. 60 | 61 | * Add HTTP headers required by the Memento protocol to responses from the 62 | Original Resource and its Mementos: 63 | 64 | - For the Original Resource, add a "Link" header that points at its TimeGate 65 | - For each Memento, add a "Link" header that points at the TimeGate 66 | - For each Memento, add a "Link" header that points to the Original Resource 67 | - For each Memento, add a Memento-Datetime header that conveys the snapshot datetime 68 | 69 | Using the previous example, and supposing a TimeGate is running at 70 | ``http://example.com/timegate/``, Memento HTTP response headers for the 71 | Original Resource and one Memento look as follows: 72 | 73 | .. image:: headers_example.png 74 | 75 | And that's it! With the TimeGate, datetime negotiation is now possible 76 | for these resources. 77 | -------------------------------------------------------------------------------- /timegate/examples/webcite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """WebCitation proxy.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import cookielib 16 | import logging 17 | import StringIO 18 | import urllib2 19 | 20 | from lxml import etree 21 | 22 | from timegate.errors import HandlerError 23 | from timegate.handler import Handler 24 | 25 | 26 | class WebCiteHandler(Handler): 27 | 28 | def __init__(self): 29 | Handler.__init__(self) 30 | cj = cookielib.LWPCookieJar() 31 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 32 | urllib2.install_opener(opener) 33 | 34 | def get_all_mementos(self, requri): 35 | 36 | if requri == 'http://lanlsource.lanl.gov/hello': 37 | wcurl = 'http://webcitation.org/5jq247bmx' 38 | elif requri == 'http://lanlsource.lanl.gov/pics/picoftheday.png': 39 | wcurl = 'http://webcitation.org/5jq24MRo3' 40 | elif requri == 'http://odusource.cs.odu.edu/pics/picoftheday.png': 41 | wcurl = 'http://webcitation.org/5k9j4oXPw' 42 | else: 43 | return self.get_from_xml(requri) # Cleaner but much slower 44 | # wcurl = 'http://webcitation.org/query.php?url=' + requri # Fast 45 | # screen scraping 46 | 47 | txheaders = {} 48 | 49 | try: 50 | req = urllib2.Request(wcurl, None, txheaders) 51 | fh = urllib2.urlopen(req) 52 | fh.close() 53 | 54 | req = urllib2.Request('http://webcitation.org/topframe.php') 55 | fh = urllib2.urlopen(req) 56 | data = fh.read() 57 | fh.close() 58 | except Exception as e: 59 | raise HandlerError('Cannot request page', 404) 60 | 61 | changes = [] 62 | 63 | try: 64 | parser = etree.HTMLParser() 65 | dom = etree.parse(StringIO.StringIO(data), parser) 66 | except: 67 | raise HandlerError('Cannot parse HTML') 68 | 69 | opts = dom.xpath('//select[@name="id"]/option') 70 | for o in opts: 71 | fid = o.attrib['value'] 72 | date = o.text 73 | if date.find('(failed)') > -1: 74 | continue 75 | 76 | changes.append(('http://webcitation.org/query?id=' + fid, date)) 77 | 78 | return changes 79 | 80 | def get_from_xml(self, requri): 81 | api_request = 'http://webcitation.org/query.php?returnxml=1&url=' + requri 82 | xml = self.request(api_request, timeout=120) 83 | 84 | try: 85 | parser = etree.XMLParser(recover=True) # Parses bad XML 86 | dom = etree.parse(StringIO.StringIO(str(xml.text)), parser) 87 | except Exception as e: 88 | logging.error('Cannot parse XML: ' + str(e)) 89 | raise HandlerError('Cannot parse XML', 404) 90 | 91 | results = [] 92 | succes = dom.xpath("//result[@status='success']") 93 | for s in succes: 94 | url = s.find('webcite_url').text 95 | date = s.find('timestamp').text 96 | 97 | results.append((url, date)) 98 | 99 | return results 100 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Contributions are welcome, and they are greatly appreciated! Every 5 | little bit helps, and credit will always be given. 6 | 7 | Types of Contributions 8 | ---------------------- 9 | 10 | Report Bugs 11 | ~~~~~~~~~~~ 12 | 13 | Report bugs at https://github.com/mementoweb/timegate/issues. 14 | 15 | If you are reporting a bug, please include: 16 | 17 | * Your operating system name and version. 18 | * Any details about your local setup that might be helpful in troubleshooting. 19 | * Detailed steps to reproduce the bug. 20 | 21 | Fix Bugs 22 | ~~~~~~~~ 23 | 24 | Look through the GitHub issues for bugs. Anything tagged with "bug" 25 | is open to whoever wants to implement it. 26 | 27 | Implement Features 28 | ~~~~~~~~~~~~~~~~~~ 29 | 30 | Look through the GitHub issues for features. Anything tagged with "feature" 31 | is open to whoever wants to implement it. 32 | 33 | Write Documentation 34 | ~~~~~~~~~~~~~~~~~~~ 35 | 36 | TimeGate could always use more documentation, whether as part of the 37 | official TimeGate docs, in docstrings, or even on the web in blog posts, 38 | articles, and such. 39 | 40 | Submit Feedback 41 | ~~~~~~~~~~~~~~~ 42 | 43 | The best way to send feedback is to file an issue at 44 | https://github.com/mementoweb/timegate/issues. 45 | 46 | If you are proposing a feature: 47 | 48 | * Explain in detail how it would work. 49 | * Keep the scope as narrow as possible, to make it easier to implement. 50 | * Remember that this is a volunteer-driven project, and that contributions 51 | are welcome :) 52 | 53 | Get Started! 54 | ------------ 55 | 56 | Ready to contribute? Here's how to set up `timegate` for local development. 57 | 58 | 1. Fork the `timegate` repo on GitHub. 59 | 2. Clone your fork locally: 60 | 61 | .. code-block:: console 62 | 63 | $ git clone git@github.com:your_name_here/timegate.git 64 | 65 | 3. Install your local copy into a virtualenv. Assuming you have 66 | virtualenvwrapper installed, this is how you set up your fork for local 67 | development: 68 | 69 | .. code-block:: console 70 | 71 | $ mkvirtualenv timegate 72 | $ cd timegate/ 73 | $ pip install -e .[all] 74 | 75 | 4. Create a branch for local development: 76 | 77 | .. code-block:: console 78 | 79 | $ git checkout -b name-of-your-bugfix-or-feature 80 | 81 | Now you can make your changes locally. 82 | 83 | 5. When you're done making changes, check that your changes pass tests: 84 | 85 | .. code-block:: console 86 | 87 | $ ./run-tests.sh 88 | 89 | The tests will provide you with test coverage and also check PEP8 90 | (code style), PEP257 (documentation), flake8 as well as build the Sphinx 91 | documentation and run doctests. 92 | 93 | 6. Commit your changes and push your branch to GitHub: 94 | 95 | .. code-block:: console 96 | 97 | $ git add . 98 | $ git commit -s -m "Your detailed description of your changes." 99 | $ git push origin name-of-your-bugfix-or-feature 100 | 101 | 7. Submit a pull request through the GitHub website. 102 | 103 | Pull Request Guidelines 104 | ----------------------- 105 | 106 | Before you submit a pull request, check that it meets these guidelines: 107 | 108 | 1. The pull request should include tests and must not decrease test coverage. 109 | 2. If the pull request adds functionality, the docs should be updated. Put 110 | your new functionality into a function with a docstring. 111 | 3. The pull request should work for Python 2.7, 3.3, 3.4 and 3.5. Check 112 | https://travis-ci.com/mementoweb/timegate/pull_requests 113 | and make sure that the tests pass for all supported Python versions. 114 | -------------------------------------------------------------------------------- /timegate/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """Implement default configuration and custom loaders.""" 11 | 12 | from __future__ import absolute_import, print_function 13 | 14 | from configparser import ConfigParser 15 | 16 | from ._compat import string_types 17 | 18 | 19 | class Config(dict): 20 | """Implement custom loaders to populate dict.""" 21 | 22 | _instance = None 23 | 24 | def __new__(cls, root_path, defaults=None): 25 | """ 26 | Converting this into a singleton for cached access. 27 | :param root_path: 28 | :param defaults: 29 | :return: 30 | """ 31 | if not cls._instance: 32 | cls._instance = super(Config, cls).__new__(cls) 33 | return cls._instance 34 | 35 | def __init__(self, root_path, defaults=None): 36 | """ 37 | Build an empty config wrapper. 38 | 39 | :param root_path: Path to which files are read relative from. 40 | :param defaults: An optional dictionary of default values. 41 | """ 42 | dict.__init__(self, defaults or {}) 43 | self.root_path = root_path 44 | 45 | def from_inifile(self, filename, silent=True): 46 | """Update the values in the config from an INI file.""" 47 | conf = ConfigParser() 48 | with open(filename) as f: 49 | conf.read_file(f) 50 | 51 | # Server configuration 52 | self['HOST'] = conf.get('server', 'host').rstrip('/') 53 | self['USER_AGENT'] = conf.get('server', 'user_agent') 54 | self['STRICT_TIME'] = conf.getboolean('server', 'strict_datetime') 55 | if conf.has_option('server', 'api_time_out'): 56 | self['API_TIME_OUT'] = conf.getfloat('server', 'api_time_out') 57 | 58 | # Handler configuration 59 | if conf.has_option('handler', 'handler_class'): 60 | self['HANDLER_MODULE'] = conf.get('handler', 'handler_class') 61 | if conf.has_option('handler', 'base_uri'): 62 | self['BASE_URI'] = conf.get('handler', 'base_uri') 63 | if conf.getboolean('handler', 'is_vcs'): 64 | self['RESOURCE_TYPE'] = 'vcs' 65 | else: 66 | self['RESOURCE_TYPE'] = 'snapshot' 67 | 68 | if conf.has_option('handler', 'use_timemap'): 69 | self['USE_TIMEMAPS'] = conf.getboolean('handler', 'use_timemap') 70 | else: 71 | self['USE_TIMEMAPS'] = False 72 | 73 | # Cache 74 | # When False, all cache requests will be cache MISS 75 | self['CACHE_USE'] = conf.getboolean('cache', 'cache_activated') 76 | # Time window in which the cache value is considered young 77 | # enough to be valid 78 | self['CACHE_TOLERANCE'] = conf.getint('cache', 'cache_refresh_time') 79 | # Cache files paths 80 | self['CACHE_DIRECTORY'] = conf.get( 81 | 'cache', 'cache_directory').rstrip('/') 82 | # Maximum number of TimeMaps stored in cache 83 | self['CACHE_MAX_VALUES'] = conf.getint('cache', 'cache_max_values') 84 | # Cache files paths 85 | self['CACHE_FILE'] = self['CACHE_DIRECTORY'] # + '/cache_data' 86 | 87 | def from_object(self, obj): 88 | """Update config with values from given object. 89 | 90 | :param obj: An import name or object. 91 | """ 92 | if isinstance(obj, string_types): 93 | obj = import_string(obj) 94 | for key in dir(obj): 95 | if key.isupper(): 96 | self[key] = getattr(obj, key) 97 | -------------------------------------------------------------------------------- /docs/getting-started.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | Memento TimeGate 5 | ---------------- 6 | 7 | TimeGate is a `WSGI `__ 8 | application server that allows simple implementation of 9 | `Memento `__ capabilities for web resources 10 | having accessible revisions. It manages all the content negotiation 11 | logic, from request processing, best memento query and selection to HTTP 12 | response. 13 | 14 | To make web resources that is accessible on a web server fully Memento 15 | compliant, two things need to be done. - TimeGate is generic: a custom 16 | handler must be plugged in to match the specific web server. - The 17 | Memento framework uses specific HTTP headers: they must be added to the 18 | resource's web server responses. 19 | 20 | Steps 21 | ----- 22 | 23 | The big picture 24 | ~~~~~~~~~~~~~~~ 25 | 26 | The first thing to do is to understand how the program is 27 | structured. See :ref:`big_picture`. 28 | 29 | Installing the server 30 | ~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | The code can be obtained 33 | `here `__. Download a 34 | zip or tar.gz archive into a directory of your choice. 35 | 36 | Decompress the zip files using: 37 | 38 | .. code:: bash 39 | 40 | $ unzip timegate-.zip 41 | 42 | Decompress tar.gz files using: 43 | 44 | .. code:: bash 45 | 46 | $ tar xvzf timegate-.tar.gz 47 | 48 | Install the dependencies using: 49 | 50 | .. code:: bash 51 | 52 | $ echo 'uWSGI>=2.0.3 ConfigParser>=3.3.0r2 python-dateutil>=2.1 requests>=2.2.1 werkzeug>=0.9.6 lxml>=3.4.1' | xargs pip install 53 | 54 | Running the TimeGate 55 | ~~~~~~~~~~~~~~~~~~~~ 56 | 57 | Then try starting the TimeGate server with one of the handler that is 58 | already provided. To run it, first navigate to the directory: 59 | 60 | .. code:: bash 61 | 62 | $ cd timegate- 63 | 64 | Then, there are two possibilities: - Either execute 65 | ``uwsgi --http :9999 --wsgi-file core/application.py --master`` to 66 | deploy the TimeGate on ``localhost:9999``. Add the option 67 | ``--pidfile /path/to/file.pid`` to store the process ID in a file. - Or 68 | edit the uWSGI launch configuration in ``conf/timegate.ini`` and then 69 | execute ``uwsgi conf/timegate.ini`` 70 | 71 | To stop the server: - Simply use ``CTRL+C`` if it is running in 72 | foreground. - Or execute ``uwsgi --stop /path/to/file.pid`` if you have 73 | stored the PID to run it in the background. - If by mistake the PID is 74 | not stored but the TimeGate is still running, list all uwsgi processes 75 | using ``ps ux | grep uwsgi``, identify the TimeGate process from the 76 | ``COMMAND`` column and kill it using ``kill -INT ``. 77 | 78 | Handler 79 | ~~~~~~~ 80 | 81 | Once the server is successfully running with an example handler that was 82 | provided, edit it or create a new one (see :ref:`handler`) that returns the list 83 | of all URI-Ms given a URI-R of an Original Resource you wish to make Memento 84 | compliant. 85 | 86 | Memento Headers 87 | ~~~~~~~~~~~~~~~ 88 | 89 | The Memento protocol mainly works with HTTP headers. Now add the required 90 | headers (see :ref:`http_response_headers`) to your web server's HTTP responses. 91 | 92 | Configuring the TimeGate 93 | ~~~~~~~~~~~~~~~~~~~~~~~~ 94 | 95 | Finally, enter the TimeGate's ``HOST`` location in the ``config.ini`` (see 96 | :ref:`configuration`) file. Also edit the other parameters' default values to 97 | your preferences. 98 | 99 | Memento compliance 100 | ~~~~~~~~~~~~~~~~~~ 101 | 102 | That's it. The basic Memento functionalities are here and your web 103 | server is now Memento compliant. See :ref:`advanced_features`. 104 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | .. _configuration: 2 | 3 | Configuring the server 4 | ====================== 5 | 6 | Edit the `config 7 | file `__: 8 | ``conf/config.ini``. 9 | 10 | Mandatory field 11 | --------------- 12 | 13 | ``host`` Is the server's base URI. This is the URI on which the TimeGate 14 | is deployed. No default value. 15 | 16 | Example: - Suppose TimeGate is running at ``http://tg.example.com`` and 17 | ``URI-R`` refers to an Orignal Resource's URI. 18 | 19 | - The program will respond to TimeGate requests at 20 | ``http://tg.example.com/timegate/URI-R`` 21 | 22 | - The program will respond to ``TimeMap`` requests at 23 | ``http://tg.example.com/timemap/link/URI-R`` and 24 | ``http://tg.example.com/timemap/json/URI-R`` if the feature is enabled. 25 | See :ref:`advanced_features`. 26 | 27 | Important field 28 | --------------- 29 | 30 | ``is_vcs`` The type of archive affects the best Memento selection 31 | algorithm. Default ``false``. - When ``false``, the history is 32 | considered to be snapshots taken at some points in time, thus the best 33 | memento is the *absolute* closest to the requested date. - When 34 | ``true``, the history the handler returns is considered to be from a 35 | version control system. In other words, the history represents every 36 | change that was made to the Original Resource and the exact datetimes of 37 | the change. In this case, the best Memento for a requested datetime T 38 | will be the closest *before* T. 39 | 40 | Other fields 41 | ------------ 42 | 43 | - ``handler_class`` (Optional) Python module path to a handler class. 44 | This is useful if the handler is composed of several classes or to 45 | quickly switch between handlers. If this parameter is not provided, 46 | the program will search for handler classes in ``core.handler``. For 47 | example: 48 | ``handler_class = core.handler_examples.wikipedia.WikipediaHandler`` 49 | - ``api_time_out`` Time, in seconds, before a request to an API times 50 | out when using the ``Handler.request()`` function. Default 6 seconds 51 | - ``base_uri`` (Optional) String that will be prepended to requested 52 | URI if missing. This can be used to shorten the request URI and to 53 | avoid repeating the base URI that is common to all resources. Default 54 | empty 55 | - For example, suppose the TimeGate is deployed at 56 | ``http://tg.example.com`` 57 | - Suppose every Original Resources ``URI-Ri`` has the following format 58 | ``http://resource.example.com/res/URI-Ri`` 59 | - Then, Setting ``base_uri = http://resource.example.com/res/`` will 60 | allow short requests such as for example 61 | ``http://tg.example.com/timegate/URI-Ri`` instead of 62 | ``http://tg.example.com/timegate/http://resource.example.com/res/URI-Ri``. 63 | - ``use_timemap`` When ``true``, the TimeGate adds TimeMaps links to 64 | its (non error) responses. Default ``false`` 65 | 66 | Cache parameters: 67 | ----------------- 68 | 69 | - ``cache_activated`` When ``true``, the cache stores the entire 70 | history of an Original Resource from handlers that allows batch 71 | ``get_all_mementos(uri_r)`` requests. It can then respond from cache 72 | if the value is fresh enough. If a requests contains the header 73 | ``Cache-Control: no-cache`` the server will not respond from cache. 74 | When ``false`` the cache files are not created. Default ``true``. 75 | - ``cache_refresh_time`` tolerance in seconds, for which it is assumed 76 | that a history didn't change. Any TimeGate request for a datetime 77 | past this (or any TimeMap request past this) will trigger a refresh 78 | of the cached history. Default 86400 seconds (one day). 79 | - ``cache_directory`` Relative path for data files. Do not add any 80 | other file to this directory as they could be deleted. Each file 81 | represents an entire history of an Original Resource. Default 82 | ``cache/``. 83 | - ``cache_max_values`` Maximum number of URI-Rs for which its entire 84 | history is stored. This is then the number of files in the 85 | ``cache_directory``. Default 250. 86 | 87 | See :ref:`cache`. 88 | -------------------------------------------------------------------------------- /timegate/examples/simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Example handler.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | # For get_memento() date parameter 16 | import datetime 17 | 18 | # For custom errors sent to client 19 | from timegate.errors import HandlerError 20 | # Mandatory 21 | from timegate.handler import Handler 22 | 23 | 24 | class ExampleHandler(Handler): 25 | 26 | def __init__(self): 27 | Handler.__init__(self) 28 | # Initialization code here. This part is run only once 29 | versions_a = [ 30 | 'http://www.example.com/resourceA_v1', 31 | 'http://www.example.com/resourceA_v2', 32 | 'http://www.example.com/resourceA_v3' 33 | 34 | ] 35 | date_times_a = [ 36 | '1999-09-30T01:50:50Z', 37 | '2010-10-16T13:27:27Z', 38 | '2015-01-03T22:00:00Z' 39 | ] 40 | versions_b = [ 41 | 'http://www.example.com/resourceB_v1', 42 | 'http://www.example.com/resourceB_v2', 43 | 44 | ] 45 | date_times_b = [ 46 | '1998-07-17T17:47:31Z', 47 | '2000-11-08T19:05:09Z' 48 | ] 49 | self.archives = { 50 | 'http://www.example.com/resourceA': versions_a, 51 | 'http://www.example.com/resourceB': versions_b, 52 | 'http://www.example.com/resource%20space': [ 53 | 'http://www.example.com/space', 54 | ], 55 | } 56 | self.dates = { 57 | 'http://www.example.com/resourceA': date_times_a, 58 | 'http://www.example.com/resourceB': date_times_b, 59 | 'http://www.example.com/resource%20space': [ 60 | '1970-01-01T00:00:00Z' 61 | ], 62 | } 63 | 64 | # This is the function to implement. 65 | def get_all_mementos(self, uri_r): 66 | # Verifies and processes the requested URI 67 | archived_uris = self.archives.keys() 68 | if uri_r in archived_uris: 69 | # Contact the API to retrieve the list of URI-Ms for this URI-R 70 | # along with their datetimes 71 | 72 | # In this example, everything is done in a statically 73 | # But this is where the handler is supposed to access the versions 74 | # API 75 | uri_ms = self.archives[uri_r] 76 | datetimes = self.dates[uri_r] 77 | 78 | # Generate the list of tuples [(uri_string, date_string)] 79 | tuple_list = list(zip(uri_ms, datetimes)) 80 | return tuple_list # A list of tuple containing all Mementos is returned 81 | else: 82 | # No Memento for this uri was found in archive 83 | return [] 84 | 85 | # Implement this function instead to bypass the TimeGate's best Memento selection algorithm. 86 | # Also, it can be used if the whole list cannot be accessed easily. 87 | # If both get_all_mementos() and get_memento() are implemented. 88 | # get_memento() will always be preferred by the TimeGate. 89 | def get_memento(self, uri_r, req_datetime): 90 | # Suppose you have a special rule for certain dates 91 | if req_datetime.year < 1999: 92 | # In this case, we do not serve anything before 2001 93 | # Return a custom Error to the client 94 | raise HandlerError( 95 | "Cannot server a Memento before 1999", status=404) 96 | else: 97 | # Gets all mementos for this URI 98 | mementos_list = self.get_all_mementos(uri_r) 99 | 100 | # Find the best single memento is returned for this uri_r and this 101 | # date 102 | (uri_m, date_time) = mementos_list[-1] 103 | # In this example we take the last one 104 | 105 | return (uri_m, date_time) # The return value is a tuple here. 106 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | """A Memento TimeGate.""" 11 | 12 | import os 13 | import sys 14 | 15 | from setuptools import find_packages, setup 16 | 17 | readme = open('README.rst').read() 18 | 19 | tests_require = [ 20 | 'check-manifest>=0.25', 21 | 'coverage>=4.0', 22 | 'isort>=4.2.2', 23 | 'pydocstyle>=1.0.0', 24 | 'pytest-cache>=1.0', 25 | 'pytest-cov>=1.8.0', 26 | 'pytest-pep8>=1.0.6', 27 | 'pytest>=2.8.0', 28 | 'httpretty>=0.8.14', 29 | 'mock>=2.0.0', 30 | ] 31 | 32 | extras_require = { 33 | ':python_version<"3.0"': [ 34 | 'ConfigParser>=3.3.0r2', 35 | ], 36 | 'docs': [ 37 | 'Sphinx>=1.4.2', 38 | ], 39 | 'uwsgi': [ 40 | 'uWSGI>=2.0.3', 41 | ], 42 | 'tests': tests_require, 43 | } 44 | 45 | extras_require['all'] = [] 46 | for key, reqs in extras_require.items(): 47 | if key[0] == ':': 48 | continue 49 | extras_require['all'].extend(reqs) 50 | 51 | setup_requires = [ 52 | 'pytest-runner>=2.6.2', 53 | ] 54 | 55 | install_requires = [ 56 | 'LinkHeader>=0.4.3', 57 | 'lxml>=3.4.1', 58 | 'python-dateutil>=2.1', 59 | 'requests>=2.2.1', 60 | 'werkzeug>=0.9.6', 61 | ] 62 | 63 | packages = find_packages() 64 | 65 | 66 | # Get the version string. Cannot be done with import! 67 | g = {} 68 | with open(os.path.join('timegate', 'version.py'), 'rt') as fp: 69 | exec(fp.read(), g) 70 | version = g['__version__'] 71 | 72 | setup( 73 | name='timegate', 74 | version=version, 75 | description=__doc__, 76 | long_description=readme, 77 | keywords='memento timegate', 78 | license='BSD', 79 | author='LANL', 80 | author_email='yorick.chollet@gmail.com', 81 | url='https://github.com/mementoweb/timegate', 82 | packages=packages, 83 | zip_safe=False, 84 | include_package_data=True, 85 | platforms='any', 86 | entry_points={ 87 | 'timegate.handlers': [ 88 | 'arxiv = timegate.examples.arxiv:ArxivHandler', 89 | 'aueb = timegate.examples.aueb:AuebHandler', 90 | 'can = timegate.examples.can:CanHandler', 91 | 'cat = timegate.examples.cat:CatHandler', 92 | 'cr = timegate.examples.cr:CrHandler', 93 | 'es = timegate.examples.es:EsHandler', 94 | 'github = timegate.examples.github:GithubHandler', 95 | 'gitlab = timegate.examples.gitlab:GitlabHandler', 96 | 'loc = timegate.examples.loc:LocHandler', 97 | 'mediawiki = timegate.examples.mediawiki:MediawikiHandler', 98 | 'nara = timegate.examples.nara:NaraHandler', 99 | 'orain = timegate.examples.orain:OrainHandler', 100 | 'pastpages = timegate.examples.pastpages:PastpagesHandler', 101 | 'sg = timegate.examples.sg:SgHandler', 102 | 'si = timegate.examples.si:SiHandler', 103 | 'simple = timegate.examples.simple:ExampleHandler', 104 | 'w3c = timegate.examples.w3c:W3cHandler', 105 | 'webcite = timegate.examples.webcite:WebCiteHandler', 106 | 'wikia = timegate.examples.wikia:WikiaHandler', 107 | 'wikipedia = timegate.examples.wikipedia:WikipediaHandler', 108 | ], 109 | }, 110 | extras_require=extras_require, 111 | install_requires=install_requires, 112 | setup_requires=setup_requires, 113 | tests_require=tests_require, 114 | classifiers=[ 115 | 'Development Status :: 5 - Production/Stable', 116 | 'Environment :: Web Environment', 117 | 'Intended Audience :: Developers', 118 | 'License :: OSI Approved :: BSD License', 119 | 'Operating System :: OS Independent', 120 | 'Programming Language :: Python :: 2', 121 | 'Programming Language :: Python :: 2.7', 122 | 'Programming Language :: Python :: 3', 123 | 'Programming Language :: Python :: 3.3', 124 | 'Programming Language :: Python :: 3.4', 125 | 'Programming Language :: Python :: 3.5', 126 | 'Programming Language :: Python', 127 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 128 | 'Topic :: Software Development :: Libraries :: Python Modules' 129 | ], 130 | ) 131 | -------------------------------------------------------------------------------- /tests/test_timegate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | 11 | """Module tests.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import json 16 | 17 | import pytest 18 | 19 | 20 | def test_version(): 21 | """Test version import.""" 22 | from timegate import __version__ 23 | assert __version__ 24 | 25 | 26 | def test_initialization(): 27 | """Test TimeGate initialization.""" 28 | from timegate.application import TimeGate 29 | from timegate.examples.simple import ExampleHandler 30 | handler = ExampleHandler() 31 | app = TimeGate(config=dict(HANDLER_MODULE=handler)) 32 | assert handler == app.handler 33 | 34 | 35 | def test_application(): 36 | """Test simple request.""" 37 | from timegate import application 38 | from werkzeug.test import Client 39 | from werkzeug.wrappers import BaseResponse 40 | client = Client(application.application, BaseResponse) 41 | 42 | assert client.get('/').status_code == 404 43 | 44 | 45 | def test_timemap_response(client): 46 | """Test timemap responses.""" 47 | response = client.get( 48 | '/timemap/json/http://www.example.com/resourceBad' 49 | ) 50 | assert response.status_code == 404 51 | 52 | response = client.get( 53 | '/timemap/json/http://www.example.com/resourceA' 54 | ) 55 | assert response.status_code == 200 56 | 57 | response = client.get( 58 | '/timemap/json/resourceA' 59 | ) 60 | assert response.status_code == 200 61 | data = json.loads(response.data.decode('utf-8')) 62 | assert 3 == len(data['mementos']['list']) 63 | 64 | response = client.get( 65 | '/timemap/link/http://www.example.com/resourceA' 66 | ) 67 | assert response.status_code == 200 68 | mementos = response.data.split(b'\n') 69 | assert 8 == len(mementos) 70 | 71 | 72 | def test_timegate_response(client): 73 | """Test timegate responses.""" 74 | response = client.get( 75 | '/timegate/http://www.example.com/resourceA' 76 | ) 77 | assert response.status_code == 302 78 | assert response.headers['Location'] == ( 79 | 'http://www.example.com/resourceA_v3' 80 | ) 81 | 82 | response = client.get( 83 | '/timegate/http://www.example.com/resourceA', 84 | headers=[('Accept-Datetime', 'Mon, 01 Jan 1999 00:00:00 GMT'), ], 85 | ) 86 | assert response.status_code == 302 87 | assert response.headers['Location'] == ( 88 | 'http://www.example.com/resourceA_v1' 89 | ) 90 | 91 | response = client.get( 92 | '/timegate/http://www.example.com/resourceA', 93 | headers=[('Accept-Datetime', 'Mon, 01 Jan 2010 00:00:00 GMT'), ], 94 | ) 95 | assert response.status_code == 302 96 | assert response.headers['Location'] == ( 97 | 'http://www.example.com/resourceA_v1' 98 | ) 99 | 100 | response = client.get( 101 | '/timegate/http://www.example.com/resource%20space' 102 | ) 103 | assert response.status_code == 302 104 | assert response.headers['Location'] == ( 105 | 'http://www.example.com/space' 106 | ) 107 | 108 | 109 | def test_closest_match(app): 110 | """Test closes match.""" 111 | from werkzeug.test import Client 112 | from werkzeug.wrappers import BaseResponse 113 | 114 | app.config['RESOURCE_TYPE'] = 'snapshot' 115 | client = Client(app, BaseResponse) 116 | 117 | response = client.get( 118 | '/timegate/http://www.example.com/resourceA', 119 | headers=[('Accept-Datetime', 'Mon, 01 Jan 2010 00:00:00 GMT'), ], 120 | ) 121 | assert response.status_code == 302 122 | assert response.headers['Location'] == ( 123 | 'http://www.example.com/resourceA_v2' 124 | ) 125 | 126 | response = client.get( 127 | '/timegate/http://www.example.com/resourceA', 128 | headers=[('Accept-Datetime', 'Mon, 01 Jan 2100 00:00:00 GMT'), ], 129 | ) 130 | assert response.status_code == 302 131 | assert response.headers['Location'] == ( 132 | 'http://www.example.com/resourceA_v3' 133 | ) 134 | 135 | 136 | @pytest.mark.parametrize('value,result', [ 137 | ('', ''), ('/', '/'), ('#', ''), 138 | ]) 139 | def test_uri_validation(value, result): 140 | """Test URI validation.""" 141 | from timegate.utils import validate_uristr 142 | assert result == validate_uristr(value) 143 | 144 | 145 | def test_uri_validation_exceptions(): 146 | """Test URI validation exceptions.""" 147 | from timegate.utils import validate_uristr 148 | with pytest.raises(Exception): 149 | validate_uristr(None) 150 | -------------------------------------------------------------------------------- /timegate/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Base class TimeGate handlers.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | from operator import itemgetter 17 | 18 | import requests 19 | 20 | from . import utils as timegate_utils 21 | from ._compat import quote 22 | from .config import Config 23 | from .constants import API_TIME_OUT, TM_MAX_SIZE 24 | from .errors import HandlerError 25 | 26 | 27 | class Handler(object): 28 | 29 | # Disables all 'requests' module event logs that are at least not WARNINGS 30 | logging.getLogger('requests').setLevel(logging.WARNING) 31 | 32 | def request(self, resource, timeout=API_TIME_OUT, **kwargs): 33 | """Handler helper function. 34 | 35 | Requests the resource over HTTP. Logs the request and handles 36 | exceptions. 37 | 38 | :param resource: The resource to get. 39 | :param timeout: The HTTP Timeout for a single request. 40 | :param kwargs: The keywords arguments to pass to the request method 41 | (``params``). These keywords will have their special character 42 | escaped using %-encoding. Do not pass already-encoded chars. 43 | :return: A requests response object. 44 | :raises HandlerError: if the requests fails to access the API. 45 | """ 46 | uri = resource 47 | config = Config(None) 48 | user_agent = config.get("USER_AGENT") 49 | headers = {} 50 | if user_agent: 51 | headers["User-Agent"] = user_agent 52 | 53 | # Request logging with params 54 | try: 55 | logging.info('Sending request for %s?%s' % ( 56 | uri, '&'.join(map(lambda k_v: '%s=%s' % ( 57 | quote(str(k_v[0])), quote(str(k_v[1])) 58 | ), kwargs['params'].items())))) 59 | except Exception: 60 | # Key errors on 'params' 61 | logging.info('Sending request for %s' % uri) 62 | 63 | try: 64 | req = requests.get(uri, timeout=timeout, headers=headers, **kwargs) 65 | except Exception as e: 66 | logging.error('Cannot request server (%s): %s' % (uri, e)) 67 | raise HandlerError('Cannot request version server.', 502) 68 | 69 | if req is None: 70 | logging.error('Error requesting server (%s): %s' % uri) 71 | raise HandlerError('Error requesting version server.', 404) 72 | 73 | if not req: 74 | logging.info('Response other than 2XX: %s' % req) 75 | # raise HandlerError('API response not 2XX', 404) 76 | return req 77 | 78 | 79 | def parsed_request(handler_function, *args, **kwargs): 80 | """Retrieve and parse the response from the ``Handler``. 81 | 82 | This function is the point of entry to all handler requests. 83 | 84 | :param handler_function: The function to call. 85 | :param args: Arguments to :handler_function: 86 | :param kwargs: Keywords arguments to :handler_function: 87 | :return: A sorted [(URI_str, date_obj),...] list of all Mementos. 88 | In the response, and all URIs/dates are valid. 89 | :raise HandlerError: In case of a bad response from the handler. 90 | """ 91 | try: 92 | handler_response = handler_function(*args, **kwargs) 93 | except HandlerError as he: 94 | logging.info('Handler raised HandlerError %s' % he) 95 | raise he # HandlerErrors have return data. 96 | except Exception as e: 97 | logging.error('Handler raised exception %s' % e) 98 | raise HandlerError('Error in Handler', 503) 99 | 100 | # Input check 101 | if not handler_response: 102 | raise HandlerError('Not Found: Handler response Empty.', 404) 103 | elif isinstance(handler_response, tuple): 104 | handler_response = [handler_response] 105 | elif not (isinstance(handler_response, list) and 106 | isinstance(handler_response[0], tuple)): 107 | logging.error('Bad response from Handler: Not a tuple nor tuple array') 108 | raise HandlerError('Bad handler response.', 503) 109 | elif len(handler_response) > TM_MAX_SIZE: 110 | logging.warning( 111 | 'Bad response from Handler: TimeMap (%d greater than max %d)' % 112 | (len(handler_response), TM_MAX_SIZE)) 113 | raise HandlerError('Handler response too big and unprocessable.', 502) 114 | 115 | valid_response = [( 116 | timegate_utils.validate_uristr(url), 117 | timegate_utils.validate_date(date) 118 | ) for (url, date) in handler_response or []] 119 | # Sort by datetime 120 | return sorted(valid_response, key=itemgetter(1)) 121 | -------------------------------------------------------------------------------- /timegate/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Various helper functions.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | from datetime import datetime, timedelta 17 | 18 | from dateutil.parser import parse as parse_datestr 19 | from dateutil.tz import tzutc 20 | 21 | from ._compat import urlparse 22 | from .errors import DateTimeError, URIRequestError 23 | 24 | 25 | def validate_uristr(uristr): 26 | """Control and validate the uri string. 27 | 28 | Raises an ``Exception`` if it is not valid. 29 | 30 | :param uristr: The uri string that needs to be verified. 31 | :return: The validated uri string. 32 | """ 33 | if uristr is None: 34 | raise ValueError('URI can not be None') 35 | return str(urlparse(uristr).geturl()) 36 | 37 | 38 | def validate_date(datestr): 39 | """Control and validate the date string. 40 | 41 | :param datestr: The date string representation. 42 | :return: The datetime object form the parsed date string. 43 | """ 44 | return parse_datestr(datestr, fuzzy=True).replace(tzinfo=tzutc()) 45 | 46 | 47 | def best(timemap, accept_datetime, timemap_type): 48 | """Find best memento.""" 49 | assert(timemap) 50 | assert(accept_datetime) 51 | if timemap_type == 'vcs': 52 | return closest_before(timemap, accept_datetime) 53 | else: 54 | return closest(timemap, accept_datetime) 55 | 56 | 57 | def closest(timemap, accept_datetime): 58 | """Find the absolutely closest memento chronologically to a datetime. 59 | 60 | Details of the requirements at 61 | http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3. 62 | 63 | :param timemap: A sorted Timemap 64 | :param accept_datetime: the time object for which the best memento must 65 | be found. 66 | :return: A tuple with memento URI and its datetime. 67 | """ 68 | 69 | delta = timedelta.max 70 | memento_uri = None 71 | memento_dt = None 72 | 73 | for (url, dt) in timemap: 74 | diff = abs(accept_datetime - dt) 75 | if diff <= delta: # there can be several with the same datetime. 76 | memento_uri = url 77 | memento_dt = dt 78 | delta = diff 79 | else: 80 | # The list is sorted and the delta didn't increase this time. 81 | # It will not increase anymore: Return the Memento (best one). 82 | return (memento_uri, memento_dt) 83 | 84 | return (memento_uri, memento_dt) 85 | 86 | 87 | def closest_before(timemap, accept_datetime): 88 | """Find the closest memento in the before a datetime. 89 | 90 | Details of the requirements at 91 | http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3. 92 | 93 | :param timemap: A sorted Timemap. 94 | :param accept_datetime: The time object for which the best memento 95 | must be found. 96 | :return: The uri_m string of the closest memento. 97 | """ 98 | prev_uri = prev_dt = None 99 | 100 | for (url, dt) in timemap: 101 | diff = abs(accept_datetime - dt) 102 | if dt > accept_datetime: 103 | if prev_uri is not None: 104 | return (prev_uri, prev_dt) # We passed 'accept-datetime' 105 | else: 106 | # The first of the sorted list is already after the accept 107 | # datetime 108 | return (url, dt) 109 | prev_uri = url 110 | prev_dt = dt 111 | 112 | return (prev_uri, prev_dt) 113 | 114 | 115 | def closest_binary(timemap, accept_datetime): 116 | """Finds the chronologically closest memento using binary search in a 117 | sorted list. Complexity O(log(n)) instead of O(n) Details of the 118 | requirements at http://www.mementoweb.org/guide/rfc/#SpecialCases, point 119 | 4.5.3. 120 | 121 | :param timemap: A sorted Timemap. 122 | :param accept_datetime: The time object for which the best memento 123 | must be found. 124 | :return: The uri_m string of the closest memento. 125 | """ 126 | # TODO implement 127 | 128 | 129 | def closest_before_binary(timemap, accept_datetime): 130 | """Find the closest memento in the past of a datetime using binary search. 131 | 132 | Note the timemap **must** be a sorted list. Complexity ``O(log(n))`` 133 | instead of ``O(n)`` Details of the requirements at 134 | http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3. 135 | 136 | :param timemap: A sorted Timemap. 137 | :param accept_datetime: The time object for which the best memento 138 | must be found. 139 | :return: The uri_m string of the closest memento. 140 | """ 141 | # TODO implement 142 | -------------------------------------------------------------------------------- /docs/handler.rst: -------------------------------------------------------------------------------- 1 | .. _handler: 2 | 3 | Resources-specific Handler 4 | ========================== 5 | 6 | A handler is a python class that is plugged into the generic TimeGate to 7 | fit any specific technique a web server has to manage its Original 8 | Resources and Mementos. Its role is simple: to retrieve the list of 9 | URI-Ms (with their archival dates) given a URI-R. It typically does so 10 | by connecting to an API. 11 | 12 | Alternatives 13 | ------------ 14 | 15 | - If no API is present: The list can be retrieved from many different 16 | ways. Page scraping, rule-based or even in a static manner. Anything 17 | will do. 18 | - If the history cannot be retrieved entirely: The handler can 19 | implement an alternative function that returns one single URI-M and 20 | its archival datetime given both URI-R and the datetime the user 21 | requested. 22 | - If the TimeGate's algorithms that select the best Memento for a 23 | requested date do not apply to the system: Implementing the 24 | alternative function could also be used to bypass these algorithms. 25 | This is particularly useful if there are performance concerns, 26 | special cases or access restriction for Mementos. 27 | 28 | Requirements 29 | ------------ 30 | 31 | .. image:: code_architecture.png 32 | 33 | A handler require to have the following: 34 | 35 | - It must a python file placed in the ``core.handler`` module (which is 36 | the ``core/handler/`` folder). And it must be unique. If several 37 | classes are needed, or to switch quickly between handlers, consider 38 | adding the handler module path manually in the configuration 39 | file. (See :ref:`configuration`.) 40 | - A handler must extend the ``core.handler_baseclass.Handler`` 41 | base-class. 42 | - Implement at least one of the following: 43 | 44 | - ``get_all_mementos(uri_r)`` class function: This function is called 45 | by the TimeGate to retrieve the history an original resource 46 | ``uri_r``. The parameter ``uri_r`` is a Python string representing 47 | the requested URI-R. The return value must be a list of 2-tuples: 48 | ``[(uri_m1, date1), (uri_m2, date2), ...]`` . Each pair 49 | ``(uri_m, date)`` contains the URI of an archived version of R 50 | ``uri_m``, and the date at which it was archived ``date``. 51 | - ``get_memento(uri_r, requested_date)`` class function (alternative): 52 | This function will be called by the TimeGate to retrieve the best 53 | Memento for ``uri_`` at the date ``date``. Use it if the API cannot 54 | return the entire history for a resource efficiently or to bypass the 55 | TimeGate's best Memento selection. The parameter ``uri_r`` is a 56 | Python string representing the requested URI-R. The parameter 57 | ``date`` is a Python ``datetime.DateTime`` object. In this case, the 58 | return value will contain only one 2-tuple: ``(uri_m, date)`` which 59 | is the best memento that the handler could provide taking into 60 | account the limits of the API. 61 | 62 | - Input parameters: 63 | 64 | - All parameter values ``uri_r`` are Python strings representing the 65 | user's requested URI-R. 66 | - All parameter values ``requested_date``\ are ``datetime.DateTime`` 67 | objects representing the user's requested datetime. 68 | 69 | - Output return values: 70 | 71 | - All return values ``uri_m`` must be strings. 72 | - All return values ``date`` must be strings representing dates. Prefer 73 | the `ISO 8601 `__ format for 74 | the dates. 75 | 76 | - Note that: 77 | 78 | - If both functions are implemented, 79 | ``get_memento(uri_r, requested_date)`` will always be used for 80 | TimeGate requests. 81 | - If the TimeMap advanced feature (see :ref:`advanced_features`) is enabled, 82 | ``get_all_mementos(uri_r)`` must be implemented. 83 | 84 | Example 85 | ------- 86 | 87 | A simple example handler is provided in\ ``core/handler/`` and can be 88 | edited to match your web server's requirements: - See 89 | `example.py `__ 90 | Which returns static lists. 91 | 92 | Other handlers examples are provided for real world APIs in 93 | ``core/handler_examples/`` for instance: 94 | 95 | - `arXiv.py 96 | `__ 97 | Where the Original Resources are the e-prints of http://arxiv.org/ - 98 | - `wikipedia.py 99 | `__ 100 | Where the Original Resources are the articles of https://www.wikipedia.org/ 101 | - `github.py 102 | `__ 103 | Where the Original Resources are the repositories, trees (branches and 104 | directories), files and raw files. 105 | 106 | Other scraping Handlers examples are provided for real world resources 107 | without any API: 108 | 109 | - `can.py 110 | `__ 111 | Where the Original Resources are the archives stored in 112 | http://www.collectionscanada.gc.ca/webarchives/ 113 | -------------------------------------------------------------------------------- /timegate/examples/wikia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Wikia TimeGate proxy.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import StringIO 17 | import time 18 | from datetime import datetime, timedelta 19 | from urlparse import urlparse 20 | 21 | from dateutil import parser as dateparser 22 | from dateutil.tz import tzutc 23 | from lxml import etree 24 | 25 | from timegate.errors import HandlerError 26 | from timegate.handler import Handler 27 | from timegate.utils import date_str 28 | 29 | 30 | def iso_to_dt(date): 31 | seq = (int(date[:4]), int(date[5:7]), int(date[8:10]), int(date[11:13]), 32 | int(date[14:16]), int(date[17:19]), 0, 1, -1) 33 | return date_str( 34 | datetime.fromtimestamp( 35 | time.mktime( 36 | time.struct_time(seq)), 37 | tzutc())) 38 | 39 | 40 | class WikiaHandler(Handler): 41 | 42 | def __init__(self): 43 | Handler.__init__(self) 44 | 45 | self.hosts = [ 46 | 'www.wowwiki.com', 47 | 'en.memory-alpha.org', 48 | 'wiki.ffxiclopedia.org', 49 | 'www.jedipedia.de' 50 | ] 51 | 52 | def get_memento(self, req_url, dt): 53 | p = urlparse(req_url) 54 | host = p[1] 55 | upath = p[2] 56 | 57 | if host.find('.wikia.com') == -1 and not host in self.hosts: 58 | return 59 | 60 | exploded_path = upath.rsplit('/', 1) 61 | 62 | if len(exploded_path) > 1: 63 | (pref, title) = upath.rsplit('/', 1) 64 | if pref: 65 | # look for /wiki 66 | pref = pref.replace('/wiki', '') 67 | else: 68 | raise HandlerError("No article title found in requested URI.", 404) 69 | 70 | changes = [] 71 | defaultProtocol = "http://" 72 | 73 | dtfmstr = "%Y%m%d%H%M%S" 74 | 75 | dt_del = timedelta(seconds=1) 76 | dt_next = dt + dt_del 77 | dt_next = dt_next.strftime(dtfmstr) 78 | dt = dt.strftime(dtfmstr) 79 | 80 | url_list = [] 81 | 82 | # url for getting the memento, prev 83 | mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % ( 84 | defaultProtocol, host, title, dt) 85 | url_list.append('mem_prev') 86 | 87 | # url for next 88 | if dt_next: 89 | next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % ( 90 | defaultProtocol, host, title, dt) 91 | url_list.append('next') 92 | 93 | # url for last 94 | last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % ( 95 | defaultProtocol, host, title) 96 | url_list.append('last') 97 | 98 | # url for first 99 | first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % ( 100 | defaultProtocol, host, title) 101 | url_list.append('first') 102 | 103 | #url = url % (title, dt) 104 | base = "%s%s%s/index.php?title=%s&oldid=" % \ 105 | (defaultProtocol, host, pref, title) 106 | dtobj = None 107 | 108 | hdrs = {} 109 | hdrs['Host'] = host 110 | 111 | for url in url_list: 112 | 113 | dom = self.get_xml(vars()[url], headers=hdrs) 114 | revs = dom.xpath('//rev') 115 | for r in revs: 116 | dt = r.attrib['timestamp'] 117 | dtobj = dateparser.parse(r.attrib['timestamp']) 118 | changes.append((base + r.attrib['revid'], dt)) 119 | 120 | return changes 121 | 122 | def get_all_mementos(self, req_url): 123 | 124 | # http://www.wowwiki.com/Cloth_armor --> /api.php 125 | # http://dragonage.wikia.com/wiki/Morrigan --> /api.php 126 | # http://memory-alpha.org/en/wiki/Fraggle_Rock --> /en/api.php 127 | 128 | p = urlparse(req_url) 129 | host = p[1] 130 | upath = p[2] 131 | 132 | if host.find('.wikia.com') == -1 and not host in self.hosts: 133 | return 134 | 135 | (pref, title) = upath.rsplit('/', 1) 136 | if pref: 137 | # look for /wiki 138 | pref = pref.replace('/wiki', '') 139 | 140 | url = "http://%s%s/api.php?format=xml&action=query&prop=revisions&meta=siteinfo&rvprop=timestamp|ids&rvlimit=500&redirects=1&titles=%s" % ( 141 | host, pref, title) 142 | 143 | changes = [] 144 | base = "http://%s%s/index.php?oldid=" % (host, pref) 145 | 146 | headers = {} 147 | # headers['Host'] = host 148 | dom = self.get_xml(url, headers=headers) 149 | while dom is not None: 150 | revs = dom.xpath('//rev') 151 | for r in revs: 152 | dtstr = iso_to_dt(r.attrib['timestamp']) 153 | changes.append((base + r.attrib['revid'], dtstr)) 154 | cont = dom.xpath('/api/query-continue/revisions/@rvstartid') 155 | if cont: 156 | dom = self.get_xml(url + "&rvstartid=" + 157 | cont[0], headers=headers) 158 | else: 159 | dom = None 160 | return changes 161 | 162 | def get_xml(self, uri, html=False, headers=None): 163 | 164 | page = self.request(uri, headers=headers) 165 | try: 166 | page_data = page.content 167 | if not html: 168 | parser = etree.XMLParser(recover=True) 169 | else: 170 | parser = etree.HTMLParser(recover=True) 171 | return etree.parse(StringIO.StringIO(page_data), parser) 172 | except Exception as e: 173 | logging.error("Cannot parse XML/HTML from %s" % uri) 174 | raise HandlerError("Couldn't parse data from %s" % uri, 404) 175 | -------------------------------------------------------------------------------- /timegate/examples/orain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | # 11 | import logging 12 | import StringIO 13 | import urllib2 14 | import urlparse 15 | 16 | from lxml import etree 17 | 18 | from core.timegate_utils import date_str 19 | from timegate.errors import HandlerError 20 | from timegate.handler import Handler 21 | 22 | 23 | class OrainHandler(Handler): 24 | 25 | def __init__(self): 26 | Handler.__init__(self) 27 | self.TIMESTAMPFMT = '%Y%m%d%H%M%S' 28 | self.hosts = [".orain.org"] 29 | 30 | def get_memento(self, req_uri, accept_datetime): 31 | 32 | logging.debug("Begin Fetching mementos for: %s" % req_uri) 33 | 34 | p = urlparse.urlparse(req_uri) 35 | host = p[1] 36 | 37 | for h in self.hosts: 38 | if host.find(h) == -1: 39 | return 40 | 41 | timestamp = date_str(accept_datetime, self.TIMESTAMPFMT) 42 | params = { 43 | 'rvlimit': 1, # Only need one 44 | 'rvstart': timestamp, # Start listing from here 45 | 'rvdir': 'older' # List in decreasing order 46 | } 47 | 48 | 49 | # Finds the API and title using scraping 50 | api_base_uri = None 51 | try: 52 | dom = self.get_xml(req_uri, html=True) 53 | links = dom.xpath("//link") 54 | for link in links: 55 | if link.attrib['rel'].lower() == "edituri": 56 | api_base_uri = link.attrib['href'].split("?")[0] 57 | if api_base_uri.startswith("//"): 58 | api_base_uri = api_base_uri.replace("//", "http://") 59 | parsed_url = urlparse.urlparse(req_uri) 60 | try: 61 | title = urlparse.parse_qs(parsed_url[4])['title'][0] 62 | except Exception as e: 63 | title = parsed_url.path.split('/')[-1] 64 | logging.debug("Orain handler: API found: %s, page title parsed to: %s " % (api_base_uri, title) ) 65 | if not title: 66 | raise HandlerError("Cannot find Title", 404) 67 | if not api_base_uri: 68 | raise HandlerError("Cannot find orain API on page", 404) 69 | else: 70 | title = urllib2.unquote(title) 71 | 72 | except HandlerError as he: 73 | raise he 74 | except Exception as e: 75 | logging.error("OrainHandler: querying and parsing page for title/api %s. handler will return empty response" % e) 76 | return None 77 | 78 | base_uri = api_base_uri.replace("api.php", "index.php") 79 | 80 | return self.query(req_uri, params, title, api_base_uri, base_uri) 81 | 82 | def query(self, req_uri, req_params, title, api_base_uri, base_uri): 83 | 84 | params = { 85 | 'action': 'query', 86 | 'format': 'json', 87 | 'prop': 'revisions', 88 | 'rvprop': 'ids|timestamp', 89 | 'indexpageids': '', 90 | 'titles': title 91 | } 92 | params.update(req_params) 93 | 94 | # Does sequential queries to get all revisions IDs and Timestamps 95 | queries_results = [] 96 | condition = True 97 | while condition: 98 | # Clone original request 99 | newparams = params.copy() 100 | req = self.request(api_base_uri, params=newparams) 101 | try: 102 | result = req.json() 103 | except Exception as e: 104 | logging.error("No JSON can be decoded from API %s" % api_base_uri) 105 | raise HandlerError("No API answer.", 404) 106 | if 'error' in result: 107 | raise HandlerError(result['error']) 108 | if 'warnings' in result: 109 | # logging.warn(result['warnings']) 110 | pass 111 | try: 112 | # The request was successful 113 | pid = result['query']['pageids'][0] # the JSON key of the page (only one) 114 | queries_results += result['query']['pages'][pid]['revisions'] 115 | if ('missing' in result['query']['pages'][pid] or 116 | 'invalid' in result['query']['pages'][pid]): 117 | raise HandlerError("Cannot find resource on version server.", 404) 118 | except Exception as e: 119 | if req_params['rvdir'] == 'older': 120 | req_params['rvdir'] = 'newer' 121 | return self.query(req_uri, req_params, title, api_base_uri, base_uri) 122 | else: 123 | raise HandlerError("No revision returned from API.", 404) 124 | if 'continue' in result: 125 | # The response was truncated, the rest can be obtained using 126 | # &rvcontinue=ID 127 | cont = result['continue'] 128 | # Modify it with the values returned in the 'continue' section of the last result. 129 | newparams.update(cont) 130 | condition = True 131 | else: 132 | condition = False 133 | 134 | # Processing list 135 | def f(rev): 136 | rev_uri = base_uri + '?title=%s&oldid=%d' % ( 137 | urllib2.quote(title), rev['revid']) 138 | dt = rev['timestamp'] 139 | return (rev_uri, dt) 140 | 141 | 142 | # logging.debug("Returning API results of size %d" % len(queries_results)) 143 | return map(f, queries_results) 144 | 145 | def get_xml(self, uri, html=False): 146 | """ 147 | Retrieves the resource using the url, parses it as XML or HTML 148 | and returns the parsed dom object. 149 | :param uri: [str] The uri to retrieve 150 | :param headers: [dict(header_name: value)] optional http headers to send in the request 151 | :param html: [bool] optional flag to parse the response as HTML 152 | :return: [lxml_obj] parsed dom. 153 | """ 154 | 155 | try: 156 | page = self.request(uri) 157 | except HandlerError as he: 158 | raise HandlerError(he, status=404) 159 | 160 | try: 161 | page_data = page.content 162 | if not html: 163 | parser = etree.XMLParser(recover=True) 164 | else: 165 | parser = etree.HTMLParser(recover=True) 166 | return etree.parse(StringIO.StringIO(page_data), parser) 167 | except Exception as e: 168 | logging.error("Cannot parse XML/HTML from %s" % uri) 169 | raise HandlerError("Couldn't parse data from %s" % uri, 404) 170 | -------------------------------------------------------------------------------- /timegate/examples/mediawiki.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | from __future__ import absolute_import, print_function 12 | 13 | import logging 14 | import StringIO 15 | import urllib2 16 | import urlparse 17 | 18 | from lxml import etree 19 | 20 | from timegate.errors import HandlerError 21 | from timegate.handler import Handler 22 | from timegate.utils import date_str 23 | 24 | 25 | class MediaWikiHandler(Handler): 26 | 27 | def __init__(self): 28 | Handler.__init__(self) 29 | self.TIMESTAMPFMT = '%Y%m%d%H%M%S' 30 | 31 | # def getall(self, uri): 32 | # params = { 33 | # 'rvlimit': 500, # Max allowed 34 | # 'continue': '' # The initial continue value is empty 35 | # } 36 | # 37 | # return self.query(uri, params) 38 | 39 | def get_memento(self, req_uri, accept_datetime): 40 | timestamp = date_str(accept_datetime, self.TIMESTAMPFMT) 41 | params = { 42 | 'rvlimit': 1, # Only need one 43 | 'rvstart': timestamp, # Start listing from here 44 | 'rvdir': 'older' # List in decreasing order 45 | } 46 | 47 | # Finds the API and title using scraping 48 | api_base_uri = None 49 | try: 50 | dom = self.get_xml(req_uri, html=True) 51 | links = dom.xpath("//link") 52 | for link in links: 53 | if link.attrib['rel'].lower() == "edituri": 54 | api_base_uri = link.attrib['href'].split("?")[0] 55 | if api_base_uri.startswith("//"): 56 | api_base_uri = api_base_uri.replace("//", "http://") 57 | parsed_url = urlparse.urlparse(req_uri) 58 | try: 59 | title = urlparse.parse_qs(parsed_url[4])['title'][0] 60 | except Exception as e: 61 | title = parsed_url.path.split('/')[-1] 62 | logging.debug( 63 | "Mediawiki handler: API found: %s, page title parsed to: %s " % 64 | (api_base_uri, title)) 65 | if not title: 66 | raise HandlerError("Cannot find Title", 404) 67 | if not api_base_uri: 68 | raise HandlerError("Cannot find mediawiki API on page", 404) 69 | else: 70 | title = urllib2.unquote(title) 71 | 72 | except HandlerError as he: 73 | raise he 74 | except Exception as e: 75 | logging.error( 76 | "MediaWikiHandler: querying and parsing page for title/api " 77 | "%s. handler will return empty response" % e 78 | ) 79 | return None 80 | 81 | base_uri = api_base_uri.replace("api.php", "index.php") 82 | 83 | return self.query(req_uri, params, title, api_base_uri, base_uri) 84 | 85 | def query(self, req_uri, req_params, title, api_base_uri, base_uri): 86 | 87 | params = { 88 | 'action': 'query', 89 | 'format': 'json', 90 | 'prop': 'revisions', 91 | 'rvprop': 'ids|timestamp', 92 | 'indexpageids': '', 93 | 'titles': title 94 | } 95 | params.update(req_params) 96 | 97 | # Does sequential queries to get all revisions IDs and Timestamps 98 | queries_results = [] 99 | condition = True 100 | while condition: 101 | # Clone original request 102 | newparams = params.copy() 103 | req = self.request(api_base_uri, params=newparams) 104 | try: 105 | result = req.json() 106 | except Exception as e: 107 | logging.error("No JSON can be decoded from API %s" % 108 | api_base_uri) 109 | raise HandlerError("No API answer.", 404) 110 | if 'error' in result: 111 | raise HandlerError(result['error']) 112 | if 'warnings' in result: 113 | # logging.warn(result['warnings']) 114 | pass 115 | try: 116 | # The request was successful 117 | # the JSON key of the page (only one) 118 | pid = result['query']['pageids'][0] 119 | queries_results += result['query']['pages'][pid]['revisions'] 120 | if ('missing' in result['query']['pages'][pid] or 121 | 'invalid' in result['query']['pages'][pid]): 122 | raise HandlerError( 123 | "Cannot find resource on version server.", 404) 124 | except Exception as e: 125 | if req_params['rvdir'] == 'older': 126 | req_params['rvdir'] = 'newer' 127 | return self.query( 128 | req_uri, req_params, title, api_base_uri, base_uri) 129 | else: 130 | raise HandlerError("No revision returned from API.", 404) 131 | if 'continue' in result: 132 | # The response was truncated, the rest can be obtained using 133 | # &rvcontinue=ID 134 | cont = result['continue'] 135 | # Modify it with the values returned in the 'continue' section 136 | # of the last result. 137 | newparams.update(cont) 138 | condition = True 139 | else: 140 | condition = False 141 | 142 | # Processing list 143 | def f(rev): 144 | rev_uri = base_uri + '?title=%s&oldid=%d' % ( 145 | urllib2.quote(title), rev['revid']) 146 | dt = rev['timestamp'] 147 | return (rev_uri, dt) 148 | 149 | # logging.debug("Returning API results of size %d" % len(queries_results)) 150 | return map(f, queries_results) 151 | 152 | def get_xml(self, uri, html=False): 153 | """Retrieve the resource using the url and parse it as XML or HTML. 154 | 155 | :param uri: [str] The uri to retrieve 156 | :param headers: [dict(header_name: value)] optional http headers 157 | to send in the request. 158 | :param html: [bool] optional flag to parse the response as HTML 159 | :return: [lxml_obj] parsed DOM. 160 | """ 161 | try: 162 | page = self.request(uri) 163 | except HandlerError as he: 164 | raise HandlerError(he, status=404) 165 | 166 | try: 167 | page_data = page.content 168 | if not html: 169 | parser = etree.XMLParser(recover=True) 170 | else: 171 | parser = etree.HTMLParser(recover=True) 172 | return etree.parse(StringIO.StringIO(page_data), parser) 173 | except Exception as e: 174 | logging.error("Cannot parse XML/HTML from %s" % uri) 175 | raise HandlerError("Couldn't parse data from %s" % uri, 404) 176 | -------------------------------------------------------------------------------- /timegate/examples/pastpages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Implementation of TimeGate handler for pastpages.org.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | from datetime import datetime 17 | 18 | from timegate.errors import HandlerError 19 | from timegate.handler import Handler 20 | 21 | 22 | class PastpagesHandler(Handler): 23 | 24 | def __init__(self): 25 | Handler.__init__(self) 26 | self.LIMIT_MAX = 100 27 | self.BASE = 'http://www.pastpages.org' 28 | self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S' 29 | self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT) 30 | 31 | # Building pages list of ('uri', 'slug') pairs 32 | self.pages_list = [] 33 | 34 | try: 35 | params = { 36 | 'limit': self.LIMIT_MAX 37 | } 38 | request = '/api/beta/sites/' 39 | has_next = True 40 | 41 | # Keep while there are still result pages 42 | while has_next: 43 | json_response = self.request( 44 | self.BASE + request, params=params).json() 45 | 46 | self.pages_list.extend([ 47 | # 'objects' is the list of responses 48 | # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively 49 | (obj['url'], obj['slug']) 50 | for obj in json_response['objects'] 51 | ]) 52 | 53 | request = json_response['meta']['next'] 54 | params = None # the request already contains &limit and &offset 55 | # Each response has a non null 'meta.next' value if it has a 56 | # continuation 57 | has_next = request is not None 58 | 59 | except Exception as e: 60 | logging.critical("Cannot create the handler's page list:") 61 | raise e 62 | 63 | logging.info("Found %s websites on pastpages' API." % 64 | len(self.pages_list)) 65 | 66 | def get_memento(self, uri_r, req_datetime): 67 | uri_r = uri_r + '/' 68 | # Check if the URI is one archived website 69 | matches = [x for x in self.pages_list if uri_r.startswith(x[0])] 70 | if len(matches) == 0: 71 | raise HandlerError( 72 | "Pastpages does not have archives of that website.", 404) 73 | if len(matches) > 1: 74 | logging.error("Uri conflict in pastpages' API URI list.") 75 | raise HandlerError("Error in pastpages API") 76 | 77 | site_slug = matches[0][1] 78 | params = { 79 | 'limit': 1, 80 | 'site__slug': site_slug, 81 | 'timestamp__lte': req_datetime.strftime(self.API_TIMEFMT) 82 | } 83 | 84 | request = '/api/beta/screenshots/' 85 | 86 | json_response = self.request(self.BASE + request, params=params).json() 87 | if 'error' in json_response: 88 | logging.error("Error in pastpages response: " + 89 | str(json_response['error'])) 90 | return 91 | 92 | result_list = [ 93 | # 'objects' is the list of responses 94 | # 'objects.absolute_url' is the URI. It exists if 'objects.has_image' 95 | (self.BASE + obj['absolute_url'], obj['timestamp']) 96 | for obj in json_response['objects'] 97 | ] 98 | if result_list: 99 | if len(result_list) > 1: 100 | logging.error( 101 | "API returned more than one object. returning the first") 102 | return result_list[0] 103 | 104 | # No Memento Found, Trying the first 105 | else: 106 | return 107 | # last_offset = json_response['meta']['total_count'] - 1 108 | # params = { 109 | # 'limit': 1, 110 | # 'site__slug': site_slug, 111 | # 'timestamp__gte': self.FIRST_DATE, # Greater here 112 | # 'offset': last_offset 113 | # } 114 | # 115 | # request = '/api/beta/screenshots/' 116 | # 117 | # json_response = self.request(self.BASE+request, params=params).json() 118 | # if json_response.has_key('error'): 119 | # logging.error("Error in pastpages response: "+str(json_response['error'])) 120 | # return 121 | # 122 | # result_list = [ 123 | # # 'objects' is the list of responses 124 | # # 'objects.absolute_url' is the URI. It exists if 'objects.has_image' 125 | # (self.BASE+obj['absolute_url'], obj['timestamp']) 126 | # for obj in json_response['objects'] 127 | # ] 128 | # if result_list: 129 | # if len(result_list) > 1: 130 | # logging.error("API returned more than one object. returning the first") 131 | # return result_list[0] 132 | 133 | def get_all_mementos(self, uri_r): 134 | # WILL BE TOO SLOW. TOO MANY WEBSITES' 135 | # Deactivate TimeMaps 136 | logging.warning( 137 | "Get_all_mementos used: Pastpages will probably have too big timemaps. Expect Timeouts") 138 | 139 | matches = [x for x in self.pages_list if uri_r.startswith(x[0])] 140 | if len(matches) == 0: 141 | raise HandlerError( 142 | "Pastpages does not have archives of that website.", 404) 143 | if len(matches) > 1: 144 | logging.error("Uri conflict in pastpages' API URI list.") 145 | raise HandlerError("Error in pastpages API") 146 | 147 | site_slug = matches[0][1] 148 | params = { 149 | 'limit': self.LIMIT_MAX, 150 | 'site__slug': site_slug 151 | } 152 | request = '/api/beta/screenshots/' 153 | has_next = True 154 | 155 | image_list = [] 156 | # Keep while there are still result pages 157 | while has_next: 158 | json_response = self.request( 159 | self.BASE + request, params=params).json() 160 | 161 | image_list.extend([ 162 | # 'objects' is the list of responses 163 | # 'objects.image' is the URI of the memento. It exists if 'objects.has_image' 164 | (self.BASE + obj['absolute_url'], obj['timestamp']) 165 | for obj in json_response['objects'] if obj['has_image'] 166 | ]) 167 | 168 | request = json_response['meta']['next'] 169 | params = None # the request already contains &limit and &offset 170 | # Each response has a non null 'meta.next' value if it has a 171 | # continuation 172 | has_next = request is not None 173 | 174 | return image_list 175 | -------------------------------------------------------------------------------- /timegate/cache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Implementation of the TimeGate caches.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import os 17 | from datetime import datetime 18 | 19 | from dateutil.relativedelta import relativedelta 20 | from dateutil.tz import tzutc 21 | from werkzeug.contrib.cache import FileSystemCache, md5 22 | 23 | from . import utils as timegate_utils 24 | from .errors import CacheError 25 | 26 | 27 | class Cache(object): 28 | """Base class for TimeGate caches.""" 29 | 30 | def __init__(self, path, tolerance, expiration, max_values, 31 | run_tests=True, max_file_size=0): 32 | """Constructor method. 33 | 34 | :param path: The path of the cache database file. 35 | :param tolerance: The tolerance, in seconds to which a TimeMap is 36 | considered young enough to be used as is. 37 | :param expiration: How long, in seconds, the cache entries are stored 38 | every get will be a CACHE MISS. 39 | :param max_values: The maximum number of TimeMaps stored in cache 40 | before some are deleted 41 | :param run_tests: (Optional) Tests the cache at initialization. 42 | :param max_file_size: (Optional) The maximum size (in Bytes) for a 43 | TimeMap cache value. When max_file_size=0, there is no limit to 44 | a cache value. When max_file_size=X > 0, the cache will not 45 | store TimeMap that require more than X Bytes on disk. 46 | """ 47 | # Parameters Check 48 | if tolerance <= 0 or expiration <= 0 or max_values <= 0: 49 | raise CacheError('Cannot create cache: all parameters must be > 0') 50 | 51 | self.tolerance = relativedelta(seconds=tolerance) 52 | self.path = path.rstrip('/') 53 | self.max_file_size = max(max_file_size, 0) 54 | self.CHECK_SIZE = self.max_file_size > 0 55 | self.max_values = max_values 56 | self.backend = FileSystemCache(path, 57 | threshold=self.max_values, 58 | default_timeout=expiration) 59 | 60 | # Testing cache 61 | if run_tests: 62 | try: 63 | key = b'1' 64 | val = 1 65 | self.backend.set(key, val) 66 | assert (not self.CHECK_SIZE) or self._check_size(key) > 0 67 | assert self.backend.get(key) == val 68 | os.remove(os.path.join(self.path, md5(key).hexdigest())) 69 | except Exception as e: 70 | raise CacheError('Error testing cache: %s' % e) 71 | 72 | logging.debug( 73 | 'Cache created. max_files = %d. Expiration = %d. ' 74 | 'max_file_size = %d' % ( 75 | self.max_values, expiration, self.max_file_size)) 76 | 77 | def get_until(self, uri_r, date): 78 | """Returns the TimeMap (memento,datetime)-list for the requested 79 | Memento. The TimeMap is guaranteed to span at least until the 'date' 80 | parameter, within the tolerance. 81 | 82 | :param uri_r: The URI-R of the resource as a string. 83 | :param date: The target date. It is the accept-datetime for TimeGate 84 | requests, and the current date. The cache will return all 85 | Mementos prior to this date (within cache.tolerance parameter) 86 | :return: [(memento_uri_string, datetime_obj),...] list if it is 87 | in cache and if it is within the cache tolerance for *date*, 88 | None otherwise. 89 | """ 90 | # Query the backend for stored cache values to that memento 91 | key = uri_r 92 | try: 93 | val = self.backend.get(key) 94 | except Exception as e: 95 | logging.error('Exception loading cache content: %s' % e) 96 | return None 97 | 98 | if val: 99 | # There is a value in the cache 100 | timestamp, timemap = val 101 | logging.info('Cached value exists for %s' % uri_r) 102 | if date > timestamp + self.tolerance: 103 | logging.info('Cache MISS: value outdated for %s' % uri_r) 104 | timemap = None 105 | else: 106 | logging.info('Cache HIT: found value for %s' % uri_r) 107 | else: 108 | # Cache MISS: No value 109 | logging.info('Cache MISS: No cached value for %s' % uri_r) 110 | timemap = None 111 | 112 | return timemap 113 | 114 | def get_all(self, uri_r): 115 | """Request the whole TimeMap for that uri. 116 | 117 | :param uri_r: the URI-R of the resource. 118 | :return: [(memento_uri_string, datetime_obj),...] list if it is in 119 | cache and if it is within the cache tolerance, None otherwise. 120 | """ 121 | until = datetime.utcnow().replace(tzinfo=tzutc()) 122 | return self.get_until(uri_r, until) 123 | 124 | def set(self, uri_r, timemap): 125 | """Set the cached TimeMap for that URI-R. 126 | 127 | It appends it with a timestamp of when it is stored. 128 | 129 | :param uri_r: The URI-R of the original resource. 130 | :param timemap: The value to cache. 131 | :return: The backend setter method return value. 132 | """ 133 | logging.info('Updating cache for %s' % uri_r) 134 | timestamp = datetime.utcnow().replace(tzinfo=tzutc()) 135 | val = (timestamp, timemap) 136 | key = uri_r 137 | try: 138 | self.backend.set(key, val) 139 | if self.CHECK_SIZE: 140 | self._check_size(uri_r) 141 | except Exception as e: 142 | logging.error('Error setting cache value: %s' % e) 143 | 144 | def _check_size(self, key, delete=True): 145 | """Check the size that a specific TimeMap value is using on disk. 146 | 147 | It deletes if it is more than the maximum size. 148 | 149 | :param key: The TimeMap original resource. 150 | :param delete: (Optional) When true, the value is deleted. 151 | Else only a warning is raised. 152 | :return: The size of the value on disk (0 if it was deleted). 153 | """ 154 | try: 155 | fname = md5(key).hexdigest() # werkzeug key 156 | fpath = self.path + '/' + fname 157 | size = os.path.getsize(fpath) 158 | if size > self.max_file_size and delete: 159 | message = ('Cache value too big (%dB, max %dB) ' 160 | 'for the TimeMap of %s') 161 | if delete: 162 | message += '. Deleting cached value.' 163 | os.remove(fpath) 164 | size = 0 165 | logging.warning(message % (size, self.max_file_size, key)) 166 | return size 167 | except Exception as e: 168 | logging.error( 169 | 'Exception checking cache value size for TimeMap of %s ' 170 | 'Exception: %s' % (key, e)) 171 | return 0 172 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\TimeGate.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\TimeGate.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/TimeGate.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/TimeGate.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/TimeGate" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/TimeGate" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /timegate/examples/wikipedia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Implementation of Wikipedia TimeGate handler.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import logging 16 | import StringIO 17 | import urllib2 18 | import urlparse 19 | 20 | from lxml import etree 21 | 22 | from timegate.errors import HandlerError 23 | from timegate.handler import Handler 24 | from timegate.utils import date_str 25 | 26 | 27 | class WikipediaHandler(Handler): 28 | 29 | def __init__(self): 30 | Handler.__init__(self) 31 | self.TIMESTAMPFMT = '%Y%m%d%H%M%S' 32 | 33 | # Storing first mementos 34 | self.inner_cache = {} 35 | self.max_inner_cache_size = 100000 36 | 37 | def get_memento(self, req_uri, accept_datetime): 38 | timestamp = date_str(accept_datetime, self.TIMESTAMPFMT) 39 | params = { 40 | 'rvlimit': 1, # Only need one 41 | 'rvstart': timestamp, # Start listing from here 42 | 'rvdir': 'older' # List in decreasing order 43 | } 44 | 45 | # Finds the API and title using scraping 46 | api_base_uri = None 47 | try: 48 | dom = self.get_xml(req_uri, html=True) 49 | links = dom.xpath("//link") 50 | for link in links: 51 | if link.attrib['rel'].lower() == "edituri": 52 | api_base_uri = link.attrib['href'].split("?")[0] 53 | if api_base_uri.startswith("//"): 54 | api_base_uri = api_base_uri.replace("//", "http://") 55 | parsed_url = urlparse.urlparse(req_uri) 56 | try: 57 | title = urlparse.parse_qs(parsed_url[4])['title'][0] 58 | except Exception as e: 59 | title = parsed_url.path.split('/')[-1] 60 | logging.debug( 61 | "Mediawiki handler: API found: %s, page title parsed to: %s " % 62 | (api_base_uri, title)) 63 | if not title: 64 | raise HandlerError("Cannot find Title", 404) 65 | if not api_base_uri: 66 | raise HandlerError("Cannot find mediawiki API on page", 404) 67 | else: 68 | title = urllib2.unquote(title) 69 | 70 | except HandlerError as he: 71 | raise he 72 | except Exception as e: 73 | logging.error( 74 | "MediaWikiHandler: querying and parsing page for title/api %s." 75 | " Handler will return empty response." % e) 76 | return None 77 | 78 | base_uri = api_base_uri.replace("api.php", "index.php") 79 | 80 | # The best Memento 81 | memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0] 82 | 83 | # The first Memento 84 | if title in self.inner_cache and memento: 85 | logging.debug("Wiki Handler: found cached first for " + title) 86 | first = self.inner_cache[title] 87 | else: 88 | logging.debug("Wiki Handler: Querying first for " + title) 89 | first_params = { 90 | 'rvlimit': 1, # Only need one 91 | 'rvstart': '19900101000000', # Start listing from 1990 92 | 'rvdir': 'newer' # List in increasing order 93 | } 94 | first = self.query(req_uri, first_params, title, 95 | api_base_uri, base_uri)[0] 96 | if len(self.inner_cache) > self.max_inner_cache_size: 97 | self.inner_cache = {} 98 | self.inner_cache[title] = first 99 | 100 | # This handler returns more than only the best Memento. 101 | # A Link with rel="first memento" will also be returned to the client. 102 | return [first, memento] 103 | 104 | def query(self, req_uri, req_params, title, api_base_uri, base_uri): 105 | """Returns a processed list of tuple. Can be used with increased 106 | rvlimit. 107 | 108 | :param req_uri: :param req_params: :param title: :param 109 | api_base_uri: :param base_uri: :return: 110 | 111 | """ 112 | 113 | params = { 114 | 'action': 'query', 115 | 'format': 'json', 116 | 'prop': 'revisions', 117 | 'rvprop': 'ids|timestamp', 118 | 'indexpageids': '', 119 | 'titles': title 120 | } 121 | params.update(req_params) 122 | 123 | # Does sequential queries to get all revisions IDs and Timestamps 124 | queries_results = [] 125 | condition = True 126 | while condition: 127 | # Clone original request 128 | newparams = params.copy() 129 | req = self.request(api_base_uri, params=newparams) 130 | try: 131 | result = req.json() 132 | except Exception as e: 133 | logging.error("No JSON can be decoded from API %s" % 134 | api_base_uri) 135 | raise HandlerError("No API answer.", 404) 136 | if 'error' in result: 137 | raise HandlerError(result['error']) 138 | if 'warnings' in result: 139 | # logging.warn(result['warnings']) 140 | pass 141 | try: 142 | # The request was successful 143 | # the JSON key of the page (only one) 144 | pid = result['query']['pageids'][0] 145 | queries_results += result['query']['pages'][pid]['revisions'] 146 | if ('missing' in result['query']['pages'][pid] or 147 | 'invalid' in result['query']['pages'][pid]): 148 | raise HandlerError( 149 | "Cannot find resource on version server.", 404) 150 | except Exception as e: 151 | if req_params['rvdir'] == 'older': 152 | req_params['rvdir'] = 'newer' 153 | return self.query( 154 | req_uri, req_params, title, api_base_uri, base_uri) 155 | else: 156 | raise HandlerError("No revision returned from API.", 404) 157 | if 'continue' in result: 158 | # The response was truncated, the rest can be obtained using 159 | # &rvcontinue=ID 160 | cont = result['continue'] 161 | # Modify it with the values returned in the 'continue' section 162 | # of the last result. 163 | newparams.update(cont) 164 | condition = True 165 | else: 166 | condition = False 167 | 168 | # Processing list 169 | def f(rev): 170 | rev_uri = base_uri + '?title=%s&oldid=%d' % ( 171 | urllib2.quote(title), rev['revid']) 172 | dt = rev['timestamp'] 173 | return (rev_uri, dt) 174 | 175 | # logging.debug("Returning API results of size %d" % len( 176 | # queries_results)) 177 | return map(f, queries_results) 178 | 179 | def get_xml(self, uri, html=False): 180 | """Retrieve the resource using the url. 181 | 182 | It parses response as XML or HTML and returns the parsed DOM object. 183 | 184 | :param uri: [str] The uri to retrieve. 185 | :param headers: [dict(header_name: value)] Optional HTTP headers to 186 | send in the request. 187 | :param html: [bool] Optional flag to parse the response as HTML. 188 | :return: [lxml_obj] Parsed DOM. 189 | """ 190 | try: 191 | page = self.request(uri) 192 | except HandlerError as he: 193 | raise HandlerError(he, status=404) 194 | 195 | try: 196 | page_data = page.content 197 | if not html: 198 | parser = etree.XMLParser(recover=True) 199 | else: 200 | parser = etree.HTMLParser(recover=True) 201 | return etree.parse(StringIO.StringIO(page_data), parser) 202 | except Exception as e: 203 | logging.error("Cannot parse XML/HTML from %s" % uri) 204 | raise HandlerError("Couldn't parse data from %s" % uri, 404) 205 | -------------------------------------------------------------------------------- /timegate/examples/github.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | from __future__ import absolute_import, print_function 12 | 13 | import re 14 | import time 15 | 16 | import requests 17 | 18 | from timegate.errors import HandlerError 19 | from timegate.handler import Handler 20 | 21 | ACCEPTABLE_RESOURCE = ( 22 | "Acceptable resources URI: repositories (github.com/:user/:repo), " 23 | "folders (github.com/:user/:repo/tree/:branch/:path), " 24 | "files (github.com/:user/:repo/blob/:branch/:path) " 25 | "and raw files (raw.githubusercontent.com/:user/:repo/:branch/:path)" 26 | ) 27 | 28 | 29 | class GitHubHandler(Handler): 30 | 31 | def __init__(self): 32 | Handler.__init__(self) 33 | # Mandatory fields 34 | self.resources = ['https://github.com/.+', 35 | 'https://raw.githubusercontent.com/'] 36 | 37 | # Local fields 38 | self.api = 'https://api.github.com' 39 | 40 | # Precompiles regular expressions 41 | self.rex = re.compile(""" # The format of URI-Rs 42 | (https://) # protocol 43 | ((?:raw.githubusercontent|github).com/) # base 44 | ([^/]+)/ # user 45 | ([^/]+) # repo 46 | (/.*)? # optional path 47 | """, re.X) # verbosed: ignore whitespaces and \n 48 | self.header_rex = re.compile( 49 | '<(.+?)>; rel="next"') # The regex for the query continuation header 50 | self.file_rex = re.compile('(/blob)?/master') # The regex for files 51 | 52 | def get_all_mementos(self, uri): 53 | MAX_TIME = 120 # seconds 54 | 55 | if uri.startswith('http://'): 56 | uri = uri.replace('http://', 'https://', 1) 57 | 58 | # URI deconstruction 59 | match = self.rex.match(uri) 60 | if not bool(match): 61 | raise HandlerError("Github uri does not match a valid resource. \n" 62 | + ACCEPTABLE_RESOURCE, 404) 63 | protocol = match.groups()[0] 64 | base = match.groups()[1] 65 | user = match.groups()[2] 66 | repo = match.groups()[3] 67 | req_path = match.groups()[4] 68 | 69 | path = '' 70 | branch = '' 71 | # Processes one result to (memento, datetime) pair 72 | mapper = None 73 | 74 | # Defining Resource type and response handling 75 | # Creates one function for a specific type to map the results to 76 | # memento pairs. 77 | if base == 'github.com/': 78 | # Resource is a repository 79 | if not req_path or req_path == '/': 80 | if req_path: 81 | path = '/' 82 | 83 | def make_pair(commit): 84 | return (commit['html_url'].replace('commit', 'tree'), 85 | commit['commit']['committer']['date']) 86 | mapper = make_pair 87 | 88 | # Resource is a file 89 | elif req_path.startswith('/blob/'): 90 | path = req_path.replace('/blob/', '', 1) 91 | branch_index = path.find('/') 92 | branch = path[:branch_index] 93 | path = path[branch_index:] 94 | if branch == '' or path == '' or path.endswith('/'): 95 | raise HandlerError( 96 | "Not found. Empty path for file in repository", 404) 97 | 98 | def make_pair(commit): 99 | # HTML Resource 100 | memento_path = '/blob/%s%s' % (commit['sha'], path) 101 | uri_m = '%s%s%s/%s%s' % ( 102 | protocol, base, user, repo, memento_path) 103 | return (uri_m, commit['commit']['committer']['date']) 104 | mapper = make_pair 105 | 106 | # Resource is a directory 107 | elif req_path.startswith('/tree/'): 108 | path = req_path.replace('/tree/', '', 1) 109 | branch_index = path.find('/') 110 | if branch_index < 0: 111 | branch_index = len(path) 112 | branch = path[:branch_index] 113 | path = path[branch_index:] 114 | if branch == '': 115 | raise HandlerError("Not found. Empty branch path", 404) 116 | 117 | def make_pair(commit): 118 | return ( 119 | commit['html_url'].replace( 120 | 'commit', 121 | 'tree') + path, 122 | commit['commit']['committer']['date']) 123 | mapper = make_pair 124 | 125 | # Resource is a raw file 126 | elif base == 'raw.githubusercontent.com/' and req_path is not None: 127 | path = req_path.replace('/', '', 1) 128 | branch_index = path.find('/') 129 | branch = path[:branch_index] 130 | path = path[branch_index:] 131 | # must be done because API does not make any difference between 132 | # path or files 133 | is_online = bool(requests.head(uri)) 134 | if path == '' or path.endswith('/') or not is_online: 135 | raise HandlerError( 136 | "'%s' not found: Raw resource must be a file." % path, 404) 137 | 138 | def make_pair(commit): 139 | memento_path = '/%s%s' % (commit['sha'], path) 140 | uri_m = '%s%s%s/%s%s' % (protocol, base, 141 | user, repo, memento_path) 142 | return (uri_m, commit['commit']['committer']['date']) 143 | mapper = make_pair 144 | 145 | if mapper is None: 146 | # The resource is not accepcted. 147 | raise HandlerError( 148 | "GitHub resource type not found." + ACCEPTABLE_RESOURCE, 404) 149 | 150 | # Initiating request variables 151 | apibase = '%s/repos/%s/%s/commits' % (self.api, user, repo) 152 | params = { 153 | 'per_page': 100, # Max allowed is 100 154 | 'path': str(path), 155 | 'sha': str(branch) 156 | } 157 | aut_pair = ('MementoTimegate', 'LANLTimeGate14') 158 | cont = apibase # The first continue is the beginning 159 | 160 | # Does sequential queries to get all commits of the particular resource 161 | queries_results = [] 162 | tmax = int(time.time()) + MAX_TIME 163 | while cont is not None: 164 | if int(time.time()) > tmax: 165 | raise HandlerError( 166 | "Resource too big to be served. GitHub Handler TimeOut (timeout: %d seconds)" % 167 | MAX_TIME, 502) 168 | req = self.request(cont, params=params, auth=aut_pair) 169 | cont = None 170 | if not req: 171 | # status code different than 2XX 172 | raise HandlerError( 173 | "Cannot find resource on version server. API response %d'd " % 174 | req.status_code, 404) 175 | result = req.json() 176 | if 'message' in result: 177 | # API-specific error 178 | raise HandlerError(result['message']) 179 | if 'errors' in result: 180 | # API-specific error 181 | raise HandlerError(result['errors']) 182 | if len(result) > 0: 183 | # The request was successful 184 | queries_results += result 185 | # Search for possible continue 186 | if 'link' in req.headers: 187 | link_header = req.headers['link'] 188 | headermatch = self.header_rex.search(link_header) 189 | if bool(headermatch): 190 | # The response was truncated, the rest can be obtained using 191 | # the given "next" link 192 | cont = headermatch.groups()[0] 193 | 194 | if queries_results: 195 | # Processes results based on resource type 196 | return map(mapper, queries_results) 197 | else: 198 | # No results found 199 | raise HandlerError( 200 | "Resource not found, empty response from API", 404) 201 | -------------------------------------------------------------------------------- /timegate/examples/gitlab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | from __future__ import absolute_import, print_function 12 | 13 | import re 14 | import time 15 | 16 | import requests 17 | 18 | from timegate.errors import HandlerError 19 | from timegate.handler import Handler 20 | 21 | ACCEPTABLE_RESOURCE = ( 22 | "Acceptable resources URI: repositories (/:user/:repo), " 23 | "folders (/:user/:repo/tree/:branch/:path), " 24 | "files (/:user/:repo/blob/:branch/:path) " 25 | "and raw files (/:user/:repo/raw/:branch/:path)" 26 | ) 27 | 28 | # TODO: wiki pages (e.g. https://gitlab.example.com/auser/aproject/wikis/home) 29 | 30 | 31 | class GitLabHandler(Handler): 32 | 33 | def __init__(self): 34 | Handler.__init__(self) 35 | # Mandatory fields 36 | # TODO: move to config file 37 | self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+'] 38 | 39 | # Local fields 40 | self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3' # TODO: move to config file 41 | self.apikey = 'VqeqaShAw4GWVc3dp7--' # TODO: move to config file 42 | 43 | # Precompiles regular expressions ## TODO: generalize for URLs with 44 | # numeric project ID instead of user/repo!!! 45 | self.rex = re.compile(""" # The format of URI-Rs 46 | (https://) # protocol 47 | ([^/]+)/ # base 48 | ([^/]+)/ # user 49 | ([^/]+) # repo 50 | (/.*)? # optional path 51 | """, re.X) # verbosed: ignore whitespaces and \n 52 | self.header_rex = re.compile( 53 | '<(.+?)>; rel="next"') # The regex for the query continuation header 54 | self.file_rex = re.compile('(/blob)?/master') # The regex for files 55 | 56 | def get_all_mementos(self, uri): 57 | MAX_TIME = 120 # seconds 58 | 59 | # URI deconstruction 60 | match = self.rex.match(uri) 61 | if not bool(match): 62 | raise HandlerError("Github uri does not match a valid resource. \n" 63 | + ACCEPTABLE_RESOURCE, 404) 64 | protocol = match.groups()[0] 65 | base = match.groups()[1] 66 | user = match.groups()[2] 67 | repo = match.groups()[3] 68 | req_path = match.groups()[4] 69 | 70 | path = '' 71 | branch = '' 72 | # Processes one result to (memento, datetime) pair 73 | mapper = None 74 | 75 | # Defining Resource type and response handling 76 | # Creates one function for a specific type to map the results to 77 | # memento pairs. 78 | if 1: 79 | # Resource is a repository 80 | if not req_path or req_path == '/': 81 | if req_path: 82 | path = '/' 83 | 84 | def make_pair(commit): 85 | memento_path = '/commit/%s' % commit['id'] 86 | uri_m = '%s%s/%s/%s%s' % ( 87 | protocol, base, user, repo, memento_path) 88 | return (uri_m, commit['created_at']) 89 | mapper = make_pair 90 | 91 | # Resource is a file 92 | elif req_path.startswith('/blob/'): 93 | path = req_path.replace('/blob/', '', 1) 94 | branch_index = path.find('/') 95 | branch = path[:branch_index] 96 | path = path[branch_index:] 97 | if branch == '' or path == '' or path.endswith('/'): 98 | raise HandlerError( 99 | "Not found. Empty path for file in repository", 404) 100 | 101 | def make_pair(commit): 102 | # HTML Resource 103 | memento_path = '/blob/%s%s' % (commit['id'], path) 104 | uri_m = '%s%s/%s/%s%s' % ( 105 | protocol, base, user, repo, memento_path) 106 | return (uri_m, commit['created_at']) 107 | mapper = make_pair 108 | 109 | # Resource is a raw file 110 | elif req_path.startswith('/raw/'): 111 | path = req_path.replace('/raw/', '', 1) 112 | branch_index = path.find('/') 113 | branch = path[:branch_index] 114 | path = path[branch_index:] 115 | is_online = bool(requests.head( 116 | uri, params={'private_token': self.apikey})) 117 | if path == '' or path.endswith('/') or not is_online: 118 | raise HandlerError( 119 | "'%s' not found: Raw resource must be a file." % 120 | path, 404) 121 | 122 | def make_pair(commit): 123 | # HTML Resource 124 | memento_path = '/raw/%s%s' % (commit['id'], path) 125 | uri_m = '%s%s/%s/%s%s' % ( 126 | protocol, base, user, repo, memento_path) 127 | return (uri_m, commit['created_at']) 128 | mapper = make_pair 129 | 130 | # Resource is a directory 131 | elif req_path.startswith('/tree/'): 132 | path = req_path.replace('/tree/', '', 1) 133 | branch_index = path.find('/') 134 | if branch_index < 0: 135 | branch_index = len(path) 136 | branch = path[:branch_index] 137 | path = path[branch_index:] 138 | if branch == '': 139 | raise HandlerError("Not found. Empty branch path", 404) 140 | 141 | def make_pair(commit): 142 | memento_path = '/commit/%s' % commit['id'] 143 | uri_m = '%s%s/%s/%s%s' % ( 144 | protocol, base, user, repo, memento_path) 145 | return (uri_m, commit['created_at']) 146 | mapper = make_pair 147 | 148 | # Resource is a wiki entry 149 | # e.g. 150 | # https://gitlab.example.com/opac/cdrom-opac/wikis/home --> 151 | # https://gitlab.example.com/opac/cdrom-opac/wikis/home?version_id=b4a9027e2948a5ce9ecd3a9c1641ed958b9f7728 152 | # API does not seem to support this: getting wrong commit IDs 153 | # elif req_path.startswith('/wikis/'): 154 | # def make_pair(commit): 155 | # # HTML Resource 156 | # memento_path = '%s?version_id=%s' % (req_path, commit['id']) 157 | # uri_m = '%s%s/%s/%s%s' % ( 158 | # protocol, base, user, repo, memento_path) 159 | # return (uri_m, commit['created_at']) 160 | # mapper = make_pair 161 | 162 | if mapper is None: 163 | # The resource is not accepcted. 164 | raise HandlerError( 165 | "GitLab resource type not found." + ACCEPTABLE_RESOURCE, 404) 166 | 167 | # Initiating request variables 168 | # It appears that user/repo can be used instead of a numeric project 169 | # ID. %2f is a urlencoded slash (/). 170 | apibase = '%s/projects/%s/repository/commits' % ( 171 | self.api, user + '%2f' + repo) 172 | params = { 173 | 'per_page': 100, # Max allowed is 100 174 | 'path': str(path), 175 | 'branches': str(branch), 176 | 'private_token': self.apikey 177 | } 178 | aut_pair = ('MementoTimegate', 'LANLTimeGate14') 179 | cont = apibase # The first continue is the beginning 180 | 181 | # Does sequential queries to get all commits of the particular resource 182 | queries_results = [] 183 | tmax = int(time.time()) + MAX_TIME 184 | while cont is not None: 185 | if int(time.time()) > tmax: 186 | raise HandlerError( 187 | "Resource too big to be served. GitLab Handler TimeOut (timeout: %d seconds)" % 188 | MAX_TIME, 502) 189 | req = self.request(cont, params=params, auth=aut_pair) 190 | cont = None 191 | if not req: 192 | # status code different than 2XX 193 | raise HandlerError( 194 | "Cannot find resource on version server. API response %d'd " % 195 | req.status_code, 404) 196 | result = req.json() 197 | if 'message' in result: 198 | # API-specific error 199 | raise HandlerError(result['message']) 200 | if 'errors' in result: 201 | # API-specific error 202 | raise HandlerError(result['errors']) 203 | if len(result) > 0: 204 | # The request was successful 205 | queries_results += result 206 | # Search for possible continue 207 | if 'link' in req.headers: 208 | link_header = req.headers['link'] 209 | headermatch = self.header_rex.search(link_header) 210 | if bool(headermatch): 211 | # The response was truncated, the rest can be obtained using 212 | # the given "next" link 213 | cont = headermatch.groups()[0] 214 | 215 | if queries_results: 216 | # Processes results based on resource type 217 | return map(mapper, queries_results) 218 | else: 219 | # No results found 220 | raise HandlerError( 221 | "Resource not found, empty response from API", 404) 222 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2016 CERN. 5 | # 6 | # TimeGate is free software; you can redistribute it and/or modify 7 | # it under the terms of the Revised BSD License; see LICENSE file for 8 | # more details. 9 | 10 | from __future__ import print_function 11 | 12 | import os 13 | 14 | import sphinx.environment 15 | from docutils.utils import get_source_line 16 | 17 | # -- General configuration ------------------------------------------------ 18 | 19 | # If your documentation needs a minimal Sphinx version, state it here. 20 | #needs_sphinx = '1.0' 21 | 22 | # Do not warn on external images. 23 | suppress_warnings = ['image.nonlocal_uri'] 24 | 25 | # Add any Sphinx extension module names here, as strings. They can be 26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 27 | # ones. 28 | extensions = [ 29 | 'sphinx.ext.autodoc', 30 | 'sphinx.ext.coverage', 31 | 'sphinx.ext.doctest', 32 | 'sphinx.ext.intersphinx', 33 | 'sphinx.ext.viewcode', 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # source_suffix = ['.rst', '.md'] 42 | source_suffix = '.rst' 43 | 44 | # The encoding of source files. 45 | #source_encoding = 'utf-8-sig' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = u'TimeGate' 52 | copyright = u'2016, CERN' 53 | author = u'LANL' 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | 61 | # Get the version string. Cannot be done with import! 62 | g = {} 63 | with open(os.path.join('..', 'timegate', 'version.py'), 'rt') as fp: 64 | exec(fp.read(), g) 65 | version = g['__version__'] 66 | 67 | # The full version, including alpha/beta/rc tags. 68 | release = version 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # There are two options for replacing |today|: either, you set today to some 78 | # non-false value, then it is used: 79 | #today = '' 80 | # Else, today_fmt is used as the format for a strftime call. 81 | #today_fmt = '%B %d, %Y' 82 | 83 | # List of patterns, relative to source directory, that match files and 84 | # directories to ignore when looking for source files. 85 | exclude_patterns = [] 86 | 87 | # The reST default role (used for this markup: `text`) to use for all 88 | # documents. 89 | #default_role = None 90 | 91 | # If true, '()' will be appended to :func: etc. cross-reference text. 92 | #add_function_parentheses = True 93 | 94 | # If true, the current module name will be prepended to all description 95 | # unit titles (such as .. function::). 96 | #add_module_names = True 97 | 98 | # If true, sectionauthor and moduleauthor directives will be shown in the 99 | # output. They are ignored by default. 100 | #show_authors = False 101 | 102 | # The name of the Pygments (syntax highlighting) style to use. 103 | pygments_style = 'sphinx' 104 | 105 | # A list of ignored prefixes for module index sorting. 106 | #modindex_common_prefix = [] 107 | 108 | # If true, keep warnings as "system message" paragraphs in the built documents. 109 | #keep_warnings = False 110 | 111 | # If true, `todo` and `todoList` produce output, else they produce nothing. 112 | todo_include_todos = False 113 | 114 | 115 | # -- Options for HTML output ---------------------------------------------- 116 | html_theme = 'alabaster' 117 | 118 | html_theme_options = { 119 | 'description': 'A Memento TimeGate', 120 | 'github_user': 'mementoweb', 121 | 'github_repo': 'timegate', 122 | 'github_button': False, 123 | 'github_banner': True, 124 | 'show_powered_by': False, 125 | 'extra_nav_links': { 126 | 'timegate@GitHub': 'http://github.com/mementoweb/timegate', 127 | 'timegate@PyPI': 'http://pypi.python.org/pypi/timegate/', 128 | } 129 | } 130 | 131 | # The theme to use for HTML and HTML Help pages. See the documentation for 132 | # a list of builtin themes. 133 | 134 | # Theme options are theme-specific and customize the look and feel of a theme 135 | # further. For a list of options available for each theme, see the 136 | # documentation. 137 | #html_theme_options = {} 138 | 139 | # Add any paths that contain custom themes here, relative to this directory. 140 | #html_theme_path = [] 141 | 142 | # The name for this set of Sphinx documents. If None, it defaults to 143 | # " v documentation". 144 | #html_title = None 145 | 146 | # A shorter title for the navigation bar. Default is the same as html_title. 147 | #html_short_title = None 148 | 149 | # The name of an image file (relative to this directory) to place at the top 150 | # of the sidebar. 151 | #html_logo = None 152 | 153 | # The name of an image file (within the static path) to use as favicon of the 154 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 155 | # pixels large. 156 | #html_favicon = None 157 | 158 | # Add any paths that contain custom static files (such as style sheets) here, 159 | # relative to this directory. They are copied after the builtin static files, 160 | # so a file named "default.css" will overwrite the builtin "default.css". 161 | #html_static_path = ['_static'] 162 | 163 | # Add any extra paths that contain custom files (such as robots.txt or 164 | # .htaccess) here, relative to this directory. These files are copied 165 | # directly to the root of the documentation. 166 | #html_extra_path = [] 167 | 168 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 169 | # using the given strftime format. 170 | #html_last_updated_fmt = '%b %d, %Y' 171 | 172 | # If true, SmartyPants will be used to convert quotes and dashes to 173 | # typographically correct entities. 174 | #html_use_smartypants = True 175 | 176 | # Custom sidebar templates, maps document names to template names. 177 | html_sidebars = { 178 | '**': [ 179 | 'about.html', 180 | 'navigation.html', 181 | 'relations.html', 182 | 'searchbox.html', 183 | 'donate.html', 184 | ] 185 | } 186 | 187 | # Additional templates that should be rendered to pages, maps page names to 188 | # template names. 189 | #html_additional_pages = {} 190 | 191 | # If false, no module index is generated. 192 | #html_domain_indices = True 193 | 194 | # If false, no index is generated. 195 | #html_use_index = True 196 | 197 | # If true, the index is split into individual pages for each letter. 198 | #html_split_index = False 199 | 200 | # If true, links to the reST sources are added to the pages. 201 | #html_show_sourcelink = True 202 | 203 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 204 | #html_show_sphinx = True 205 | 206 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 207 | #html_show_copyright = True 208 | 209 | # If true, an OpenSearch description file will be output, and all pages will 210 | # contain a tag referring to it. The value of this option must be the 211 | # base URL from which the finished HTML is served. 212 | #html_use_opensearch = '' 213 | 214 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 215 | #html_file_suffix = None 216 | 217 | # Language to be used for generating the HTML full-text search index. 218 | # Sphinx supports the following languages: 219 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 220 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 221 | #html_search_language = 'en' 222 | 223 | # A dictionary with options for the search language support, empty by default. 224 | # Now only 'ja' uses this config value 225 | #html_search_options = {'type': 'default'} 226 | 227 | # The name of a javascript file (relative to the configuration directory) that 228 | # implements a search results scorer. If empty, the default will be used. 229 | #html_search_scorer = 'scorer.js' 230 | 231 | # Output file base name for HTML help builder. 232 | htmlhelp_basename = 'timegate_namedoc' 233 | 234 | # -- Options for LaTeX output --------------------------------------------- 235 | 236 | latex_elements = { 237 | # The paper size ('letterpaper' or 'a4paper'). 238 | #'papersize': 'letterpaper', 239 | 240 | # The font size ('10pt', '11pt' or '12pt'). 241 | #'pointsize': '10pt', 242 | 243 | # Additional stuff for the LaTeX preamble. 244 | #'preamble': '', 245 | 246 | # Latex figure (float) alignment 247 | #'figure_align': 'htbp', 248 | } 249 | 250 | # Grouping the document tree into LaTeX files. List of tuples 251 | # (source start file, target name, title, 252 | # author, documentclass [howto, manual, or own class]). 253 | latex_documents = [ 254 | (master_doc, 'timegate.tex', u'timegate Documentation', 255 | u'LANL', 'manual'), 256 | ] 257 | 258 | # The name of an image file (relative to this directory) to place at the top of 259 | # the title page. 260 | #latex_logo = None 261 | 262 | # For "manual" documents, if this is true, then toplevel headings are parts, 263 | # not chapters. 264 | #latex_use_parts = False 265 | 266 | # If true, show page references after internal links. 267 | #latex_show_pagerefs = False 268 | 269 | # If true, show URL addresses after external links. 270 | #latex_show_urls = False 271 | 272 | # Documents to append as an appendix to all manuals. 273 | #latex_appendices = [] 274 | 275 | # If false, no module index is generated. 276 | #latex_domain_indices = True 277 | 278 | 279 | # -- Options for manual page output --------------------------------------- 280 | 281 | # One entry per manual page. List of tuples 282 | # (source start file, name, description, authors, manual section). 283 | man_pages = [ 284 | (master_doc, 'timegate', u'timegate Documentation', 285 | [author], 1) 286 | ] 287 | 288 | # If true, show URL addresses after external links. 289 | #man_show_urls = False 290 | 291 | 292 | # -- Options for Texinfo output ------------------------------------------- 293 | 294 | # Grouping the document tree into Texinfo files. List of tuples 295 | # (source start file, target name, title, author, 296 | # dir menu entry, description, category) 297 | texinfo_documents = [ 298 | (master_doc, 'timegate', u'TimeGate Documentation', 299 | author, 'timegate', 'A Memento TimeGate', 300 | 'Miscellaneous'), 301 | ] 302 | 303 | # Documents to append as an appendix to all manuals. 304 | #texinfo_appendices = [] 305 | 306 | # If false, no module index is generated. 307 | #texinfo_domain_indices = True 308 | 309 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 310 | #texinfo_show_urls = 'footnote' 311 | 312 | # If true, do not generate a @detailmenu in the "Top" node's menu. 313 | #texinfo_no_detailmenu = False 314 | 315 | 316 | # Example configuration for intersphinx: refer to the Python standard library. 317 | intersphinx_mapping = {'https://docs.python.org/': None} 318 | -------------------------------------------------------------------------------- /timegate/application.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of TimeGate. 4 | # Copyright (C) 2014, 2015, 2016 LANL. 5 | # Copyright (C) 2016 CERN. 6 | # 7 | # TimeGate is free software; you can redistribute it and/or modify 8 | # it under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Implementation of the TimeGate server.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import json 16 | import logging 17 | import os 18 | from datetime import datetime 19 | 20 | from pkg_resources import iter_entry_points 21 | 22 | from dateutil.tz import tzutc 23 | from link_header import Link, LinkHeader 24 | from werkzeug.exceptions import HTTPException, abort 25 | from werkzeug.http import http_date, parse_date 26 | from werkzeug.local import Local, LocalManager 27 | from werkzeug.routing import BaseConverter, Map, Rule, ValidationError 28 | from werkzeug.utils import cached_property, import_string 29 | from werkzeug.wrappers import Request, Response 30 | 31 | from . import constants 32 | from .cache import Cache 33 | from .config import Config 34 | from .handler import Handler, parsed_request 35 | from .utils import best 36 | 37 | local = Local() 38 | """Thread safe local data storage.""" 39 | 40 | local_manager = LocalManager([local]) 41 | """Manager for local data storage.""" 42 | 43 | request = local('request') 44 | """Proxy to request object.""" 45 | 46 | # logging.getLogger(__name__) 47 | # logging.basicConfig(level=logging.DEBUG) 48 | 49 | 50 | def url_for(*args, **kwargs): 51 | """Proxy to URL Map adapter builder.""" 52 | return request.adapter.build(*args, **kwargs) 53 | 54 | 55 | def load_handler(name_or_path): 56 | """Load handler from entry points or import string.""" 57 | if isinstance(name_or_path, Handler): 58 | return name_or_path 59 | 60 | handlers = list(iter_entry_points('timegate.handlers', name=name_or_path)) 61 | number_of_handlers = len(handlers) 62 | if number_of_handlers > 1: 63 | raise RuntimeError( 64 | 'Multiple handlers with the same name "{0}" has been found'.format( 65 | name_or_path 66 | ) 67 | ) 68 | elif number_of_handlers == 1: 69 | return handlers[0].load()() 70 | else: 71 | return import_string(name_or_path)() 72 | 73 | 74 | class URIConverter(BaseConverter): 75 | """URI Converter.""" 76 | 77 | def __init__(self, url_map, base_uri=None): 78 | super(URIConverter, self).__init__(url_map) 79 | self.base_uri = base_uri 80 | self.regex = ( 81 | r"([^:/?#]+:)?(//[^/?#]*)?" 82 | r"[^?#]*(\?[^#]*)?(#.*)?" 83 | ) 84 | 85 | def to_python(self, value): 86 | """Return value with base URI prefix.""" 87 | value = value.replace(' ', '%20') # encode 88 | if self.base_uri and not value.startswith(self.base_uri): 89 | return self.base_uri + value 90 | return value 91 | 92 | def to_url(self, value): 93 | """Return value without base URI if it is defined.""" 94 | value = value.replace('%20', ' ') # decode 95 | if self.base_uri and value.startswith(self.base_uri): 96 | return value[len(self.base_uri):] 97 | return value 98 | 99 | 100 | class TimeGate(object): 101 | """Implementation of Memento protocol with configurable handlers.""" 102 | 103 | def __init__(self, config=None, cache=None): 104 | """Initialize application with handler.""" 105 | self.config = Config(None) 106 | self.config.from_object(constants) 107 | self.config.update(config or {}) 108 | self.cache = None 109 | if cache: 110 | self.cache = cache 111 | elif self.config['CACHE_USE']: 112 | self._build_default_cache() 113 | 114 | @cached_property 115 | def handler(self): 116 | handler = load_handler(self.config['HANDLER_MODULE']) 117 | HAS_TIMEGATE = hasattr(handler, 'get_memento') 118 | HAS_TIMEMAP = hasattr(handler, 'get_all_mementos') 119 | if self.config['USE_TIMEMAPS'] and (not HAS_TIMEMAP): 120 | logging.error( 121 | "Handler has no get_all_mementos() function " 122 | "but is suppose to serve timemaps.") 123 | 124 | if not (HAS_TIMEGATE or HAS_TIMEMAP): 125 | raise NotImplementedError( 126 | "NotImplementedError: Handler has neither `get_memento` " 127 | "nor `get_all_mementos` method.") 128 | return handler 129 | 130 | @cached_property 131 | def url_map(self): 132 | """Build URL map.""" 133 | base_uri = self.config['BASE_URI'] 134 | rules = [ 135 | Rule('/timegate/'.format(base_uri), 136 | endpoint='timegate', methods=['GET', 'HEAD']), 137 | Rule('/timemap//' 138 | ''.format(base_uri), 139 | endpoint='timemap', methods=['GET', 'HEAD']), 140 | ] 141 | return Map(rules, converters={'uri': URIConverter}) 142 | 143 | def _build_default_cache(self): 144 | """Build default cache object.""" 145 | self.cache = Cache( 146 | self.config['CACHE_FILE'], 147 | self.config['CACHE_TOLERANCE'], 148 | self.config['CACHE_EXP'], 149 | self.config['CACHE_MAX_VALUES'], 150 | ) 151 | 152 | def __repr__(self): 153 | """Representation of this class.""" 154 | return '<{0} {1}>'.format( 155 | self.__class__.__name__, self.handler.__class__.__name__ 156 | ) 157 | 158 | def dispatch_request(self, request): 159 | """Choose correct method.""" 160 | request.adapter = adapter = self.url_map.bind_to_environ( 161 | request.environ 162 | ) 163 | try: 164 | endpoint, values = adapter.match() 165 | return getattr(self, endpoint)(**values) 166 | except HTTPException as e: 167 | return e 168 | finally: 169 | self.adapter = None 170 | 171 | def wsgi_app(self, environ, start_response): 172 | local.request = request = Request(environ) 173 | response = self.dispatch_request(request) 174 | return response(environ, start_response) 175 | 176 | def __call__(self, environ, start_response): 177 | """Handle a request.""" 178 | return self.wsgi_app(environ, start_response) 179 | 180 | def get_memento(self, uri_r, accept_datetime): 181 | """Return a URL-M for an original resource. 182 | 183 | It must span at least up to a certain date. 184 | 185 | :param uri_r: The original resource to look for. 186 | :param accept_datetime: Datetime object with requested time. 187 | :return: The TimeMap if it exists and is valid. 188 | """ 189 | return parsed_request(self.handler.get_memento, 190 | uri_r, accept_datetime) 191 | 192 | def get_all_mementos(self, uri_r): 193 | """Uses the handler to retrieve a TimeMap for an original resource. 194 | 195 | The value is cached if the cache is activated. 196 | 197 | :param uri_r: The URI to retrieve and cache the TimeMap of. 198 | :return: The retrieved value. 199 | """ 200 | mementos = None 201 | if self.cache and request.cache_control != 'no-cache': 202 | mementos = self.cache.get_all(uri_r) 203 | if mementos is None: 204 | mementos = parsed_request(self.handler.get_all_mementos, uri_r) 205 | if self.cache: 206 | self.cache.set(uri_r, mementos) 207 | return mementos 208 | 209 | def timegate(self, uri_r): 210 | """Handle timegate high-level logic. 211 | 212 | Fetch the Memento for the requested URI at the requested date time. 213 | Returns a HTTP 302 response if it exists. If the resource handler 214 | allows batch requests, then the result may be cached. 215 | 216 | :return: The body of the HTTP response. 217 | """ 218 | if 'Accept-Datetime' in request.headers: 219 | accept_datetime = parse_date( 220 | request.headers['Accept-Datetime'] 221 | ).replace(tzinfo=tzutc()) 222 | else: 223 | accept_datetime = datetime.utcnow().replace(tzinfo=tzutc()) 224 | 225 | # Runs the handler's API request for the Memento 226 | mementos = first = last = None 227 | HAS_TIMEMAP = hasattr(self.handler, 'get_all_mementos') 228 | if HAS_TIMEMAP and self.config['USE_TIMEMAPS']: 229 | logging.debug('Using multiple-request mode.') 230 | mementos = self.get_all_mementos(uri_r) 231 | 232 | if mementos: 233 | first = mementos[0] 234 | last = mementos[-1] 235 | memento = best(mementos, accept_datetime, 236 | self.config['RESOURCE_TYPE']) 237 | else: 238 | logging.debug('Using single-request mode.') 239 | memento = self.get_memento(uri_r, accept_datetime) 240 | 241 | # If the handler returned several Mementos, take the closest 242 | return memento_response( 243 | memento, 244 | uri_r, 245 | first, 246 | last, 247 | has_timemap=HAS_TIMEMAP and self.config['USE_TIMEMAPS'], 248 | ) 249 | 250 | def timemap(self, uri_r, response_type='link'): 251 | """Handle TimeMap high-level logic. 252 | 253 | It fetches all Mementos for an Original Resource and builds the TimeMap 254 | response. Returns a HTTP 200 response if it exists with the timemap in 255 | the message body. 256 | 257 | :param req_uri: The requested original resource URI. 258 | :param start_response: WSGI callback function. 259 | :return: The body of the HTTP response. 260 | """ 261 | if not self.config['USE_TIMEMAPS']: 262 | abort(403) 263 | 264 | mementos = self.get_all_mementos(uri_r) 265 | # Generates the TimeMap response body and Headers 266 | if response_type == 'json': 267 | return timemap_json_response(self, mementos, uri_r) 268 | else: 269 | return timemap_link_response(self, mementos, uri_r) 270 | 271 | 272 | @local_manager.middleware 273 | def application(environ, start_response): 274 | """WSGI application object. 275 | 276 | This is the start point of the TimeGate server. 277 | 278 | TimeMap requests are parsed here. 279 | 280 | :param environ: Dictionary containing environment variables from 281 | the client request. 282 | :param start_response: Callback function used to send HTTP status 283 | and headers to the server. 284 | :return: The response body, in a list of one str element. 285 | """ 286 | app = TimeGate() 287 | app.config.from_inifile( 288 | os.path.join(os.path.dirname(__file__), 'conf', 'config.ini') 289 | ) 290 | return app(environ, start_response) 291 | 292 | 293 | def memento_response( 294 | memento, 295 | uri_r, 296 | first=None, 297 | last=None, 298 | has_timemap=False): 299 | """Return a 302 redirection to the best Memento for a resource. 300 | 301 | It includes necessary headers including datetime requested by the user. 302 | 303 | :param memento: (The URI string, dt obj) of the best memento. 304 | :param uri_r: The original resource's complete URI. 305 | :param first: (Optional) (URI string, dt obj) of the first memento. 306 | :param last: (Optional) (URI string, dt obj) of the last memento. 307 | :param has_timemap: Flag indicating that the handler accepts 308 | TimeMap requests too. Default True. 309 | :return: The ``Response`` object. 310 | """ 311 | # Gather links containing original and if availible: TimeMap, first, last 312 | # TimeGate link not allowed here 313 | links = [Link(uri_r, rel='original')] 314 | if has_timemap: 315 | for response_type, mime in (('link', 'application/link-format'), 316 | ('json', 'application/json'), ): 317 | links.append(Link( 318 | url_for('timemap', dict( 319 | response_type=response_type, uri_r=uri_r 320 | ), force_external=True), 321 | rel='timemap', type=mime 322 | )) 323 | 324 | (uri_m, dt_m) = memento 325 | (uri_last, dt_last) = (uri_first, dt_first) = (None, None) 326 | if last: 327 | (uri_last, dt_last) = last 328 | if first: 329 | (uri_first, dt_first) = first 330 | if first and last and uri_first == uri_last: 331 | # There's only one memento (first = best = last) 332 | assert(uri_last == uri_m) 333 | links.append(Link(uri_m, rel='first last memento', 334 | datetime=http_date(dt_m))) 335 | else: 336 | if first: 337 | links.append(Link(uri_m, rel='first memento', 338 | datetime=http_date(dt_first))) 339 | if (uri_first != uri_m and uri_last != uri_m): 340 | # The best memento is neither the first nor the last 341 | links.append(Link(uri_m, rel='memento', 342 | datetime=http_date(dt_m))) 343 | if last: 344 | links.append(Link(uri_m, rel='last memento', 345 | datetime=http_date(dt_last))) 346 | 347 | # Builds the response headers 348 | headers = [ 349 | ('Date', http_date(datetime.utcnow())), 350 | ('Vary', 'accept-datetime'), 351 | ('Content-Length', '0'), 352 | ('Content-Type', 'text/plain; charset=UTF-8'), 353 | ('Connection', 'close'), 354 | ('Location', uri_m), 355 | ('Link', str(LinkHeader(links))), 356 | ] 357 | return Response(None, headers=headers, status=302) 358 | 359 | 360 | def timemap_link_response(app, mementos, uri_r): 361 | """Return a 200 TimeMap response. 362 | 363 | :param mementos: A sorted (ascending by date) list of (uri_str, 364 | datetime_obj) tuples representing a TimeMap. 365 | :param uri_r: The URI-R of the original resource. 366 | :return: The ``Response`` object. 367 | """ 368 | assert len(mementos) >= 1 369 | 370 | # Adds Original, TimeGate and TimeMap links 371 | original_link = Link(uri_r, rel='original') 372 | timegate_link = Link( 373 | url_for('timegate', dict(uri_r=uri_r), force_external=True), 374 | rel='timegate', 375 | ) 376 | link_self = Link( 377 | url_for('timemap', dict( 378 | response_type='link', uri_r=uri_r 379 | ), force_external=True), 380 | rel='self', type='application/link-format', 381 | ) 382 | json_self = Link( 383 | url_for('timemap', dict( 384 | response_type='json', uri_r=uri_r 385 | ), force_external=True), 386 | rel='timemap', type='application/json', 387 | ) 388 | 389 | # Sets up first and last relations 390 | if len(mementos) == 1: 391 | mementos_links = [Link(mementos[0][0], rel='first last memento', 392 | datetime=http_date(mementos[0][1]))] 393 | else: 394 | # Browse through Mementos to generate the TimeMap links list 395 | mementos_links = [ 396 | Link(mementos[0][0], rel='first memento', 397 | datetime=http_date(mementos[0][1])) 398 | ] + [ 399 | Link(uri, rel='memento', datetime=http_date(date)) 400 | for (uri, date) in mementos[1:-1] 401 | ] + [ 402 | Link(mementos[-1][0], rel='last memento', 403 | datetime=http_date(mementos[-1][1])) 404 | ] 405 | 406 | # Aggregates all link strings and constructs the TimeMap body 407 | links = [original_link, timegate_link, link_self, json_self] 408 | links.extend(mementos_links) 409 | body = ',\n'.join([str(l) for l in links]) + '\n' 410 | 411 | # Builds HTTP Response and WSGI return 412 | headers = [ 413 | ('Date', http_date(datetime.utcnow())), 414 | ('Content-Length', str(len(body))), 415 | ('Content-Type', 'application/link-format'), 416 | ('Connection', 'close'), 417 | ] 418 | return Response(body, headers=headers) 419 | 420 | 421 | def timemap_json_response(app, mementos, uri_r): 422 | """Creates and sends a timemap response. 423 | 424 | :param mementos: A sorted list of (uri_str, datetime_obj) tuples 425 | representing a timemap. 426 | :param uri_r: The URI-R of the original resource. 427 | :param start_response: WSGI callback function. 428 | :return: The ``Response`` object. 429 | """ 430 | assert len(mementos) >= 1 431 | 432 | # Prepares the JSON response by building a dict 433 | response_dict = {} 434 | 435 | response_dict['original_uri'] = uri_r 436 | response_dict['timegate_uri'] = url_for( 437 | 'timegate', dict(uri_r=uri_r), force_external=True 438 | ) 439 | 440 | # Browse through Mementos to generate TimeMap links dict list 441 | mementos_links = [ 442 | {'uri': urlstr, 'datetime': http_date(date)} 443 | for (urlstr, date) in mementos 444 | ] 445 | 446 | # Builds up first and last links dict 447 | firstlink = {'uri': mementos[0][0], 'datetime': http_date(mementos[0][1])} 448 | lastlink = {'uri': mementos[-1][0], 'datetime': http_date(mementos[-1][1])} 449 | 450 | response_dict['mementos'] = { 451 | 'last': lastlink, 452 | 'first': firstlink, 453 | 'list': mementos_links, 454 | } 455 | 456 | # Builds self (TimeMap)links dict 457 | response_dict['timemap_uri'] = { 458 | 'json_format': url_for('timemap', dict( 459 | response_type='json', uri_r=uri_r 460 | ), force_external=True), 461 | 'link_format': url_for('timemap', dict( 462 | response_type='link', uri_r=uri_r 463 | ), force_external=True), 464 | } 465 | 466 | # Creates the JSON str from the dict 467 | response_json = json.dumps(response_dict) 468 | 469 | # Builds HTTP Response and WSGI return 470 | headers = [ 471 | ('Date', http_date(datetime.utcnow())), 472 | ('Content-Length', str(len(response_json))), 473 | ('Content-Type', 'application/json'), 474 | ] 475 | return Response(response_json, headers=headers) 476 | --------------------------------------------------------------------------------