├── .coveragerc
├── docs
    ├── sequence.png
    ├── architecture.png
    ├── uris_example.png
    ├── client_server.png
    ├── headers_example.png
    ├── client_server_tg.png
    ├── code_architecture.png
    ├── authors.rst
    ├── changes.rst
    ├── installation.rst
    ├── contributing.rst
    ├── license.rst
    ├── api.rst
    ├── index.rst
    ├── advanced-features.rst
    ├── http-response-headers.rst
    ├── memento.rst
    ├── cache.rst
    ├── big-picture.rst
    ├── introduction.rst
    ├── getting-started.rst
    ├── configuration.rst
    ├── handler.rst
    ├── make.bat
    ├── Makefile
    └── conf.py
├── timegate
    ├── examples
    │   ├── __init__.py
    │   ├── es.py
    │   ├── aueb.py
    │   ├── si.py
    │   ├── cat.py
    │   ├── sg.py
    │   ├── can.py
    │   ├── cr.py
    │   ├── loc.py
    │   ├── w3c.py
    │   ├── arxiv.py
    │   ├── nara.py
    │   ├── webcite.py
    │   ├── simple.py
    │   ├── wikia.py
    │   ├── orain.py
    │   ├── mediawiki.py
    │   ├── pastpages.py
    │   ├── wikipedia.py
    │   ├── github.py
    │   └── gitlab.py
    ├── version.py
    ├── __init__.py
    ├── _compat.py
    ├── conf
    │   ├── timegate.ini
    │   └── config.ini
    ├── errors.py
    ├── constants.py
    ├── config.py
    ├── handler.py
    ├── utils.py
    ├── cache.py
    └── application.py
├── CHANGES.rst
├── pytest.ini
├── MANIFEST.in
├── setup.cfg
├── run-tests.sh
├── INSTALL.rst
├── RELEASE-NOTES.rst
├── AUTHORS.rst
├── LICENSE
├── .gitignore
├── tests
    ├── conftest.py
    └── test_timegate.py
├── README.rst
├── .travis.yml
├── CONTRIBUTING.rst
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = timegate/examples/*
3 | 


--------------------------------------------------------------------------------
/docs/sequence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/sequence.png


--------------------------------------------------------------------------------
/docs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/architecture.png


--------------------------------------------------------------------------------
/docs/uris_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/uris_example.png


--------------------------------------------------------------------------------
/docs/client_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/client_server.png


--------------------------------------------------------------------------------
/docs/headers_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/headers_example.png


--------------------------------------------------------------------------------
/docs/client_server_tg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/client_server_tg.png


--------------------------------------------------------------------------------
/docs/code_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mementoweb/timegate/HEAD/docs/code_architecture.png


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | .. include:: ../AUTHORS.rst
10 | 


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | .. include:: ../CHANGES.rst
10 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | .. include:: ../INSTALL.rst
10 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | .. include:: ../CONTRIBUTING.rst
10 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | License
10 | =======
11 | 
12 | .. include:: ../LICENSE
13 |    :literal:
14 | 


--------------------------------------------------------------------------------
/timegate/examples/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """List of examples for TimeGate."""
11 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | 
10 | Changes
11 | =======
12 | 
13 | Version 0.5.0 (released TBD)
14 | 
15 | - Initial public release.
16 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | [pytest]
11 | addopts = --pep8 --ignore=docs --cov=timegate --cov-report=term-missing
12 | pep8ignore = timegate/examples/* ALL
13 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.py
 2 | include *.rst
 3 | include *.sh
 4 | include .coveragerc
 5 | include LICENSE
 6 | include pytest.ini
 7 | include timegate/conf/*.ini
 8 | recursive-include conf *.ini
 9 | recursive-include docs *.bat
10 | recursive-include docs *.png
11 | recursive-include docs *.py
12 | recursive-include docs *.rst
13 | recursive-include docs Makefile
14 | recursive-include examples *.py
15 | recursive-include tests *.py
16 | 
17 | prune docs/_build
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | 
11 | [aliases]
12 | test=pytest
13 | 
14 | [build_sphinx]
15 | source-dir = docs/
16 | build-dir = docs/_build
17 | all_files = 1
18 | 
19 | [bdist_wheel]
20 | universal = 1
21 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | 
10 | API Docs
11 | ========
12 | 
13 | .. automodule:: timegate.application
14 | 
15 | Errors
16 | ------
17 | 
18 | .. automodule:: timegate.errors
19 |    :members:
20 | 
21 | Utilities
22 | ---------
23 | 
24 | .. automodule:: timegate.utils
25 |    :members:
26 | 


--------------------------------------------------------------------------------
/timegate/version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Version information for TimeGate.
11 | 
12 | This file is imported by ``timegate.__init__``, and parsed by
13 | ``setup.py``.
14 | 
15 | """
16 | 
17 | from __future__ import absolute_import, print_function
18 | 
19 | __version__ = "0.5.0.dev20160000"
20 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # This file is part of TimeGate.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | # pydocstyle timegate && \
12 | isort -rc -c -df **/*.py && \
13 | check-manifest --ignore ".travis-*" && \
14 | sphinx-build -qnNW docs docs/_build/html && \
15 | python setup.py test && \
16 | sphinx-build -qnNW -b doctest docs docs/_build/doctest
17 | 


--------------------------------------------------------------------------------
/INSTALL.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | Installation
10 | ============
11 | 
12 | In this installation guide, we’ll create a basic TimeGate instance.
13 | 
14 | .. code-block:: console
15 | 
16 |    $ pip install -e git+https://github.com/mementoweb/timegate.git#egg=TimeGate
17 |    $ uwsgi --http :9999 -s /tmp/mysock.sock --module timegate.application --callable application
18 | 


--------------------------------------------------------------------------------
/RELEASE-NOTES.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 |  TimeGate v0.5.0
 3 | =================
 4 | 
 5 | TimeGate v0.5.0 was released on TBD, 2016.
 6 | 
 7 | About
 8 | -----
 9 | 
10 | A Memento TimeGate.
11 | 
12 | What's new
13 | ----------
14 | 
15 | - Initial public release.
16 | 
17 | Installation
18 | ------------
19 | 
20 |    $ pip install timegate==0.5.0
21 | 
22 | Documentation
23 | -------------
24 | 
25 |    http://pythonhosted.org/timegate/
26 | 
27 | Happy hacking and thanks for flying TimeGate.
28 | 
29 | | TimeGate Development Team
30 | |   GitHub: https://github.com/mementoweb/timegate
31 | |   URL: http://mementoweb.org
32 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | 
10 | Authors
11 | =======
12 | 
13 | A Memento TimeGate contributors:
14 | 
15 | - Christian Pietsch <cpietsch+github@uni-bielefeld.de>
16 | - Harihar Shankar <hariharshankar@gmail.com>
17 | - Jiri Kuncar <jiri.kuncar@gmail.com>
18 | - Luda171 <ludab@lanl.gov>
19 | - Sawood Alam <ibnesayeed@gmail.com>
20 | - Tibor Simko <tibor.simko@cern.ch>
21 | - Yorick Chollet <yorick.chollet@gmail.com>
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |  Copyright 2013,
 2 |  Yorick Chollet, Harihar Shankar, Herbert Van de Sompel.
 3 |  -- Los Alamos National Laboratory. 
 4 | 
 5 | Licensed under the BSD open source software license.
 6 | You may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://mementoweb.github.io/SiteStory/license.html
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 


--------------------------------------------------------------------------------
/timegate/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Make your web resources Memento compliant in a few easy steps.
12 | 
13 | The Memento framework enables datetime negotiation for web resources.
14 | Knowing the URI of a Memento-compliant web resource, a user can select a
15 | date and see what it was like around that time.
16 | """
17 | 
18 | from .version import __version__
19 | 
20 | __all__ = (
21 |     '__version__',
22 | )
23 | 


--------------------------------------------------------------------------------
/timegate/_compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """PY2/PY3 compatibility layer."""
11 | 
12 | import sys
13 | 
14 | PY2 = sys.version_info[0] == 2
15 | 
16 | if not PY2:  # pragma: no cover
17 |     from urllib.parse import urlparse, quote, unquote
18 | 
19 |     text_type = str
20 |     string_types = (str,)
21 |     integer_types = (int,)
22 | else:  # pragma: no cover
23 |     from urlparse import urlparse
24 |     from urllib2 import quote, unquote
25 | 
26 |     text_type = unicode
27 |     string_types = (str, unicode)
28 |     integer_types = (int, long)
29 | 


--------------------------------------------------------------------------------
/timegate/conf/timegate.ini:
--------------------------------------------------------------------------------
 1 | # uWSGI launch configuration file
 2 | [uwsgi]
 3 | home = /Users/harihar/venv/timegate/
 4 | #socket = uwsgi.sock
 5 | http = :9000
 6 | #chdir = /data/web/timegate/w3c
 7 | 
 8 | #daemonize = /data/var/logs/timegate/w3c.log
 9 | module = timegate.application
10 | callable = application
11 | master = true
12 | #pidfile = /data/var/run/timegate/w3c/w3c.pid
13 | #harakiri = 120
14 | 
15 | memory-report
16 | processes = 4
17 | threads = 2
18 | listen = 60000
19 | cheaper-algo = spare
20 | cheaper = 3
21 | cheaper-initial = 3
22 | workers = 15
23 | cheaper-step = 2
24 | #cheaper-rss-limit-soft = 134217728
25 | vacuum
26 | max-requests = 500
27 | 
28 | reload-mercy = 8
29 | reload-on-as = 512
30 | evil-reload-on-rss = 96
31 | limit-as = 1024
32 | 
33 | # To stop the server, use uwsgi --stop /data/var/run/timegate/orain/orain.pid
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # Idea software family
 6 | .idea/
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | 
49 | # Translations
50 | *.mo
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # Cache
62 | cache/
63 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | 
11 | """Pytest configuration."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import pytest
16 | 
17 | 
18 | @pytest.fixture()
19 | def app(tmpdir):
20 |     """Initialize cache directory."""
21 |     from timegate import application
22 |     from timegate.cache import Cache
23 |     return application.TimeGate(config=dict(
24 |         HOST='http://localhost',
25 |         BASE_URI='http://www.example.com/',
26 |         CACHE_USE=True,
27 |         CACHE_FILE=tmpdir.mkdir('cache').strpath,
28 |     ))
29 | 
30 | 
31 | @pytest.fixture()
32 | def client(app):
33 |     """Application fixture."""
34 |     from timegate import application
35 |     from werkzeug.test import Client
36 |     from werkzeug.wrappers import BaseResponse
37 |     return Client(app, BaseResponse)
38 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Memento TimeGate
 2 | ================
 3 | 
 4 | .. image:: https://img.shields.io/travis/mementoweb/timegate.svg
 5 |            :target: https://travis-ci.org/mementoweb/timegate
 6 | 
 7 | About
 8 | -----
 9 | 
10 | Make your web resources `Memento <http://www.mementoweb.org>`__ compliant in a
11 | few easy steps.
12 | 
13 | The Memento framework enables datetime negotiation for web resources.
14 | Knowing the URI of a Memento-compliant web resource, a user can select a
15 | date and see what it was like around that time.
16 | 
17 | Installation
18 | ------------
19 | 
20 | Memento TimeGate is on PyPI so all you need is: ::
21 | 
22 |   pip install -e git+https://github.com/mementoweb/timegate.git#egg=TimeGate
23 |   uwsgi --http :9999 -s /tmp/mysock.sock --module timegate.application --callable application
24 | 
25 | 
26 | Documentation
27 | -------------
28 | 
29 | The documentation is readable at http://timegate.readthedocs.io or can be built
30 | using Sphinx: ::
31 | 
32 |   pip install timegate[docs]
33 |   python setup.py build_sphinx
34 | 
35 | 
36 | Testing
37 | -------
38 | 
39 | Running the test suite is as simple as: ::
40 | 
41 |   ./run-tests.sh
42 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     This file is part of TimeGate
 3 |     Copyright (C) 2016 CERN.
 4 | 
 5 |     TimeGate is free software; you can redistribute it and/or modify
 6 |     it under the terms of the Revised BSD License; see LICENSE file for
 7 |     more details.
 8 | 
 9 | 
10 | .. include:: ../README.rst
11 |    :end-before: Installation
12 | 
13 | User's Guide
14 | ------------
15 | 
16 | This part of the documentation will show you how to get started in using
17 | TimeGate.
18 | 
19 | .. toctree::
20 |    :maxdepth: 2
21 | 
22 |    introduction
23 |    installation
24 |    big-picture
25 |    getting-started
26 |    memento
27 |    http-response-headers
28 |    handler
29 |    configuration
30 |    cache
31 |    advanced-features
32 | 
33 | API Reference
34 | -------------
35 | 
36 | If you are looking for information on a specific function, class or method,
37 | this part of the documentation is for you.
38 | 
39 | .. toctree::
40 |    :maxdepth: 2
41 | 
42 |    api
43 | 
44 | Additional Notes
45 | ----------------
46 | 
47 | Notes on how to contribute, legal information and changes are here for the
48 | interested.
49 | 
50 | .. toctree::
51 |    :maxdepth: 1
52 | 
53 |    contributing
54 |    changes
55 |    license
56 |    authors
57 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | 
11 | notifications:
12 |   email: false
13 | 
14 | sudo: false
15 | 
16 | language: python
17 | 
18 | cache:
19 |   - pip
20 | 
21 | env:
22 |   - REQUIREMENTS=lowest
23 |   - REQUIREMENTS=release
24 |   # - REQUIREMENTS=devel
25 | 
26 | python:
27 |   - "2.7"
28 |   - "3.3"
29 |   - "3.4"
30 |   - "3.5"
31 | 
32 | before_install:
33 |   - "travis_retry pip install --upgrade pip setuptools py"
34 |   - "travis_retry pip install twine wheel coveralls requirements-builder"
35 |   - "requirements-builder --level=min setup.py > .travis-lowest-requirements.txt"
36 |   - "requirements-builder --level=pypi setup.py > .travis-release-requirements.txt"
37 |   # - "requirements-builder --level=dev --req requirements-devel.txt setup.py > .travis-devel-requirements.txt"
38 | 
39 | install:
40 |   - "travis_retry pip install -r .travis-${REQUIREMENTS}-requirements.txt"
41 |   - "travis_retry pip install -e .[all]"
42 | 
43 | script:
44 |   - "./run-tests.sh"
45 | 
46 | after_success:
47 |   - coveralls
48 |   
49 | branches:
50 |   only:
51 |     - master
52 | 


--------------------------------------------------------------------------------
/timegate/errors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Custom TimeGate errors."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | from werkzeug.exceptions import HTTPException
16 | 
17 | 
18 | class TimegateError(HTTPException):
19 |     """General TimeGate Exception."""
20 | 
21 |     code = 400
22 |     description = 'Invalid TimeGate request.'
23 | 
24 |     def __init__(self, msg, status=None):
25 |         super(TimegateError, self).__init__(description=msg)
26 |         if status:
27 |             self.code = status
28 | 
29 | 
30 | class TimeoutError(TimegateError):
31 |     """Raise to signalize a timeout."""
32 | 
33 |     code = 416
34 | 
35 | 
36 | class URIRequestError(TimegateError):
37 |     """Raise if the request contains invalid URI."""
38 | 
39 |     code = 400
40 | 
41 | 
42 | class HandlerError(TimegateError):
43 |     """Raise to signal handler error."""
44 | 
45 |     code = 503
46 | 
47 | 
48 | class DateTimeError(TimegateError):
49 |     """Raise if the server is unable to handle the date time."""
50 | 
51 |     code = 400
52 | 
53 | 
54 | class CacheError(TimegateError):
55 |     """Raise if the cache is not functioning."""
56 | 
57 |     code = 500
58 | 


--------------------------------------------------------------------------------
/timegate/examples/es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Memento proxy for Estonia Web Archive.
12 | 
13 | TODO: rewrite regex html parsing(?) with lxml.
14 | """
15 | 
16 | from __future__ import absolute_import, print_function
17 | 
18 | import logging
19 | import re
20 | 
21 | from timegate.errors import HandlerError
22 | from timegate.handler import Handler
23 | 
24 | BASEURI = "http://veebiarhiiv.digar.ee/a/*/"
25 | 
26 | 
27 | class EsHandler(Handler):
28 | 
29 |     def __init__(self):
30 |         Handler.__init__(self)
31 |         regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">'
32 |         self.uriRegex = re.compile(regex)
33 | 
34 |     def get_all_mementos(self, req_url):
35 |         # implement the changes list for this particular proxy
36 | 
37 |         uri = BASEURI + req_url
38 |         try:
39 |             resp = self.request(uri)
40 |             data = resp.content
41 |         except Exception as e:
42 |             logging.error("Cannot request URI: %s" % e)
43 |             raise HandlerError("Cannot request URI", 404)
44 | 
45 |         changes = []
46 |         uris = re.findall(self.uriRegex, data)
47 |         for u in uris:
48 |             dtstr = u[0]
49 |             loc = u[1]
50 |             dtstr += " GMT"
51 |             changes.append((loc, dtstr))
52 | 
53 |         return changes
54 | 


--------------------------------------------------------------------------------
/timegate/examples/aueb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Greece handler."""
12 | 
13 | import logging
14 | import re
15 | import urllib
16 | 
17 | from timegate.errors import HandlerError
18 | from timegate.handler import Handler
19 | 
20 | 
21 | class GreeceHandler(Handler):
22 | 
23 |     def __init__(self):
24 | 
25 |         self.baseuri = "http://83.212.204.92:8080/*/"
26 | 
27 |         regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://83.212.204.92:8080/[\S]*">';
28 |         self.uriRegex = re.compile(regex)
29 |         Handler.__init__(self)
30 | 
31 |     def get_all_mementos(self, req_url):
32 |         # def fetch_changes(self, req, requri, dt=None):
33 |         # implement the changes list for this particular proxy
34 | 
35 |         uri = self.baseuri + req_url
36 |         try:
37 |             fh = urllib.urlopen(uri)
38 |         except Exception as e:
39 |             logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e)))
40 |             return None
41 |         data = fh.read()
42 |         fh.close()
43 | 
44 |         changes = []
45 |         uris = re.findall(self.uriRegex, data)
46 |         for u in uris:
47 |             dtstr = u[27:41]
48 |             loc = u[52:-2]
49 |             dtstr += " GMT"
50 |             # dtobj = dateparser.parse(dtstr)
51 |             changes.append((loc, dtstr))
52 | 
53 |         return changes
54 | 


--------------------------------------------------------------------------------
/timegate/constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Important constants of the TimeGate server."""
12 | 
13 | # Code constants
14 | HTTP_STATUS = {
15 |     200: "200 OK",
16 |     302: "302 Found",
17 |     400: "400 Bad Request",
18 |     403: "403 Forbidden",
19 |     404: "404 Not Found",
20 |     405: "405 Method Not Allowed",
21 |     416: '416 Requested Range Not Satisfiable',
22 |     500: "500 Internal Server Error",
23 |     502: "502 Bad Gateway",
24 |     501: "501 Not Implemented",
25 |     503: "503 Service Unavailable"
26 | }
27 | 
28 | # Memento date rfc1123
29 | DATE_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
30 | 
31 | # TimeMap max size (in URIs) safeguard
32 | TM_MAX_SIZE = 100000
33 | 
34 | # Server configuration
35 | HOST = None
36 | STRICT_TIME = True
37 | API_TIME_OUT = 6
38 | 
39 | # Handler configuration
40 | HANDLER_MODULE = 'simple'
41 | BASE_URI = ''
42 | RESOURCE_TYPE = 'vcs'
43 | USE_TIMEMAPS = True
44 | 
45 | # Cache
46 | # When False, all cache requests will be cache MISS
47 | CACHE_USE = False
48 | # Time window in which the cache value is considered young enough to be valid
49 | CACHE_TOLERANCE = 86400
50 | # Cache files paths
51 | CACHE_DIRECTORY = 'cache'
52 | # Maximum number of TimeMaps stored in cache
53 | CACHE_MAX_VALUES = 250
54 | # Cache files paths
55 | CACHE_FILE = CACHE_DIRECTORY  # + '/cache_data'
56 | # Cache expiration (space bound) in seconds
57 | CACHE_EXP = 259200  # Three days
58 | 


--------------------------------------------------------------------------------
/timegate/examples/si.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """TimeGate proxy for uni-lj.si."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | import urllib
18 | 
19 | from timegate.handler import Handler
20 | 
21 | 
22 | class SloveniaHandler(Handler):
23 | 
24 |     def __init__(self):
25 |         self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/"
26 |         regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">'
27 |         self.uriRegex = re.compile(regex)
28 |         Handler.__init__(self)
29 | 
30 |     def get_all_mementos(self, req_url):
31 |         # def fetch_changes(self, req, requri, dt=None):
32 |         # implement the changes list for this particular proxy
33 | 
34 |         uri = self.baseuri + req_url
35 |         try:
36 |             fh = urllib.urlopen(uri)
37 |         except Exception as e:
38 |             logging.error("Couldn't retrieve data from %s : %s" %
39 |                           (uri, str(e)))
40 |             return None
41 |         data = fh.read()
42 |         fh.close()
43 | 
44 |         changes = []
45 |         uris = re.findall(self.uriRegex, data)
46 |         for u in uris:
47 |             dtstr = u[27:41]
48 |             loc = u[52:-2]
49 |             dtstr += " GMT"
50 |             # dtobj = dateparser.parse(dtstr)
51 |             changes.append((loc, dtstr))
52 | 
53 |         return changes
54 | 


--------------------------------------------------------------------------------
/timegate/examples/cat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Catalonia handler."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | import urllib
18 | 
19 | from core.handler_baseclass import Handler
20 | from errors.timegateerrors import HandlerError
21 | 
22 | 
23 | class CataloniaHandler(Handler):
24 | 
25 |     def __init__(self):
26 | 
27 |         self.baseuri = "http://www.padi.cat:8080/wayback/*/"
28 | 
29 |         regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">';
30 |         self.uriRegex = re.compile(regex)
31 |         Handler.__init__(self)
32 | 
33 |     def get_all_mementos(self, req_url):
34 |         # def fetch_changes(self, req, requri, dt=None):
35 |         # implement the changes list for this particular proxy
36 | 
37 |         uri = self.baseuri + req_url
38 |         try:
39 |             fh = urllib.urlopen(uri)
40 |         except Exception as e:
41 |             logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e)))
42 |             return None
43 |         data = fh.read()
44 |         fh.close()
45 | 
46 |         changes = []
47 |         uris = re.findall(self.uriRegex, data)
48 |         for u in uris:
49 |             dtstr = u[27:41]
50 |             loc = u[52:-2]
51 |             dtstr += " GMT"
52 |             # dtobj = dateparser.parse(dtstr)
53 |             changes.append((loc, dtstr))
54 | 
55 |         return changes
56 | 


--------------------------------------------------------------------------------
/timegate/examples/sg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Singapore handler."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | import urllib
18 | 
19 | from core.handler_baseclass import Handler
20 | from errors.timegateerrors import HandlerError
21 | 
22 | 
23 | class SingaporeHandler(Handler):
24 | 
25 |     def __init__(self):
26 |         #self.baseuri = "http://was.nl.sg/wayback/*/"
27 |         self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/"
28 | 
29 |         regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">';
30 |         self.uriRegex = re.compile(regex)
31 |         Handler.__init__(self)
32 | 
33 |     def get_all_mementos(self, req_url):
34 |         # def fetch_changes(self, req, requri, dt=None):
35 |         # implement the changes list for this particular proxy
36 | 
37 |         uri = self.baseuri + req_url
38 |         try:
39 |             fh = urllib.urlopen(uri)
40 |         except Exception as e:
41 |             logging.error("Couldn't retrieve data from %s : %s" % (uri, str(e)))
42 |             return None
43 |         data = fh.read()
44 |         fh.close()
45 | 
46 |         changes = []
47 |         uris = re.findall(self.uriRegex, data)
48 |         for u in uris:
49 |             dtstr = u[27:41]
50 |             loc = u[52:-2]
51 |             dtstr += " GMT"
52 |             # dtobj = dateparser.parse(dtstr)
53 |             changes.append((loc, dtstr))
54 | 
55 |         return changes
56 | 


--------------------------------------------------------------------------------
/timegate/examples/can.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Canadian archive proxy."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | import StringIO
18 | 
19 | from lxml import etree
20 | 
21 | from timegate.errors import HandlerError
22 | from timegate.handler import Handler
23 | 
24 | 
25 | class CanHandler(Handler):
26 | 
27 |     def __init__(self):
28 |         Handler.__init__(self)
29 |         self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/"
30 |         self.dtre = re.compile(
31 |             "http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
32 | 
33 |     def get_all_mementos(self, req_url):
34 |         iauri = self.baseuri + req_url
35 |         dom = self.get_xml(iauri, html=True)
36 | 
37 |         alist = dom.xpath('//div[@class="inner-content"]//a')
38 |         if not alist:
39 |             return []
40 | 
41 |         changes = []
42 |         for a in alist:
43 |             if not 'name' in a.attrib:
44 |                 uri = a.attrib['href']
45 |                 match = self.dtre.match(uri)
46 |                 if bool(match):
47 |                     dtstr = match.groups()[0]
48 |                     changes.append((uri, dtstr))
49 |         return changes
50 | 
51 |     def get_xml(self, uri, html=False):
52 |         page = self.request(uri)
53 |         try:
54 |             page_data = page.content
55 |             if not html:
56 |                 parser = etree.XMLParser(recover=True)
57 |             else:
58 |                 parser = etree.HTMLParser(recover=True)
59 |             return etree.parse(StringIO.StringIO(page_data), parser)
60 |         except Exception as e:
61 |             logging.error("Cannot parse XML/HTML from %s" % uri)
62 |             raise HandlerError("Couldn't parse data from %s" % uri, 404)
63 | 


--------------------------------------------------------------------------------
/docs/advanced-features.rst:
--------------------------------------------------------------------------------
 1 | .. _advanced_features:
 2 | 
 3 | TimeMaps
 4 | ========
 5 | 
 6 | The TimeGate can easily be used as a TimeMap server too. ## Requirements
 7 | For that there are two requirements:
 8 | 
 9 | - The Handler must implement the ``get_all_mementos(uri_r)`` function to return
10 |   the entire history of an Original Resource.
11 | 
12 | 
13 | - The ``conf/config.ini`` file must have the variable ``use_timemap = true``.
14 | 
15 | Resulting links
16 | ---------------
17 | 
18 | Once this setup is in place, the TimeGate responses' ``Link`` header
19 | will contain two new relations, for two different formats (MIME types):
20 | 
21 | - ``<HOST/timemap/link/URI-R>; rel="timemap"; type="application/link-format"``
22 |   `Link TimeMaps <http://www.mementoweb.org/guide/rfc/#Pattern6>`_
23 | 
24 | - ``<HOST/timemap/json/URI-R>; rel="timemap"; type="application/json"`` JSON
25 |   TimeMaps
26 | 
27 | Where ``HOST`` is the base URI of the program and ``URI-R`` is the URI
28 | of the Original Resource.
29 | 
30 | Example
31 | -------
32 | 
33 | For example, suppose ``http://www.example.com/resourceA`` is the URI-R
34 | of an Original Resource. And suppose the TimeGate/TimeMap server's
35 | ``host`` configuration is set to ``http://timegate.example.com`` Then,
36 | HTTP responses from the TimeGate will include the following:
37 | 
38 | - ``<http://timegate.example.com/timemap/link/http://www.example.com/resourceA>; rel="timemap"; type="application/link-format"``
39 | - ``<http://timegate.example.com/timemap/json/http://www.example.com/resourceA>; rel="timemap"; type="application/json"``
40 | 
41 | Now a user can request an ``HTTP GET`` on one of those link and the
42 | server's response will have a ``200 OK`` status code and its body will
43 | be the TimeMap.
44 | 
45 | HandlerErrors
46 | =============
47 | 
48 | Custom error messages can be sent to the client using the custom
49 | exception module: ``from errors.timegateerrors import HandlerError``.
50 | For instance, a custom message with HTTP status ``400`` and body
51 | ``Custom error message`` can be sent using:
52 | ``raise HandlerError("Custom error message", status=400)``. Raising a
53 | ``HandlerError`` will stop the request and not return any Memento to the
54 | client.
55 | 


--------------------------------------------------------------------------------
/docs/http-response-headers.rst:
--------------------------------------------------------------------------------
 1 | .. _http_response_headers:
 2 | 
 3 | Memento and HTTP
 4 | ================
 5 | 
 6 | The Memento framework requires specific HTTP headers in order to work
 7 | properly. They must be added to the server's response headers for any
 8 | Original Resources or Mementos request.
 9 | 
10 | Intuitively, a user needs to be able to know which server to contact to
11 | do the time negotiation. Hence a link to the TimeGate is needed from
12 | both the Original Resource and the Mementos. Additionally, a Memento is
13 | defined by an Original Resource it is the snapshot of, and the date time
14 | at which it was created. Thus, it carries a link to its Original
15 | Resource and a datetime information.
16 | 
17 | Example
18 | -------
19 | 
20 | Let's take the following example: Suppose a server is handling requests
21 | for the following URIs:
22 | 
23 | .. image:: uris_example.png
24 | 
25 | Each time a server responds to requests for any of these URIs, standards
26 | HTTP headers are returned. With Memento, the following headers are
27 | added: - For the Original Resource, add a "Link" header that points at
28 | its TimeGate - For each Memento, add a "Link" header that points at the
29 | TimeGate - For each Memento, add a "Link" header that points to the
30 | Original Resource - For each Memento, add a Memento-Datetime header that
31 | conveys the snapshot datetime
32 | 
33 | Using the previous example, and supposing a TimeGate server is running
34 | at ``http://example.com/timegate/``, Memento HTTP response headers for
35 | the Original Resource and one Memento look as follows:
36 | 
37 | .. image:: uris_example.png
38 | 
39 | To sum up
40 | ---------
41 | 
42 | -  The ``Memento-Datetime:`` header is a Memento-specific header which
43 |    value is the `rfc1123 <http://tools.ietf.org/html/rfc1123>`__-date of
44 |    the Memento.
45 | -  It must be included in any response to a Memento request.
46 | -  It cannot be in an Original Resource response.
47 | -  The ``Link:`` header is a standard header to which new values are
48 |    added.
49 | -  A link to the TimeGate with relation ``rel="timegate"`` must be
50 |    included in all Memento and Original Resource responses.
51 | -  A link to the Original Resource with relation ``rel="original"`` must
52 |    be included in all Memento responses.
53 | -  Link with relation ``rel="original"`` cannot be in an Original
54 |    Resource response.
55 | 


--------------------------------------------------------------------------------
/timegate/examples/cr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Croatian web archive proxy."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | import urllib
18 | 
19 | from timegate.errors import HandlerError
20 | from timegate.handler import Handler
21 | 
22 | baseuri = "http://haw.nsk.hr/json.php?"
23 | 
24 | 
25 | class CrHandler(Handler):
26 | 
27 |     def __init__(self):
28 |         Handler.__init__(self)
29 | 
30 |     def get_all_mementos(self, req_url):
31 |         # implement the changes list for this particular proxy
32 | 
33 |         parameters = {}
34 |         parameters['q'] = req_url
35 |         parameters['subject'] = 'url'
36 | 
37 |         uri = baseuri + urllib.urlencode(parameters)
38 |         try:
39 |             jsonobj = self.request(uri).json()
40 |         except Exception as e:
41 |             logging.error("Cannot request API or parse json response: " + e)
42 |             raise HandlerError("Cannot get API response.", 404)
43 | 
44 |         changes = []
45 | 
46 |         if int(jsonobj['availableHits']) == 0:
47 |             return []
48 | 
49 |         tmid = jsonobj['hits'][0]['ID']
50 |         tmuri = "http://haw.nsk.hr/publikacija/" + tmid
51 | 
52 |         try:
53 |             data = self.request(tmuri).content
54 |         except Exception as e:
55 |             logging.error("Error requerying API: " + e)
56 |             raise HandlerError("Cannot get API response.", 404)
57 | 
58 |         uriRegex = re.compile(r'<tr><td>[\d]*\.</td>.*</tr>')
59 |         dtregex = re.compile('<td>\d\d\.\d\d\.\d\d\d\d[0-9\.:\s]*</td>')
60 | 
61 |         uris = re.findall(uriRegex, data)
62 |         for u in uris:
63 |             d = u.index("title")
64 | 
65 |             loc = "http://haw.nsk.hr/" + u[45:d - 2].lstrip('/')
66 | 
67 |             result = dtregex.search(u)
68 |             if result:
69 |                 dtstr = result.group(0)
70 |             dtstr = dtstr[4:-5]
71 | 
72 |             dtstr = dtstr[6:10] + dtstr[3:5] + dtstr[0:2] + \
73 |                 dtstr[11:19].replace(":", "") + " GMT"
74 |             changes.append((loc, dtstr))
75 | 
76 |         return changes
77 | 


--------------------------------------------------------------------------------
/docs/memento.rst:
--------------------------------------------------------------------------------
 1 | Memento Framework
 2 | =================
 3 | 
 4 | Resources on the web change over time. While many server keep archives
 5 | of what these resources looked like in the past, it is often difficult
 6 | for the user to retrieve the URI of such an archive for a specific point
 7 | in time.
 8 | 
 9 | The `Memento Framework <http://www.mementoweb.org/>`__ leverages the
10 | need for the user to do the search by hand.
11 | 
12 | Components
13 | ----------
14 | 
15 | -  Suppose a web resource is located at some URI. We call the resource
16 |    the **Original Resource** and refer to its URI as the **URI-R**. This
17 |    is the resource for which a user wants to find a prior version.
18 | -  A prior version of an Original Resource is called a **Memento** and
19 |    we refer to its URI as the **URI-M**. There could be many Mementos
20 |    for one Original Resource. Each having its own URI-Mi and each
21 |    encapsulating the state of the Original Resource at a specific point
22 |    in time.
23 | -  The **TimeGate** is the application which selects the best Memento of
24 |    an Original Resource for a given datetime. This is where datetime
25 |    negotiation happens.
26 | 
27 | Requirements
28 | ------------
29 | 
30 | -  The first requirements is that Original Resources and Mementos must
31 |    be accessible through their respective and unique URIs.
32 | - Also, the framework operates using HTTP headers to work. Headers of requests
33 |    from/to the TimeGate are taken care of. However, Original Resources and
34 |    Mementos require the add of new headers. (See :ref:`http_response_headers`.)
35 | 
36 | The Generic TimeGate
37 | --------------------
38 | 
39 | The TimeGate is where most of the Memento magic happens. And its
40 | implementation is likely to be extremely close from one server to
41 | another. In this sense, its processing of HTTP requests / responses
42 | headers, its algorithms and logic can be abstracted and made generic.
43 | The only thing server-specific is the management of URIs and datetimes.
44 | To do that, this TimeGate can fit any web resource if it is provided a
45 | way to retrieve a history of a specific Original Resource. This is made
46 | using a custom handler.  (See :ref:`handler`.)
47 | 
48 | More about Memento
49 | ------------------
50 | 
51 | -  Details about Memento are available in the `RFC
52 |    7089 <http://www.mementoweb.org/guide/rfc/>`__.
53 | -  A `quick intro <http://www.mementoweb.org/guide/quick-intro/>`__ is
54 |    available on Memento's website.
55 | 


--------------------------------------------------------------------------------
/docs/cache.rst:
--------------------------------------------------------------------------------
 1 | .. _cache:
 2 | 
 3 | Cache
 4 | =====
 5 | 
 6 | The TimeGate comes with a built-in cache that is activated by default. Change
 7 | this behavior editing in the configuration file. See :ref:`configuration`.
 8 | 
 9 | Populating the cache
10 | --------------------
11 | 
12 | The cache stores TimeMaps which is the return values of the handler
13 | function ``get_all_mementos()`` only: - If the Handler does not have
14 | ``get_all_mementos()`` implemented, the cache will never be filled. - If
15 | the Handler has both the functions ``get_all_mementos()`` and
16 | ``get_memento()``, only TimeMap requests will fill the cache. All
17 | TimeGate requests will use ``get_memento()`` which result will not be
18 | cached.
19 | 
20 | Cache HIT conditions
21 | --------------------
22 | 
23 | -  Cached TimeMaps can be used used to respond to a TimeMap request from
24 |    a client if it is fresh enough. The tolerance for freshness can be
25 |    defined in the configuration file.
26 | -  Cached TimeMap can also be used to respond to a TimeGate requests
27 |    from a client. In this case, it is not the request's time that must
28 |    lie within the tolerance bounds, but the requested datetime.
29 | 
30 | Force Fresh value
31 | -----------------
32 | 
33 | If the request contains the header ``Cache Control: no-cache``, then the
34 | TimeGate will not return anything from cache.
35 | 
36 | Example
37 | -------
38 | 
39 | Suppose you have a TimeMap that was cached at time ``T``. Suppose you
40 | have a tolerance of ``d`` seconds. A TimeMap request arrives at time
41 | ``R1``. A TimeGate request arrives at time ``R2`` with requested
42 | datetime j. This request does **not** contain the header
43 | ``Cache Control: no-cache``. - A TimeMap request will be served from
44 | cache only if it arrives within the tolerance: ``R1 <= T+d``. - A
45 | TimeGate request will be served from cache only if the requested
46 | datetime happens within the tolerance: ``j <= T+d``, no matter ``R2``.
47 | This means that even if a cached value is old, the cache can still
48 | respond to TimeGate requests for requested datetimes that are until time
49 | ``T+d``. - All other requests will be cache misses.
50 | 
51 | Cache size
52 | ----------
53 | 
54 | There is no "maximum size" parameter. The reason for this is that the
55 | cache size will depend on the average size of TimeMaps, which itself
56 | depends on the length of each URI-Ms it contains, and their average
57 | count. These variables will depend on your system. The cache can be
58 | managed using the ``cache_max_values`` parameter which will affect
59 | indirectly its size.
60 | 


--------------------------------------------------------------------------------
/timegate/examples/loc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | from __future__ import absolute_import, print_function
12 | 
13 | import logging
14 | import re
15 | import StringIO
16 | 
17 | from lxml import etree
18 | 
19 | from timegate.handler import Handler
20 | 
21 | 
22 | class LocHandler(Handler):
23 | 
24 |     def __init__(self):
25 |         Handler.__init__(self)
26 | 
27 |         self.datere = re.compile(
28 |             'http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+')
29 |         self.colls = [
30 |             'lcwa0001',
31 |             'lcwa0002',
32 |             'lcwa0003',
33 |             'lcwa0004',
34 |             'lcwa0005',
35 |             'lcwa0006',
36 |             'lcwa0007',
37 |             'lcwa0008',
38 |             'lcwa0009',
39 |             'lcwa0010',
40 |             'lcwa0011',
41 |             'lcwa0012',
42 |             'lcwa0013',
43 |             'lcwa0014',
44 |             'lcwa0015',
45 |             'lcwa0016',
46 |             'lcwa0017',
47 |             'lcwa0018',
48 |             'lcwa0019',
49 |             'lcwa0020',
50 |             'lcwa0029',
51 |             'lcwa0031',
52 |             'lcwa0032',
53 |             'lcwa0033',
54 |             'lcwa0037'
55 |         ]
56 | 
57 |     def get_all_mementos(self, requri):
58 |         changes = []
59 | 
60 |         for c in self.colls:
61 |             iauri = "http://webarchives.loc.gov/%s/*/%s" % (c, requri)
62 | 
63 |             try:
64 |                 req = self.request(iauri)
65 |                 data = req.content
66 |             except Exception as e:
67 |                 continue
68 | 
69 |             try:
70 |                 parser = etree.HTMLParser(recover=True)
71 |                 dom = etree.parse(StringIO.StringIO(data), parser)
72 |             except Exception as e:
73 |                 logging.error("Exception parsing data in loc handler: %s" % e)
74 |                 continue
75 | 
76 |             alist = dom.xpath('//a')
77 | 
78 |             for a in alist:
79 |                 loc = a.attrib.get('href', '')
80 |                 if loc.startswith('http://webarchive.loc.gov/%s/' % c):
81 | 
82 |                     # extract time from link
83 |                     m = self.datere.match(loc)
84 |                     if m and a.tail:
85 |                         datestr = m.groups()[0]
86 |                         changes.append((loc, datestr))
87 |         return changes
88 | 


--------------------------------------------------------------------------------
/timegate/examples/w3c.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2015, 2016 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """TimeGate proxy to W3C pages."""
12 | 
13 | import re
14 | import time
15 | 
16 | import requests
17 | 
18 | from timegate.errors import HandlerError
19 | from timegate.handler import Handler
20 | 
21 | ACCEPTABLE_RESOURCE = (
22 |     "This TimeGate understands W3C specification uri of "
23 |     "the format: http://www.w3.org/TR/<spec_name>"
24 | )
25 | 
26 | # NOTE: Add API Key here
27 | APIKEY = ""
28 | 
29 | 
30 | class W3cHandler(Handler):
31 | 
32 |     def __init__(self):
33 |         Handler.__init__(self)
34 | 
35 |         # Local fields
36 |         self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1'
37 | 
38 |         self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)", re.IGNORECASE)
39 | 
40 |     def get_all_mementos(self, uri):
41 |         MAX_TIME = 120 #seconds
42 | 
43 |         match_spec_name = self.re_spec_name.match(uri)
44 |         if not bool(match_spec_name):
45 |             raise HandlerError("Unknown W3C specification uri. \n"
46 |                                + ACCEPTABLE_RESOURCE, 404)
47 | 
48 |         spec_name = match_spec_name.groups()[1]
49 |         if spec_name.endswith("/"):
50 |             spec_name = spec_name[:-1]
51 | 
52 |         api_response = self.request(self.api_url % (spec_name, APIKEY))
53 | 
54 |         if not api_response.status_code == 200:
55 |             raise HandlerError("No versions were found for the requested specification with shortname: %s" % spec_name, 404)
56 | 
57 |         json_response = {}
58 |         try:
59 |             json_response = api_response.json()
60 |             #for versions in json_response.get("_embedded").get("versions"):
61 |             #    spec_versions.append((versions.get("uri"), versions.get("date")))
62 |         except:
63 |             raise HandlerError("The W3C API returned an unknown response.", 502)
64 | 
65 |         if not json_response.get("_embedded") and json_response.get("_embedded").get("versions"):
66 |             raise HandlerError("The W3C API returned an unknown response.", 502)
67 | 
68 |         versions = map(
69 |                 lambda version: (version.get("uri"), version.get("date")),
70 |                 json_response.get("_embedded").get("version-history")
71 |                 )
72 |         #return versions
73 |         return sorted(versions, key=lambda version: version[1])
74 | 


--------------------------------------------------------------------------------
/timegate/examples/arxiv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | """Arxiv handler."""
12 | 
13 | from __future__ import absolute_import, print_function
14 | 
15 | import logging
16 | import re
17 | from StringIO import StringIO
18 | 
19 | from lxml import etree
20 | 
21 | from timegate.errors import HandlerError
22 | from timegate.handler import Handler
23 | 
24 | 
25 | class ArxivHandler(Handler):
26 | 
27 |     def __init__(self):
28 |         Handler.__init__(self)
29 | 
30 |         # Resources
31 |         # Ignores all that trails the identifier (? params, vX version,...info)
32 |         self.rex = re.compile(
33 |             r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)')
34 |         self.api_base = 'http://export.arxiv.org/oai2'
35 | 
36 |     def get_all_mementos(self, uri_r):
37 |         try:
38 |             # Extract the resource ID
39 |             match = self.rex.match(uri_r)
40 |             if not match:
41 |                 raise HandlerError("URI does not match a valid resource.", 404)
42 |             parts = match.groups()
43 |             base = parts[0]
44 |             type = parts[1]
45 |             resource = parts[2]
46 |             normalized_uri = '%s/%s/%s' % (base, type, resource)
47 | 
48 |             # Prepars the API call
49 |             params = {
50 |                 'verb': 'GetRecord',
51 |                 'identifier': 'oai:arXiv.org:%s' % resource,
52 |                 'metadataPrefix': 'arXivRaw'
53 |             }
54 | 
55 |             # Queries the API and extract the values
56 |             response = self.request(self.api_base, params=params)
57 |             if not response:
58 |                 raise HandlerError("API response not 2XX", 404)
59 |             root = etree.parse(StringIO(response.content),
60 |                                etree.XMLParser(recover=True))
61 |             versions = root.findall(
62 |                 './/{http://arxiv.org/OAI/arXivRaw/}version')
63 | 
64 |             # Processes the return
65 |             def mapper(version):
66 |                 v = version.xpath('@*')[0]
67 |                 date = version.find(
68 |                     './{http://arxiv.org/OAI/arXivRaw/}date').text
69 |                 return (normalized_uri + v, date)
70 | 
71 |             return map(mapper, versions)
72 | 
73 |         except HandlerError as he:
74 |             raise he
75 | 
76 |         except Exception as e:
77 |             logging.error('Arxiv handler exception: %s returning 404' % e)
78 |             return
79 | 


--------------------------------------------------------------------------------
/timegate/conf/config.ini:
--------------------------------------------------------------------------------
 1 | [server]
 2 | 
 3 | # host
 4 | # TimeGate server base URI
 5 | # Example: host = http://timegate.example.com
 6 | host = http://localhost
 7 | 
 8 | # strict_datetime
 9 | # When set to true, the user must use the RFC 1123 date in 'Accept-Datetime' header
10 | # When set to false, the server will also try to parse other time formats
11 | strict_datetime = true
12 | 
13 | # api_time_out
14 | # Timeout for any API request in seconds
15 | api_time_out = 6
16 | 
17 | # user-agent
18 | # Provide a user-agent to be added to the requests made by the timegate server
19 | user_agent = Memento TimeGate
20 | 
21 | [handler]
22 | # handler_class
23 | # Optional path to handler class. If not provided the program will
24 | # search core extensions for a possible handler.
25 | handler_class = timegate.examples.es.EsHandler
26 | 
27 | # use_timemap
28 | # Optional boolean to define wether the program can handle timemap requests.
29 | use_timemap = true
30 | 
31 | 
32 | # is_vcs
33 | # When true, the mementos are served from a Version Control System
34 | # When false, the mementos are served from a Snapshot system
35 | # This implies that the best memento to a date d is either, respectively
36 | # The closest to time d, before d
37 | # The absolute closest to time d
38 | is_vcs = true
39 | 
40 | # base_uri
41 | # (Optional) String that will be prepended to requested URI if it is not already present
42 | # For example, if the server runs at `http://timegate.example.com` and all original resources begin with `http://example.com/res/{resource ID}`,
43 | # then setting `base_uri = http://example.com/res/` will allow short requests such `http://timegate.example.com/{resource ID}`
44 | base_uri =
45 | 
46 | [cache]
47 | 
48 | # cache_activated
49 | # When true, the cache stores TimeMaps from API that allows batch (get_all_mementos) requests, except for requests with `Cache-Control: no-cache` header, which will always return fresh Mementos.
50 | # When false, no cache file will be created
51 | # Default true
52 | cache_activated = false
53 | 
54 | # cache_refresh_time
55 | # Time in seconds, for which it is assumed that a TimeMap didn't change. Any TimeGate request for a datetime past this period (or any TimeMap request past this period) will trigger a refresh of the cached value.
56 | # Default 86400 (one day)
57 | cache_refresh_time = 86400
58 | 
59 | # cache_directory
60 | # Cache directory relative path for data files. Make sure that this directory is empty or else the cache will start deleting random files.
61 | # Default cache/
62 | cache_directory = cache
63 | 
64 | # cache_max_values
65 | # Maximum number of stored TimeMaps in the cache.
66 | # Tweak this depending on how big your TimeMaps can become (number of elements and length of URIs)
67 | # Default 250
68 | cache_max_values = 250
69 | 


--------------------------------------------------------------------------------
/docs/big-picture.rst:
--------------------------------------------------------------------------------
 1 | .. _big_picture:
 2 | 
 3 | Big picture
 4 | ===========
 5 | 
 6 | Definitions
 7 | -----------
 8 | 
 9 | From now on, this documentation will refer to the web server where
10 | resources and archives are as the **web server** and to the Memento
11 | TimeGate datetime negotiation server as the **TimeGate**.
12 | 
13 | -  Suppose you have a web resource accessible in a web server by some
14 |    URI. We call the resource the **Original Resource** and refer to its
15 |    URI as **URI-R**.
16 | -  Suppose a web server has a snapshot of what this URI-R looked like in
17 |    the past. We call such a snapshot a **Memento** and we refer to its
18 |    URI as **URI-M**. There could be many snapshots of URI-R, taken at
19 |    different moments in time, each with their distinct URI-Ms. The
20 |    Mementos do not necessary need to be in the same web server as the
21 |    Original Resources.
22 | 
23 | Client, Server and TimeGate
24 | ---------------------------
25 | 
26 | This figure represents the current situation; Without date time
27 | negotiation, the client has to find by hand the URIs for the previous
28 | versions of a web resource. If they exists: |client_server.png| To make
29 | this web resources Memento compliant, two things need to be added. The
30 | new components of the systems are the TimeGate and Memento HTTP headers
31 | at the web server's side: |client_server_tg.png| With these links, the
32 | client now gets the address of the TimeGate when retrieving an Original
33 | Resource or a Memento. Then, he can use datetime negotiation with the
34 | TimeGate to get the URI of an archived version (``URI-M2``) of the
35 | Original Resource at specific a point in time (``T2``): |sequence.png|
36 | 
37 | Architecture
38 | ------------
39 | 
40 | The TimeGate will manage the framework's logic in a generic manner.
41 | However, every web server has its specific way to store snapshots and to
42 | construct URI-Ms. Thus, a specific plugin must be written for every web
43 | server. Such a plugin is called a handler. A handler will typically talk
44 | to an API to return the list of URI-Ms given a URI-R, but there are
45 | several alternatives to this setup.
46 | 
47 | .. figure:: architecture.png
48 |    :alt: architecture.png
49 | 
50 |    architecture.png
51 | 
52 | The system can be seen as three components.
53 | 
54 | -  The Memento user who wishes to retrieve an older version of a
55 |    resource
56 | -  The web server where the active version (original URI) and revisions
57 |    (mementos) can be accessed. This entity must provide a way to access
58 |    these versions. Typically through an API.
59 | -  The TimeGate which itself is composed of two main elements:
60 | -  One API-specific handler
61 | -  The generic TimeGate code
62 | 
63 | .. |client_server.png| image:: client_server.png
64 | .. |client_server_tg.png| image:: client_server_tg.png
65 | .. |sequence.png| image:: sequence.png
66 | 


--------------------------------------------------------------------------------
/timegate/examples/nara.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2014, 2015 LANL.
 5 | # Copyright (C) 2016 CERN.
 6 | #
 7 | # TimeGate is free software; you can redistribute it and/or modify
 8 | # it under the terms of the Revised BSD License; see LICENSE file for
 9 | # more details.
10 | 
11 | from __future__ import absolute_import, print_function
12 | 
13 | import logging
14 | import StringIO
15 | from datetime import datetime
16 | 
17 | from lxml import etree
18 | 
19 | from timegate.errors import HandlerError
20 | from timegate.handler import Handler
21 | 
22 | 
23 | class NaraHandler(Handler):
24 | 
25 |     def __init__(self):
26 |         Handler.__init__(self)
27 |         self.baseuri = "http://webharvest.gov/"
28 |         congress_number = 109
29 |         FIRST_YEAR = 2006
30 |         THIS_YEAR = datetime.utcnow().year
31 |         self.collections = ["peth04"]
32 | 
33 |         for i in range(FIRST_YEAR, THIS_YEAR, 2):
34 |             self.collections.append("congress%sth" % congress_number)
35 |             congress_number += 1
36 | 
37 |     def get_all_mementos(self, requri):
38 |         # implement the changes list for this particular proxy
39 |         changes = []
40 | 
41 |         for collection in self.collections:
42 |             uri = self.baseuri + collection + "/*/" + requri
43 |             dom = self.get_xml(uri, html=True)
44 | 
45 |             if dom:
46 |                 rlist = dom.xpath('//*[@class="mainBody"]')
47 |                 for td in rlist:
48 |                     if len(td.getchildren()) > 0:
49 |                         for a in td:
50 |                             if a.tag == 'a':
51 |                                 loc = a.get('href')
52 |                                 if not loc.startswith(self.baseuri):
53 |                                     if loc.startswith("/"):
54 |                                         loc = self.baseuri + loc[1:]
55 |                                     else:
56 |                                         loc = self.baseuri + loc
57 |                                 dtstr = a.get('onclick').split("'")[1] + " GMT"
58 | 
59 |                                 # if a.tail:
60 |                                 changes.append((loc, dtstr))
61 | 
62 |         return changes
63 | 
64 |     def get_xml(self, uri, html=False):
65 |         """Retrieves the resource using the url, parses it as XML or HTML and
66 |         returns the parsed dom object.
67 | 
68 |         :param uri: [str] The uri to retrieve
69 |         :param headers: [dict(header_name: value)] optional http headers to
70 |             send in the request.
71 |         :param html: [bool] optional flag to parse the response.
72 |         as HTML
73 |         :return: [lxml_obj] parsed dom.
74 |         """
75 | 
76 |         page = self.request(uri)
77 |         try:
78 |             page_data = page.content
79 |             if not html:
80 |                 parser = etree.XMLParser(recover=True)
81 |             else:
82 |                 parser = etree.HTMLParser(recover=True)
83 |             return etree.parse(StringIO.StringIO(page_data), parser)
84 |         except Exception as e:
85 |             logging.error("Cannot parse XML/HTML from %s" % uri)
86 |             raise HandlerError("Couldn't parse data from %s" % uri)
87 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | Introduction
 5 | ------------
 6 | 
 7 | In order to support Memento, a web server must obviously have accessible
 8 | archives of its online resources. And it must also have a piece of
 9 | software that handles the datetime negotiation according to the Memento
10 | protocol for those resources.
11 | 
12 | But in such datetime negotiation server, only a small proportion of the
13 | code is specific to the particular web resources it handles. The main
14 | part of logic will be very similar throughout many implementations.
15 | TimeGate isolates the core components and functionality. With it,
16 | there's no need to implement, or to re-implement the same logic and
17 | algorithms over and over again. Its architecture is designed to accept
18 | easy-to-code plugins to match any web resources.
19 | 
20 | From now on, this documentation will refer to the web server where
21 | resources and archives are as the **web server** and to the Memento
22 | TimeGate datetime negotiation server as the **TimeGate**.
23 | 
24 | -  Suppose you have a web resource accessible in a web server by some
25 |    URI. We call the resource the **Original Resource** and refer to its
26 |    URI as **URI-R**.
27 | -  Suppose a web server has a snapshot of what this URI-R looked like in
28 |    the past. We call such a snapshot a **Memento** and we refer to its
29 |    URI as **URI-M**. There could be many snapshots of URI-R, taken at
30 |    different moments in time, each Memento i with its distinct URI-Mi.
31 |    The Mementos do not necessary need to be in the same web server as
32 |    the Original Resources.
33 | 
34 | Example
35 | -------
36 | 
37 | .. figure:: uris_example.png
38 | 
39 | There are only two steps to make such resource Memento compliant.
40 | 
41 | Step 1: Setting up TimeGate
42 | ---------------------------
43 | 
44 | The first thing to do is to set up the TimeGate for the specific web
45 | server.
46 | 
47 | * Run the TimeGate with your custom handler. The handler is the
48 |   piece of code that is specific to how the web server manages Original
49 |   Resources and Mementos. It needs to implement either one of the
50 |   following:
51 | 
52 |   - Given a URI-R, return the list of URI-Ms along with their respective dates.
53 |   - Given a URI-R and a datetime, return one single URI-M along with its date.
54 | 
55 | Step 2: Providing the headers
56 | -----------------------------
57 | 
58 | The second thing to do is to provide Memento's HTTP headers at the web
59 | server.
60 | 
61 | * Add HTTP headers required by the Memento protocol to responses from the
62 |   Original Resource and its Mementos:
63 | 
64 |   - For the Original Resource, add a "Link" header that points at its TimeGate
65 |   - For each Memento, add a "Link" header that points at the TimeGate
66 |   - For each Memento, add a "Link" header that points to the Original Resource
67 |   - For each Memento, add a Memento-Datetime header that conveys the snapshot datetime
68 | 
69 | Using the previous example, and supposing a TimeGate is running at
70 | ``http://example.com/timegate/``, Memento HTTP response headers for the
71 | Original Resource and one Memento look as follows:
72 | 
73 | .. image:: headers_example.png
74 | 
75 | And that's it! With the TimeGate, datetime negotiation is now possible
76 | for these resources.
77 | 


--------------------------------------------------------------------------------
/timegate/examples/webcite.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """WebCitation proxy."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import cookielib
 16 | import logging
 17 | import StringIO
 18 | import urllib2
 19 | 
 20 | from lxml import etree
 21 | 
 22 | from timegate.errors import HandlerError
 23 | from timegate.handler import Handler
 24 | 
 25 | 
 26 | class WebCiteHandler(Handler):
 27 | 
 28 |     def __init__(self):
 29 |         Handler.__init__(self)
 30 |         cj = cookielib.LWPCookieJar()
 31 |         opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 32 |         urllib2.install_opener(opener)
 33 | 
 34 |     def get_all_mementos(self, requri):
 35 | 
 36 |         if requri == 'http://lanlsource.lanl.gov/hello':
 37 |             wcurl = 'http://webcitation.org/5jq247bmx'
 38 |         elif requri == 'http://lanlsource.lanl.gov/pics/picoftheday.png':
 39 |             wcurl = 'http://webcitation.org/5jq24MRo3'
 40 |         elif requri == 'http://odusource.cs.odu.edu/pics/picoftheday.png':
 41 |             wcurl = 'http://webcitation.org/5k9j4oXPw'
 42 |         else:
 43 |             return self.get_from_xml(requri)  # Cleaner but much slower
 44 |             # wcurl = 'http://webcitation.org/query.php?url=' + requri  # Fast
 45 |             # screen scraping
 46 | 
 47 |         txheaders = {}
 48 | 
 49 |         try:
 50 |             req = urllib2.Request(wcurl, None, txheaders)
 51 |             fh = urllib2.urlopen(req)
 52 |             fh.close()
 53 | 
 54 |             req = urllib2.Request('http://webcitation.org/topframe.php')
 55 |             fh = urllib2.urlopen(req)
 56 |             data = fh.read()
 57 |             fh.close()
 58 |         except Exception as e:
 59 |             raise HandlerError('Cannot request page', 404)
 60 | 
 61 |         changes = []
 62 | 
 63 |         try:
 64 |             parser = etree.HTMLParser()
 65 |             dom = etree.parse(StringIO.StringIO(data), parser)
 66 |         except:
 67 |             raise HandlerError('Cannot parse HTML')
 68 | 
 69 |         opts = dom.xpath('//select[@name="id"]/option')
 70 |         for o in opts:
 71 |             fid = o.attrib['value']
 72 |             date = o.text
 73 |             if date.find('(failed)') > -1:
 74 |                 continue
 75 | 
 76 |             changes.append(('http://webcitation.org/query?id=' + fid, date))
 77 | 
 78 |         return changes
 79 | 
 80 |     def get_from_xml(self, requri):
 81 |         api_request = 'http://webcitation.org/query.php?returnxml=1&url=' + requri
 82 |         xml = self.request(api_request, timeout=120)
 83 | 
 84 |         try:
 85 |             parser = etree.XMLParser(recover=True)  # Parses bad XML
 86 |             dom = etree.parse(StringIO.StringIO(str(xml.text)), parser)
 87 |         except Exception as e:
 88 |             logging.error('Cannot parse XML: ' + str(e))
 89 |             raise HandlerError('Cannot parse XML', 404)
 90 | 
 91 |         results = []
 92 |         succes = dom.xpath("//result[@status='success']")
 93 |         for s in succes:
 94 |             url = s.find('webcite_url').text
 95 |             date = s.find('timestamp').text
 96 | 
 97 |             results.append((url, date))
 98 | 
 99 |         return results
100 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | Contributing
  2 | ============
  3 | 
  4 | Contributions are welcome, and they are greatly appreciated! Every
  5 | little bit helps, and credit will always be given.
  6 | 
  7 | Types of Contributions
  8 | ----------------------
  9 | 
 10 | Report Bugs
 11 | ~~~~~~~~~~~
 12 | 
 13 | Report bugs at https://github.com/mementoweb/timegate/issues.
 14 | 
 15 | If you are reporting a bug, please include:
 16 | 
 17 | * Your operating system name and version.
 18 | * Any details about your local setup that might be helpful in troubleshooting.
 19 | * Detailed steps to reproduce the bug.
 20 | 
 21 | Fix Bugs
 22 | ~~~~~~~~
 23 | 
 24 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 25 | is open to whoever wants to implement it.
 26 | 
 27 | Implement Features
 28 | ~~~~~~~~~~~~~~~~~~
 29 | 
 30 | Look through the GitHub issues for features. Anything tagged with "feature"
 31 | is open to whoever wants to implement it.
 32 | 
 33 | Write Documentation
 34 | ~~~~~~~~~~~~~~~~~~~
 35 | 
 36 | TimeGate could always use more documentation, whether as part of the
 37 | official TimeGate docs, in docstrings, or even on the web in blog posts,
 38 | articles, and such.
 39 | 
 40 | Submit Feedback
 41 | ~~~~~~~~~~~~~~~
 42 | 
 43 | The best way to send feedback is to file an issue at
 44 | https://github.com/mementoweb/timegate/issues.
 45 | 
 46 | If you are proposing a feature:
 47 | 
 48 | * Explain in detail how it would work.
 49 | * Keep the scope as narrow as possible, to make it easier to implement.
 50 | * Remember that this is a volunteer-driven project, and that contributions
 51 |   are welcome :)
 52 | 
 53 | Get Started!
 54 | ------------
 55 | 
 56 | Ready to contribute? Here's how to set up `timegate` for local development.
 57 | 
 58 | 1. Fork the `timegate` repo on GitHub.
 59 | 2. Clone your fork locally:
 60 | 
 61 |    .. code-block:: console
 62 | 
 63 |       $ git clone git@github.com:your_name_here/timegate.git
 64 | 
 65 | 3. Install your local copy into a virtualenv. Assuming you have
 66 |    virtualenvwrapper installed, this is how you set up your fork for local
 67 |    development:
 68 | 
 69 |    .. code-block:: console
 70 | 
 71 |       $ mkvirtualenv timegate
 72 |       $ cd timegate/
 73 |       $ pip install -e .[all]
 74 | 
 75 | 4. Create a branch for local development:
 76 | 
 77 |    .. code-block:: console
 78 | 
 79 |       $ git checkout -b name-of-your-bugfix-or-feature
 80 | 
 81 |    Now you can make your changes locally.
 82 | 
 83 | 5. When you're done making changes, check that your changes pass tests:
 84 | 
 85 |    .. code-block:: console
 86 | 
 87 |       $ ./run-tests.sh
 88 | 
 89 |    The tests will provide you with test coverage and also check PEP8
 90 |    (code style), PEP257 (documentation), flake8 as well as build the Sphinx
 91 |    documentation and run doctests.
 92 | 
 93 | 6. Commit your changes and push your branch to GitHub:
 94 | 
 95 |    .. code-block:: console
 96 | 
 97 |       $ git add .
 98 |       $ git commit -s -m "Your detailed description of your changes."
 99 |       $ git push origin name-of-your-bugfix-or-feature
100 | 
101 | 7. Submit a pull request through the GitHub website.
102 | 
103 | Pull Request Guidelines
104 | -----------------------
105 | 
106 | Before you submit a pull request, check that it meets these guidelines:
107 | 
108 | 1. The pull request should include tests and must not decrease test coverage.
109 | 2. If the pull request adds functionality, the docs should be updated. Put
110 |    your new functionality into a function with a docstring.
111 | 3. The pull request should work for Python 2.7, 3.3, 3.4 and 3.5. Check
112 |    https://travis-ci.com/mementoweb/timegate/pull_requests
113 |    and make sure that the tests pass for all supported Python versions.
114 | 


--------------------------------------------------------------------------------
/timegate/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of TimeGate.
 4 | # Copyright (C) 2016 CERN.
 5 | #
 6 | # TimeGate is free software; you can redistribute it and/or modify
 7 | # it under the terms of the Revised BSD License; see LICENSE file for
 8 | # more details.
 9 | 
10 | """Implement default configuration and custom loaders."""
11 | 
12 | from __future__ import absolute_import, print_function
13 | 
14 | from configparser import ConfigParser
15 | 
16 | from ._compat import string_types
17 | 
18 | 
19 | class Config(dict):
20 |     """Implement custom loaders to populate dict."""
21 | 
22 |     _instance = None
23 | 
24 |     def __new__(cls, root_path, defaults=None):
25 |         """
26 |         Converting this into a singleton for cached access.
27 |         :param root_path:
28 |         :param defaults:
29 |         :return:
30 |         """
31 |         if not cls._instance:
32 |             cls._instance = super(Config, cls).__new__(cls)
33 |         return cls._instance
34 | 
35 |     def __init__(self, root_path, defaults=None):
36 |         """
37 |         Build an empty config wrapper.
38 | 
39 |         :param root_path: Path to which files are read relative from.
40 |         :param defaults: An optional dictionary of default values.
41 |         """
42 |         dict.__init__(self, defaults or {})
43 |         self.root_path = root_path
44 | 
45 |     def from_inifile(self, filename, silent=True):
46 |         """Update the values in the config from an INI file."""
47 |         conf = ConfigParser()
48 |         with open(filename) as f:
49 |             conf.read_file(f)
50 | 
51 |         # Server configuration
52 |         self['HOST'] = conf.get('server', 'host').rstrip('/')
53 |         self['USER_AGENT'] = conf.get('server', 'user_agent')
54 |         self['STRICT_TIME'] = conf.getboolean('server', 'strict_datetime')
55 |         if conf.has_option('server', 'api_time_out'):
56 |             self['API_TIME_OUT'] = conf.getfloat('server', 'api_time_out')
57 | 
58 |         # Handler configuration
59 |         if conf.has_option('handler', 'handler_class'):
60 |             self['HANDLER_MODULE'] = conf.get('handler', 'handler_class')
61 |         if conf.has_option('handler', 'base_uri'):
62 |             self['BASE_URI'] = conf.get('handler', 'base_uri')
63 |         if conf.getboolean('handler', 'is_vcs'):
64 |             self['RESOURCE_TYPE'] = 'vcs'
65 |         else:
66 |             self['RESOURCE_TYPE'] = 'snapshot'
67 | 
68 |         if conf.has_option('handler', 'use_timemap'):
69 |             self['USE_TIMEMAPS'] = conf.getboolean('handler', 'use_timemap')
70 |         else:
71 |             self['USE_TIMEMAPS'] = False
72 | 
73 |         # Cache
74 |         # When False, all cache requests will be cache MISS
75 |         self['CACHE_USE'] = conf.getboolean('cache', 'cache_activated')
76 |         # Time window in which the cache value is considered young
77 |         # enough to be valid
78 |         self['CACHE_TOLERANCE'] = conf.getint('cache', 'cache_refresh_time')
79 |         # Cache files paths
80 |         self['CACHE_DIRECTORY'] = conf.get(
81 |             'cache', 'cache_directory').rstrip('/')
82 |         # Maximum number of TimeMaps stored in cache
83 |         self['CACHE_MAX_VALUES'] = conf.getint('cache', 'cache_max_values')
84 |         # Cache files paths
85 |         self['CACHE_FILE'] = self['CACHE_DIRECTORY']  # + '/cache_data'
86 | 
87 |     def from_object(self, obj):
88 |         """Update config with values from given object.
89 | 
90 |         :param obj: An import name or object.
91 |         """
92 |         if isinstance(obj, string_types):
93 |             obj = import_string(obj)
94 |         for key in dir(obj):
95 |             if key.isupper():
96 |                 self[key] = getattr(obj, key)
97 | 


--------------------------------------------------------------------------------
/docs/getting-started.rst:
--------------------------------------------------------------------------------
  1 | Getting Started
  2 | ===============
  3 | 
  4 | Memento TimeGate
  5 | ----------------
  6 | 
  7 | TimeGate is a `WSGI <http://wsgi.readthedocs.org/en/latest/>`__
  8 | application server that allows simple implementation of
  9 | `Memento <http://mementoweb.org>`__ capabilities for web resources
 10 | having accessible revisions. It manages all the content negotiation
 11 | logic, from request processing, best memento query and selection to HTTP
 12 | response.
 13 | 
 14 | To make web resources that is accessible on a web server fully Memento
 15 | compliant, two things need to be done. - TimeGate is generic: a custom
 16 | handler must be plugged in to match the specific web server. - The
 17 | Memento framework uses specific HTTP headers: they must be added to the
 18 | resource's web server responses.
 19 | 
 20 | Steps
 21 | -----
 22 | 
 23 | The big picture
 24 | ~~~~~~~~~~~~~~~
 25 | 
 26 | The first thing to do is to understand how the program is
 27 | structured.  See :ref:`big_picture`.
 28 | 
 29 | Installing the server
 30 | ~~~~~~~~~~~~~~~~~~~~~
 31 | 
 32 | The code can be obtained
 33 | `here <https://github.com/mementoweb/timegate/releases>`__. Download a
 34 | zip or tar.gz archive into a directory of your choice.
 35 | 
 36 | Decompress the zip files using:
 37 | 
 38 | .. code:: bash
 39 | 
 40 |     $ unzip timegate-<version>.zip
 41 | 
 42 | Decompress tar.gz files using:
 43 | 
 44 | .. code:: bash
 45 | 
 46 |     $ tar xvzf timegate-<version>.tar.gz
 47 | 
 48 | Install the dependencies using:
 49 | 
 50 | .. code:: bash
 51 | 
 52 |     $ echo 'uWSGI>=2.0.3 ConfigParser>=3.3.0r2 python-dateutil>=2.1 requests>=2.2.1 werkzeug>=0.9.6 lxml>=3.4.1' | xargs pip install
 53 | 
 54 | Running the TimeGate
 55 | ~~~~~~~~~~~~~~~~~~~~
 56 | 
 57 | Then try starting the TimeGate server with one of the handler that is
 58 | already provided. To run it, first navigate to the directory:
 59 | 
 60 | .. code:: bash
 61 | 
 62 |     $ cd timegate-<version>
 63 | 
 64 | Then, there are two possibilities: - Either execute
 65 | ``uwsgi --http :9999 --wsgi-file core/application.py --master`` to
 66 | deploy the TimeGate on ``localhost:9999``. Add the option
 67 | ``--pidfile /path/to/file.pid`` to store the process ID in a file. - Or
 68 | edit the uWSGI launch configuration in ``conf/timegate.ini`` and then
 69 | execute ``uwsgi conf/timegate.ini``
 70 | 
 71 | To stop the server: - Simply use ``CTRL+C`` if it is running in
 72 | foreground. - Or execute ``uwsgi --stop /path/to/file.pid`` if you have
 73 | stored the PID to run it in the background. - If by mistake the PID is
 74 | not stored but the TimeGate is still running, list all uwsgi processes
 75 | using ``ps ux | grep uwsgi``, identify the TimeGate process from the
 76 | ``COMMAND`` column and kill it using ``kill -INT  <PID>``.
 77 | 
 78 | Handler
 79 | ~~~~~~~
 80 | 
 81 | Once the server is successfully running with an example handler that was
 82 | provided, edit it or create a new one (see :ref:`handler`) that returns the list
 83 | of all URI-Ms given a URI-R of an Original Resource you wish to make Memento
 84 | compliant.
 85 | 
 86 | Memento Headers
 87 | ~~~~~~~~~~~~~~~
 88 | 
 89 | The Memento protocol mainly works with HTTP headers. Now add the required
 90 | headers (see :ref:`http_response_headers`) to your web server's HTTP responses.
 91 | 
 92 | Configuring the TimeGate
 93 | ~~~~~~~~~~~~~~~~~~~~~~~~
 94 | 
 95 | Finally, enter the TimeGate's ``HOST`` location in the ``config.ini`` (see
 96 | :ref:`configuration`) file. Also edit the other parameters' default values to
 97 | your preferences.
 98 | 
 99 | Memento compliance
100 | ~~~~~~~~~~~~~~~~~~
101 | 
102 | That's it. The basic Memento functionalities are here and your web
103 | server is now Memento compliant. See :ref:`advanced_features`.
104 | 


--------------------------------------------------------------------------------
/docs/configuration.rst:
--------------------------------------------------------------------------------
 1 | .. _configuration:
 2 | 
 3 | Configuring the server
 4 | ======================
 5 | 
 6 | Edit the `config
 7 | file <https://github.com/mementoweb/timegate/blob/master/conf/config.ini>`__:
 8 | ``conf/config.ini``.
 9 | 
10 | Mandatory field
11 | ---------------
12 | 
13 | ``host`` Is the server's base URI. This is the URI on which the TimeGate
14 | is deployed. No default value.
15 | 
16 | Example: - Suppose TimeGate is running at ``http://tg.example.com`` and
17 | ``URI-R`` refers to an Orignal Resource's URI.
18 | 
19 | - The program will respond to TimeGate requests at
20 |   ``http://tg.example.com/timegate/URI-R``
21 | 
22 | - The program will respond to ``TimeMap`` requests at
23 |   ``http://tg.example.com/timemap/link/URI-R`` and
24 |   ``http://tg.example.com/timemap/json/URI-R`` if the feature is enabled.
25 |   See :ref:`advanced_features`.
26 | 
27 | Important field
28 | ---------------
29 | 
30 | ``is_vcs`` The type of archive affects the best Memento selection
31 | algorithm. Default ``false``. - When ``false``, the history is
32 | considered to be snapshots taken at some points in time, thus the best
33 | memento is the *absolute* closest to the requested date. - When
34 | ``true``, the history the handler returns is considered to be from a
35 | version control system. In other words, the history represents every
36 | change that was made to the Original Resource and the exact datetimes of
37 | the change. In this case, the best Memento for a requested datetime T
38 | will be the closest *before* T.
39 | 
40 | Other fields
41 | ------------
42 | 
43 | -  ``handler_class`` (Optional) Python module path to a handler class.
44 |    This is useful if the handler is composed of several classes or to
45 |    quickly switch between handlers. If this parameter is not provided,
46 |    the program will search for handler classes in ``core.handler``. For
47 |    example:
48 |    ``handler_class = core.handler_examples.wikipedia.WikipediaHandler``
49 | -  ``api_time_out`` Time, in seconds, before a request to an API times
50 |    out when using the ``Handler.request()`` function. Default 6 seconds
51 | -  ``base_uri`` (Optional) String that will be prepended to requested
52 |    URI if missing. This can be used to shorten the request URI and to
53 |    avoid repeating the base URI that is common to all resources. Default
54 |    empty
55 | -  For example, suppose the TimeGate is deployed at
56 |    ``http://tg.example.com``
57 | -  Suppose every Original Resources ``URI-Ri`` has the following format
58 |    ``http://resource.example.com/res/URI-Ri``
59 | -  Then, Setting ``base_uri = http://resource.example.com/res/`` will
60 |    allow short requests such as for example
61 |    ``http://tg.example.com/timegate/URI-Ri`` instead of
62 |    ``http://tg.example.com/timegate/http://resource.example.com/res/URI-Ri``.
63 | -  ``use_timemap`` When ``true``, the TimeGate adds TimeMaps links to
64 |    its (non error) responses. Default ``false``
65 | 
66 | Cache parameters:
67 | -----------------
68 | 
69 | -  ``cache_activated`` When ``true``, the cache stores the entire
70 |    history of an Original Resource from handlers that allows batch
71 |    ``get_all_mementos(uri_r)`` requests. It can then respond from cache
72 |    if the value is fresh enough. If a requests contains the header
73 |    ``Cache-Control: no-cache`` the server will not respond from cache.
74 |    When ``false`` the cache files are not created. Default ``true``.
75 | -  ``cache_refresh_time`` tolerance in seconds, for which it is assumed
76 |    that a history didn't change. Any TimeGate request for a datetime
77 |    past this (or any TimeMap request past this) will trigger a refresh
78 |    of the cached history. Default 86400 seconds (one day).
79 | -  ``cache_directory`` Relative path for data files. Do not add any
80 |    other file to this directory as they could be deleted. Each file
81 |    represents an entire history of an Original Resource. Default
82 |    ``cache/``.
83 | -  ``cache_max_values`` Maximum number of URI-Rs for which its entire
84 |    history is stored. This is then the number of files in the
85 |    ``cache_directory``. Default 250.
86 | 
87 | See :ref:`cache`.
88 | 


--------------------------------------------------------------------------------
/timegate/examples/simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Example handler."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | # For get_memento() date parameter
 16 | import datetime
 17 | 
 18 | # For custom errors sent to client
 19 | from timegate.errors import HandlerError
 20 | # Mandatory
 21 | from timegate.handler import Handler
 22 | 
 23 | 
 24 | class ExampleHandler(Handler):
 25 | 
 26 |     def __init__(self):
 27 |         Handler.__init__(self)
 28 |         # Initialization code here. This part is run only once
 29 |         versions_a = [
 30 |             'http://www.example.com/resourceA_v1',
 31 |             'http://www.example.com/resourceA_v2',
 32 |             'http://www.example.com/resourceA_v3'
 33 | 
 34 |         ]
 35 |         date_times_a = [
 36 |             '1999-09-30T01:50:50Z',
 37 |             '2010-10-16T13:27:27Z',
 38 |             '2015-01-03T22:00:00Z'
 39 |         ]
 40 |         versions_b = [
 41 |             'http://www.example.com/resourceB_v1',
 42 |             'http://www.example.com/resourceB_v2',
 43 | 
 44 |         ]
 45 |         date_times_b = [
 46 |             '1998-07-17T17:47:31Z',
 47 |             '2000-11-08T19:05:09Z'
 48 |         ]
 49 |         self.archives = {
 50 |             'http://www.example.com/resourceA': versions_a,
 51 |             'http://www.example.com/resourceB': versions_b,
 52 |             'http://www.example.com/resource%20space': [
 53 |                 'http://www.example.com/space',
 54 |             ],
 55 |         }
 56 |         self.dates = {
 57 |             'http://www.example.com/resourceA': date_times_a,
 58 |             'http://www.example.com/resourceB': date_times_b,
 59 |             'http://www.example.com/resource%20space': [
 60 |                 '1970-01-01T00:00:00Z'
 61 |             ],
 62 |         }
 63 | 
 64 |     # This is the function to implement.
 65 |     def get_all_mementos(self, uri_r):
 66 |         # Verifies and processes the requested URI
 67 |         archived_uris = self.archives.keys()
 68 |         if uri_r in archived_uris:
 69 |             # Contact the API to retrieve the list of URI-Ms for this URI-R
 70 |             # along with their datetimes
 71 | 
 72 |             # In this example, everything is done in a statically
 73 |             # But this is where the handler is supposed to access the versions
 74 |             # API
 75 |             uri_ms = self.archives[uri_r]
 76 |             datetimes = self.dates[uri_r]
 77 | 
 78 |             # Generate the list of tuples [(uri_string, date_string)]
 79 |             tuple_list = list(zip(uri_ms, datetimes))
 80 |             return tuple_list  # A list of tuple containing all Mementos is returned
 81 |         else:
 82 |             # No Memento for this uri was found in archive
 83 |             return []
 84 | 
 85 |     # Implement this function instead to bypass the TimeGate's best Memento selection algorithm.
 86 |     # Also, it can be used if the whole list cannot be accessed easily.
 87 |     # If both get_all_mementos() and get_memento() are implemented.
 88 |     # get_memento() will always be preferred by the TimeGate.
 89 |     def get_memento(self, uri_r, req_datetime):
 90 |         # Suppose you have a special rule for certain dates
 91 |         if req_datetime.year < 1999:
 92 |             # In this case, we do not serve anything before 2001
 93 |             # Return a custom Error to the client
 94 |             raise HandlerError(
 95 |                 "Cannot server a Memento before 1999", status=404)
 96 |         else:
 97 |             # Gets all mementos for this URI
 98 |             mementos_list = self.get_all_mementos(uri_r)
 99 | 
100 |             # Find the best single memento is returned for this uri_r and this
101 |             # date
102 |             (uri_m, date_time) = mementos_list[-1]
103 |             # In this example we take the last one
104 | 
105 |             return (uri_m, date_time)  # The return value is a tuple here.
106 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2016 CERN.
  5 | #
  6 | # TimeGate is free software; you can redistribute it and/or modify
  7 | # it under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | """A Memento TimeGate."""
 11 | 
 12 | import os
 13 | import sys
 14 | 
 15 | from setuptools import find_packages, setup
 16 | 
 17 | readme = open('README.rst').read()
 18 | 
 19 | tests_require = [
 20 |     'check-manifest>=0.25',
 21 |     'coverage>=4.0',
 22 |     'isort>=4.2.2',
 23 |     'pydocstyle>=1.0.0',
 24 |     'pytest-cache>=1.0',
 25 |     'pytest-cov>=1.8.0',
 26 |     'pytest-pep8>=1.0.6',
 27 |     'pytest>=2.8.0',
 28 |     'httpretty>=0.8.14',
 29 |     'mock>=2.0.0',
 30 | ]
 31 | 
 32 | extras_require = {
 33 |     ':python_version<"3.0"': [
 34 |         'ConfigParser>=3.3.0r2',
 35 |     ],
 36 |     'docs': [
 37 |         'Sphinx>=1.4.2',
 38 |     ],
 39 |     'uwsgi': [
 40 |         'uWSGI>=2.0.3',
 41 |     ],
 42 |     'tests': tests_require,
 43 | }
 44 | 
 45 | extras_require['all'] = []
 46 | for key, reqs in extras_require.items():
 47 |     if key[0] == ':':
 48 |         continue
 49 |     extras_require['all'].extend(reqs)
 50 | 
 51 | setup_requires = [
 52 |     'pytest-runner>=2.6.2',
 53 | ]
 54 | 
 55 | install_requires = [
 56 |     'LinkHeader>=0.4.3',
 57 |     'lxml>=3.4.1',
 58 |     'python-dateutil>=2.1',
 59 |     'requests>=2.2.1',
 60 |     'werkzeug>=0.9.6',
 61 | ]
 62 | 
 63 | packages = find_packages()
 64 | 
 65 | 
 66 | # Get the version string. Cannot be done with import!
 67 | g = {}
 68 | with open(os.path.join('timegate', 'version.py'), 'rt') as fp:
 69 |     exec(fp.read(), g)
 70 |     version = g['__version__']
 71 | 
 72 | setup(
 73 |     name='timegate',
 74 |     version=version,
 75 |     description=__doc__,
 76 |     long_description=readme,
 77 |     keywords='memento timegate',
 78 |     license='BSD',
 79 |     author='LANL',
 80 |     author_email='yorick.chollet@gmail.com',
 81 |     url='https://github.com/mementoweb/timegate',
 82 |     packages=packages,
 83 |     zip_safe=False,
 84 |     include_package_data=True,
 85 |     platforms='any',
 86 |     entry_points={
 87 |         'timegate.handlers': [
 88 |             'arxiv = timegate.examples.arxiv:ArxivHandler',
 89 |             'aueb = timegate.examples.aueb:AuebHandler',
 90 |             'can = timegate.examples.can:CanHandler',
 91 |             'cat = timegate.examples.cat:CatHandler',
 92 |             'cr = timegate.examples.cr:CrHandler',
 93 |             'es = timegate.examples.es:EsHandler',
 94 |             'github = timegate.examples.github:GithubHandler',
 95 |             'gitlab = timegate.examples.gitlab:GitlabHandler',
 96 |             'loc = timegate.examples.loc:LocHandler',
 97 |             'mediawiki = timegate.examples.mediawiki:MediawikiHandler',
 98 |             'nara = timegate.examples.nara:NaraHandler',
 99 |             'orain = timegate.examples.orain:OrainHandler',
100 |             'pastpages = timegate.examples.pastpages:PastpagesHandler',
101 |             'sg = timegate.examples.sg:SgHandler',
102 |             'si = timegate.examples.si:SiHandler',
103 |             'simple = timegate.examples.simple:ExampleHandler',
104 |             'w3c = timegate.examples.w3c:W3cHandler',
105 |             'webcite = timegate.examples.webcite:WebCiteHandler',
106 |             'wikia = timegate.examples.wikia:WikiaHandler',
107 |             'wikipedia = timegate.examples.wikipedia:WikipediaHandler',
108 |         ],
109 |     },
110 |     extras_require=extras_require,
111 |     install_requires=install_requires,
112 |     setup_requires=setup_requires,
113 |     tests_require=tests_require,
114 |     classifiers=[
115 |         'Development Status :: 5 - Production/Stable',
116 |         'Environment :: Web Environment',
117 |         'Intended Audience :: Developers',
118 |         'License :: OSI Approved :: BSD License',
119 |         'Operating System :: OS Independent',
120 |         'Programming Language :: Python :: 2',
121 |         'Programming Language :: Python :: 2.7',
122 |         'Programming Language :: Python :: 3',
123 |         'Programming Language :: Python :: 3.3',
124 |         'Programming Language :: Python :: 3.4',
125 |         'Programming Language :: Python :: 3.5',
126 |         'Programming Language :: Python',
127 |         'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
128 |         'Topic :: Software Development :: Libraries :: Python Modules'
129 |     ],
130 | )
131 | 


--------------------------------------------------------------------------------
/tests/test_timegate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2016 CERN.
  5 | #
  6 | # TimeGate is free software; you can redistribute it and/or modify
  7 | # it under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | 
 11 | """Module tests."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import json
 16 | 
 17 | import pytest
 18 | 
 19 | 
 20 | def test_version():
 21 |     """Test version import."""
 22 |     from timegate import __version__
 23 |     assert __version__
 24 | 
 25 | 
 26 | def test_initialization():
 27 |     """Test TimeGate initialization."""
 28 |     from timegate.application import TimeGate
 29 |     from timegate.examples.simple import ExampleHandler
 30 |     handler = ExampleHandler()
 31 |     app = TimeGate(config=dict(HANDLER_MODULE=handler))
 32 |     assert handler == app.handler
 33 | 
 34 | 
 35 | def test_application():
 36 |     """Test simple request."""
 37 |     from timegate import application
 38 |     from werkzeug.test import Client
 39 |     from werkzeug.wrappers import BaseResponse
 40 |     client = Client(application.application, BaseResponse)
 41 | 
 42 |     assert client.get('/').status_code == 404
 43 | 
 44 | 
 45 | def test_timemap_response(client):
 46 |     """Test timemap responses."""
 47 |     response = client.get(
 48 |         '/timemap/json/http://www.example.com/resourceBad'
 49 |     )
 50 |     assert response.status_code == 404
 51 | 
 52 |     response = client.get(
 53 |         '/timemap/json/http://www.example.com/resourceA'
 54 |     )
 55 |     assert response.status_code == 200
 56 | 
 57 |     response = client.get(
 58 |         '/timemap/json/resourceA'
 59 |     )
 60 |     assert response.status_code == 200
 61 |     data = json.loads(response.data.decode('utf-8'))
 62 |     assert 3 == len(data['mementos']['list'])
 63 | 
 64 |     response = client.get(
 65 |         '/timemap/link/http://www.example.com/resourceA'
 66 |     )
 67 |     assert response.status_code == 200
 68 |     mementos = response.data.split(b'\n')
 69 |     assert 8 == len(mementos)
 70 | 
 71 | 
 72 | def test_timegate_response(client):
 73 |     """Test timegate responses."""
 74 |     response = client.get(
 75 |         '/timegate/http://www.example.com/resourceA'
 76 |     )
 77 |     assert response.status_code == 302
 78 |     assert response.headers['Location'] == (
 79 |         'http://www.example.com/resourceA_v3'
 80 |     )
 81 | 
 82 |     response = client.get(
 83 |         '/timegate/http://www.example.com/resourceA',
 84 |         headers=[('Accept-Datetime', 'Mon, 01 Jan 1999 00:00:00 GMT'), ],
 85 |     )
 86 |     assert response.status_code == 302
 87 |     assert response.headers['Location'] == (
 88 |         'http://www.example.com/resourceA_v1'
 89 |     )
 90 | 
 91 |     response = client.get(
 92 |         '/timegate/http://www.example.com/resourceA',
 93 |         headers=[('Accept-Datetime', 'Mon, 01 Jan 2010 00:00:00 GMT'), ],
 94 |     )
 95 |     assert response.status_code == 302
 96 |     assert response.headers['Location'] == (
 97 |         'http://www.example.com/resourceA_v1'
 98 |     )
 99 | 
100 |     response = client.get(
101 |         '/timegate/http://www.example.com/resource%20space'
102 |     )
103 |     assert response.status_code == 302
104 |     assert response.headers['Location'] == (
105 |         'http://www.example.com/space'
106 |     )
107 | 
108 | 
109 | def test_closest_match(app):
110 |     """Test closes match."""
111 |     from werkzeug.test import Client
112 |     from werkzeug.wrappers import BaseResponse
113 | 
114 |     app.config['RESOURCE_TYPE'] = 'snapshot'
115 |     client = Client(app, BaseResponse)
116 | 
117 |     response = client.get(
118 |         '/timegate/http://www.example.com/resourceA',
119 |         headers=[('Accept-Datetime', 'Mon, 01 Jan 2010 00:00:00 GMT'), ],
120 |     )
121 |     assert response.status_code == 302
122 |     assert response.headers['Location'] == (
123 |         'http://www.example.com/resourceA_v2'
124 |     )
125 | 
126 |     response = client.get(
127 |         '/timegate/http://www.example.com/resourceA',
128 |         headers=[('Accept-Datetime', 'Mon, 01 Jan 2100 00:00:00 GMT'), ],
129 |     )
130 |     assert response.status_code == 302
131 |     assert response.headers['Location'] == (
132 |         'http://www.example.com/resourceA_v3'
133 |     )
134 | 
135 | 
136 | @pytest.mark.parametrize('value,result', [
137 |     ('', ''), ('/', '/'), ('#', ''),
138 | ])
139 | def test_uri_validation(value, result):
140 |     """Test URI validation."""
141 |     from timegate.utils import validate_uristr
142 |     assert result == validate_uristr(value)
143 | 
144 | 
145 | def test_uri_validation_exceptions():
146 |     """Test URI validation exceptions."""
147 |     from timegate.utils import validate_uristr
148 |     with pytest.raises(Exception):
149 |         validate_uristr(None)
150 | 


--------------------------------------------------------------------------------
/timegate/handler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Base class TimeGate handlers."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | from operator import itemgetter
 17 | 
 18 | import requests
 19 | 
 20 | from . import utils as timegate_utils
 21 | from ._compat import quote
 22 | from .config import Config
 23 | from .constants import API_TIME_OUT, TM_MAX_SIZE
 24 | from .errors import HandlerError
 25 | 
 26 | 
 27 | class Handler(object):
 28 | 
 29 |     # Disables all 'requests' module event logs that are at least not WARNINGS
 30 |     logging.getLogger('requests').setLevel(logging.WARNING)
 31 | 
 32 |     def request(self, resource, timeout=API_TIME_OUT, **kwargs):
 33 |         """Handler helper function.
 34 | 
 35 |         Requests the resource over HTTP. Logs the request and handles
 36 |         exceptions.
 37 | 
 38 |         :param resource: The resource to get.
 39 |         :param timeout: The HTTP Timeout for a single request.
 40 |         :param kwargs: The keywords arguments to pass to the request method
 41 |             (``params``). These keywords will have their special character
 42 |             escaped using %-encoding. Do not pass already-encoded chars.
 43 |         :return: A requests response object.
 44 |         :raises HandlerError: if the requests fails to access the API.
 45 |         """
 46 |         uri = resource
 47 |         config = Config(None)
 48 |         user_agent = config.get("USER_AGENT")
 49 |         headers = {}
 50 |         if user_agent:
 51 |             headers["User-Agent"] = user_agent
 52 | 
 53 |         # Request logging with params
 54 |         try:
 55 |             logging.info('Sending request for %s?%s' % (
 56 |                 uri, '&'.join(map(lambda k_v: '%s=%s' % (
 57 |                     quote(str(k_v[0])), quote(str(k_v[1]))
 58 |                 ), kwargs['params'].items()))))
 59 |         except Exception:
 60 |             # Key errors on 'params'
 61 |             logging.info('Sending request for %s' % uri)
 62 | 
 63 |         try:
 64 |             req = requests.get(uri, timeout=timeout, headers=headers, **kwargs)
 65 |         except Exception as e:
 66 |             logging.error('Cannot request server (%s): %s' % (uri, e))
 67 |             raise HandlerError('Cannot request version server.', 502)
 68 | 
 69 |         if req is None:
 70 |             logging.error('Error requesting server (%s): %s' % uri)
 71 |             raise HandlerError('Error requesting version server.', 404)
 72 | 
 73 |         if not req:
 74 |             logging.info('Response other than 2XX: %s' % req)
 75 |             # raise HandlerError('API response not 2XX', 404)
 76 |         return req
 77 | 
 78 | 
 79 | def parsed_request(handler_function, *args, **kwargs):
 80 |     """Retrieve and parse the response from the ``Handler``.
 81 | 
 82 |     This function is the point of entry to all handler requests.
 83 | 
 84 |     :param handler_function: The function to call.
 85 |     :param args: Arguments to :handler_function:
 86 |     :param kwargs: Keywords arguments to :handler_function:
 87 |     :return: A sorted [(URI_str, date_obj),...] list of all Mementos.
 88 |         In the response, and all URIs/dates are valid.
 89 |     :raise HandlerError: In case of a bad response from the handler.
 90 |     """
 91 |     try:
 92 |         handler_response = handler_function(*args, **kwargs)
 93 |     except HandlerError as he:
 94 |         logging.info('Handler raised HandlerError %s' % he)
 95 |         raise he  # HandlerErrors have return data.
 96 |     except Exception as e:
 97 |         logging.error('Handler raised exception %s' % e)
 98 |         raise HandlerError('Error in Handler', 503)
 99 | 
100 |     # Input check
101 |     if not handler_response:
102 |         raise HandlerError('Not Found: Handler response Empty.', 404)
103 |     elif isinstance(handler_response, tuple):
104 |         handler_response = [handler_response]
105 |     elif not (isinstance(handler_response, list) and
106 |               isinstance(handler_response[0], tuple)):
107 |         logging.error('Bad response from Handler: Not a tuple nor tuple array')
108 |         raise HandlerError('Bad handler response.', 503)
109 |     elif len(handler_response) > TM_MAX_SIZE:
110 |         logging.warning(
111 |             'Bad response from Handler: TimeMap (%d  greater than max %d)' %
112 |             (len(handler_response), TM_MAX_SIZE))
113 |         raise HandlerError('Handler response too big and unprocessable.', 502)
114 | 
115 |     valid_response = [(
116 |         timegate_utils.validate_uristr(url),
117 |         timegate_utils.validate_date(date)
118 |     ) for (url, date) in handler_response or []]
119 |     # Sort by datetime
120 |     return sorted(valid_response, key=itemgetter(1))
121 | 


--------------------------------------------------------------------------------
/timegate/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Various helper functions."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | from datetime import datetime, timedelta
 17 | 
 18 | from dateutil.parser import parse as parse_datestr
 19 | from dateutil.tz import tzutc
 20 | 
 21 | from ._compat import urlparse
 22 | from .errors import DateTimeError, URIRequestError
 23 | 
 24 | 
 25 | def validate_uristr(uristr):
 26 |     """Control and validate the uri string.
 27 | 
 28 |     Raises an ``Exception`` if it is not valid.
 29 | 
 30 |     :param uristr: The uri string that needs to be verified.
 31 |     :return: The validated uri string.
 32 |     """
 33 |     if uristr is None:
 34 |         raise ValueError('URI can not be None')
 35 |     return str(urlparse(uristr).geturl())
 36 | 
 37 | 
 38 | def validate_date(datestr):
 39 |     """Control and validate the date string.
 40 | 
 41 |     :param datestr: The date string representation.
 42 |     :return: The datetime object form the parsed date string.
 43 |     """
 44 |     return parse_datestr(datestr, fuzzy=True).replace(tzinfo=tzutc())
 45 | 
 46 | 
 47 | def best(timemap, accept_datetime, timemap_type):
 48 |     """Find best memento."""
 49 |     assert(timemap)
 50 |     assert(accept_datetime)
 51 |     if timemap_type == 'vcs':
 52 |         return closest_before(timemap, accept_datetime)
 53 |     else:
 54 |         return closest(timemap, accept_datetime)
 55 | 
 56 | 
 57 | def closest(timemap, accept_datetime):
 58 |     """Find the absolutely closest memento chronologically to a datetime.
 59 | 
 60 |     Details of the requirements at
 61 |     http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3.
 62 | 
 63 |     :param timemap: A sorted Timemap
 64 |     :param accept_datetime: the time object for which the best memento must
 65 |         be found.
 66 |     :return: A tuple with memento URI and its datetime.
 67 |     """
 68 | 
 69 |     delta = timedelta.max
 70 |     memento_uri = None
 71 |     memento_dt = None
 72 | 
 73 |     for (url, dt) in timemap:
 74 |         diff = abs(accept_datetime - dt)
 75 |         if diff <= delta:  # there can be several with the same datetime.
 76 |             memento_uri = url
 77 |             memento_dt = dt
 78 |             delta = diff
 79 |         else:
 80 |             # The list is sorted and the delta didn't increase this time.
 81 |             # It will not increase anymore: Return the Memento (best one).
 82 |             return (memento_uri, memento_dt)
 83 | 
 84 |     return (memento_uri, memento_dt)
 85 | 
 86 | 
 87 | def closest_before(timemap, accept_datetime):
 88 |     """Find the closest memento in the before a datetime.
 89 | 
 90 |     Details of the requirements at
 91 |     http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3.
 92 | 
 93 |     :param timemap: A sorted Timemap.
 94 |     :param accept_datetime: The time object for which the best memento
 95 |         must be found.
 96 |     :return: The uri_m string of the closest memento.
 97 |     """
 98 |     prev_uri = prev_dt = None
 99 | 
100 |     for (url, dt) in timemap:
101 |         diff = abs(accept_datetime - dt)
102 |         if dt > accept_datetime:
103 |             if prev_uri is not None:
104 |                 return (prev_uri, prev_dt)  # We passed 'accept-datetime'
105 |             else:
106 |                 # The first of the sorted list is already after the accept
107 |                 # datetime
108 |                 return (url, dt)
109 |         prev_uri = url
110 |         prev_dt = dt
111 | 
112 |     return (prev_uri, prev_dt)
113 | 
114 | 
115 | def closest_binary(timemap, accept_datetime):
116 |     """Finds the chronologically closest memento using binary search in a
117 |     sorted list. Complexity O(log(n)) instead of O(n) Details of the
118 |     requirements at http://www.mementoweb.org/guide/rfc/#SpecialCases, point
119 |     4.5.3.
120 | 
121 |     :param timemap: A sorted Timemap.
122 |     :param accept_datetime: The time object for which the best memento
123 |         must be found.
124 |     :return: The uri_m string of the closest memento.
125 |     """
126 |     # TODO implement
127 | 
128 | 
129 | def closest_before_binary(timemap, accept_datetime):
130 |     """Find the closest memento in the past of a datetime using binary search.
131 | 
132 |     Note the timemap **must** be a sorted list. Complexity ``O(log(n))``
133 |     instead of ``O(n)`` Details of the requirements at
134 |     http://www.mementoweb.org/guide/rfc/#SpecialCases, point 4.5.3.
135 | 
136 |     :param timemap: A sorted Timemap.
137 |     :param accept_datetime: The time object for which the best memento
138 |         must be found.
139 |     :return: The uri_m string of the closest memento.
140 |     """
141 |     # TODO implement
142 | 


--------------------------------------------------------------------------------
/docs/handler.rst:
--------------------------------------------------------------------------------
  1 | .. _handler:
  2 | 
  3 | Resources-specific Handler
  4 | ==========================
  5 | 
  6 | A handler is a python class that is plugged into the generic TimeGate to
  7 | fit any specific technique a web server has to manage its Original
  8 | Resources and Mementos. Its role is simple: to retrieve the list of
  9 | URI-Ms (with their archival dates) given a URI-R. It typically does so
 10 | by connecting to an API.
 11 | 
 12 | Alternatives
 13 | ------------
 14 | 
 15 | -  If no API is present: The list can be retrieved from many different
 16 |    ways. Page scraping, rule-based or even in a static manner. Anything
 17 |    will do.
 18 | -  If the history cannot be retrieved entirely: The handler can
 19 |    implement an alternative function that returns one single URI-M and
 20 |    its archival datetime given both URI-R and the datetime the user
 21 |    requested.
 22 | -  If the TimeGate's algorithms that select the best Memento for a
 23 |    requested date do not apply to the system: Implementing the
 24 |    alternative function could also be used to bypass these algorithms.
 25 |    This is particularly useful if there are performance concerns,
 26 |    special cases or access restriction for Mementos.
 27 | 
 28 | Requirements
 29 | ------------
 30 | 
 31 | .. image:: code_architecture.png
 32 | 
 33 | A handler require to have the following:
 34 | 
 35 | -  It must a python file placed in the ``core.handler`` module (which is
 36 |    the ``core/handler/`` folder). And it must be unique. If several
 37 |    classes are needed, or to switch quickly between handlers, consider
 38 |    adding the handler module path manually in the configuration
 39 |    file.  (See :ref:`configuration`.)
 40 | -  A handler must extend the ``core.handler_baseclass.Handler``
 41 |    base-class.
 42 | -  Implement at least one of the following:
 43 | 
 44 |    - ``get_all_mementos(uri_r)`` class function: This function is called
 45 |      by the TimeGate to retrieve the history an original resource
 46 |      ``uri_r``. The parameter ``uri_r`` is a Python string representing
 47 |      the requested URI-R. The return value must be a list of 2-tuples:
 48 |      ``[(uri_m1, date1), (uri_m2, date2), ...]`` . Each pair
 49 |      ``(uri_m, date)`` contains the URI of an archived version of R
 50 |      ``uri_m``, and the date at which it was archived ``date``.
 51 |    - ``get_memento(uri_r, requested_date)`` class function (alternative):
 52 |      This function will be called by the TimeGate to retrieve the best
 53 |      Memento for ``uri_`` at the date ``date``. Use it if the API cannot
 54 |      return the entire history for a resource efficiently or to bypass the
 55 |      TimeGate's best Memento selection. The parameter ``uri_r`` is a
 56 |      Python string representing the requested URI-R. The parameter
 57 |      ``date`` is a Python ``datetime.DateTime`` object. In this case, the
 58 |      return value will contain only one 2-tuple: ``(uri_m, date)`` which
 59 |      is the best memento that the handler could provide taking into
 60 |      account the limits of the API.
 61 | 
 62 | -  Input parameters:
 63 | 
 64 |    -  All parameter values ``uri_r`` are Python strings representing the
 65 |       user's requested URI-R.
 66 |    -  All parameter values ``requested_date``\ are ``datetime.DateTime``
 67 |       objects representing the user's requested datetime.
 68 | 
 69 | -  Output return values:
 70 | 
 71 |    -  All return values ``uri_m`` must be strings.
 72 |    -  All return values ``date`` must be strings representing dates. Prefer
 73 |       the `ISO 8601 <http://en.wikipedia.org/wiki/ISO_8601>`__ format for
 74 |       the dates.
 75 | 
 76 | -  Note that:
 77 | 
 78 |    - If both functions are implemented,
 79 |      ``get_memento(uri_r, requested_date)`` will always be used for
 80 |      TimeGate requests.
 81 |    - If the TimeMap advanced feature (see :ref:`advanced_features`) is enabled,
 82 |      ``get_all_mementos(uri_r)`` must be implemented.
 83 | 
 84 | Example
 85 | -------
 86 | 
 87 | A simple example handler is provided in\ ``core/handler/`` and can be
 88 | edited to match your web server's requirements: - See
 89 | `example.py <https://github.com/mementoweb/timegate/blob/master/core/handler/example.py>`__
 90 | Which returns static lists.
 91 | 
 92 | Other handlers examples are provided for real world APIs in
 93 | ``core/handler_examples/`` for instance:
 94 | 
 95 | - `arXiv.py
 96 |   <https://github.com/mementoweb/timegate/blob/master/core/handler_examples/arxiv.py>`__
 97 |   Where the Original Resources are the e-prints of http://arxiv.org/ -
 98 | - `wikipedia.py
 99 |   <https://github.com/mementoweb/timegate/blob/master/core/handler_examples/wikipedia.py>`__
100 |   Where the Original Resources are the articles of https://www.wikipedia.org/
101 | - `github.py
102 |   <https://github.com/mementoweb/timegate/blob/master/core/handler_examples/github.py>`__
103 |   Where the Original Resources are the repositories, trees (branches and
104 |   directories), files and raw files.
105 | 
106 | Other scraping Handlers examples are provided for real world resources
107 | without any API:
108 | 
109 | - `can.py
110 |   <https://github.com/mementoweb/timegate/blob/master/core/handler_examples/can.py>`__
111 |   Where the Original Resources are the archives stored in
112 |   http://www.collectionscanada.gc.ca/webarchives/
113 | 


--------------------------------------------------------------------------------
/timegate/examples/wikia.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Wikia TimeGate proxy."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | import StringIO
 17 | import time
 18 | from datetime import datetime, timedelta
 19 | from urlparse import urlparse
 20 | 
 21 | from dateutil import parser as dateparser
 22 | from dateutil.tz import tzutc
 23 | from lxml import etree
 24 | 
 25 | from timegate.errors import HandlerError
 26 | from timegate.handler import Handler
 27 | from timegate.utils import date_str
 28 | 
 29 | 
 30 | def iso_to_dt(date):
 31 |     seq = (int(date[:4]), int(date[5:7]), int(date[8:10]), int(date[11:13]),
 32 |            int(date[14:16]), int(date[17:19]), 0, 1, -1)
 33 |     return date_str(
 34 |         datetime.fromtimestamp(
 35 |             time.mktime(
 36 |                 time.struct_time(seq)),
 37 |             tzutc()))
 38 | 
 39 | 
 40 | class WikiaHandler(Handler):
 41 | 
 42 |     def __init__(self):
 43 |         Handler.__init__(self)
 44 | 
 45 |         self.hosts = [
 46 |             'www.wowwiki.com',
 47 |             'en.memory-alpha.org',
 48 |             'wiki.ffxiclopedia.org',
 49 |             'www.jedipedia.de'
 50 |         ]
 51 | 
 52 |     def get_memento(self, req_url, dt):
 53 |         p = urlparse(req_url)
 54 |         host = p[1]
 55 |         upath = p[2]
 56 | 
 57 |         if host.find('.wikia.com') == -1 and not host in self.hosts:
 58 |             return
 59 | 
 60 |         exploded_path = upath.rsplit('/', 1)
 61 | 
 62 |         if len(exploded_path) > 1:
 63 |             (pref, title) = upath.rsplit('/', 1)
 64 |             if pref:
 65 |                 # look for /wiki
 66 |                 pref = pref.replace('/wiki', '')
 67 |         else:
 68 |             raise HandlerError("No article title found in requested URI.", 404)
 69 | 
 70 |         changes = []
 71 |         defaultProtocol = "http://"
 72 | 
 73 |         dtfmstr = "%Y%m%d%H%M%S"
 74 | 
 75 |         dt_del = timedelta(seconds=1)
 76 |         dt_next = dt + dt_del
 77 |         dt_next = dt_next.strftime(dtfmstr)
 78 |         dt = dt.strftime(dtfmstr)
 79 | 
 80 |         url_list = []
 81 | 
 82 |         # url for getting the memento, prev
 83 |         mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (
 84 |             defaultProtocol, host, title, dt)
 85 |         url_list.append('mem_prev')
 86 | 
 87 |         # url for next
 88 |         if dt_next:
 89 |             next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (
 90 |                 defaultProtocol, host, title, dt)
 91 |             url_list.append('next')
 92 | 
 93 |         # url for last
 94 |         last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (
 95 |             defaultProtocol, host, title)
 96 |         url_list.append('last')
 97 | 
 98 |         # url for first
 99 |         first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (
100 |             defaultProtocol, host, title)
101 |         url_list.append('first')
102 | 
103 |         #url = url % (title, dt)
104 |         base = "%s%s%s/index.php?title=%s&oldid=" % \
105 |                (defaultProtocol, host, pref, title)
106 |         dtobj = None
107 | 
108 |         hdrs = {}
109 |         hdrs['Host'] = host
110 | 
111 |         for url in url_list:
112 | 
113 |             dom = self.get_xml(vars()[url], headers=hdrs)
114 |             revs = dom.xpath('//rev')
115 |             for r in revs:
116 |                 dt = r.attrib['timestamp']
117 |                 dtobj = dateparser.parse(r.attrib['timestamp'])
118 |                 changes.append((base + r.attrib['revid'], dt))
119 | 
120 |         return changes
121 | 
122 |     def get_all_mementos(self, req_url):
123 | 
124 |         # http://www.wowwiki.com/Cloth_armor              --> /api.php
125 |         # http://dragonage.wikia.com/wiki/Morrigan        --> /api.php
126 |         # http://memory-alpha.org/en/wiki/Fraggle_Rock    --> /en/api.php
127 | 
128 |         p = urlparse(req_url)
129 |         host = p[1]
130 |         upath = p[2]
131 | 
132 |         if host.find('.wikia.com') == -1 and not host in self.hosts:
133 |             return
134 | 
135 |         (pref, title) = upath.rsplit('/', 1)
136 |         if pref:
137 |             # look for /wiki
138 |             pref = pref.replace('/wiki', '')
139 | 
140 |         url = "http://%s%s/api.php?format=xml&action=query&prop=revisions&meta=siteinfo&rvprop=timestamp|ids&rvlimit=500&redirects=1&titles=%s" % (
141 |             host, pref, title)
142 | 
143 |         changes = []
144 |         base = "http://%s%s/index.php?oldid=" % (host, pref)
145 | 
146 |         headers = {}
147 |         # headers['Host'] = host
148 |         dom = self.get_xml(url, headers=headers)
149 |         while dom is not None:
150 |             revs = dom.xpath('//rev')
151 |             for r in revs:
152 |                 dtstr = iso_to_dt(r.attrib['timestamp'])
153 |                 changes.append((base + r.attrib['revid'], dtstr))
154 |             cont = dom.xpath('/api/query-continue/revisions/@rvstartid')
155 |             if cont:
156 |                 dom = self.get_xml(url + "&rvstartid=" +
157 |                                    cont[0], headers=headers)
158 |             else:
159 |                 dom = None
160 |         return changes
161 | 
162 |     def get_xml(self, uri, html=False, headers=None):
163 | 
164 |         page = self.request(uri, headers=headers)
165 |         try:
166 |             page_data = page.content
167 |             if not html:
168 |                 parser = etree.XMLParser(recover=True)
169 |             else:
170 |                 parser = etree.HTMLParser(recover=True)
171 |             return etree.parse(StringIO.StringIO(page_data), parser)
172 |         except Exception as e:
173 |             logging.error("Cannot parse XML/HTML from %s" % uri)
174 |             raise HandlerError("Couldn't parse data from %s" % uri, 404)
175 | 


--------------------------------------------------------------------------------
/timegate/examples/orain.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | #
 11 | import logging
 12 | import StringIO
 13 | import urllib2
 14 | import urlparse
 15 | 
 16 | from lxml import etree
 17 | 
 18 | from core.timegate_utils import date_str
 19 | from timegate.errors import HandlerError
 20 | from timegate.handler import Handler
 21 | 
 22 | 
 23 | class OrainHandler(Handler):
 24 | 
 25 |     def __init__(self):
 26 |         Handler.__init__(self)
 27 |         self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
 28 |         self.hosts = [".orain.org"]
 29 | 
 30 |     def get_memento(self, req_uri, accept_datetime):
 31 | 
 32 |         logging.debug("Begin Fetching mementos for: %s" % req_uri)
 33 | 
 34 |         p = urlparse.urlparse(req_uri)
 35 |         host = p[1]
 36 | 
 37 |         for h in self.hosts:
 38 |             if host.find(h) == -1:
 39 |                 return
 40 | 
 41 |         timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
 42 |         params = {
 43 |             'rvlimit': 1,  # Only need one
 44 |             'rvstart': timestamp,  # Start listing from here
 45 |             'rvdir': 'older'  # List in decreasing order
 46 |         }
 47 | 
 48 | 
 49 |         # Finds the API and title using scraping
 50 |         api_base_uri = None
 51 |         try:
 52 |             dom = self.get_xml(req_uri, html=True)
 53 |             links = dom.xpath("//link")
 54 |             for link in links:
 55 |                 if link.attrib['rel'].lower() == "edituri":
 56 |                     api_base_uri = link.attrib['href'].split("?")[0]
 57 |                     if api_base_uri.startswith("//"):
 58 |                         api_base_uri = api_base_uri.replace("//", "http://")
 59 |             parsed_url = urlparse.urlparse(req_uri)
 60 |             try:
 61 |                 title = urlparse.parse_qs(parsed_url[4])['title'][0]
 62 |             except Exception as e:
 63 |                 title = parsed_url.path.split('/')[-1]
 64 |             logging.debug("Orain handler: API found: %s, page title parsed to: %s " % (api_base_uri, title) )
 65 |             if not title:
 66 |                 raise HandlerError("Cannot find Title", 404)
 67 |             if not api_base_uri:
 68 |                 raise HandlerError("Cannot find orain API on page", 404)
 69 |             else:
 70 |                 title = urllib2.unquote(title)
 71 | 
 72 |         except HandlerError as he:
 73 |             raise he
 74 |         except Exception as e:
 75 |             logging.error("OrainHandler: querying and parsing page for title/api %s. handler will return empty response" % e)
 76 |             return None
 77 | 
 78 |         base_uri = api_base_uri.replace("api.php", "index.php")
 79 | 
 80 |         return self.query(req_uri, params, title, api_base_uri, base_uri)
 81 | 
 82 |     def query(self, req_uri, req_params, title, api_base_uri, base_uri):
 83 | 
 84 |         params = {
 85 |             'action': 'query',
 86 |             'format': 'json',
 87 |             'prop': 'revisions',
 88 |             'rvprop': 'ids|timestamp',
 89 |             'indexpageids': '',
 90 |             'titles': title
 91 |         }
 92 |         params.update(req_params)
 93 | 
 94 |         # Does sequential queries to get all revisions IDs and Timestamps
 95 |         queries_results = []
 96 |         condition = True
 97 |         while condition:
 98 |             # Clone original request
 99 |             newparams = params.copy()
100 |             req = self.request(api_base_uri, params=newparams)
101 |             try:
102 |                 result = req.json()
103 |             except Exception as e:
104 |                 logging.error("No JSON can be decoded from API %s" % api_base_uri)
105 |                 raise HandlerError("No API answer.", 404)
106 |             if 'error' in result:
107 |                 raise HandlerError(result['error'])
108 |             if 'warnings' in result:
109 |                 # logging.warn(result['warnings'])
110 |                 pass
111 |             try:
112 |                 # The request was successful
113 |                 pid = result['query']['pageids'][0]  # the JSON key of the page (only one)
114 |                 queries_results += result['query']['pages'][pid]['revisions']
115 |                 if ('missing' in result['query']['pages'][pid] or
116 |                                 'invalid' in result['query']['pages'][pid]):
117 |                     raise HandlerError("Cannot find resource on version server.", 404)
118 |             except Exception as e:
119 |                 if req_params['rvdir'] == 'older':
120 |                     req_params['rvdir'] = 'newer'
121 |                     return self.query(req_uri, req_params, title, api_base_uri, base_uri)
122 |                 else:
123 |                     raise HandlerError("No revision returned from API.", 404)
124 |             if 'continue' in result:
125 |                 # The response was truncated, the rest can be obtained using
126 |                 # &rvcontinue=ID
127 |                 cont = result['continue']
128 |                 # Modify it with the values returned in the 'continue' section of the last result.
129 |                 newparams.update(cont)
130 |                 condition = True
131 |             else:
132 |                 condition = False
133 | 
134 |         # Processing list
135 |         def f(rev):
136 |             rev_uri = base_uri + '?title=%s&oldid=%d' % (
137 |                 urllib2.quote(title), rev['revid'])
138 |             dt = rev['timestamp']
139 |             return (rev_uri, dt)
140 | 
141 | 
142 |         # logging.debug("Returning API results of size %d" % len(queries_results))
143 |         return map(f, queries_results)
144 | 
145 |     def get_xml(self, uri, html=False):
146 |         """
147 |         Retrieves the resource using the url, parses it as XML or HTML
148 |         and returns the parsed dom object.
149 |         :param uri: [str] The uri to retrieve
150 |         :param headers: [dict(header_name: value)] optional http headers to send in the request
151 |         :param html: [bool] optional flag to parse the response as HTML
152 |         :return: [lxml_obj] parsed dom.
153 |         """
154 | 
155 |         try:
156 |             page = self.request(uri)
157 |         except HandlerError as he:
158 |             raise HandlerError(he, status=404)
159 | 
160 |         try:
161 |             page_data = page.content
162 |             if not html:
163 |                 parser = etree.XMLParser(recover=True)
164 |             else:
165 |                 parser = etree.HTMLParser(recover=True)
166 |             return etree.parse(StringIO.StringIO(page_data), parser)
167 |         except Exception as e:
168 |             logging.error("Cannot parse XML/HTML from %s" % uri)
169 |             raise HandlerError("Couldn't parse data from %s" % uri, 404)
170 | 


--------------------------------------------------------------------------------
/timegate/examples/mediawiki.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | from __future__ import absolute_import, print_function
 12 | 
 13 | import logging
 14 | import StringIO
 15 | import urllib2
 16 | import urlparse
 17 | 
 18 | from lxml import etree
 19 | 
 20 | from timegate.errors import HandlerError
 21 | from timegate.handler import Handler
 22 | from timegate.utils import date_str
 23 | 
 24 | 
 25 | class MediaWikiHandler(Handler):
 26 | 
 27 |     def __init__(self):
 28 |         Handler.__init__(self)
 29 |         self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
 30 | 
 31 |     # def getall(self, uri):
 32 |     #     params = {
 33 |     #         'rvlimit': 500,  # Max allowed
 34 |     #         'continue': ''  # The initial continue value is empty
 35 |     #     }
 36 |     #
 37 |     #     return self.query(uri, params)
 38 | 
 39 |     def get_memento(self, req_uri, accept_datetime):
 40 |         timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
 41 |         params = {
 42 |             'rvlimit': 1,  # Only need one
 43 |             'rvstart': timestamp,  # Start listing from here
 44 |             'rvdir': 'older'  # List in decreasing order
 45 |         }
 46 | 
 47 |         # Finds the API and title using scraping
 48 |         api_base_uri = None
 49 |         try:
 50 |             dom = self.get_xml(req_uri, html=True)
 51 |             links = dom.xpath("//link")
 52 |             for link in links:
 53 |                 if link.attrib['rel'].lower() == "edituri":
 54 |                     api_base_uri = link.attrib['href'].split("?")[0]
 55 |                     if api_base_uri.startswith("//"):
 56 |                         api_base_uri = api_base_uri.replace("//", "http://")
 57 |             parsed_url = urlparse.urlparse(req_uri)
 58 |             try:
 59 |                 title = urlparse.parse_qs(parsed_url[4])['title'][0]
 60 |             except Exception as e:
 61 |                 title = parsed_url.path.split('/')[-1]
 62 |             logging.debug(
 63 |                 "Mediawiki handler: API found: %s, page title parsed to: %s " %
 64 |                 (api_base_uri, title))
 65 |             if not title:
 66 |                 raise HandlerError("Cannot find Title", 404)
 67 |             if not api_base_uri:
 68 |                 raise HandlerError("Cannot find mediawiki API on page", 404)
 69 |             else:
 70 |                 title = urllib2.unquote(title)
 71 | 
 72 |         except HandlerError as he:
 73 |             raise he
 74 |         except Exception as e:
 75 |             logging.error(
 76 |                 "MediaWikiHandler: querying and parsing page for title/api "
 77 |                 "%s. handler will return empty response" % e
 78 |             )
 79 |             return None
 80 | 
 81 |         base_uri = api_base_uri.replace("api.php", "index.php")
 82 | 
 83 |         return self.query(req_uri, params, title, api_base_uri, base_uri)
 84 | 
 85 |     def query(self, req_uri, req_params, title, api_base_uri, base_uri):
 86 | 
 87 |         params = {
 88 |             'action': 'query',
 89 |             'format': 'json',
 90 |             'prop': 'revisions',
 91 |             'rvprop': 'ids|timestamp',
 92 |             'indexpageids': '',
 93 |             'titles': title
 94 |         }
 95 |         params.update(req_params)
 96 | 
 97 |         # Does sequential queries to get all revisions IDs and Timestamps
 98 |         queries_results = []
 99 |         condition = True
100 |         while condition:
101 |             # Clone original request
102 |             newparams = params.copy()
103 |             req = self.request(api_base_uri, params=newparams)
104 |             try:
105 |                 result = req.json()
106 |             except Exception as e:
107 |                 logging.error("No JSON can be decoded from API %s" %
108 |                               api_base_uri)
109 |                 raise HandlerError("No API answer.", 404)
110 |             if 'error' in result:
111 |                 raise HandlerError(result['error'])
112 |             if 'warnings' in result:
113 |                 # logging.warn(result['warnings'])
114 |                 pass
115 |             try:
116 |                 # The request was successful
117 |                 # the JSON key of the page (only one)
118 |                 pid = result['query']['pageids'][0]
119 |                 queries_results += result['query']['pages'][pid]['revisions']
120 |                 if ('missing' in result['query']['pages'][pid] or
121 |                         'invalid' in result['query']['pages'][pid]):
122 |                     raise HandlerError(
123 |                         "Cannot find resource on version server.", 404)
124 |             except Exception as e:
125 |                 if req_params['rvdir'] == 'older':
126 |                     req_params['rvdir'] = 'newer'
127 |                     return self.query(
128 |                         req_uri, req_params, title, api_base_uri, base_uri)
129 |                 else:
130 |                     raise HandlerError("No revision returned from API.", 404)
131 |             if 'continue' in result:
132 |                 # The response was truncated, the rest can be obtained using
133 |                 # &rvcontinue=ID
134 |                 cont = result['continue']
135 |                 # Modify it with the values returned in the 'continue' section
136 |                 # of the last result.
137 |                 newparams.update(cont)
138 |                 condition = True
139 |             else:
140 |                 condition = False
141 | 
142 |         # Processing list
143 |         def f(rev):
144 |             rev_uri = base_uri + '?title=%s&oldid=%d' % (
145 |                 urllib2.quote(title), rev['revid'])
146 |             dt = rev['timestamp']
147 |             return (rev_uri, dt)
148 | 
149 |         # logging.debug("Returning API results of size %d" % len(queries_results))
150 |         return map(f, queries_results)
151 | 
152 |     def get_xml(self, uri, html=False):
153 |         """Retrieve the resource using the url and parse it as XML or HTML.
154 | 
155 |         :param uri: [str] The uri to retrieve
156 |         :param headers: [dict(header_name: value)] optional http headers
157 |             to send in the request.
158 |         :param html: [bool] optional flag to parse the response as HTML
159 |         :return: [lxml_obj] parsed DOM.
160 |         """
161 |         try:
162 |             page = self.request(uri)
163 |         except HandlerError as he:
164 |             raise HandlerError(he, status=404)
165 | 
166 |         try:
167 |             page_data = page.content
168 |             if not html:
169 |                 parser = etree.XMLParser(recover=True)
170 |             else:
171 |                 parser = etree.HTMLParser(recover=True)
172 |             return etree.parse(StringIO.StringIO(page_data), parser)
173 |         except Exception as e:
174 |             logging.error("Cannot parse XML/HTML from %s" % uri)
175 |             raise HandlerError("Couldn't parse data from %s" % uri, 404)
176 | 


--------------------------------------------------------------------------------
/timegate/examples/pastpages.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Implementation of TimeGate handler for pastpages.org."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | from datetime import datetime
 17 | 
 18 | from timegate.errors import HandlerError
 19 | from timegate.handler import Handler
 20 | 
 21 | 
 22 | class PastpagesHandler(Handler):
 23 | 
 24 |     def __init__(self):
 25 |         Handler.__init__(self)
 26 |         self.LIMIT_MAX = 100
 27 |         self.BASE = 'http://www.pastpages.org'
 28 |         self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S'
 29 |         self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT)
 30 | 
 31 |         # Building pages list of ('uri', 'slug') pairs
 32 |         self.pages_list = []
 33 | 
 34 |         try:
 35 |             params = {
 36 |                 'limit': self.LIMIT_MAX
 37 |             }
 38 |             request = '/api/beta/sites/'
 39 |             has_next = True
 40 | 
 41 |             # Keep while there are still result pages
 42 |             while has_next:
 43 |                 json_response = self.request(
 44 |                     self.BASE + request, params=params).json()
 45 | 
 46 |                 self.pages_list.extend([
 47 |                     # 'objects' is the list of responses
 48 |                     # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively
 49 |                     (obj['url'], obj['slug'])
 50 |                     for obj in json_response['objects']
 51 |                 ])
 52 | 
 53 |                 request = json_response['meta']['next']
 54 |                 params = None  # the request already contains &limit and &offset
 55 |                 # Each response has a non null 'meta.next' value if it has a
 56 |                 # continuation
 57 |                 has_next = request is not None
 58 | 
 59 |         except Exception as e:
 60 |             logging.critical("Cannot create the handler's page list:")
 61 |             raise e
 62 | 
 63 |         logging.info("Found %s websites on pastpages' API." %
 64 |                      len(self.pages_list))
 65 | 
 66 |     def get_memento(self, uri_r, req_datetime):
 67 |         uri_r = uri_r + '/'
 68 |         # Check if the URI is one archived website
 69 |         matches = [x for x in self.pages_list if uri_r.startswith(x[0])]
 70 |         if len(matches) == 0:
 71 |             raise HandlerError(
 72 |                 "Pastpages does not have archives of that website.", 404)
 73 |         if len(matches) > 1:
 74 |             logging.error("Uri conflict in pastpages' API URI list.")
 75 |             raise HandlerError("Error in pastpages API")
 76 | 
 77 |         site_slug = matches[0][1]
 78 |         params = {
 79 |             'limit': 1,
 80 |             'site__slug': site_slug,
 81 |             'timestamp__lte': req_datetime.strftime(self.API_TIMEFMT)
 82 |         }
 83 | 
 84 |         request = '/api/beta/screenshots/'
 85 | 
 86 |         json_response = self.request(self.BASE + request, params=params).json()
 87 |         if 'error' in json_response:
 88 |             logging.error("Error in pastpages response: " +
 89 |                           str(json_response['error']))
 90 |             return
 91 | 
 92 |         result_list = [
 93 |             # 'objects' is the list of responses
 94 |             # 'objects.absolute_url' is the URI. It exists if 'objects.has_image'
 95 |             (self.BASE + obj['absolute_url'], obj['timestamp'])
 96 |             for obj in json_response['objects']
 97 |         ]
 98 |         if result_list:
 99 |             if len(result_list) > 1:
100 |                 logging.error(
101 |                     "API returned more than one object. returning the first")
102 |             return result_list[0]
103 | 
104 |         # No Memento Found, Trying the first
105 |         else:
106 |             return
107 |             # last_offset = json_response['meta']['total_count'] - 1
108 |             # params = {
109 |             #     'limit': 1,
110 |             #     'site__slug': site_slug,
111 |             #     'timestamp__gte': self.FIRST_DATE,  # Greater here
112 |             #     'offset': last_offset
113 |             # }
114 |             #
115 |             # request = '/api/beta/screenshots/'
116 |             #
117 |             # json_response = self.request(self.BASE+request, params=params).json()
118 |             # if json_response.has_key('error'):
119 |             #     logging.error("Error in pastpages response: "+str(json_response['error']))
120 |             #     return
121 |             #
122 |             # result_list = [
123 |             #     # 'objects' is the list of responses
124 |             #     # 'objects.absolute_url' is the URI. It exists if 'objects.has_image'
125 |             #     (self.BASE+obj['absolute_url'], obj['timestamp'])
126 |             #         for obj in json_response['objects']
127 |             # ]
128 |             # if result_list:
129 |             #     if len(result_list) > 1:
130 |             #         logging.error("API returned more than one object. returning the first")
131 |             #     return result_list[0]
132 | 
133 |     def get_all_mementos(self, uri_r):
134 |         # WILL BE TOO SLOW. TOO MANY WEBSITES'
135 |         # Deactivate TimeMaps
136 |         logging.warning(
137 |             "Get_all_mementos used: Pastpages will probably have too big timemaps. Expect Timeouts")
138 | 
139 |         matches = [x for x in self.pages_list if uri_r.startswith(x[0])]
140 |         if len(matches) == 0:
141 |             raise HandlerError(
142 |                 "Pastpages does not have archives of that website.", 404)
143 |         if len(matches) > 1:
144 |             logging.error("Uri conflict in pastpages' API URI list.")
145 |             raise HandlerError("Error in pastpages API")
146 | 
147 |         site_slug = matches[0][1]
148 |         params = {
149 |             'limit': self.LIMIT_MAX,
150 |             'site__slug': site_slug
151 |         }
152 |         request = '/api/beta/screenshots/'
153 |         has_next = True
154 | 
155 |         image_list = []
156 |         # Keep while there are still result pages
157 |         while has_next:
158 |             json_response = self.request(
159 |                 self.BASE + request, params=params).json()
160 | 
161 |             image_list.extend([
162 |                 # 'objects' is the list of responses
163 |                 # 'objects.image' is the URI of the memento. It exists if 'objects.has_image'
164 |                 (self.BASE + obj['absolute_url'], obj['timestamp'])
165 |                 for obj in json_response['objects'] if obj['has_image']
166 |             ])
167 | 
168 |             request = json_response['meta']['next']
169 |             params = None  # the request already contains &limit and &offset
170 |             # Each response has a non null 'meta.next' value if it has a
171 |             # continuation
172 |             has_next = request is not None
173 | 
174 |         return image_list
175 | 


--------------------------------------------------------------------------------
/timegate/cache.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Implementation of the TimeGate caches."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | import os
 17 | from datetime import datetime
 18 | 
 19 | from dateutil.relativedelta import relativedelta
 20 | from dateutil.tz import tzutc
 21 | from werkzeug.contrib.cache import FileSystemCache, md5
 22 | 
 23 | from . import utils as timegate_utils
 24 | from .errors import CacheError
 25 | 
 26 | 
 27 | class Cache(object):
 28 |     """Base class for TimeGate caches."""
 29 | 
 30 |     def __init__(self, path, tolerance, expiration, max_values,
 31 |                  run_tests=True, max_file_size=0):
 32 |         """Constructor method.
 33 | 
 34 |         :param path: The path of the cache database file.
 35 |         :param tolerance: The tolerance, in seconds to which a TimeMap is
 36 |         considered young enough to be used as is.
 37 |         :param expiration: How long, in seconds, the cache entries are stored
 38 |         every get will be a CACHE MISS.
 39 |         :param max_values: The maximum number of TimeMaps stored in cache
 40 |         before some are deleted
 41 |         :param run_tests: (Optional) Tests the cache at initialization.
 42 |         :param max_file_size: (Optional) The maximum size (in Bytes) for a
 43 |         TimeMap cache value. When max_file_size=0, there is no limit to
 44 |         a cache value. When max_file_size=X > 0, the cache will not
 45 |         store TimeMap that require more than X Bytes on disk.
 46 |         """
 47 |         # Parameters Check
 48 |         if tolerance <= 0 or expiration <= 0 or max_values <= 0:
 49 |             raise CacheError('Cannot create cache: all parameters must be > 0')
 50 | 
 51 |         self.tolerance = relativedelta(seconds=tolerance)
 52 |         self.path = path.rstrip('/')
 53 |         self.max_file_size = max(max_file_size, 0)
 54 |         self.CHECK_SIZE = self.max_file_size > 0
 55 |         self.max_values = max_values
 56 |         self.backend = FileSystemCache(path,
 57 |                                        threshold=self.max_values,
 58 |                                        default_timeout=expiration)
 59 | 
 60 |         # Testing cache
 61 |         if run_tests:
 62 |             try:
 63 |                 key = b'1'
 64 |                 val = 1
 65 |                 self.backend.set(key, val)
 66 |                 assert (not self.CHECK_SIZE) or self._check_size(key) > 0
 67 |                 assert self.backend.get(key) == val
 68 |                 os.remove(os.path.join(self.path, md5(key).hexdigest()))
 69 |             except Exception as e:
 70 |                 raise CacheError('Error testing cache: %s' % e)
 71 | 
 72 |         logging.debug(
 73 |             'Cache created. max_files = %d. Expiration = %d. '
 74 |             'max_file_size = %d' % (
 75 |                 self.max_values, expiration, self.max_file_size))
 76 | 
 77 |     def get_until(self, uri_r, date):
 78 |         """Returns the TimeMap (memento,datetime)-list for the requested
 79 |         Memento. The TimeMap is guaranteed to span at least until the 'date'
 80 |         parameter, within the tolerance.
 81 | 
 82 |         :param uri_r: The URI-R of the resource as a string.
 83 |         :param date: The target date. It is the accept-datetime for TimeGate
 84 |         requests, and the current date. The cache will return all
 85 |         Mementos prior to this date (within cache.tolerance parameter)
 86 |         :return: [(memento_uri_string, datetime_obj),...] list if it is
 87 |         in cache and if it is within the cache tolerance for *date*,
 88 |         None otherwise.
 89 |         """
 90 |         # Query the backend for stored cache values to that memento
 91 |         key = uri_r
 92 |         try:
 93 |             val = self.backend.get(key)
 94 |         except Exception as e:
 95 |             logging.error('Exception loading cache content: %s' % e)
 96 |             return None
 97 | 
 98 |         if val:
 99 |             # There is a value in the cache
100 |             timestamp, timemap = val
101 |             logging.info('Cached value exists for %s' % uri_r)
102 |             if date > timestamp + self.tolerance:
103 |                 logging.info('Cache MISS: value outdated for %s' % uri_r)
104 |                 timemap = None
105 |             else:
106 |                 logging.info('Cache HIT: found value for %s' % uri_r)
107 |         else:
108 |             # Cache MISS: No value
109 |             logging.info('Cache MISS: No cached value for %s' % uri_r)
110 |             timemap = None
111 | 
112 |         return timemap
113 | 
114 |     def get_all(self, uri_r):
115 |         """Request the whole TimeMap for that uri.
116 | 
117 |         :param uri_r: the URI-R of the resource.
118 |         :return: [(memento_uri_string, datetime_obj),...] list if it is in
119 |         cache and if it is within the cache tolerance, None otherwise.
120 |         """
121 |         until = datetime.utcnow().replace(tzinfo=tzutc())
122 |         return self.get_until(uri_r, until)
123 | 
124 |     def set(self, uri_r, timemap):
125 |         """Set the cached TimeMap for that URI-R.
126 | 
127 |         It appends it with a timestamp of when it is stored.
128 | 
129 |         :param uri_r: The URI-R of the original resource.
130 |         :param timemap: The value to cache.
131 |         :return: The backend setter method return value.
132 |         """
133 |         logging.info('Updating cache for %s' % uri_r)
134 |         timestamp = datetime.utcnow().replace(tzinfo=tzutc())
135 |         val = (timestamp, timemap)
136 |         key = uri_r
137 |         try:
138 |             self.backend.set(key, val)
139 |             if self.CHECK_SIZE:
140 |                 self._check_size(uri_r)
141 |         except Exception as e:
142 |             logging.error('Error setting cache value: %s' % e)
143 | 
144 |     def _check_size(self, key, delete=True):
145 |         """Check the size that a specific TimeMap value is using on disk.
146 | 
147 |         It deletes if it is more than the maximum size.
148 | 
149 |         :param key: The TimeMap original resource.
150 |         :param delete: (Optional) When true, the value is deleted.
151 |         Else only a warning is raised.
152 |         :return: The size of the value on disk (0 if it was deleted).
153 |         """
154 |         try:
155 |             fname = md5(key).hexdigest()  # werkzeug key
156 |             fpath = self.path + '/' + fname
157 |             size = os.path.getsize(fpath)
158 |             if size > self.max_file_size and delete:
159 |                 message = ('Cache value too big (%dB, max %dB) '
160 |                            'for the TimeMap of %s')
161 |                 if delete:
162 |                     message += '. Deleting cached value.'
163 |                     os.remove(fpath)
164 |                     size = 0
165 |                 logging.warning(message % (size, self.max_file_size, key))
166 |             return size
167 |         except Exception as e:
168 |             logging.error(
169 |                 'Exception checking cache value size for TimeMap of %s '
170 |                 'Exception: %s' % (key, e))
171 |             return 0
172 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\TimeGate.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\TimeGate.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/TimeGate.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/TimeGate.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/TimeGate"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/TimeGate"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/timegate/examples/wikipedia.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Implementation of Wikipedia TimeGate handler."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import logging
 16 | import StringIO
 17 | import urllib2
 18 | import urlparse
 19 | 
 20 | from lxml import etree
 21 | 
 22 | from timegate.errors import HandlerError
 23 | from timegate.handler import Handler
 24 | from timegate.utils import date_str
 25 | 
 26 | 
 27 | class WikipediaHandler(Handler):
 28 | 
 29 |     def __init__(self):
 30 |         Handler.__init__(self)
 31 |         self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
 32 | 
 33 |         # Storing first mementos
 34 |         self.inner_cache = {}
 35 |         self.max_inner_cache_size = 100000
 36 | 
 37 |     def get_memento(self, req_uri, accept_datetime):
 38 |         timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
 39 |         params = {
 40 |             'rvlimit': 1,  # Only need one
 41 |             'rvstart': timestamp,  # Start listing from here
 42 |             'rvdir': 'older'  # List in decreasing order
 43 |         }
 44 | 
 45 |         # Finds the API and title using scraping
 46 |         api_base_uri = None
 47 |         try:
 48 |             dom = self.get_xml(req_uri, html=True)
 49 |             links = dom.xpath("//link")
 50 |             for link in links:
 51 |                 if link.attrib['rel'].lower() == "edituri":
 52 |                     api_base_uri = link.attrib['href'].split("?")[0]
 53 |                     if api_base_uri.startswith("//"):
 54 |                         api_base_uri = api_base_uri.replace("//", "http://")
 55 |             parsed_url = urlparse.urlparse(req_uri)
 56 |             try:
 57 |                 title = urlparse.parse_qs(parsed_url[4])['title'][0]
 58 |             except Exception as e:
 59 |                 title = parsed_url.path.split('/')[-1]
 60 |             logging.debug(
 61 |                 "Mediawiki handler: API found: %s, page title parsed to: %s " %
 62 |                 (api_base_uri, title))
 63 |             if not title:
 64 |                 raise HandlerError("Cannot find Title", 404)
 65 |             if not api_base_uri:
 66 |                 raise HandlerError("Cannot find mediawiki API on page", 404)
 67 |             else:
 68 |                 title = urllib2.unquote(title)
 69 | 
 70 |         except HandlerError as he:
 71 |             raise he
 72 |         except Exception as e:
 73 |             logging.error(
 74 |                 "MediaWikiHandler: querying and parsing page for title/api %s."
 75 |                 " Handler will return empty response." % e)
 76 |             return None
 77 | 
 78 |         base_uri = api_base_uri.replace("api.php", "index.php")
 79 | 
 80 |         # The best Memento
 81 |         memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0]
 82 | 
 83 |         # The first Memento
 84 |         if title in self.inner_cache and memento:
 85 |             logging.debug("Wiki Handler: found cached first for " + title)
 86 |             first = self.inner_cache[title]
 87 |         else:
 88 |             logging.debug("Wiki Handler: Querying first for " + title)
 89 |             first_params = {
 90 |                 'rvlimit': 1,  # Only need one
 91 |                 'rvstart': '19900101000000',  # Start listing from 1990
 92 |                 'rvdir': 'newer'  # List in increasing order
 93 |             }
 94 |             first = self.query(req_uri, first_params, title,
 95 |                                api_base_uri, base_uri)[0]
 96 |             if len(self.inner_cache) > self.max_inner_cache_size:
 97 |                 self.inner_cache = {}
 98 |             self.inner_cache[title] = first
 99 | 
100 |         # This handler returns more than only the best Memento.
101 |         # A Link with rel="first memento" will also be returned to the client.
102 |         return [first, memento]
103 | 
104 |     def query(self, req_uri, req_params, title, api_base_uri, base_uri):
105 |         """Returns a processed list of tuple. Can be used with increased
106 |         rvlimit.
107 | 
108 |         :param req_uri: :param req_params: :param title: :param
109 |         api_base_uri: :param base_uri: :return:
110 | 
111 |         """
112 | 
113 |         params = {
114 |             'action': 'query',
115 |             'format': 'json',
116 |             'prop': 'revisions',
117 |             'rvprop': 'ids|timestamp',
118 |             'indexpageids': '',
119 |             'titles': title
120 |         }
121 |         params.update(req_params)
122 | 
123 |         # Does sequential queries to get all revisions IDs and Timestamps
124 |         queries_results = []
125 |         condition = True
126 |         while condition:
127 |             # Clone original request
128 |             newparams = params.copy()
129 |             req = self.request(api_base_uri, params=newparams)
130 |             try:
131 |                 result = req.json()
132 |             except Exception as e:
133 |                 logging.error("No JSON can be decoded from API %s" %
134 |                               api_base_uri)
135 |                 raise HandlerError("No API answer.", 404)
136 |             if 'error' in result:
137 |                 raise HandlerError(result['error'])
138 |             if 'warnings' in result:
139 |                 # logging.warn(result['warnings'])
140 |                 pass
141 |             try:
142 |                 # The request was successful
143 |                 # the JSON key of the page (only one)
144 |                 pid = result['query']['pageids'][0]
145 |                 queries_results += result['query']['pages'][pid]['revisions']
146 |                 if ('missing' in result['query']['pages'][pid] or
147 |                         'invalid' in result['query']['pages'][pid]):
148 |                     raise HandlerError(
149 |                         "Cannot find resource on version server.", 404)
150 |             except Exception as e:
151 |                 if req_params['rvdir'] == 'older':
152 |                     req_params['rvdir'] = 'newer'
153 |                     return self.query(
154 |                         req_uri, req_params, title, api_base_uri, base_uri)
155 |                 else:
156 |                     raise HandlerError("No revision returned from API.", 404)
157 |             if 'continue' in result:
158 |                 # The response was truncated, the rest can be obtained using
159 |                 # &rvcontinue=ID
160 |                 cont = result['continue']
161 |                 # Modify it with the values returned in the 'continue' section
162 |                 # of the last result.
163 |                 newparams.update(cont)
164 |                 condition = True
165 |             else:
166 |                 condition = False
167 | 
168 |         # Processing list
169 |         def f(rev):
170 |             rev_uri = base_uri + '?title=%s&oldid=%d' % (
171 |                 urllib2.quote(title), rev['revid'])
172 |             dt = rev['timestamp']
173 |             return (rev_uri, dt)
174 | 
175 |         # logging.debug("Returning API results of size %d" % len(
176 |         #    queries_results))
177 |         return map(f, queries_results)
178 | 
179 |     def get_xml(self, uri, html=False):
180 |         """Retrieve the resource using the url.
181 | 
182 |         It parses response as XML or HTML and returns the parsed DOM object.
183 | 
184 |         :param uri: [str] The uri to retrieve.
185 |         :param headers: [dict(header_name: value)] Optional HTTP headers to
186 |         send in the request.
187 |         :param html: [bool] Optional flag to parse the response as HTML.
188 |         :return: [lxml_obj] Parsed DOM.
189 |         """
190 |         try:
191 |             page = self.request(uri)
192 |         except HandlerError as he:
193 |             raise HandlerError(he, status=404)
194 | 
195 |         try:
196 |             page_data = page.content
197 |             if not html:
198 |                 parser = etree.XMLParser(recover=True)
199 |             else:
200 |                 parser = etree.HTMLParser(recover=True)
201 |             return etree.parse(StringIO.StringIO(page_data), parser)
202 |         except Exception as e:
203 |             logging.error("Cannot parse XML/HTML from %s" % uri)
204 |             raise HandlerError("Couldn't parse data from %s" % uri, 404)
205 | 


--------------------------------------------------------------------------------
/timegate/examples/github.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | from __future__ import absolute_import, print_function
 12 | 
 13 | import re
 14 | import time
 15 | 
 16 | import requests
 17 | 
 18 | from timegate.errors import HandlerError
 19 | from timegate.handler import Handler
 20 | 
 21 | ACCEPTABLE_RESOURCE = (
 22 |     "Acceptable resources URI: repositories (github.com/:user/:repo), "
 23 |     "folders (github.com/:user/:repo/tree/:branch/:path), "
 24 |     "files (github.com/:user/:repo/blob/:branch/:path) "
 25 |     "and raw files (raw.githubusercontent.com/:user/:repo/:branch/:path)"
 26 | )
 27 | 
 28 | 
 29 | class GitHubHandler(Handler):
 30 | 
 31 |     def __init__(self):
 32 |         Handler.__init__(self)
 33 |         # Mandatory fields
 34 |         self.resources = ['https://github.com/.+',
 35 |                           'https://raw.githubusercontent.com/']
 36 | 
 37 |         # Local fields
 38 |         self.api = 'https://api.github.com'
 39 | 
 40 |         # Precompiles regular expressions
 41 |         self.rex = re.compile("""  # The format of URI-Rs
 42 |                               (https://)  # protocol
 43 |                               ((?:raw.githubusercontent|github).com/)  # base
 44 |                               ([^/]+)/  # user
 45 |                               ([^/]+)  # repo
 46 |                               (/.*)?  # optional path
 47 |                               """, re.X)  # verbosed: ignore whitespaces and \n
 48 |         self.header_rex = re.compile(
 49 |             '<(.+?)>; rel="next"')  # The regex for the query continuation header
 50 |         self.file_rex = re.compile('(/blob)?/master')  # The regex for files
 51 | 
 52 |     def get_all_mementos(self, uri):
 53 |         MAX_TIME = 120  # seconds
 54 | 
 55 |         if uri.startswith('http://'):
 56 |             uri = uri.replace('http://', 'https://', 1)
 57 | 
 58 |         # URI deconstruction
 59 |         match = self.rex.match(uri)
 60 |         if not bool(match):
 61 |             raise HandlerError("Github uri does not match a valid resource. \n"
 62 |                                + ACCEPTABLE_RESOURCE, 404)
 63 |         protocol = match.groups()[0]
 64 |         base = match.groups()[1]
 65 |         user = match.groups()[2]
 66 |         repo = match.groups()[3]
 67 |         req_path = match.groups()[4]
 68 | 
 69 |         path = ''
 70 |         branch = ''
 71 |         # Processes one result to (memento, datetime) pair
 72 |         mapper = None
 73 | 
 74 |         # Defining Resource type and response handling
 75 |         # Creates one function for a specific type to map the results to
 76 |         # memento pairs.
 77 |         if base == 'github.com/':
 78 |             # Resource is a repository
 79 |             if not req_path or req_path == '/':
 80 |                 if req_path:
 81 |                     path = '/'
 82 | 
 83 |                 def make_pair(commit):
 84 |                     return (commit['html_url'].replace('commit', 'tree'),
 85 |                             commit['commit']['committer']['date'])
 86 |                 mapper = make_pair
 87 | 
 88 |             # Resource is a file
 89 |             elif req_path.startswith('/blob/'):
 90 |                 path = req_path.replace('/blob/', '', 1)
 91 |                 branch_index = path.find('/')
 92 |                 branch = path[:branch_index]
 93 |                 path = path[branch_index:]
 94 |                 if branch == '' or path == '' or path.endswith('/'):
 95 |                     raise HandlerError(
 96 |                         "Not found. Empty path for file in repository", 404)
 97 | 
 98 |                 def make_pair(commit):
 99 |                     # HTML Resource
100 |                     memento_path = '/blob/%s%s' % (commit['sha'], path)
101 |                     uri_m = '%s%s%s/%s%s' % (
102 |                         protocol, base, user, repo, memento_path)
103 |                     return (uri_m, commit['commit']['committer']['date'])
104 |                 mapper = make_pair
105 | 
106 |             # Resource is a directory
107 |             elif req_path.startswith('/tree/'):
108 |                 path = req_path.replace('/tree/', '', 1)
109 |                 branch_index = path.find('/')
110 |                 if branch_index < 0:
111 |                     branch_index = len(path)
112 |                 branch = path[:branch_index]
113 |                 path = path[branch_index:]
114 |                 if branch == '':
115 |                     raise HandlerError("Not found. Empty branch path", 404)
116 | 
117 |                 def make_pair(commit):
118 |                     return (
119 |                         commit['html_url'].replace(
120 |                             'commit',
121 |                             'tree') + path,
122 |                         commit['commit']['committer']['date'])
123 |                 mapper = make_pair
124 | 
125 |         # Resource is a raw file
126 |         elif base == 'raw.githubusercontent.com/' and req_path is not None:
127 |             path = req_path.replace('/', '', 1)
128 |             branch_index = path.find('/')
129 |             branch = path[:branch_index]
130 |             path = path[branch_index:]
131 |             # must be done because API does not make any difference between
132 |             # path or files
133 |             is_online = bool(requests.head(uri))
134 |             if path == '' or path.endswith('/') or not is_online:
135 |                 raise HandlerError(
136 |                     "'%s' not found: Raw resource must be a file." % path, 404)
137 | 
138 |             def make_pair(commit):
139 |                 memento_path = '/%s%s' % (commit['sha'], path)
140 |                 uri_m = '%s%s%s/%s%s' % (protocol, base,
141 |                                          user, repo, memento_path)
142 |                 return (uri_m, commit['commit']['committer']['date'])
143 |             mapper = make_pair
144 | 
145 |         if mapper is None:
146 |             # The resource is not accepcted.
147 |             raise HandlerError(
148 |                 "GitHub resource type not found." + ACCEPTABLE_RESOURCE, 404)
149 | 
150 |         # Initiating request variables
151 |         apibase = '%s/repos/%s/%s/commits' % (self.api, user, repo)
152 |         params = {
153 |             'per_page': 100,  # Max allowed is 100
154 |             'path': str(path),
155 |             'sha': str(branch)
156 |         }
157 |         aut_pair = ('MementoTimegate', 'LANLTimeGate14')
158 |         cont = apibase  # The first continue is the beginning
159 | 
160 |         # Does sequential queries to get all commits of the particular resource
161 |         queries_results = []
162 |         tmax = int(time.time()) + MAX_TIME
163 |         while cont is not None:
164 |             if int(time.time()) > tmax:
165 |                 raise HandlerError(
166 |                     "Resource too big to be served. GitHub Handler TimeOut (timeout: %d seconds)" %
167 |                     MAX_TIME, 502)
168 |             req = self.request(cont, params=params, auth=aut_pair)
169 |             cont = None
170 |             if not req:
171 |                 # status code different than 2XX
172 |                 raise HandlerError(
173 |                     "Cannot find resource on version server. API response %d'd " %
174 |                     req.status_code, 404)
175 |             result = req.json()
176 |             if 'message' in result:
177 |                 # API-specific error
178 |                 raise HandlerError(result['message'])
179 |             if 'errors' in result:
180 |                 # API-specific error
181 |                 raise HandlerError(result['errors'])
182 |             if len(result) > 0:
183 |                 # The request was successful
184 |                 queries_results += result
185 |                 # Search for possible continue
186 |                 if 'link' in req.headers:
187 |                     link_header = req.headers['link']
188 |                     headermatch = self.header_rex.search(link_header)
189 |                     if bool(headermatch):
190 |                         # The response was truncated, the rest can be obtained using
191 |                         # the given "next" link
192 |                         cont = headermatch.groups()[0]
193 | 
194 |         if queries_results:
195 |             # Processes results based on resource type
196 |             return map(mapper, queries_results)
197 |         else:
198 |             # No results found
199 |             raise HandlerError(
200 |                 "Resource not found, empty response from API", 404)
201 | 


--------------------------------------------------------------------------------
/timegate/examples/gitlab.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | from __future__ import absolute_import, print_function
 12 | 
 13 | import re
 14 | import time
 15 | 
 16 | import requests
 17 | 
 18 | from timegate.errors import HandlerError
 19 | from timegate.handler import Handler
 20 | 
 21 | ACCEPTABLE_RESOURCE = (
 22 |     "Acceptable resources URI: repositories (/:user/:repo), "
 23 |     "folders (/:user/:repo/tree/:branch/:path), "
 24 |     "files (/:user/:repo/blob/:branch/:path) "
 25 |     "and raw files (/:user/:repo/raw/:branch/:path)"
 26 | )
 27 | 
 28 | # TODO: wiki pages (e.g. https://gitlab.example.com/auser/aproject/wikis/home)
 29 | 
 30 | 
 31 | class GitLabHandler(Handler):
 32 | 
 33 |     def __init__(self):
 34 |         Handler.__init__(self)
 35 |         # Mandatory fields
 36 |         # TODO: move to config file
 37 |         self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+']
 38 | 
 39 |         # Local fields
 40 |         self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3'  # TODO: move to config file
 41 |         self.apikey = 'VqeqaShAw4GWVc3dp7--'  # TODO: move to config file
 42 | 
 43 |         # Precompiles regular expressions  ## TODO: generalize for URLs with
 44 |         # numeric project ID instead of user/repo!!!
 45 |         self.rex = re.compile("""  # The format of URI-Rs
 46 |                               (https://)  # protocol
 47 |                               ([^/]+)/  # base
 48 |                               ([^/]+)/  # user
 49 |                               ([^/]+)  # repo
 50 |                               (/.*)?  # optional path
 51 |                               """, re.X)  # verbosed: ignore whitespaces and \n
 52 |         self.header_rex = re.compile(
 53 |             '<(.+?)>; rel="next"')  # The regex for the query continuation header
 54 |         self.file_rex = re.compile('(/blob)?/master')  # The regex for files
 55 | 
 56 |     def get_all_mementos(self, uri):
 57 |         MAX_TIME = 120  # seconds
 58 | 
 59 |         # URI deconstruction
 60 |         match = self.rex.match(uri)
 61 |         if not bool(match):
 62 |             raise HandlerError("Github uri does not match a valid resource. \n"
 63 |                                + ACCEPTABLE_RESOURCE, 404)
 64 |         protocol = match.groups()[0]
 65 |         base = match.groups()[1]
 66 |         user = match.groups()[2]
 67 |         repo = match.groups()[3]
 68 |         req_path = match.groups()[4]
 69 | 
 70 |         path = ''
 71 |         branch = ''
 72 |         # Processes one result to (memento, datetime) pair
 73 |         mapper = None
 74 | 
 75 |         # Defining Resource type and response handling
 76 |         # Creates one function for a specific type to map the results to
 77 |         # memento pairs.
 78 |         if 1:
 79 |             # Resource is a repository
 80 |             if not req_path or req_path == '/':
 81 |                 if req_path:
 82 |                     path = '/'
 83 | 
 84 |                 def make_pair(commit):
 85 |                     memento_path = '/commit/%s' % commit['id']
 86 |                     uri_m = '%s%s/%s/%s%s' % (
 87 |                         protocol, base, user, repo, memento_path)
 88 |                     return (uri_m, commit['created_at'])
 89 |                 mapper = make_pair
 90 | 
 91 |             # Resource is a file
 92 |             elif req_path.startswith('/blob/'):
 93 |                 path = req_path.replace('/blob/', '', 1)
 94 |                 branch_index = path.find('/')
 95 |                 branch = path[:branch_index]
 96 |                 path = path[branch_index:]
 97 |                 if branch == '' or path == '' or path.endswith('/'):
 98 |                     raise HandlerError(
 99 |                         "Not found. Empty path for file in repository", 404)
100 | 
101 |                 def make_pair(commit):
102 |                     # HTML Resource
103 |                     memento_path = '/blob/%s%s' % (commit['id'], path)
104 |                     uri_m = '%s%s/%s/%s%s' % (
105 |                         protocol, base, user, repo, memento_path)
106 |                     return (uri_m, commit['created_at'])
107 |                 mapper = make_pair
108 | 
109 |             # Resource is a raw file
110 |             elif req_path.startswith('/raw/'):
111 |                 path = req_path.replace('/raw/', '', 1)
112 |                 branch_index = path.find('/')
113 |                 branch = path[:branch_index]
114 |                 path = path[branch_index:]
115 |                 is_online = bool(requests.head(
116 |                     uri, params={'private_token': self.apikey}))
117 |                 if path == '' or path.endswith('/') or not is_online:
118 |                     raise HandlerError(
119 |                         "'%s' not found: Raw resource must be a file." %
120 |                         path, 404)
121 | 
122 |                 def make_pair(commit):
123 |                     # HTML Resource
124 |                     memento_path = '/raw/%s%s' % (commit['id'], path)
125 |                     uri_m = '%s%s/%s/%s%s' % (
126 |                         protocol, base, user, repo, memento_path)
127 |                     return (uri_m, commit['created_at'])
128 |                 mapper = make_pair
129 | 
130 |             # Resource is a directory
131 |             elif req_path.startswith('/tree/'):
132 |                 path = req_path.replace('/tree/', '', 1)
133 |                 branch_index = path.find('/')
134 |                 if branch_index < 0:
135 |                     branch_index = len(path)
136 |                 branch = path[:branch_index]
137 |                 path = path[branch_index:]
138 |                 if branch == '':
139 |                     raise HandlerError("Not found. Empty branch path", 404)
140 | 
141 |                 def make_pair(commit):
142 |                     memento_path = '/commit/%s' % commit['id']
143 |                     uri_m = '%s%s/%s/%s%s' % (
144 |                         protocol, base, user, repo, memento_path)
145 |                     return (uri_m, commit['created_at'])
146 |                 mapper = make_pair
147 | 
148 |             # Resource is a wiki entry
149 |             # e.g.
150 |             # https://gitlab.example.com/opac/cdrom-opac/wikis/home -->
151 |             # https://gitlab.example.com/opac/cdrom-opac/wikis/home?version_id=b4a9027e2948a5ce9ecd3a9c1641ed958b9f7728
152 |             # API does not seem to support this: getting wrong commit IDs
153 |             # elif req_path.startswith('/wikis/'):
154 |             #     def make_pair(commit):
155 |             #         # HTML Resource
156 |             #         memento_path = '%s?version_id=%s' % (req_path, commit['id'])
157 |             #         uri_m = '%s%s/%s/%s%s' % (
158 |             #             protocol, base, user, repo, memento_path)
159 |             #         return (uri_m, commit['created_at'])
160 |             #     mapper = make_pair
161 | 
162 |         if mapper is None:
163 |             # The resource is not accepcted.
164 |             raise HandlerError(
165 |                 "GitLab resource type not found." + ACCEPTABLE_RESOURCE, 404)
166 | 
167 |         # Initiating request variables
168 |         # It appears that user/repo can be used instead of a numeric project
169 |         # ID. %2f is a urlencoded slash (/).
170 |         apibase = '%s/projects/%s/repository/commits' % (
171 |             self.api, user + '%2f' + repo)
172 |         params = {
173 |             'per_page': 100,  # Max allowed is 100
174 |             'path': str(path),
175 |             'branches': str(branch),
176 |             'private_token': self.apikey
177 |         }
178 |         aut_pair = ('MementoTimegate', 'LANLTimeGate14')
179 |         cont = apibase  # The first continue is the beginning
180 | 
181 |         # Does sequential queries to get all commits of the particular resource
182 |         queries_results = []
183 |         tmax = int(time.time()) + MAX_TIME
184 |         while cont is not None:
185 |             if int(time.time()) > tmax:
186 |                 raise HandlerError(
187 |                     "Resource too big to be served. GitLab Handler TimeOut (timeout: %d seconds)" %
188 |                     MAX_TIME, 502)
189 |             req = self.request(cont, params=params, auth=aut_pair)
190 |             cont = None
191 |             if not req:
192 |                 # status code different than 2XX
193 |                 raise HandlerError(
194 |                     "Cannot find resource on version server. API response %d'd " %
195 |                     req.status_code, 404)
196 |             result = req.json()
197 |             if 'message' in result:
198 |                 # API-specific error
199 |                 raise HandlerError(result['message'])
200 |             if 'errors' in result:
201 |                 # API-specific error
202 |                 raise HandlerError(result['errors'])
203 |             if len(result) > 0:
204 |                 # The request was successful
205 |                 queries_results += result
206 |                 # Search for possible continue
207 |                 if 'link' in req.headers:
208 |                     link_header = req.headers['link']
209 |                     headermatch = self.header_rex.search(link_header)
210 |                     if bool(headermatch):
211 |                         # The response was truncated, the rest can be obtained using
212 |                         # the given "next" link
213 |                         cont = headermatch.groups()[0]
214 | 
215 |         if queries_results:
216 |             # Processes results based on resource type
217 |             return map(mapper, queries_results)
218 |         else:
219 |             # No results found
220 |             raise HandlerError(
221 |                 "Resource not found, empty response from API", 404)
222 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2016 CERN.
  5 | #
  6 | # TimeGate is free software; you can redistribute it and/or modify
  7 | # it under the terms of the Revised BSD License; see LICENSE file for
  8 | # more details.
  9 | 
 10 | from __future__ import print_function
 11 | 
 12 | import os
 13 | 
 14 | import sphinx.environment
 15 | from docutils.utils import get_source_line
 16 | 
 17 | # -- General configuration ------------------------------------------------
 18 | 
 19 | # If your documentation needs a minimal Sphinx version, state it here.
 20 | #needs_sphinx = '1.0'
 21 | 
 22 | # Do not warn on external images.
 23 | suppress_warnings = ['image.nonlocal_uri']
 24 | 
 25 | # Add any Sphinx extension module names here, as strings. They can be
 26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 27 | # ones.
 28 | extensions = [
 29 |     'sphinx.ext.autodoc',
 30 |     'sphinx.ext.coverage',
 31 |     'sphinx.ext.doctest',
 32 |     'sphinx.ext.intersphinx',
 33 |     'sphinx.ext.viewcode',
 34 | ]
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | # source_suffix = ['.rst', '.md']
 42 | source_suffix = '.rst'
 43 | 
 44 | # The encoding of source files.
 45 | #source_encoding = 'utf-8-sig'
 46 | 
 47 | # The master toctree document.
 48 | master_doc = 'index'
 49 | 
 50 | # General information about the project.
 51 | project = u'TimeGate'
 52 | copyright = u'2016, CERN'
 53 | author = u'LANL'
 54 | 
 55 | # The version info for the project you're documenting, acts as replacement for
 56 | # |version| and |release|, also used in various other places throughout the
 57 | # built documents.
 58 | #
 59 | # The short X.Y version.
 60 | 
 61 | # Get the version string. Cannot be done with import!
 62 | g = {}
 63 | with open(os.path.join('..', 'timegate', 'version.py'), 'rt') as fp:
 64 |     exec(fp.read(), g)
 65 |     version = g['__version__']
 66 | 
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = version
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #
 73 | # This is also used if you do content translation via gettext catalogs.
 74 | # Usually you set "language" from the command line for these cases.
 75 | language = None
 76 | 
 77 | # There are two options for replacing |today|: either, you set today to some
 78 | # non-false value, then it is used:
 79 | #today = ''
 80 | # Else, today_fmt is used as the format for a strftime call.
 81 | #today_fmt = '%B %d, %Y'
 82 | 
 83 | # List of patterns, relative to source directory, that match files and
 84 | # directories to ignore when looking for source files.
 85 | exclude_patterns = []
 86 | 
 87 | # The reST default role (used for this markup: `text`) to use for all
 88 | # documents.
 89 | #default_role = None
 90 | 
 91 | # If true, '()' will be appended to :func: etc. cross-reference text.
 92 | #add_function_parentheses = True
 93 | 
 94 | # If true, the current module name will be prepended to all description
 95 | # unit titles (such as .. function::).
 96 | #add_module_names = True
 97 | 
 98 | # If true, sectionauthor and moduleauthor directives will be shown in the
 99 | # output. They are ignored by default.
100 | #show_authors = False
101 | 
102 | # The name of the Pygments (syntax highlighting) style to use.
103 | pygments_style = 'sphinx'
104 | 
105 | # A list of ignored prefixes for module index sorting.
106 | #modindex_common_prefix = []
107 | 
108 | # If true, keep warnings as "system message" paragraphs in the built documents.
109 | #keep_warnings = False
110 | 
111 | # If true, `todo` and `todoList` produce output, else they produce nothing.
112 | todo_include_todos = False
113 | 
114 | 
115 | # -- Options for HTML output ----------------------------------------------
116 | html_theme = 'alabaster'
117 | 
118 | html_theme_options = {
119 |     'description': 'A Memento TimeGate',
120 |     'github_user': 'mementoweb',
121 |     'github_repo': 'timegate',
122 |     'github_button': False,
123 |     'github_banner': True,
124 |     'show_powered_by': False,
125 |     'extra_nav_links': {
126 |         'timegate@GitHub': 'http://github.com/mementoweb/timegate',
127 |         'timegate@PyPI': 'http://pypi.python.org/pypi/timegate/',
128 |     }
129 | }
130 | 
131 | # The theme to use for HTML and HTML Help pages.  See the documentation for
132 | # a list of builtin themes.
133 | 
134 | # Theme options are theme-specific and customize the look and feel of a theme
135 | # further.  For a list of options available for each theme, see the
136 | # documentation.
137 | #html_theme_options = {}
138 | 
139 | # Add any paths that contain custom themes here, relative to this directory.
140 | #html_theme_path = []
141 | 
142 | # The name for this set of Sphinx documents.  If None, it defaults to
143 | # "<project> v<release> documentation".
144 | #html_title = None
145 | 
146 | # A shorter title for the navigation bar.  Default is the same as html_title.
147 | #html_short_title = None
148 | 
149 | # The name of an image file (relative to this directory) to place at the top
150 | # of the sidebar.
151 | #html_logo = None
152 | 
153 | # The name of an image file (within the static path) to use as favicon of the
154 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
155 | # pixels large.
156 | #html_favicon = None
157 | 
158 | # Add any paths that contain custom static files (such as style sheets) here,
159 | # relative to this directory. They are copied after the builtin static files,
160 | # so a file named "default.css" will overwrite the builtin "default.css".
161 | #html_static_path = ['_static']
162 | 
163 | # Add any extra paths that contain custom files (such as robots.txt or
164 | # .htaccess) here, relative to this directory. These files are copied
165 | # directly to the root of the documentation.
166 | #html_extra_path = []
167 | 
168 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
169 | # using the given strftime format.
170 | #html_last_updated_fmt = '%b %d, %Y'
171 | 
172 | # If true, SmartyPants will be used to convert quotes and dashes to
173 | # typographically correct entities.
174 | #html_use_smartypants = True
175 | 
176 | # Custom sidebar templates, maps document names to template names.
177 | html_sidebars = {
178 |     '**': [
179 |         'about.html',
180 |         'navigation.html',
181 |         'relations.html',
182 |         'searchbox.html',
183 |         'donate.html',
184 |     ]
185 | }
186 | 
187 | # Additional templates that should be rendered to pages, maps page names to
188 | # template names.
189 | #html_additional_pages = {}
190 | 
191 | # If false, no module index is generated.
192 | #html_domain_indices = True
193 | 
194 | # If false, no index is generated.
195 | #html_use_index = True
196 | 
197 | # If true, the index is split into individual pages for each letter.
198 | #html_split_index = False
199 | 
200 | # If true, links to the reST sources are added to the pages.
201 | #html_show_sourcelink = True
202 | 
203 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
204 | #html_show_sphinx = True
205 | 
206 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
207 | #html_show_copyright = True
208 | 
209 | # If true, an OpenSearch description file will be output, and all pages will
210 | # contain a <link> tag referring to it.  The value of this option must be the
211 | # base URL from which the finished HTML is served.
212 | #html_use_opensearch = ''
213 | 
214 | # This is the file name suffix for HTML files (e.g. ".xhtml").
215 | #html_file_suffix = None
216 | 
217 | # Language to be used for generating the HTML full-text search index.
218 | # Sphinx supports the following languages:
219 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
220 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
221 | #html_search_language = 'en'
222 | 
223 | # A dictionary with options for the search language support, empty by default.
224 | # Now only 'ja' uses this config value
225 | #html_search_options = {'type': 'default'}
226 | 
227 | # The name of a javascript file (relative to the configuration directory) that
228 | # implements a search results scorer. If empty, the default will be used.
229 | #html_search_scorer = 'scorer.js'
230 | 
231 | # Output file base name for HTML help builder.
232 | htmlhelp_basename = 'timegate_namedoc'
233 | 
234 | # -- Options for LaTeX output ---------------------------------------------
235 | 
236 | latex_elements = {
237 | # The paper size ('letterpaper' or 'a4paper').
238 | #'papersize': 'letterpaper',
239 | 
240 | # The font size ('10pt', '11pt' or '12pt').
241 | #'pointsize': '10pt',
242 | 
243 | # Additional stuff for the LaTeX preamble.
244 | #'preamble': '',
245 | 
246 | # Latex figure (float) alignment
247 | #'figure_align': 'htbp',
248 | }
249 | 
250 | # Grouping the document tree into LaTeX files. List of tuples
251 | # (source start file, target name, title,
252 | #  author, documentclass [howto, manual, or own class]).
253 | latex_documents = [
254 |   (master_doc, 'timegate.tex', u'timegate Documentation',
255 |    u'LANL', 'manual'),
256 | ]
257 | 
258 | # The name of an image file (relative to this directory) to place at the top of
259 | # the title page.
260 | #latex_logo = None
261 | 
262 | # For "manual" documents, if this is true, then toplevel headings are parts,
263 | # not chapters.
264 | #latex_use_parts = False
265 | 
266 | # If true, show page references after internal links.
267 | #latex_show_pagerefs = False
268 | 
269 | # If true, show URL addresses after external links.
270 | #latex_show_urls = False
271 | 
272 | # Documents to append as an appendix to all manuals.
273 | #latex_appendices = []
274 | 
275 | # If false, no module index is generated.
276 | #latex_domain_indices = True
277 | 
278 | 
279 | # -- Options for manual page output ---------------------------------------
280 | 
281 | # One entry per manual page. List of tuples
282 | # (source start file, name, description, authors, manual section).
283 | man_pages = [
284 |     (master_doc, 'timegate', u'timegate Documentation',
285 |      [author], 1)
286 | ]
287 | 
288 | # If true, show URL addresses after external links.
289 | #man_show_urls = False
290 | 
291 | 
292 | # -- Options for Texinfo output -------------------------------------------
293 | 
294 | # Grouping the document tree into Texinfo files. List of tuples
295 | # (source start file, target name, title, author,
296 | #  dir menu entry, description, category)
297 | texinfo_documents = [
298 |   (master_doc, 'timegate', u'TimeGate Documentation',
299 |    author, 'timegate', 'A Memento TimeGate',
300 |    'Miscellaneous'),
301 | ]
302 | 
303 | # Documents to append as an appendix to all manuals.
304 | #texinfo_appendices = []
305 | 
306 | # If false, no module index is generated.
307 | #texinfo_domain_indices = True
308 | 
309 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
310 | #texinfo_show_urls = 'footnote'
311 | 
312 | # If true, do not generate a @detailmenu in the "Top" node's menu.
313 | #texinfo_no_detailmenu = False
314 | 
315 | 
316 | # Example configuration for intersphinx: refer to the Python standard library.
317 | intersphinx_mapping = {'https://docs.python.org/': None}
318 | 


--------------------------------------------------------------------------------
/timegate/application.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of TimeGate.
  4 | # Copyright (C) 2014, 2015, 2016 LANL.
  5 | # Copyright (C) 2016 CERN.
  6 | #
  7 | # TimeGate is free software; you can redistribute it and/or modify
  8 | # it under the terms of the Revised BSD License; see LICENSE file for
  9 | # more details.
 10 | 
 11 | """Implementation of the TimeGate server."""
 12 | 
 13 | from __future__ import absolute_import, print_function
 14 | 
 15 | import json
 16 | import logging
 17 | import os
 18 | from datetime import datetime
 19 | 
 20 | from pkg_resources import iter_entry_points
 21 | 
 22 | from dateutil.tz import tzutc
 23 | from link_header import Link, LinkHeader
 24 | from werkzeug.exceptions import HTTPException, abort
 25 | from werkzeug.http import http_date, parse_date
 26 | from werkzeug.local import Local, LocalManager
 27 | from werkzeug.routing import BaseConverter, Map, Rule, ValidationError
 28 | from werkzeug.utils import cached_property, import_string
 29 | from werkzeug.wrappers import Request, Response
 30 | 
 31 | from . import constants
 32 | from .cache import Cache
 33 | from .config import Config
 34 | from .handler import Handler, parsed_request
 35 | from .utils import best
 36 | 
 37 | local = Local()
 38 | """Thread safe local data storage."""
 39 | 
 40 | local_manager = LocalManager([local])
 41 | """Manager for local data storage."""
 42 | 
 43 | request = local('request')
 44 | """Proxy to request object."""
 45 | 
 46 | # logging.getLogger(__name__)
 47 | # logging.basicConfig(level=logging.DEBUG)
 48 | 
 49 | 
 50 | def url_for(*args, **kwargs):
 51 |     """Proxy to URL Map adapter builder."""
 52 |     return request.adapter.build(*args, **kwargs)
 53 | 
 54 | 
 55 | def load_handler(name_or_path):
 56 |     """Load handler from entry points or import string."""
 57 |     if isinstance(name_or_path, Handler):
 58 |         return name_or_path
 59 | 
 60 |     handlers = list(iter_entry_points('timegate.handlers', name=name_or_path))
 61 |     number_of_handlers = len(handlers)
 62 |     if number_of_handlers > 1:
 63 |         raise RuntimeError(
 64 |             'Multiple handlers with the same name "{0}" has been found'.format(
 65 |                 name_or_path
 66 |             )
 67 |         )
 68 |     elif number_of_handlers == 1:
 69 |         return handlers[0].load()()
 70 |     else:
 71 |         return import_string(name_or_path)()
 72 | 
 73 | 
 74 | class URIConverter(BaseConverter):
 75 |     """URI Converter."""
 76 | 
 77 |     def __init__(self, url_map, base_uri=None):
 78 |         super(URIConverter, self).__init__(url_map)
 79 |         self.base_uri = base_uri
 80 |         self.regex = (
 81 |             r"([^:/?#]+:)?(//[^/?#]*)?"
 82 |             r"[^?#]*(\?[^#]*)?(#.*)?"
 83 |         )
 84 | 
 85 |     def to_python(self, value):
 86 |         """Return value with base URI prefix."""
 87 |         value = value.replace(' ', '%20')  # encode
 88 |         if self.base_uri and not value.startswith(self.base_uri):
 89 |             return self.base_uri + value
 90 |         return value
 91 | 
 92 |     def to_url(self, value):
 93 |         """Return value without base URI if it is defined."""
 94 |         value = value.replace('%20', ' ')  # decode
 95 |         if self.base_uri and value.startswith(self.base_uri):
 96 |             return value[len(self.base_uri):]
 97 |         return value
 98 | 
 99 | 
100 | class TimeGate(object):
101 |     """Implementation of Memento protocol with configurable handlers."""
102 | 
103 |     def __init__(self, config=None, cache=None):
104 |         """Initialize application with handler."""
105 |         self.config = Config(None)
106 |         self.config.from_object(constants)
107 |         self.config.update(config or {})
108 |         self.cache = None
109 |         if cache:
110 |             self.cache = cache
111 |         elif self.config['CACHE_USE']:
112 |             self._build_default_cache()
113 | 
114 |     @cached_property
115 |     def handler(self):
116 |         handler = load_handler(self.config['HANDLER_MODULE'])
117 |         HAS_TIMEGATE = hasattr(handler, 'get_memento')
118 |         HAS_TIMEMAP = hasattr(handler, 'get_all_mementos')
119 |         if self.config['USE_TIMEMAPS'] and (not HAS_TIMEMAP):
120 |             logging.error(
121 |                 "Handler has no get_all_mementos() function "
122 |                 "but is suppose to serve timemaps.")
123 | 
124 |         if not (HAS_TIMEGATE or HAS_TIMEMAP):
125 |             raise NotImplementedError(
126 |                 "NotImplementedError: Handler has neither `get_memento` "
127 |                 "nor `get_all_mementos` method.")
128 |         return handler
129 | 
130 |     @cached_property
131 |     def url_map(self):
132 |         """Build URL map."""
133 |         base_uri = self.config['BASE_URI']
134 |         rules = [
135 |             Rule('/timegate/<uri(base_uri="{0}"):uri_r>'.format(base_uri),
136 |                  endpoint='timegate', methods=['GET', 'HEAD']),
137 |             Rule('/timemap/<any(json, link):response_type>/'
138 |                  '<uri(base_uri="{0}"):uri_r>'.format(base_uri),
139 |                  endpoint='timemap', methods=['GET', 'HEAD']),
140 |         ]
141 |         return Map(rules, converters={'uri': URIConverter})
142 | 
143 |     def _build_default_cache(self):
144 |         """Build default cache object."""
145 |         self.cache = Cache(
146 |             self.config['CACHE_FILE'],
147 |             self.config['CACHE_TOLERANCE'],
148 |             self.config['CACHE_EXP'],
149 |             self.config['CACHE_MAX_VALUES'],
150 |         )
151 | 
152 |     def __repr__(self):
153 |         """Representation of this class."""
154 |         return '<{0} {1}>'.format(
155 |             self.__class__.__name__, self.handler.__class__.__name__
156 |         )
157 | 
158 |     def dispatch_request(self, request):
159 |         """Choose correct method."""
160 |         request.adapter = adapter = self.url_map.bind_to_environ(
161 |             request.environ
162 |         )
163 |         try:
164 |             endpoint, values = adapter.match()
165 |             return getattr(self, endpoint)(**values)
166 |         except HTTPException as e:
167 |             return e
168 |         finally:
169 |             self.adapter = None
170 | 
171 |     def wsgi_app(self, environ, start_response):
172 |         local.request = request = Request(environ)
173 |         response = self.dispatch_request(request)
174 |         return response(environ, start_response)
175 | 
176 |     def __call__(self, environ, start_response):
177 |         """Handle a request."""
178 |         return self.wsgi_app(environ, start_response)
179 | 
180 |     def get_memento(self, uri_r, accept_datetime):
181 |         """Return a URL-M for an original resource.
182 | 
183 |         It must span at least up to a certain date.
184 | 
185 |         :param uri_r: The original resource to look for.
186 |         :param accept_datetime: Datetime object with requested time.
187 |         :return: The TimeMap if it exists and is valid.
188 |         """
189 |         return parsed_request(self.handler.get_memento,
190 |                               uri_r, accept_datetime)
191 | 
192 |     def get_all_mementos(self, uri_r):
193 |         """Uses the handler to retrieve a TimeMap for an original resource.
194 | 
195 |         The value is cached if the cache is activated.
196 | 
197 |         :param uri_r: The URI to retrieve and cache the TimeMap of.
198 |         :return: The retrieved value.
199 |         """
200 |         mementos = None
201 |         if self.cache and request.cache_control != 'no-cache':
202 |             mementos = self.cache.get_all(uri_r)
203 |         if mementos is None:
204 |             mementos = parsed_request(self.handler.get_all_mementos, uri_r)
205 |             if self.cache:
206 |                 self.cache.set(uri_r, mementos)
207 |         return mementos
208 | 
209 |     def timegate(self, uri_r):
210 |         """Handle timegate high-level logic.
211 | 
212 |         Fetch the Memento for the requested URI at the requested date time.
213 |         Returns a HTTP 302 response if it exists.  If the resource handler
214 |         allows batch requests, then the result may be cached.
215 | 
216 |         :return: The body of the HTTP response.
217 |         """
218 |         if 'Accept-Datetime' in request.headers:
219 |             accept_datetime = parse_date(
220 |                 request.headers['Accept-Datetime']
221 |             ).replace(tzinfo=tzutc())
222 |         else:
223 |             accept_datetime = datetime.utcnow().replace(tzinfo=tzutc())
224 | 
225 |         # Runs the handler's API request for the Memento
226 |         mementos = first = last = None
227 |         HAS_TIMEMAP = hasattr(self.handler, 'get_all_mementos')
228 |         if HAS_TIMEMAP and self.config['USE_TIMEMAPS']:
229 |             logging.debug('Using multiple-request mode.')
230 |             mementos = self.get_all_mementos(uri_r)
231 | 
232 |         if mementos:
233 |             first = mementos[0]
234 |             last = mementos[-1]
235 |             memento = best(mementos, accept_datetime,
236 |                            self.config['RESOURCE_TYPE'])
237 |         else:
238 |             logging.debug('Using single-request mode.')
239 |             memento = self.get_memento(uri_r, accept_datetime)
240 | 
241 |         # If the handler returned several Mementos, take the closest
242 |         return memento_response(
243 |             memento,
244 |             uri_r,
245 |             first,
246 |             last,
247 |             has_timemap=HAS_TIMEMAP and self.config['USE_TIMEMAPS'],
248 |         )
249 | 
250 |     def timemap(self, uri_r, response_type='link'):
251 |         """Handle TimeMap high-level logic.
252 | 
253 |         It fetches all Mementos for an Original Resource and builds the TimeMap
254 |         response. Returns a HTTP 200 response if it exists with the timemap in
255 |         the message body.
256 | 
257 |         :param req_uri: The requested original resource URI.
258 |         :param start_response: WSGI callback function.
259 |         :return: The body of the HTTP response.
260 |         """
261 |         if not self.config['USE_TIMEMAPS']:
262 |             abort(403)
263 | 
264 |         mementos = self.get_all_mementos(uri_r)
265 |         # Generates the TimeMap response body and Headers
266 |         if response_type == 'json':
267 |             return timemap_json_response(self, mementos, uri_r)
268 |         else:
269 |             return timemap_link_response(self, mementos, uri_r)
270 | 
271 | 
272 | @local_manager.middleware
273 | def application(environ, start_response):
274 |     """WSGI application object.
275 | 
276 |     This is the start point of the TimeGate server.
277 | 
278 |     TimeMap requests are parsed here.
279 | 
280 |     :param environ: Dictionary containing environment variables from
281 |     the client request.
282 |     :param start_response: Callback function used to send HTTP status
283 |     and headers to the server.
284 |     :return: The response body, in a list of one str element.
285 |     """
286 |     app = TimeGate()
287 |     app.config.from_inifile(
288 |         os.path.join(os.path.dirname(__file__), 'conf', 'config.ini')
289 |     )
290 |     return app(environ, start_response)
291 | 
292 | 
293 | def memento_response(
294 |         memento,
295 |         uri_r,
296 |         first=None,
297 |         last=None,
298 |         has_timemap=False):
299 |     """Return a 302 redirection to the best Memento for a resource.
300 | 
301 |     It includes necessary headers including datetime requested by the user.
302 | 
303 |     :param memento: (The URI string, dt obj) of the best memento.
304 |     :param uri_r: The original resource's complete URI.
305 |     :param first: (Optional) (URI string, dt obj) of the first memento.
306 |     :param last: (Optional) (URI string, dt obj) of the last memento.
307 |     :param has_timemap: Flag indicating that the handler accepts
308 |         TimeMap requests too. Default True.
309 |     :return: The ``Response`` object.
310 |     """
311 |     # Gather links containing original and if availible: TimeMap, first, last
312 |     # TimeGate link not allowed here
313 |     links = [Link(uri_r, rel='original')]
314 |     if has_timemap:
315 |         for response_type, mime in (('link', 'application/link-format'),
316 |                                     ('json', 'application/json'), ):
317 |             links.append(Link(
318 |                 url_for('timemap', dict(
319 |                     response_type=response_type, uri_r=uri_r
320 |                 ), force_external=True),
321 |                 rel='timemap', type=mime
322 |             ))
323 | 
324 |     (uri_m, dt_m) = memento
325 |     (uri_last, dt_last) = (uri_first, dt_first) = (None, None)
326 |     if last:
327 |         (uri_last, dt_last) = last
328 |     if first:
329 |         (uri_first, dt_first) = first
330 |     if first and last and uri_first == uri_last:
331 |         # There's only one memento (first = best = last)
332 |         assert(uri_last == uri_m)
333 |         links.append(Link(uri_m, rel='first last memento',
334 |                           datetime=http_date(dt_m)))
335 |     else:
336 |         if first:
337 |             links.append(Link(uri_m, rel='first memento',
338 |                               datetime=http_date(dt_first)))
339 |         if (uri_first != uri_m and uri_last != uri_m):
340 |             # The best memento is neither the first nor the last
341 |             links.append(Link(uri_m, rel='memento',
342 |                               datetime=http_date(dt_m)))
343 |         if last:
344 |             links.append(Link(uri_m, rel='last memento',
345 |                               datetime=http_date(dt_last)))
346 | 
347 |     # Builds the response headers
348 |     headers = [
349 |         ('Date', http_date(datetime.utcnow())),
350 |         ('Vary', 'accept-datetime'),
351 |         ('Content-Length', '0'),
352 |         ('Content-Type', 'text/plain; charset=UTF-8'),
353 |         ('Connection', 'close'),
354 |         ('Location', uri_m),
355 |         ('Link', str(LinkHeader(links))),
356 |     ]
357 |     return Response(None, headers=headers, status=302)
358 | 
359 | 
360 | def timemap_link_response(app, mementos, uri_r):
361 |     """Return a 200 TimeMap response.
362 | 
363 |     :param mementos: A sorted (ascending by date) list of (uri_str,
364 |     datetime_obj) tuples representing a TimeMap.
365 |     :param uri_r: The URI-R of the original resource.
366 |     :return: The ``Response`` object.
367 |     """
368 |     assert len(mementos) >= 1
369 | 
370 |     # Adds Original, TimeGate and TimeMap links
371 |     original_link = Link(uri_r, rel='original')
372 |     timegate_link = Link(
373 |         url_for('timegate', dict(uri_r=uri_r), force_external=True),
374 |         rel='timegate',
375 |     )
376 |     link_self = Link(
377 |         url_for('timemap', dict(
378 |             response_type='link', uri_r=uri_r
379 |         ), force_external=True),
380 |         rel='self', type='application/link-format',
381 |     )
382 |     json_self = Link(
383 |         url_for('timemap', dict(
384 |             response_type='json', uri_r=uri_r
385 |         ), force_external=True),
386 |         rel='timemap', type='application/json',
387 |     )
388 | 
389 |     # Sets up first and last relations
390 |     if len(mementos) == 1:
391 |         mementos_links = [Link(mementos[0][0], rel='first last memento',
392 |                                datetime=http_date(mementos[0][1]))]
393 |     else:
394 |         # Browse through Mementos to generate the TimeMap links list
395 |         mementos_links = [
396 |             Link(mementos[0][0], rel='first memento',
397 |                  datetime=http_date(mementos[0][1]))
398 |         ] + [
399 |             Link(uri, rel='memento', datetime=http_date(date))
400 |             for (uri, date) in mementos[1:-1]
401 |         ] + [
402 |             Link(mementos[-1][0], rel='last memento',
403 |                  datetime=http_date(mementos[-1][1]))
404 |         ]
405 | 
406 |     # Aggregates all link strings and constructs the TimeMap body
407 |     links = [original_link, timegate_link, link_self, json_self]
408 |     links.extend(mementos_links)
409 |     body = ',\n'.join([str(l) for l in links]) + '\n'
410 | 
411 |     # Builds HTTP Response and WSGI return
412 |     headers = [
413 |         ('Date', http_date(datetime.utcnow())),
414 |         ('Content-Length', str(len(body))),
415 |         ('Content-Type', 'application/link-format'),
416 |         ('Connection', 'close'),
417 |     ]
418 |     return Response(body, headers=headers)
419 | 
420 | 
421 | def timemap_json_response(app, mementos, uri_r):
422 |     """Creates and sends a timemap response.
423 | 
424 |     :param mementos: A sorted list of (uri_str, datetime_obj) tuples
425 |     representing a timemap.
426 |     :param uri_r: The URI-R of the original resource.
427 |     :param start_response: WSGI callback function.
428 |     :return: The ``Response`` object.
429 |     """
430 |     assert len(mementos) >= 1
431 | 
432 |     # Prepares the JSON response by building a dict
433 |     response_dict = {}
434 | 
435 |     response_dict['original_uri'] = uri_r
436 |     response_dict['timegate_uri'] = url_for(
437 |         'timegate', dict(uri_r=uri_r), force_external=True
438 |     )
439 | 
440 |     # Browse through Mementos to generate TimeMap links dict list
441 |     mementos_links = [
442 |         {'uri': urlstr, 'datetime': http_date(date)}
443 |         for (urlstr, date) in mementos
444 |     ]
445 | 
446 |     # Builds up first and last links dict
447 |     firstlink = {'uri': mementos[0][0], 'datetime': http_date(mementos[0][1])}
448 |     lastlink = {'uri': mementos[-1][0], 'datetime': http_date(mementos[-1][1])}
449 | 
450 |     response_dict['mementos'] = {
451 |         'last': lastlink,
452 |         'first': firstlink,
453 |         'list': mementos_links,
454 |     }
455 | 
456 |     # Builds self (TimeMap)links dict
457 |     response_dict['timemap_uri'] = {
458 |         'json_format': url_for('timemap', dict(
459 |             response_type='json', uri_r=uri_r
460 |         ), force_external=True),
461 |         'link_format': url_for('timemap', dict(
462 |             response_type='link', uri_r=uri_r
463 |         ), force_external=True),
464 |     }
465 | 
466 |     # Creates the JSON str from the dict
467 |     response_json = json.dumps(response_dict)
468 | 
469 |     # Builds HTTP Response and WSGI return
470 |     headers = [
471 |         ('Date', http_date(datetime.utcnow())),
472 |         ('Content-Length', str(len(response_json))),
473 |         ('Content-Type', 'application/json'),
474 |     ]
475 |     return Response(response_json, headers=headers)
476 | 


--------------------------------------------------------------------------------