├── mdshare
├── test
│ ├── __init__.py
│ ├── test_api.py
│ ├── test_utils.py
│ └── test_repository.py
├── data
│ ├── mdshare-catalogue.md5
│ ├── template.yaml
│ └── mdshare-catalogue.yaml
├── __init__.py
├── repository.py
├── utils.py
└── api.py
├── .gitattributes
├── .git_archival.txt
├── requirements.txt
├── setup.cfg
├── MANIFEST.in
├── CHANGELOG.md
├── .circleci
└── config.yml
├── .gitignore
├── setup.py
├── README.md
├── bin
└── mdshare-index-maker.py
└── LICENSE
/mdshare/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | .git_archival.txt export-subst
--------------------------------------------------------------------------------
/.git_archival.txt:
--------------------------------------------------------------------------------
1 | ref-names: HEAD -> master, tag: 0.4.2
--------------------------------------------------------------------------------
/mdshare/data/mdshare-catalogue.md5:
--------------------------------------------------------------------------------
1 | 3465007cf4a866ac4316b4e8afba4673
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | setuptools_scm
3 | setuptools_scm_git_archive
4 | pytest
5 | humanfriendly
6 | requests
7 | pyyaml
8 | tqdm
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [alias]
2 | test=pytest
3 |
4 | [tool:pytest]
5 | filterwarnings =
6 | once::DeprecationWarning
7 | once::PendingDeprecationWarning
8 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include CHANGELOG.md
4 | include mdshare/data/mdshare-catalogue.yaml
5 | include mdshare/data/mdshare-catalogue.md5
6 | include mdshare/data/template.md5
7 |
8 | # exclude compiled bytecode
9 | global-exclude *.pyc
10 | # exclude git backup files
11 | global-exclude *.orig
12 |
13 | # do not include eventually present eggs (installed during setup runtime)
14 | prune .eggs
15 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # mdshare — CHANGELOG
2 |
3 | ## 0.1.0
4 | - provides `mdshare.load()`
5 |
6 | ## 0.2.0
7 | - switching to LGPL
8 | - adding tests and CI
9 |
10 | ### 0.2.1
11 | - removing numpy dependency
12 | - code cleanup
13 |
14 | ## 0.3.0
15 | - provides `mdshare.catalogue()` and `mdshare.search()`
16 | - deprecates `mdshare.load()`
17 | - provides `mdshare.fetch()` as successor to `mdshare.load()`
18 |
19 | ### 0.3.1
20 | - updates to documention and package
21 |
22 | ### 0.3.2
23 | - replaces FTP usage
24 |
25 | ## 0.4.0
26 | - added progressbars
27 | - major refactoring to have an offline catalogue available
28 | - allow to download/extract .tar.gz containers
29 |
30 | ## 0.4.1
31 | - updated file catalogue
32 |
33 | ### upcoming
34 | - removed obsolete/unsused imports/variables/...
35 | - replaced assert statements
36 |
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | # Python CircleCI 2.0 configuration file
2 | #
3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
4 | #
5 | version: 2
6 | jobs:
7 | build:
8 | docker:
9 | # specify the version you desire here
10 | # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 | - image: circleci/python:3.6.1
12 |
13 | # Specify service dependencies here if necessary
14 | # CircleCI maintains a library of pre-built images
15 | # documented at https://circleci.com/docs/2.0/circleci-images/
16 | # - image: circleci/postgres:9.4
17 |
18 | working_directory: ~/repo
19 |
20 | steps:
21 | - checkout
22 |
23 | # Download and cache dependencies
24 | - restore_cache:
25 | keys:
26 | - v1-dependencies-{{ checksum "requirements.txt" }}
27 | # fallback to using the latest cache if no exact match is found
28 | - v1-dependencies-
29 |
30 | - run:
31 | name: install dependencies
32 | command: |
33 | python3 -m venv venv
34 | . venv/bin/activate
35 | pip install -r requirements.txt
36 |
37 | - save_cache:
38 | paths:
39 | - ./venv
40 | key: v1-dependencies-{{ checksum "requirements.txt" }}
41 |
42 | # run tests!
43 | - run:
44 | name: run tests
45 | command: |
46 | . venv/bin/activate
47 | python setup.py test
48 |
49 | - store_artifacts:
50 | path: test-reports
51 | destination: test-reports
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # pytest
104 | .pytest_cache/
105 |
--------------------------------------------------------------------------------
/mdshare/data/template.yaml:
--------------------------------------------------------------------------------
1 | # MDSHARE TEMPLATE FILE
2 | #
3 | # This is the template to compile the current catalogue file.
4 | # Navigate to the directory where the data files are and run
5 | #
6 | # python path/to/mdshare-index-maker.py build path/to/template.yaml
7 | #
8 | # This will create a yaml file (NAME.yaml) and a corresponding MD5
9 | # checksum file (NAME.md5) where NAME corresponds to the 'name' entry
10 | # in the template.
11 | #
12 | # The 'url' entry points to the directory's URL.
13 | #
14 | # 'include' denotes all files ion the current directory which should
15 | # be indexed; you can use unix-style wildcard patterns.
16 | #
17 | # 'containers' denotes which files should be grouped in .tar.gz
18 | # archives; again, you can use unix-style wildcard patterns. The
19 | # files must be part of 'include'.
20 |
21 | name: mdshare-catalogue
22 | url: 'http://ftp.imp.fu-berlin.de/pub/cmb-data/'
23 | include:
24 | - alanine-dipeptide-*.npz
25 | - alanine-dipeptide-*-nowater.xtc
26 | - alanine-dipeptide-nowater.pdb
27 | - pentapeptide-*-500ns-impl-solv.xtc
28 | - pentapeptide-impl-solv.pdb
29 | - pyemma-tutorial-mt-data.npz
30 | - pyemma-tutorial-tpt-data.npz
31 | - pyemma-tutorial-us-data.npz
32 | - pyemma-tutorial-us-nacl.npz
33 | - methane-dimer-umbrella-sampling.npz
34 | - doublewell_disconnected.npy
35 | - doublewell_oneway.npy
36 | - hmm-doublewell-2d-100k.npz
37 | - mdshare-test-00.txt
38 | - imd_channel_transitionmatrix.npy
39 | - imd_full_system_trajectory.npy
40 | - alanine_dipeptide_parallel_tempering_energies.npz
41 | - alanine_dipeptide_parallel_tempering_dihedrals.npz
42 | containers:
43 | pyemma-tutorial-livecoms.tar.gz:
44 | - alanine-dipeptide-*-nowater.xtc
45 | - alanine-dipeptide-nowater.pdb
46 | - pentapeptide-*-500ns-impl-solv.xtc
47 | - pentapeptide-impl-solv.pdb
48 | - doublewell_disconnected.npy
49 | - doublewell_oneway.npy
50 | - hmm-doublewell-2d-100k.npz
51 | mdshare-test.tar.gz:
52 | - mdshare-test-00.txt
53 |
--------------------------------------------------------------------------------
/mdshare/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 |
19 | __author__ = 'Christoph Wehmeyer'
20 | __email__ = 'christoph.wehmeyer@fu-berlin.de'
21 | __credits__ = ['Guillermo Pérez-Hernández', 'Martin K. Scherer'],
22 |
23 |
24 | from pkg_resources import get_distribution, DistributionNotFound
25 | try:
26 | __version__ = get_distribution(__name__).version
27 | except DistributionNotFound:
28 | __version__ = 'unknown'
29 | del get_distribution, DistributionNotFound
30 |
31 |
32 | from .repository import Repository
33 | from os.path import dirname, join
34 | from warnings import warn
35 | try:
36 | default_repository = Repository(
37 | join(dirname(__file__), 'data', 'mdshare-catalogue.yaml'),
38 | join(dirname(__file__), 'data', 'mdshare-catalogue.md5'))
39 | except FileNotFoundError:
40 | warn('Cannot build the default repository: missing file(s)!')
41 | default_repository = None
42 | except RuntimeError as e:
43 | warn(f'Cannot build the default repository: {e.args[0]}')
44 | default_repository = None
45 | del dirname, join, warn
46 |
47 |
48 | from .api import load_repository, search, catalogue, fetch
49 | from .utils import LoadError
50 |
51 |
52 | def load(*args, **kwargs):
53 | raise NotImplementedError('use fetch')
54 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | from setuptools import setup, find_packages
19 | from setuptools.command.test import test as TestCommand
20 | import sys
21 |
22 | class PyTest(TestCommand):
23 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
24 | def initialize_options(self):
25 | TestCommand.initialize_options(self)
26 | self.pytest_args = ['mdshare']
27 | def run_tests(self):
28 | import pytest
29 | errno = pytest.main(self.pytest_args)
30 | sys.exit(errno)
31 |
32 | setup(
33 | cmdclass={'test': PyTest},
34 | use_scm_version=True,
35 | name='mdshare',
36 | author='Christoph Wehmeyer',
37 | author_email='christoph.wehmeyer@fu-berlin.de',
38 | url='https://github.com/markovmodel/mdshare',
39 | description='Get easy access to our public data files.',
40 | packages=find_packages(),
41 | include_package_data=True,
42 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
43 | install_requires=['humanfriendly',
44 | 'requests',
45 | ],
46 | tests_require=['pytest'],
47 | zip_safe=False,
48 | scripts=['bin/mdshare-index-maker.py'],
49 | classifiers=[
50 | 'Development Status :: 4 - Beta',
51 | 'Environment :: Console',
52 | 'Intended Audience :: Science/Research',
53 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)',
54 | 'Natural Language :: English',
55 | 'Operating System :: OS Independent',
56 | 'Programming Language :: Python :: 3 :: Only',
57 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
58 | 'Topic :: Scientific/Engineering :: Chemistry',
59 | 'Topic :: Scientific/Engineering :: Mathematics',
60 | 'Topic :: Scientific/Engineering :: Physics'])
61 |
--------------------------------------------------------------------------------
/mdshare/test/test_api.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import pytest
19 | import os
20 | from ..utils import LoadError
21 | from ..utils import file_hash
22 | from ..api import load_repository
23 | from ..api import search
24 | from ..api import catalogue
25 | from ..api import fetch
26 | from .. import default_repository
27 |
28 | FILE = 'mdshare-test-00.txt'
29 | HASH = '5cbb04531c2e9fa7cc1e5d83195a2f81'
30 |
31 |
32 | def file_check(file):
33 | if file_hash(file) != HASH:
34 | raise AssertionError()
35 | os.remove(file)
36 |
37 |
38 | def test_load_repository_break():
39 | with pytest.raises(TypeError):
40 | load_repository(None)
41 | with pytest.raises(FileNotFoundError):
42 | load_repository('not-a-repository')
43 |
44 |
45 | def test_search():
46 | if len(search(FILE)) != 1:
47 | raise AssertionError()
48 | if search(FILE)[0] != FILE:
49 | raise AssertionError()
50 | if len(search(FILE[1:-1])) != 0:
51 | raise AssertionError()
52 |
53 |
54 | def test_search_break():
55 | with pytest.raises(TypeError):
56 | search(FILE, 'not-a-repository')
57 | with pytest.raises(TypeError):
58 | search(None)
59 |
60 |
61 | def test_catalogue(capsys):
62 | catalogue()
63 | captured = capsys.readouterr()
64 | if captured.out != f'{str(default_repository)}\n':
65 | raise AssertionError()
66 |
67 |
68 | def test_catalogue_break():
69 | with pytest.raises(TypeError):
70 | catalogue('not-a-repository')
71 |
72 |
73 | def test_fetch():
74 | file_check(fetch(FILE))
75 | file_check(fetch(f'*{FILE[1:-1]}*'))
76 | file_check(fetch(FILE, repository=default_repository))
77 |
78 |
79 | def test_fetch_break():
80 | file = fetch(FILE)
81 | with pytest.raises(FileExistsError):
82 | fetch(FILE, working_directory=file)
83 | os.remove(file)
84 | with pytest.raises(TypeError):
85 | fetch(None)
86 | with pytest.raises(LoadError):
87 | fetch('not-an-existing-file-or-pattern')
88 | with pytest.raises(TypeError):
89 | fetch(FILE, repository='not-a-repository')
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mdshare
2 | Get access to our MD data files.
3 |
4 | [](https://circleci.com/gh/markovmodel/mdshare/tree/master)
5 | [](https://www.codacy.com/app/cwehmeyer/mdshare?utm_source=github.com&utm_medium=referral&utm_content=markovmodel/mdshare&utm_campaign=Badge_Grade)
6 |
7 | This is a downloader for molecular dynamics (MD) data from a public FTP server at FU Berlin. See [here](https://markovmodel.github.io/mdshare/) for a full list of available datasets and terms of use.
8 |
9 | ## Example
10 | This code will download a file (if it does not already exist locally) with a featurized set of three alanine dipeptide MD trajectories and store its content of three `numpy.ndarray` objects (each of `shape=[250000, 2], dtype=numpy.float32`) in the list `trajs`:
11 |
12 | ```python
13 | import mdshare
14 | import numpy as np
15 |
16 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')
17 | with np.load(local_filename) as fh:
18 | trajs = [fh[key] for key in sorted(fh.keys())]
19 | ```
20 |
21 | By default, the `mdshare.fetch()` function will look in and download to the current directory (function parameter `working_directory='.'`). If you instead set this parameter to `None` ...
22 |
23 | ```python
24 | local_filename = mdshare.fetch(
25 | 'alanine-dipeptide-3x250ns-backbone-dihedrals.npz',
26 | working_directory=None)
27 | ```
28 |
29 | ... the file will be downloaded to a temporary directory. In both cases, the function will return the path to the downloaded file.
30 |
31 | Should the requested file already be present in the `working_directory`, the download is skipped.
32 |
33 | Using `mdshare.catalogue()` to view the files and filesizes of the available trajectories ...
34 |
35 | ```python
36 | mdshare.catalogue()
37 | ```
38 |
39 | ... produces the output:
40 |
41 | ```text
42 | Repository: http://ftp.imp.fu-berlin.de/pub/cmb-data/
43 | Files:
44 | alanine-dipeptide-0-250ns-nowater.xtc 42.9 MB
45 | alanine-dipeptide-1-250ns-nowater.xtc 42.9 MB
46 | alanine-dipeptide-2-250ns-nowater.xtc 42.9 MB
47 | alanine-dipeptide-3x250ns-backbone-dihedrals.npz 6.0 MB
48 | alanine-dipeptide-3x250ns-heavy-atom-distances.npz 135.0 MB
49 | [...]
50 | Containers:
51 | mdshare-test.tar.gz 193.0 bytes
52 | pyemma-tutorial-livecoms.tar.gz 123.9 MB
53 | ```
54 |
55 | Using `mdshare.search(filename_pattern)` to select for a given group of files ...
56 |
57 | ```python
58 | pentapeptide_xtcs = mdshare.search('penta*xtc')
59 | print(pentapeptide_xtcs)
60 | ```
61 |
62 | ... produces the output:
63 |
64 | ```python
65 | ['pentapeptide-00-500ns-impl-solv.xtc',
66 | 'pentapeptide-01-500ns-impl-solv.xtc',
67 | 'pentapeptide-02-500ns-impl-solv.xtc',
68 | ...
69 | 'pentapeptide-22-500ns-impl-solv.xtc',
70 | 'pentapeptide-23-500ns-impl-solv.xtc',
71 | 'pentapeptide-24-500ns-impl-solv.xtc']
72 | ```
73 |
--------------------------------------------------------------------------------
/mdshare/repository.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | from humanfriendly import format_size
19 | from yaml import safe_load
20 | import requests
21 | import fnmatch
22 | from .utils import LoadError, file_hash
23 |
24 |
25 | class Category(dict):
26 | def __init__(self, data):
27 | super(Category, self).__init__(data)
28 |
29 | def search(self, pattern):
30 | return fnmatch.filter(self.keys(), pattern)
31 |
32 | def __str__(self):
33 | string = ''
34 | for key in sorted(self.keys()):
35 | size, unit = format_size(self[key]['size']).split(' ')
36 | string += f'{key:50s} {float(size):6.1f} {unit}\n'
37 | return string.rstrip('\n')
38 |
39 |
40 | class Repository(object):
41 | def __init__(self, catalogue_file, checksum_file=None):
42 | if checksum_file is not None:
43 | with open(checksum_file, 'r') as fh:
44 | if file_hash(catalogue_file) != fh.read():
45 | raise RuntimeError(
46 | 'Checksums do not match, check your catalogue files!')
47 | self.catalogue_file = catalogue_file
48 | with open(self.catalogue_file, 'r') as fh:
49 | data = safe_load(fh)
50 | for key in ('url', 'index', 'containers'):
51 | if key not in data:
52 | raise RuntimeError(
53 | f'Cannot build repository catalogue without the {key} key')
54 | self.url = data['url']
55 | self.index = Category(data['index'])
56 | self.containers = Category(data['containers'])
57 | self._connection = None
58 |
59 | def lookup(self, key):
60 | if key in self.index:
61 | return 'index', self.index[key]
62 | elif key in self.containers:
63 | return 'containers', self.containers[key]
64 | raise LoadError(key, 'file not in repository catalogue')
65 |
66 | def size(self, key):
67 | _, data = self.lookup(key)
68 | return data['size']
69 |
70 | def hash(self, key):
71 | _, data = self.lookup(key)
72 | return data['hash']
73 |
74 | def search(self, pattern):
75 | index = set(self.index.search(pattern))
76 | containers = set(self.containers.search(pattern))
77 | return list(sorted(index | containers))
78 |
79 | def stack(self, pattern):
80 | stack = []
81 | for file in self.search(pattern):
82 | location, data = self.lookup(file)
83 | unpack = location == 'containers'
84 | stack.append(
85 | dict(file=file, size=data['size'], unpack=unpack))
86 | return stack
87 |
88 | def _get_connection(self):
89 | if self._connection is None:
90 | self._connection = requests.session()
91 | return self._connection
92 |
93 | def __str__(self):
94 | string = f'Repository: {self.url}\n'
95 | string += f'Files:\n{self.index}\n'
96 | string += f'Containers:\n{self.containers}'
97 | return string
98 |
--------------------------------------------------------------------------------
/bin/mdshare-index-maker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # This file is part of the markovmodel/mdshare project.
4 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
5 | # Freie Universitaet Berlin (GER)
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Lesser General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see .
19 |
20 | from mdshare import fetch, Repository
21 | from mdshare.utils import file_hash
22 | from argparse import ArgumentParser
23 | from yaml import load, dump
24 | import fnmatch
25 | import tarfile
26 | import os
27 |
28 |
29 | def filter_files(files, patterns):
30 | """Keep only those files which match at least on pattern"""
31 | include = set()
32 | for pattern in patterns:
33 | match = fnmatch.filter(files, pattern)
34 | include = include | set(match)
35 | return list(sorted(include))
36 |
37 |
38 | def get_metadata(file):
39 | """Get a dict with file hash and size"""
40 | return dict(
41 | hash=file_hash(file),
42 | size=os.path.getsize(file))
43 |
44 |
45 | def make_container(container, files):
46 | """Make a .tar.gz container from a list of files"""
47 | with tarfile.open(container, 'w:gz') as fh:
48 | for file in files:
49 | fh.add(file)
50 |
51 |
52 | def build(template_file):
53 | """Build the catalogues from the given template"""
54 | with open(template_file, 'r') as fh:
55 | template = load(fh)
56 |
57 | for key in ('url', 'include', 'containers'):
58 | if key not in template:
59 | raise RuntimeError(f'Cannot build without {key} key')
60 |
61 | db = dict(
62 | url=template['url'],
63 | index=dict(),
64 | containers=dict())
65 |
66 | files = filter_files(os.listdir(), template['include'])
67 | for file in files:
68 | db['index'].update({file: get_metadata(file)})
69 |
70 | for container, patterns in template['containers'].items():
71 | make_container(container, filter_files(files, patterns))
72 | db['containers'].update({container: get_metadata(container)})
73 |
74 | catalogue = f'{template["name"]}.yaml'
75 | with open(catalogue, 'w') as fh:
76 | fh.write(dump(db))
77 |
78 | checksum = f'{template["name"]}.md5'
79 | with open(checksum, 'w') as fh:
80 | fh.write(file_hash(catalogue))
81 |
82 | print(f'catalogue written to: {catalogue}')
83 | print(f'checksum written to: {checksum}')
84 |
85 |
86 | def test(catalogue_file, checksum_file):
87 | repository = Repository(catalogue_file, checksum_file)
88 | working_directory = 'mdshare-testing-area'
89 | os.mkdir(working_directory)
90 | for file in repository.index:
91 | local_file = fetch(
92 | file,
93 | working_directory=working_directory,
94 | repository=repository)
95 | os.remove(local_file)
96 | for file in repository.containers:
97 | local_files = fetch(
98 | file,
99 | working_directory=working_directory,
100 | repository=repository)
101 | try:
102 | os.remove(local_files)
103 | except TypeError:
104 | for local_file in local_files:
105 | os.remove(local_file)
106 | os.rmdir(working_directory)
107 |
108 |
109 | if __name__ == '__main__':
110 | parser = ArgumentParser()
111 | parser.add_argument(
112 | 'mode',
113 | help='action to take [ build | test ]',
114 | metavar='MODE')
115 | parser.add_argument(
116 | 'yaml',
117 | help='yaml file with catalogue or catalogue template',
118 | metavar='FILE')
119 | parser.add_argument(
120 | 'md5',
121 | help='md5 checksum file of the catalogue',
122 | metavar='FILE',
123 | nargs='?')
124 | args = parser.parse_args()
125 |
126 | if args.mode.lower() == 'build':
127 | build(args.yaml)
128 | elif args.mode.lower() == 'test':
129 | test(args.yaml, args.md5)
130 | else:
131 | raise ValueError(f'Unsupported mode: {args.mode}')
132 |
--------------------------------------------------------------------------------
/mdshare/test/test_utils.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import string
19 | import random
20 | import pytest
21 | import os
22 | from .. import default_repository as REPO
23 | from ..utils import LoadError
24 | from ..utils import file_hash
25 | from ..utils import url_join
26 | from ..utils import download_file
27 | from ..utils import attempt_to_download_file
28 | from ..utils import download_wrapper
29 |
30 |
31 | REPO_URL = REPO.url.rstrip('/')
32 | FILE = 'mdshare-test-00.txt'
33 | HASH = '5cbb04531c2e9fa7cc1e5d83195a2f81'
34 |
35 |
36 | def local_file():
37 | file = ''.join([random.choice(string.ascii_letters)
38 | for x in range(20)])
39 | return os.path.join('.', file)
40 |
41 |
42 | def file_check(file):
43 | checksum = file_hash(file)
44 | os.remove(file)
45 | if checksum != HASH:
46 | raise AssertionError()
47 |
48 |
49 | def test_file_hash():
50 | if file_hash('LICENSE') != 'bb3ca60759f3202f1ae42e3519cd06bc':
51 | raise AssertionError()
52 |
53 |
54 | def test_file_hash_break():
55 | with pytest.raises(TypeError):
56 | file_hash(None)
57 | with pytest.raises(FileNotFoundError):
58 | file_hash('THIS IS NOT A FILE')
59 |
60 |
61 | def test_url_join():
62 | url = f'{REPO_URL}/{FILE}'
63 | if url_join(REPO_URL, FILE) != url:
64 | raise AssertionError()
65 | if url_join(REPO_URL, FILE) != url:
66 | raise AssertionError()
67 | if url_join(REPO_URL, f'/{FILE}') != url:
68 | raise AssertionError()
69 | if url_join(f'{REPO_URL}/', f'/{FILE}') != url:
70 | raise AssertionError()
71 | if url_join(f'{REPO_URL}//', FILE) != url:
72 | raise AssertionError()
73 | if url_join(REPO_URL, f'//{FILE}') != url:
74 | raise AssertionError()
75 |
76 |
77 | def test_url_join_break():
78 | with pytest.raises(AttributeError):
79 | url_join(REPO_URL, None)
80 | with pytest.raises(AttributeError):
81 | url_join(None, FILE)
82 | with pytest.raises(AttributeError):
83 | url_join(REPO_URL, 1)
84 | with pytest.raises(AttributeError):
85 | url_join(1, FILE)
86 |
87 |
88 | def test_download_file():
89 | file_check(download_file(REPO, FILE, local_file()))
90 |
91 |
92 | def test_download_file_break():
93 | with pytest.raises(LoadError):
94 | download_file(REPO, None, local_file())
95 | with pytest.raises(LoadError):
96 | download_file(REPO, 'not-an-existing-file', local_file())
97 | with pytest.raises(AttributeError):
98 | download_file(None, FILE, local_file())
99 | with pytest.raises(AttributeError):
100 | download_file('not-a-repository', FILE, local_file())
101 |
102 |
103 | def test_attempt_to_download_file():
104 | file_check(attempt_to_download_file(REPO, FILE, local_file()))
105 | file_check(
106 | attempt_to_download_file(
107 | REPO, FILE, local_file(), max_attempts=10))
108 |
109 |
110 | def test_attempt_to_download_file_break():
111 | with pytest.raises(LoadError):
112 | attempt_to_download_file(REPO, None, local_file())
113 | with pytest.raises(AttributeError):
114 | attempt_to_download_file(None, FILE, local_file())
115 | with pytest.raises(IsADirectoryError):
116 | attempt_to_download_file(REPO, FILE, '.')
117 | with pytest.raises(LoadError):
118 | attempt_to_download_file(
119 | REPO, FILE, local_file(), max_attempts=0)
120 | with pytest.raises(LoadError):
121 | attempt_to_download_file(
122 | REPO, 'not-an-existing-file', local_file())
123 |
124 |
125 | def test_download_wrapper():
126 | file_check(download_wrapper(REPO, FILE))
127 | file_check(download_wrapper(REPO, FILE, max_attempts=10))
128 | with open(FILE, 'w') as fh:
129 | fh.write('nonsense content')
130 | if file_hash(download_wrapper(REPO, FILE)) == HASH:
131 | raise AssertionError()
132 | file_check(download_wrapper(REPO, FILE, force=True))
133 |
134 |
135 | def test_download_wrapper_break():
136 | with pytest.raises(TypeError):
137 | download_wrapper(REPO, None)
138 | with pytest.raises(LoadError):
139 | download_wrapper(REPO, 'not-an-existing-file')
140 | with pytest.raises(RuntimeError):
141 | download_wrapper(REPO, FILE, working_directory=None)
142 | with pytest.raises(AttributeError):
143 | download_wrapper(None, FILE)
144 | with pytest.raises(AttributeError):
145 | download_wrapper('not-a-repository', FILE)
146 | with pytest.raises(LoadError):
147 | download_wrapper(REPO, FILE, max_attempts=0)
148 |
--------------------------------------------------------------------------------
/mdshare/data/mdshare-catalogue.yaml:
--------------------------------------------------------------------------------
1 | containers:
2 | mdshare-test.tar.gz:
3 | hash: 8eda06f1af3760ee788101ecb59dba69
4 | size: 232
5 | pyemma-tutorial-livecoms.tar.gz:
6 | hash: 71eebc44c37825fbedb87b35d3b76587
7 | size: 123939712
8 | index:
9 | alanine-dipeptide-0-250ns-nowater.xtc:
10 | hash: e82ba584d8e64491f30bd1d9dd019687
11 | size: 42909936
12 | alanine-dipeptide-1-250ns-nowater.xtc:
13 | hash: 96b7686aa28a459d51a0d77c2dd0316e
14 | size: 42911308
15 | alanine-dipeptide-1Mx1ps-with-force.npz:
16 | hash: c35a55fb5e5cec0d49e4275d62493ab9
17 | size: 463525559
18 | alanine-dipeptide-2-250ns-nowater.xtc:
19 | hash: 6d3a8d9aecb3aa0e0a1ea9b195b7dfe0
20 | size: 42907500
21 | alanine-dipeptide-3x250ns-backbone-dihedrals.npz:
22 | hash: 3fa7c72ba512213a3860f9691602cdc7
23 | size: 6000544
24 | alanine-dipeptide-3x250ns-heavy-atom-distances.npz:
25 | hash: c8d108087fdce3ccb0f8e30e76858925
26 | size: 135000544
27 | alanine-dipeptide-3x250ns-heavy-atom-positions.npz:
28 | hash: 55bd66fe4ee49849dead7faf6bf8b71e
29 | size: 90000544
30 | alanine-dipeptide-nowater.pdb:
31 | hash: 728635667ed4937cf4a0e5b7c801d9ea
32 | size: 1813
33 | alanine_dipeptide_parallel_tempering_dihedrals.npz:
34 | hash: db21a645b6aa39db4095fa0a5a61c8cc
35 | size: 3365062
36 | alanine_dipeptide_parallel_tempering_energies.npz:
37 | hash: b7b58af1dbd92168d63fac2376edf9cb
38 | size: 1685062
39 | doublewell_disconnected.npy:
40 | hash: 26717c09a92cf96cda412a8ab0119360
41 | size: 160096
42 | doublewell_oneway.npy:
43 | hash: 2843dc5108ffc5f2adf9d3d1c8cce804
44 | size: 160096
45 | hmm-doublewell-2d-100k.npz:
46 | hash: aaf37fb708a0f8f82f70d3aa06da205f
47 | size: 2000638
48 | imd_channel_transitionmatrix.npy:
49 | hash: fdbbd2376541e4be13c503cfa13e789c
50 | size: 8320
51 | imd_full_system_trajectory.npy:
52 | hash: 1d77481bc2d6527016e4791a01349347
53 | size: 8000128
54 | mdshare-test-00.txt:
55 | hash: 5cbb04531c2e9fa7cc1e5d83195a2f81
56 | size: 33
57 | methane-dimer-umbrella-sampling.npz:
58 | hash: e494f8bf0da3283c2d6a6ce9ce10c989
59 | size: 193346
60 | pentapeptide-00-500ns-impl-solv.xtc:
61 | hash: 16967d0bb09d24dc66de4d4885f953a4
62 | size: 2221296
63 | pentapeptide-01-500ns-impl-solv.xtc:
64 | hash: 9db9b87ecafb2d5eb0085adc01b97680
65 | size: 2221268
66 | pentapeptide-02-500ns-impl-solv.xtc:
67 | hash: 5ec89b72fa1e48c3a5e1b1ec707f66fb
68 | size: 2221392
69 | pentapeptide-03-500ns-impl-solv.xtc:
70 | hash: 726208c33f9c9ce0eb2688e5788b8d57
71 | size: 2221596
72 | pentapeptide-04-500ns-impl-solv.xtc:
73 | hash: 934d06ed03744c8cada2123a2cfd6fbf
74 | size: 2221604
75 | pentapeptide-05-500ns-impl-solv.xtc:
76 | hash: 395f614244f3d484db8d4a50f35252fd
77 | size: 2221020
78 | pentapeptide-06-500ns-impl-solv.xtc:
79 | hash: 3a94a06657b5bd7cadf6c3767fc4ca18
80 | size: 2221088
81 | pentapeptide-07-500ns-impl-solv.xtc:
82 | hash: 82c633ed92112bf62bb7070f4393ac2a
83 | size: 2221376
84 | pentapeptide-08-500ns-impl-solv.xtc:
85 | hash: c2f49e03f4c8ef6c8315d844d5d1be0e
86 | size: 2220668
87 | pentapeptide-09-500ns-impl-solv.xtc:
88 | hash: ea74fa65dcb80a086a15c34a907495f3
89 | size: 2221668
90 | pentapeptide-10-500ns-impl-solv.xtc:
91 | hash: 7b374e2402a37f139817be3137e6509e
92 | size: 2221300
93 | pentapeptide-11-500ns-impl-solv.xtc:
94 | hash: 0fe84f969978a492fe26dbb3bd39c6ee
95 | size: 2221672
96 | pentapeptide-12-500ns-impl-solv.xtc:
97 | hash: 8ddaaf213a4b4d34e92cbfbf6f3daad5
98 | size: 2221012
99 | pentapeptide-13-500ns-impl-solv.xtc:
100 | hash: 21ced9d0791a4330c2714a408c6b7b63
101 | size: 2222168
102 | pentapeptide-14-500ns-impl-solv.xtc:
103 | hash: ecd2213ac5de68ef0fc37e9723269d34
104 | size: 2221316
105 | pentapeptide-15-500ns-impl-solv.xtc:
106 | hash: f41d2a6e283d3b43fdf14aa3e95eff03
107 | size: 2221780
108 | pentapeptide-16-500ns-impl-solv.xtc:
109 | hash: 30c8f0805bfa248934714bfb663c4196
110 | size: 2221404
111 | pentapeptide-17-500ns-impl-solv.xtc:
112 | hash: 82c52922cb585962a0b1cab70cc32645
113 | size: 2221296
114 | pentapeptide-18-500ns-impl-solv.xtc:
115 | hash: 6906793b17978370a59025b6c2080ae1
116 | size: 2220924
117 | pentapeptide-19-500ns-impl-solv.xtc:
118 | hash: 4f7948b2f6a5515666b495a1054326c4
119 | size: 2220900
120 | pentapeptide-20-500ns-impl-solv.xtc:
121 | hash: f4f3c376e90826d3dd43bcdf08910863
122 | size: 2221344
123 | pentapeptide-21-500ns-impl-solv.xtc:
124 | hash: e5d2467545d04778d6005c2c904e6df3
125 | size: 2221000
126 | pentapeptide-22-500ns-impl-solv.xtc:
127 | hash: b2cdf3759057ef1a9ccbf9ab41044efa
128 | size: 2220708
129 | pentapeptide-23-500ns-impl-solv.xtc:
130 | hash: 1169c0f9efd028155519fa9bf2a85971
131 | size: 2221592
132 | pentapeptide-24-500ns-impl-solv.xtc:
133 | hash: 8d016a9ea6a5b63843fca58b82b4573c
134 | size: 2220908
135 | pentapeptide-impl-solv.pdb:
136 | hash: c52f482024e0ec7dcd64f2b925b53c2b
137 | size: 7501
138 | pyemma-tutorial-mt-data.npz:
139 | hash: 02f7b91ea2cac71b762f85bdbc4086e9
140 | size: 4763982
141 | pyemma-tutorial-tpt-data.npz:
142 | hash: e3d95283e915cc7992aaefd30cea5462
143 | size: 512952
144 | pyemma-tutorial-us-data.npz:
145 | hash: eb36cfe14a61a5d9a7d5b57ef7f829e8
146 | size: 1622382
147 | pyemma-tutorial-us-nacl.npz:
148 | hash: fca286bd0ffb8f30315e27bff2c1a772
149 | size: 241062
150 | url: http://ftp.imp.fu-berlin.de/pub/cmb-data/
151 |
--------------------------------------------------------------------------------
/mdshare/utils.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import os
19 | import sys
20 | import logging
21 | from requests import HTTPError
22 | from hashlib import md5
23 |
24 |
25 | class LoadError(KeyError):
26 | def __init__(self, file, message, *args, **kwargs):
27 | super(LoadError, self).__init__(*args, **kwargs)
28 | self.file = file
29 | self.message = message
30 |
31 | def __str__(self):
32 | return f'{self.file} [{self.message}]'
33 |
34 |
35 | def file_hash(file, chunk_size=65536):
36 | """Compute the MD5 hash of a file.
37 |
38 | Arguments:
39 | file (str): path of the file to be hashed
40 | chunk_size (int): size of chunks to read
41 | """
42 | hash_ = md5()
43 | with open(file, 'rb') as fh:
44 | while True:
45 | data = fh.read(chunk_size)
46 | if not data:
47 | break
48 | hash_.update(data)
49 | return hash_.hexdigest()
50 |
51 |
52 | def url_join(repository_url, file):
53 | """Compose a URL.
54 |
55 | Arguments:
56 | repository_url (str): url of the repository
57 | file (str): name of the file in the repository
58 | """
59 | return f'{repository_url.rstrip("/")}/{file.lstrip("/")}'
60 |
61 |
62 | def download_file(repository, file, local_path, callback=None):
63 | """Download a file.
64 |
65 | Arguments:
66 | repository (Repository): repository object
67 | file (str): name of the file in the repository
68 | local_path (str): local path where the file should be saved
69 | callback (callable): callback function
70 | """
71 | location, metadata = repository.lookup(file)
72 | logging.debug(
73 | f'Repository::{location}::{file} has checksum {metadata["hash"]}'
74 | f' and size {metadata["size"]}')
75 | logging.debug(
76 | f'From <{repository.url}> download <{file}> to <{local_path}>')
77 | response = repository._get_connection().get(
78 | url_join(repository.url, file),
79 | stream=True)
80 | blocksize = 1024 * 8
81 | with open(local_path, 'wb') as fh:
82 | for i, data in enumerate(response.iter_content(blocksize)):
83 | fh.write(data)
84 | if callback is not None:
85 | callback(i, blocksize)
86 | checksum = file_hash(local_path)
87 | logging.debug(f'Loaded file {local_path} has {checksum}')
88 | if checksum != metadata['hash']:
89 | raise LoadError(file, 'checksum test failed')
90 | return local_path
91 |
92 |
93 | def attempt_to_download_file(
94 | repository,
95 | file,
96 | local_path,
97 | max_attempts=3,
98 | callback=None):
99 | """Retry to download a file several times if necessary.
100 |
101 | Arguments:
102 | repository (Repository): repository object
103 | file (str): name of the file in the repository
104 | local_path (str): local path where the file should be saved
105 | max_attempts (int): number of download attempts
106 | callback (callable): callback function
107 | """
108 | attempt = 0
109 | filename = None
110 |
111 | def fault_handler(filename, exception):
112 | print(f'error: {exception}', file=sys.stderr)
113 | try:
114 | # remove faulty files
115 | os.unlink(filename)
116 | except:
117 | print(f'warning: could not remove file {filename}', file=sys.stderr)
118 | raise exception
119 |
120 | while attempt < max_attempts:
121 | attempt += 1
122 | logging.debug(f'download attempt {attempt}/{max_attempts} ...')
123 | try:
124 | filename = download_file(
125 | repository,
126 | file,
127 | local_path,
128 | callback=callback)
129 | break
130 | except (HTTPError, IOError, KeyboardInterrupt) as e:
131 | fault_handler(filename, e)
132 | if filename is None:
133 | raise LoadError(file, 'download failed')
134 | return filename
135 |
136 |
137 | def download_wrapper(
138 | repository, file, working_directory='.',
139 | max_attempts=3, force=False, callback=None):
140 | """Download a file if necessary.
141 |
142 | Arguments:
143 | repository (Repository): repository object
144 | file (str): name of the file in the repository
145 | working_directory (str): directory where the file should be saved
146 | max_attempts (int): number of download attempts
147 | force (boolean): enforce download even if file exists
148 | callback (callable): callback function
149 | """
150 | logging.debug(
151 | f'download_wrapper({repository.url}, {file},'
152 | f' working_directory="{working_directory}",'
153 | f' max_attempts={max_attempts}, force={force})')
154 | if working_directory is None:
155 | raise RuntimeError(
156 | 'working_directory=None is illegal at this point')
157 | local_path = os.path.join(working_directory, file)
158 | logging.debug(f'local_path={local_path}')
159 | if os.path.exists(local_path) and not force:
160 | logging.debug(f'local_path={local_path} exists ... return')
161 | return local_path
162 | logging.debug(
163 | f'local_path={local_path} does not exist ... attempting download')
164 | return attempt_to_download_file(
165 | repository,
166 | file,
167 | local_path,
168 | max_attempts=max_attempts,
169 | callback=callback)
170 |
--------------------------------------------------------------------------------
/mdshare/test/test_repository.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import random
19 | import string
20 | import pytest
21 | import os
22 | from yaml import dump
23 | from ..utils import LoadError
24 | from ..utils import file_hash
25 | from ..repository import Category
26 | from ..repository import Repository
27 |
28 |
29 | def randomizer(length, pattern=None):
30 | sample = ''.join([random.choice(string.ascii_letters)
31 | for x in range(length)])
32 | if pattern is None:
33 | return sample
34 | return f'{sample}-{pattern}-{sample}'
35 |
36 |
37 | def make_random_category_dict(n, m):
38 | def metadata():
39 | return dict(
40 | size=random.randint(100, 100000),
41 | hash=randomizer(32))
42 |
43 | patterns = [randomizer(10) + str(i) for i in range(n)]
44 | files = []
45 | for pattern in patterns:
46 | files += [randomizer(10, pattern) + str(i) for i in range(m)]
47 | data = {file: metadata() for file in files}
48 | return patterns, files, data
49 |
50 |
51 | class RandomCatalogue(object):
52 | def __init__(self, npattern, nentries, ncontainers, mode=0):
53 | _, _, index = make_random_category_dict(npattern, nentries)
54 | _, _, containers = make_random_category_dict(ncontainers, 1)
55 | self.data = dict(
56 | url=f'http://{randomizer(10)}.{randomizer(3)}/{randomizer(7)}',
57 | index=index,
58 | containers=containers)
59 | self.offset = ''
60 | if mode == 1:
61 | self.data.pop('url')
62 | elif mode == 2:
63 | self.data.pop('index')
64 | elif mode == 3:
65 | self.data.pop('containers')
66 | elif mode == 4:
67 | self.offset = randomizer(42)
68 | self.file = randomizer(25)
69 |
70 | def __enter__(self):
71 | with open(f'{self.file}.yaml', 'w') as fh:
72 | fh.write(dump(self.data))
73 | with open(f'{self.file}.md5', 'w') as fh:
74 | fh.write(file_hash(f'{self.file}.yaml') + self.offset)
75 | return (self.data, self.file)
76 |
77 | def __exit__(self, exception_type, exception_value, traceback):
78 | os.remove(f'{self.file}.yaml')
79 | os.remove(f'{self.file}.md5')
80 |
81 |
82 | def test_category():
83 | n, m = random.randint(2, 6), random.randint(2, 6)
84 | patterns, files, data = make_random_category_dict(n, m)
85 | category = Category(data)
86 | if len(category.keys()) != n * m:
87 | raise AssertionError()
88 | for pattern in patterns:
89 | if len(category.search(f'*-{pattern}-*')) != m:
90 | raise AssertionError()
91 | if len(category.search(f'*-{pattern[1:-1]}-*')) != 0:
92 | raise AssertionError()
93 | string = str(category)
94 | for file in files:
95 | for key in ('hash', 'size'):
96 | if category[file][key] != data[file][key]:
97 | raise AssertionError()
98 | if file not in string:
99 | raise AssertionError()
100 |
101 |
102 | def test_repository():
103 | args = [random.randint(2, 7) for _ in range(3)]
104 | with RandomCatalogue(*args, mode=0) as (data, file):
105 | repository = Repository(f'{file}.yaml', f'{file}.md5')
106 | string = str(repository)
107 | if repository.url != data['url']:
108 | raise AssertionError()
109 | if data['url'] not in string:
110 | raise AssertionError()
111 | for file in data['index']:
112 | if file not in string:
113 | raise AssertionError()
114 | if file not in repository.index:
115 | raise AssertionError()
116 | location, metadata = repository.lookup(file)
117 | if location != 'index':
118 | raise AssertionError()
119 | if metadata['size'] != data['index'][file]['size']:
120 | raise AssertionError()
121 | if metadata['hash'] != data['index'][file]['hash']:
122 | raise AssertionError()
123 | if repository.size(file) != data['index'][file]['size']:
124 | raise AssertionError()
125 | if repository.hash(file) != data['index'][file]['hash']:
126 | raise AssertionError()
127 | for file in data['containers']:
128 | if file not in string:
129 | raise AssertionError()
130 | if file not in repository.containers:
131 | raise AssertionError()
132 | location, metadata = repository.lookup(file)
133 | if location != 'containers':
134 | raise AssertionError()
135 | if metadata['size'] != data['containers'][file]['size']:
136 | raise AssertionError()
137 | if metadata['hash'] != data['containers'][file]['hash']:
138 | raise AssertionError()
139 | if repository.size(file) != data['containers'][file]['size']:
140 | raise AssertionError()
141 | if repository.hash(file) != data['containers'][file]['hash']:
142 | raise AssertionError()
143 |
144 |
145 | def test_repository_break():
146 | for mode in range(4):
147 | with RandomCatalogue(4, 3, 2, mode=mode + 1) as (data, file):
148 | with pytest.raises(RuntimeError):
149 | Repository(f'{file}.yaml', f'{file}.md5')
150 | args = [random.randint(2, 7) for _ in range(3)]
151 | with RandomCatalogue(*args, mode=0) as (data, file):
152 | repository = Repository(f'{file}.yaml', f'{file}.md5')
153 | for location in ('index', 'containers'):
154 | for file in data[location]:
155 | with pytest.raises(LoadError):
156 | repository.lookup(file[1:-1])
157 |
--------------------------------------------------------------------------------
/mdshare/api.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/mdshare project.
2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import os
19 | import sys
20 | import tarfile
21 | from tempfile import mkdtemp
22 | from .utils import LoadError, download_wrapper
23 | from .repository import Repository
24 | from . import default_repository
25 |
26 |
27 | def load_repository(catalogue_file, checksum_file=None):
28 | """Load a repository catalogue from file
29 |
30 | Arguments:
31 | catalogue_file (str): filename of the catalogue
32 | checksum_file (str): filename of the catalogue's checksum
33 | """
34 | return Repository(catalogue_file, checksum_file)
35 |
36 |
37 | def search(filename_pattern, repository=None):
38 | """Returns a list of available files matching a filename_pattern.
39 |
40 | Arguments:
41 | filename_pattern (str): filename pattern, allows for Unix shell-style wildcards
42 | repository (Repository): repository object
43 | """
44 | if repository is None:
45 | repository = default_repository
46 | if not isinstance(repository, Repository):
47 | raise TypeError('received {type(repository)} instead of Repository')
48 | return repository.search(filename_pattern)
49 |
50 |
51 | def catalogue(repository=None):
52 | """Prints a human-friendly list of available files/sizes.
53 |
54 | Arguments:
55 | repository (Repository): repository object
56 | """
57 | if repository is None:
58 | repository = default_repository
59 | if not isinstance(repository, Repository):
60 | raise TypeError('received {type(repository)} instead of Repository')
61 | print(repository)
62 |
63 |
64 | def fetch(
65 | remote_filename, working_directory='.', repository=None,
66 | max_attempts=3, force=False, show_progress=True):
67 | """Download a file if it is not already at the traget location.
68 |
69 | Arguments:
70 | remote_filename (str): name of the file in the repository
71 | working_directory (str): directory where the file should be saved
72 | repository (Repository): repository object
73 | max_attempts (int): number of download attempts
74 | force (boolean): enforce download even if file exists
75 | show_progress (boolean): show download progress
76 | """
77 | if repository is None:
78 | repository = default_repository
79 | if not isinstance(repository, Repository):
80 | raise TypeError('received {type(repository)} instead of Repository')
81 | if working_directory is None:
82 | working_directory = mkdtemp()
83 | else:
84 | os.makedirs(working_directory, exist_ok=True)
85 | try:
86 | import progress_reporter
87 | have_progress_reporter = True
88 | except ImportError:
89 | have_progress_reporter = False
90 |
91 | stack = repository.stack(remote_filename)
92 | if len(stack) == 0:
93 | raise LoadError(remote_filename, 'no match in repository')
94 |
95 | if have_progress_reporter and show_progress:
96 | callbacks = []
97 | pg = progress_reporter.ProgressReporter_()
98 | total = sum(item['size'] for item in stack)
99 |
100 | def update(n, blk, stage):
101 | downloaded = n * blk
102 | inc = max(
103 | 0, downloaded - pg._prog_rep_progressbars[stage].n)
104 | pg.update(inc, stage=stage)
105 | # total progress
106 | try:
107 | pg.update(inc, stage=-1)
108 | except RuntimeError:
109 | pass
110 |
111 | from functools import partial
112 | tqdm_args = dict(unit='B', file=sys.stdout, unit_scale=True)
113 |
114 | n_progress_bars = 0
115 | for stage, item in enumerate(stack):
116 | if working_directory is not None:
117 | path = os.path.join(working_directory, item['file'])
118 | if os.path.exists(path) and not force:
119 | callbacks.append(None)
120 | else:
121 | pg.register(
122 | item['size'],
123 | description=f'downloading {item["file"]}',
124 | tqdm_args=tqdm_args,
125 | stage=stage)
126 | callbacks.append(partial(update, stage=stage))
127 | n_progress_bars += 1
128 | if n_progress_bars > 1:
129 | pg.register(
130 | total, description='total', tqdm_args=tqdm_args, stage=-1)
131 | else:
132 | from unittest.mock import MagicMock
133 | pg = MagicMock()
134 | callbacks = [None] * len(stack)
135 |
136 | result = []
137 | with pg.context():
138 | for item, progress in zip(stack, callbacks):
139 | file = download_wrapper(
140 | repository,
141 | item['file'],
142 | working_directory=working_directory,
143 | max_attempts=max_attempts,
144 | force=force,
145 | callback=progress)
146 | if item['unpack']:
147 |
148 | def inspect(members):
149 | for member in members:
150 | path, filename = os.path.split(member.name)
151 | if path == '':
152 | yield member, filename
153 |
154 | with tarfile.open(file, 'r:gz') as fh:
155 | members = []
156 | for member, filename in inspect(fh):
157 | members.append(member)
158 | result.append(
159 | os.path.join(working_directory, filename))
160 | fh.extractall(
161 | path=working_directory, members=members)
162 | os.remove(file)
163 | else:
164 | result.append(file)
165 |
166 | if len(result) == 0:
167 | raise LoadError(remote_filename, 'this should not have happend!')
168 | elif len(result) == 1:
169 | return result[0]
170 | return result
171 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
--------------------------------------------------------------------------------