├── mdshare ├── test │ ├── __init__.py │ ├── test_api.py │ ├── test_utils.py │ └── test_repository.py ├── data │ ├── mdshare-catalogue.md5 │ ├── template.yaml │ └── mdshare-catalogue.yaml ├── __init__.py ├── repository.py ├── utils.py └── api.py ├── .gitattributes ├── .git_archival.txt ├── requirements.txt ├── setup.cfg ├── MANIFEST.in ├── CHANGELOG.md ├── .circleci └── config.yml ├── .gitignore ├── setup.py ├── README.md ├── bin └── mdshare-index-maker.py └── LICENSE /mdshare/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | .git_archival.txt export-subst -------------------------------------------------------------------------------- /.git_archival.txt: -------------------------------------------------------------------------------- 1 | ref-names: HEAD -> master, tag: 0.4.2 -------------------------------------------------------------------------------- /mdshare/data/mdshare-catalogue.md5: -------------------------------------------------------------------------------- 1 | 3465007cf4a866ac4316b4e8afba4673 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | setuptools_scm 3 | setuptools_scm_git_archive 4 | pytest 5 | humanfriendly 6 | requests 7 | pyyaml 8 | tqdm -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [alias] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | filterwarnings = 6 | once::DeprecationWarning 7 | once::PendingDeprecationWarning 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include CHANGELOG.md 4 | include mdshare/data/mdshare-catalogue.yaml 5 | include mdshare/data/mdshare-catalogue.md5 6 | include mdshare/data/template.md5 7 | 8 | # exclude compiled bytecode 9 | global-exclude *.pyc 10 | # exclude git backup files 11 | global-exclude *.orig 12 | 13 | # do not include eventually present eggs (installed during setup runtime) 14 | prune .eggs 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # mdshare — CHANGELOG 2 | 3 | ## 0.1.0 4 | - provides `mdshare.load()` 5 | 6 | ## 0.2.0 7 | - switching to LGPL 8 | - adding tests and CI 9 | 10 | ### 0.2.1 11 | - removing numpy dependency 12 | - code cleanup 13 | 14 | ## 0.3.0 15 | - provides `mdshare.catalogue()` and `mdshare.search()` 16 | - deprecates `mdshare.load()` 17 | - provides `mdshare.fetch()` as successor to `mdshare.load()` 18 | 19 | ### 0.3.1 20 | - updates to documention and package 21 | 22 | ### 0.3.2 23 | - replaces FTP usage 24 | 25 | ## 0.4.0 26 | - added progressbars 27 | - major refactoring to have an offline catalogue available 28 | - allow to download/extract .tar.gz containers 29 | 30 | ## 0.4.1 31 | - updated file catalogue 32 | 33 | ### upcoming 34 | - removed obsolete/unsused imports/variables/... 35 | - replaced assert statements 36 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` 11 | - image: circleci/python:3.6.1 12 | 13 | # Specify service dependencies here if necessary 14 | # CircleCI maintains a library of pre-built images 15 | # documented at https://circleci.com/docs/2.0/circleci-images/ 16 | # - image: circleci/postgres:9.4 17 | 18 | working_directory: ~/repo 19 | 20 | steps: 21 | - checkout 22 | 23 | # Download and cache dependencies 24 | - restore_cache: 25 | keys: 26 | - v1-dependencies-{{ checksum "requirements.txt" }} 27 | # fallback to using the latest cache if no exact match is found 28 | - v1-dependencies- 29 | 30 | - run: 31 | name: install dependencies 32 | command: | 33 | python3 -m venv venv 34 | . venv/bin/activate 35 | pip install -r requirements.txt 36 | 37 | - save_cache: 38 | paths: 39 | - ./venv 40 | key: v1-dependencies-{{ checksum "requirements.txt" }} 41 | 42 | # run tests! 43 | - run: 44 | name: run tests 45 | command: | 46 | . venv/bin/activate 47 | python setup.py test 48 | 49 | - store_artifacts: 50 | path: test-reports 51 | destination: test-reports 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # pytest 104 | .pytest_cache/ 105 | -------------------------------------------------------------------------------- /mdshare/data/template.yaml: -------------------------------------------------------------------------------- 1 | # MDSHARE TEMPLATE FILE 2 | # 3 | # This is the template to compile the current catalogue file. 4 | # Navigate to the directory where the data files are and run 5 | # 6 | # python path/to/mdshare-index-maker.py build path/to/template.yaml 7 | # 8 | # This will create a yaml file (NAME.yaml) and a corresponding MD5 9 | # checksum file (NAME.md5) where NAME corresponds to the 'name' entry 10 | # in the template. 11 | # 12 | # The 'url' entry points to the directory's URL. 13 | # 14 | # 'include' denotes all files ion the current directory which should 15 | # be indexed; you can use unix-style wildcard patterns. 16 | # 17 | # 'containers' denotes which files should be grouped in .tar.gz 18 | # archives; again, you can use unix-style wildcard patterns. The 19 | # files must be part of 'include'. 20 | 21 | name: mdshare-catalogue 22 | url: 'http://ftp.imp.fu-berlin.de/pub/cmb-data/' 23 | include: 24 | - alanine-dipeptide-*.npz 25 | - alanine-dipeptide-*-nowater.xtc 26 | - alanine-dipeptide-nowater.pdb 27 | - pentapeptide-*-500ns-impl-solv.xtc 28 | - pentapeptide-impl-solv.pdb 29 | - pyemma-tutorial-mt-data.npz 30 | - pyemma-tutorial-tpt-data.npz 31 | - pyemma-tutorial-us-data.npz 32 | - pyemma-tutorial-us-nacl.npz 33 | - methane-dimer-umbrella-sampling.npz 34 | - doublewell_disconnected.npy 35 | - doublewell_oneway.npy 36 | - hmm-doublewell-2d-100k.npz 37 | - mdshare-test-00.txt 38 | - imd_channel_transitionmatrix.npy 39 | - imd_full_system_trajectory.npy 40 | - alanine_dipeptide_parallel_tempering_energies.npz 41 | - alanine_dipeptide_parallel_tempering_dihedrals.npz 42 | containers: 43 | pyemma-tutorial-livecoms.tar.gz: 44 | - alanine-dipeptide-*-nowater.xtc 45 | - alanine-dipeptide-nowater.pdb 46 | - pentapeptide-*-500ns-impl-solv.xtc 47 | - pentapeptide-impl-solv.pdb 48 | - doublewell_disconnected.npy 49 | - doublewell_oneway.npy 50 | - hmm-doublewell-2d-100k.npz 51 | mdshare-test.tar.gz: 52 | - mdshare-test-00.txt 53 | -------------------------------------------------------------------------------- /mdshare/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | 19 | __author__ = 'Christoph Wehmeyer' 20 | __email__ = 'christoph.wehmeyer@fu-berlin.de' 21 | __credits__ = ['Guillermo Pérez-Hernández', 'Martin K. Scherer'], 22 | 23 | 24 | from pkg_resources import get_distribution, DistributionNotFound 25 | try: 26 | __version__ = get_distribution(__name__).version 27 | except DistributionNotFound: 28 | __version__ = 'unknown' 29 | del get_distribution, DistributionNotFound 30 | 31 | 32 | from .repository import Repository 33 | from os.path import dirname, join 34 | from warnings import warn 35 | try: 36 | default_repository = Repository( 37 | join(dirname(__file__), 'data', 'mdshare-catalogue.yaml'), 38 | join(dirname(__file__), 'data', 'mdshare-catalogue.md5')) 39 | except FileNotFoundError: 40 | warn('Cannot build the default repository: missing file(s)!') 41 | default_repository = None 42 | except RuntimeError as e: 43 | warn(f'Cannot build the default repository: {e.args[0]}') 44 | default_repository = None 45 | del dirname, join, warn 46 | 47 | 48 | from .api import load_repository, search, catalogue, fetch 49 | from .utils import LoadError 50 | 51 | 52 | def load(*args, **kwargs): 53 | raise NotImplementedError('use fetch') 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | from setuptools import setup, find_packages 19 | from setuptools.command.test import test as TestCommand 20 | import sys 21 | 22 | class PyTest(TestCommand): 23 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 24 | def initialize_options(self): 25 | TestCommand.initialize_options(self) 26 | self.pytest_args = ['mdshare'] 27 | def run_tests(self): 28 | import pytest 29 | errno = pytest.main(self.pytest_args) 30 | sys.exit(errno) 31 | 32 | setup( 33 | cmdclass={'test': PyTest}, 34 | use_scm_version=True, 35 | name='mdshare', 36 | author='Christoph Wehmeyer', 37 | author_email='christoph.wehmeyer@fu-berlin.de', 38 | url='https://github.com/markovmodel/mdshare', 39 | description='Get easy access to our public data files.', 40 | packages=find_packages(), 41 | include_package_data=True, 42 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'], 43 | install_requires=['humanfriendly', 44 | 'requests', 45 | ], 46 | tests_require=['pytest'], 47 | zip_safe=False, 48 | scripts=['bin/mdshare-index-maker.py'], 49 | classifiers=[ 50 | 'Development Status :: 4 - Beta', 51 | 'Environment :: Console', 52 | 'Intended Audience :: Science/Research', 53 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)', 54 | 'Natural Language :: English', 55 | 'Operating System :: OS Independent', 56 | 'Programming Language :: Python :: 3 :: Only', 57 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 58 | 'Topic :: Scientific/Engineering :: Chemistry', 59 | 'Topic :: Scientific/Engineering :: Mathematics', 60 | 'Topic :: Scientific/Engineering :: Physics']) 61 | -------------------------------------------------------------------------------- /mdshare/test/test_api.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import pytest 19 | import os 20 | from ..utils import LoadError 21 | from ..utils import file_hash 22 | from ..api import load_repository 23 | from ..api import search 24 | from ..api import catalogue 25 | from ..api import fetch 26 | from .. import default_repository 27 | 28 | FILE = 'mdshare-test-00.txt' 29 | HASH = '5cbb04531c2e9fa7cc1e5d83195a2f81' 30 | 31 | 32 | def file_check(file): 33 | if file_hash(file) != HASH: 34 | raise AssertionError() 35 | os.remove(file) 36 | 37 | 38 | def test_load_repository_break(): 39 | with pytest.raises(TypeError): 40 | load_repository(None) 41 | with pytest.raises(FileNotFoundError): 42 | load_repository('not-a-repository') 43 | 44 | 45 | def test_search(): 46 | if len(search(FILE)) != 1: 47 | raise AssertionError() 48 | if search(FILE)[0] != FILE: 49 | raise AssertionError() 50 | if len(search(FILE[1:-1])) != 0: 51 | raise AssertionError() 52 | 53 | 54 | def test_search_break(): 55 | with pytest.raises(TypeError): 56 | search(FILE, 'not-a-repository') 57 | with pytest.raises(TypeError): 58 | search(None) 59 | 60 | 61 | def test_catalogue(capsys): 62 | catalogue() 63 | captured = capsys.readouterr() 64 | if captured.out != f'{str(default_repository)}\n': 65 | raise AssertionError() 66 | 67 | 68 | def test_catalogue_break(): 69 | with pytest.raises(TypeError): 70 | catalogue('not-a-repository') 71 | 72 | 73 | def test_fetch(): 74 | file_check(fetch(FILE)) 75 | file_check(fetch(f'*{FILE[1:-1]}*')) 76 | file_check(fetch(FILE, repository=default_repository)) 77 | 78 | 79 | def test_fetch_break(): 80 | file = fetch(FILE) 81 | with pytest.raises(FileExistsError): 82 | fetch(FILE, working_directory=file) 83 | os.remove(file) 84 | with pytest.raises(TypeError): 85 | fetch(None) 86 | with pytest.raises(LoadError): 87 | fetch('not-an-existing-file-or-pattern') 88 | with pytest.raises(TypeError): 89 | fetch(FILE, repository='not-a-repository') 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mdshare 2 | Get access to our MD data files. 3 | 4 | [![CircleCI](https://circleci.com/gh/markovmodel/mdshare/tree/master.svg?style=svg)](https://circleci.com/gh/markovmodel/mdshare/tree/master) 5 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/b9a86155b4e84bf3b397bad0c04e42a9)](https://www.codacy.com/app/cwehmeyer/mdshare?utm_source=github.com&utm_medium=referral&utm_content=markovmodel/mdshare&utm_campaign=Badge_Grade) 6 | 7 | This is a downloader for molecular dynamics (MD) data from a public FTP server at FU Berlin. See [here](https://markovmodel.github.io/mdshare/) for a full list of available datasets and terms of use. 8 | 9 | ## Example 10 | This code will download a file (if it does not already exist locally) with a featurized set of three alanine dipeptide MD trajectories and store its content of three `numpy.ndarray` objects (each of `shape=[250000, 2], dtype=numpy.float32`) in the list `trajs`: 11 | 12 | ```python 13 | import mdshare 14 | import numpy as np 15 | 16 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz') 17 | with np.load(local_filename) as fh: 18 | trajs = [fh[key] for key in sorted(fh.keys())] 19 | ``` 20 | 21 | By default, the `mdshare.fetch()` function will look in and download to the current directory (function parameter `working_directory='.'`). If you instead set this parameter to `None` ... 22 | 23 | ```python 24 | local_filename = mdshare.fetch( 25 | 'alanine-dipeptide-3x250ns-backbone-dihedrals.npz', 26 | working_directory=None) 27 | ``` 28 | 29 | ... the file will be downloaded to a temporary directory. In both cases, the function will return the path to the downloaded file. 30 | 31 | Should the requested file already be present in the `working_directory`, the download is skipped. 32 | 33 | Using `mdshare.catalogue()` to view the files and filesizes of the available trajectories ... 34 | 35 | ```python 36 | mdshare.catalogue() 37 | ``` 38 | 39 | ... produces the output: 40 | 41 | ```text 42 | Repository: http://ftp.imp.fu-berlin.de/pub/cmb-data/ 43 | Files: 44 | alanine-dipeptide-0-250ns-nowater.xtc 42.9 MB 45 | alanine-dipeptide-1-250ns-nowater.xtc 42.9 MB 46 | alanine-dipeptide-2-250ns-nowater.xtc 42.9 MB 47 | alanine-dipeptide-3x250ns-backbone-dihedrals.npz 6.0 MB 48 | alanine-dipeptide-3x250ns-heavy-atom-distances.npz 135.0 MB 49 | [...] 50 | Containers: 51 | mdshare-test.tar.gz 193.0 bytes 52 | pyemma-tutorial-livecoms.tar.gz 123.9 MB 53 | ``` 54 | 55 | Using `mdshare.search(filename_pattern)` to select for a given group of files ... 56 | 57 | ```python 58 | pentapeptide_xtcs = mdshare.search('penta*xtc') 59 | print(pentapeptide_xtcs) 60 | ``` 61 | 62 | ... produces the output: 63 | 64 | ```python 65 | ['pentapeptide-00-500ns-impl-solv.xtc', 66 | 'pentapeptide-01-500ns-impl-solv.xtc', 67 | 'pentapeptide-02-500ns-impl-solv.xtc', 68 | ... 69 | 'pentapeptide-22-500ns-impl-solv.xtc', 70 | 'pentapeptide-23-500ns-impl-solv.xtc', 71 | 'pentapeptide-24-500ns-impl-solv.xtc'] 72 | ``` 73 | -------------------------------------------------------------------------------- /mdshare/repository.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | from humanfriendly import format_size 19 | from yaml import safe_load 20 | import requests 21 | import fnmatch 22 | from .utils import LoadError, file_hash 23 | 24 | 25 | class Category(dict): 26 | def __init__(self, data): 27 | super(Category, self).__init__(data) 28 | 29 | def search(self, pattern): 30 | return fnmatch.filter(self.keys(), pattern) 31 | 32 | def __str__(self): 33 | string = '' 34 | for key in sorted(self.keys()): 35 | size, unit = format_size(self[key]['size']).split(' ') 36 | string += f'{key:50s} {float(size):6.1f} {unit}\n' 37 | return string.rstrip('\n') 38 | 39 | 40 | class Repository(object): 41 | def __init__(self, catalogue_file, checksum_file=None): 42 | if checksum_file is not None: 43 | with open(checksum_file, 'r') as fh: 44 | if file_hash(catalogue_file) != fh.read(): 45 | raise RuntimeError( 46 | 'Checksums do not match, check your catalogue files!') 47 | self.catalogue_file = catalogue_file 48 | with open(self.catalogue_file, 'r') as fh: 49 | data = safe_load(fh) 50 | for key in ('url', 'index', 'containers'): 51 | if key not in data: 52 | raise RuntimeError( 53 | f'Cannot build repository catalogue without the {key} key') 54 | self.url = data['url'] 55 | self.index = Category(data['index']) 56 | self.containers = Category(data['containers']) 57 | self._connection = None 58 | 59 | def lookup(self, key): 60 | if key in self.index: 61 | return 'index', self.index[key] 62 | elif key in self.containers: 63 | return 'containers', self.containers[key] 64 | raise LoadError(key, 'file not in repository catalogue') 65 | 66 | def size(self, key): 67 | _, data = self.lookup(key) 68 | return data['size'] 69 | 70 | def hash(self, key): 71 | _, data = self.lookup(key) 72 | return data['hash'] 73 | 74 | def search(self, pattern): 75 | index = set(self.index.search(pattern)) 76 | containers = set(self.containers.search(pattern)) 77 | return list(sorted(index | containers)) 78 | 79 | def stack(self, pattern): 80 | stack = [] 81 | for file in self.search(pattern): 82 | location, data = self.lookup(file) 83 | unpack = location == 'containers' 84 | stack.append( 85 | dict(file=file, size=data['size'], unpack=unpack)) 86 | return stack 87 | 88 | def _get_connection(self): 89 | if self._connection is None: 90 | self._connection = requests.session() 91 | return self._connection 92 | 93 | def __str__(self): 94 | string = f'Repository: {self.url}\n' 95 | string += f'Files:\n{self.index}\n' 96 | string += f'Containers:\n{self.containers}' 97 | return string 98 | -------------------------------------------------------------------------------- /bin/mdshare-index-maker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file is part of the markovmodel/mdshare project. 4 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 5 | # Freie Universitaet Berlin (GER) 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | from mdshare import fetch, Repository 21 | from mdshare.utils import file_hash 22 | from argparse import ArgumentParser 23 | from yaml import load, dump 24 | import fnmatch 25 | import tarfile 26 | import os 27 | 28 | 29 | def filter_files(files, patterns): 30 | """Keep only those files which match at least on pattern""" 31 | include = set() 32 | for pattern in patterns: 33 | match = fnmatch.filter(files, pattern) 34 | include = include | set(match) 35 | return list(sorted(include)) 36 | 37 | 38 | def get_metadata(file): 39 | """Get a dict with file hash and size""" 40 | return dict( 41 | hash=file_hash(file), 42 | size=os.path.getsize(file)) 43 | 44 | 45 | def make_container(container, files): 46 | """Make a .tar.gz container from a list of files""" 47 | with tarfile.open(container, 'w:gz') as fh: 48 | for file in files: 49 | fh.add(file) 50 | 51 | 52 | def build(template_file): 53 | """Build the catalogues from the given template""" 54 | with open(template_file, 'r') as fh: 55 | template = load(fh) 56 | 57 | for key in ('url', 'include', 'containers'): 58 | if key not in template: 59 | raise RuntimeError(f'Cannot build without {key} key') 60 | 61 | db = dict( 62 | url=template['url'], 63 | index=dict(), 64 | containers=dict()) 65 | 66 | files = filter_files(os.listdir(), template['include']) 67 | for file in files: 68 | db['index'].update({file: get_metadata(file)}) 69 | 70 | for container, patterns in template['containers'].items(): 71 | make_container(container, filter_files(files, patterns)) 72 | db['containers'].update({container: get_metadata(container)}) 73 | 74 | catalogue = f'{template["name"]}.yaml' 75 | with open(catalogue, 'w') as fh: 76 | fh.write(dump(db)) 77 | 78 | checksum = f'{template["name"]}.md5' 79 | with open(checksum, 'w') as fh: 80 | fh.write(file_hash(catalogue)) 81 | 82 | print(f'catalogue written to: {catalogue}') 83 | print(f'checksum written to: {checksum}') 84 | 85 | 86 | def test(catalogue_file, checksum_file): 87 | repository = Repository(catalogue_file, checksum_file) 88 | working_directory = 'mdshare-testing-area' 89 | os.mkdir(working_directory) 90 | for file in repository.index: 91 | local_file = fetch( 92 | file, 93 | working_directory=working_directory, 94 | repository=repository) 95 | os.remove(local_file) 96 | for file in repository.containers: 97 | local_files = fetch( 98 | file, 99 | working_directory=working_directory, 100 | repository=repository) 101 | try: 102 | os.remove(local_files) 103 | except TypeError: 104 | for local_file in local_files: 105 | os.remove(local_file) 106 | os.rmdir(working_directory) 107 | 108 | 109 | if __name__ == '__main__': 110 | parser = ArgumentParser() 111 | parser.add_argument( 112 | 'mode', 113 | help='action to take [ build | test ]', 114 | metavar='MODE') 115 | parser.add_argument( 116 | 'yaml', 117 | help='yaml file with catalogue or catalogue template', 118 | metavar='FILE') 119 | parser.add_argument( 120 | 'md5', 121 | help='md5 checksum file of the catalogue', 122 | metavar='FILE', 123 | nargs='?') 124 | args = parser.parse_args() 125 | 126 | if args.mode.lower() == 'build': 127 | build(args.yaml) 128 | elif args.mode.lower() == 'test': 129 | test(args.yaml, args.md5) 130 | else: 131 | raise ValueError(f'Unsupported mode: {args.mode}') 132 | -------------------------------------------------------------------------------- /mdshare/test/test_utils.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import string 19 | import random 20 | import pytest 21 | import os 22 | from .. import default_repository as REPO 23 | from ..utils import LoadError 24 | from ..utils import file_hash 25 | from ..utils import url_join 26 | from ..utils import download_file 27 | from ..utils import attempt_to_download_file 28 | from ..utils import download_wrapper 29 | 30 | 31 | REPO_URL = REPO.url.rstrip('/') 32 | FILE = 'mdshare-test-00.txt' 33 | HASH = '5cbb04531c2e9fa7cc1e5d83195a2f81' 34 | 35 | 36 | def local_file(): 37 | file = ''.join([random.choice(string.ascii_letters) 38 | for x in range(20)]) 39 | return os.path.join('.', file) 40 | 41 | 42 | def file_check(file): 43 | checksum = file_hash(file) 44 | os.remove(file) 45 | if checksum != HASH: 46 | raise AssertionError() 47 | 48 | 49 | def test_file_hash(): 50 | if file_hash('LICENSE') != 'bb3ca60759f3202f1ae42e3519cd06bc': 51 | raise AssertionError() 52 | 53 | 54 | def test_file_hash_break(): 55 | with pytest.raises(TypeError): 56 | file_hash(None) 57 | with pytest.raises(FileNotFoundError): 58 | file_hash('THIS IS NOT A FILE') 59 | 60 | 61 | def test_url_join(): 62 | url = f'{REPO_URL}/{FILE}' 63 | if url_join(REPO_URL, FILE) != url: 64 | raise AssertionError() 65 | if url_join(REPO_URL, FILE) != url: 66 | raise AssertionError() 67 | if url_join(REPO_URL, f'/{FILE}') != url: 68 | raise AssertionError() 69 | if url_join(f'{REPO_URL}/', f'/{FILE}') != url: 70 | raise AssertionError() 71 | if url_join(f'{REPO_URL}//', FILE) != url: 72 | raise AssertionError() 73 | if url_join(REPO_URL, f'//{FILE}') != url: 74 | raise AssertionError() 75 | 76 | 77 | def test_url_join_break(): 78 | with pytest.raises(AttributeError): 79 | url_join(REPO_URL, None) 80 | with pytest.raises(AttributeError): 81 | url_join(None, FILE) 82 | with pytest.raises(AttributeError): 83 | url_join(REPO_URL, 1) 84 | with pytest.raises(AttributeError): 85 | url_join(1, FILE) 86 | 87 | 88 | def test_download_file(): 89 | file_check(download_file(REPO, FILE, local_file())) 90 | 91 | 92 | def test_download_file_break(): 93 | with pytest.raises(LoadError): 94 | download_file(REPO, None, local_file()) 95 | with pytest.raises(LoadError): 96 | download_file(REPO, 'not-an-existing-file', local_file()) 97 | with pytest.raises(AttributeError): 98 | download_file(None, FILE, local_file()) 99 | with pytest.raises(AttributeError): 100 | download_file('not-a-repository', FILE, local_file()) 101 | 102 | 103 | def test_attempt_to_download_file(): 104 | file_check(attempt_to_download_file(REPO, FILE, local_file())) 105 | file_check( 106 | attempt_to_download_file( 107 | REPO, FILE, local_file(), max_attempts=10)) 108 | 109 | 110 | def test_attempt_to_download_file_break(): 111 | with pytest.raises(LoadError): 112 | attempt_to_download_file(REPO, None, local_file()) 113 | with pytest.raises(AttributeError): 114 | attempt_to_download_file(None, FILE, local_file()) 115 | with pytest.raises(IsADirectoryError): 116 | attempt_to_download_file(REPO, FILE, '.') 117 | with pytest.raises(LoadError): 118 | attempt_to_download_file( 119 | REPO, FILE, local_file(), max_attempts=0) 120 | with pytest.raises(LoadError): 121 | attempt_to_download_file( 122 | REPO, 'not-an-existing-file', local_file()) 123 | 124 | 125 | def test_download_wrapper(): 126 | file_check(download_wrapper(REPO, FILE)) 127 | file_check(download_wrapper(REPO, FILE, max_attempts=10)) 128 | with open(FILE, 'w') as fh: 129 | fh.write('nonsense content') 130 | if file_hash(download_wrapper(REPO, FILE)) == HASH: 131 | raise AssertionError() 132 | file_check(download_wrapper(REPO, FILE, force=True)) 133 | 134 | 135 | def test_download_wrapper_break(): 136 | with pytest.raises(TypeError): 137 | download_wrapper(REPO, None) 138 | with pytest.raises(LoadError): 139 | download_wrapper(REPO, 'not-an-existing-file') 140 | with pytest.raises(RuntimeError): 141 | download_wrapper(REPO, FILE, working_directory=None) 142 | with pytest.raises(AttributeError): 143 | download_wrapper(None, FILE) 144 | with pytest.raises(AttributeError): 145 | download_wrapper('not-a-repository', FILE) 146 | with pytest.raises(LoadError): 147 | download_wrapper(REPO, FILE, max_attempts=0) 148 | -------------------------------------------------------------------------------- /mdshare/data/mdshare-catalogue.yaml: -------------------------------------------------------------------------------- 1 | containers: 2 | mdshare-test.tar.gz: 3 | hash: 8eda06f1af3760ee788101ecb59dba69 4 | size: 232 5 | pyemma-tutorial-livecoms.tar.gz: 6 | hash: 71eebc44c37825fbedb87b35d3b76587 7 | size: 123939712 8 | index: 9 | alanine-dipeptide-0-250ns-nowater.xtc: 10 | hash: e82ba584d8e64491f30bd1d9dd019687 11 | size: 42909936 12 | alanine-dipeptide-1-250ns-nowater.xtc: 13 | hash: 96b7686aa28a459d51a0d77c2dd0316e 14 | size: 42911308 15 | alanine-dipeptide-1Mx1ps-with-force.npz: 16 | hash: c35a55fb5e5cec0d49e4275d62493ab9 17 | size: 463525559 18 | alanine-dipeptide-2-250ns-nowater.xtc: 19 | hash: 6d3a8d9aecb3aa0e0a1ea9b195b7dfe0 20 | size: 42907500 21 | alanine-dipeptide-3x250ns-backbone-dihedrals.npz: 22 | hash: 3fa7c72ba512213a3860f9691602cdc7 23 | size: 6000544 24 | alanine-dipeptide-3x250ns-heavy-atom-distances.npz: 25 | hash: c8d108087fdce3ccb0f8e30e76858925 26 | size: 135000544 27 | alanine-dipeptide-3x250ns-heavy-atom-positions.npz: 28 | hash: 55bd66fe4ee49849dead7faf6bf8b71e 29 | size: 90000544 30 | alanine-dipeptide-nowater.pdb: 31 | hash: 728635667ed4937cf4a0e5b7c801d9ea 32 | size: 1813 33 | alanine_dipeptide_parallel_tempering_dihedrals.npz: 34 | hash: db21a645b6aa39db4095fa0a5a61c8cc 35 | size: 3365062 36 | alanine_dipeptide_parallel_tempering_energies.npz: 37 | hash: b7b58af1dbd92168d63fac2376edf9cb 38 | size: 1685062 39 | doublewell_disconnected.npy: 40 | hash: 26717c09a92cf96cda412a8ab0119360 41 | size: 160096 42 | doublewell_oneway.npy: 43 | hash: 2843dc5108ffc5f2adf9d3d1c8cce804 44 | size: 160096 45 | hmm-doublewell-2d-100k.npz: 46 | hash: aaf37fb708a0f8f82f70d3aa06da205f 47 | size: 2000638 48 | imd_channel_transitionmatrix.npy: 49 | hash: fdbbd2376541e4be13c503cfa13e789c 50 | size: 8320 51 | imd_full_system_trajectory.npy: 52 | hash: 1d77481bc2d6527016e4791a01349347 53 | size: 8000128 54 | mdshare-test-00.txt: 55 | hash: 5cbb04531c2e9fa7cc1e5d83195a2f81 56 | size: 33 57 | methane-dimer-umbrella-sampling.npz: 58 | hash: e494f8bf0da3283c2d6a6ce9ce10c989 59 | size: 193346 60 | pentapeptide-00-500ns-impl-solv.xtc: 61 | hash: 16967d0bb09d24dc66de4d4885f953a4 62 | size: 2221296 63 | pentapeptide-01-500ns-impl-solv.xtc: 64 | hash: 9db9b87ecafb2d5eb0085adc01b97680 65 | size: 2221268 66 | pentapeptide-02-500ns-impl-solv.xtc: 67 | hash: 5ec89b72fa1e48c3a5e1b1ec707f66fb 68 | size: 2221392 69 | pentapeptide-03-500ns-impl-solv.xtc: 70 | hash: 726208c33f9c9ce0eb2688e5788b8d57 71 | size: 2221596 72 | pentapeptide-04-500ns-impl-solv.xtc: 73 | hash: 934d06ed03744c8cada2123a2cfd6fbf 74 | size: 2221604 75 | pentapeptide-05-500ns-impl-solv.xtc: 76 | hash: 395f614244f3d484db8d4a50f35252fd 77 | size: 2221020 78 | pentapeptide-06-500ns-impl-solv.xtc: 79 | hash: 3a94a06657b5bd7cadf6c3767fc4ca18 80 | size: 2221088 81 | pentapeptide-07-500ns-impl-solv.xtc: 82 | hash: 82c633ed92112bf62bb7070f4393ac2a 83 | size: 2221376 84 | pentapeptide-08-500ns-impl-solv.xtc: 85 | hash: c2f49e03f4c8ef6c8315d844d5d1be0e 86 | size: 2220668 87 | pentapeptide-09-500ns-impl-solv.xtc: 88 | hash: ea74fa65dcb80a086a15c34a907495f3 89 | size: 2221668 90 | pentapeptide-10-500ns-impl-solv.xtc: 91 | hash: 7b374e2402a37f139817be3137e6509e 92 | size: 2221300 93 | pentapeptide-11-500ns-impl-solv.xtc: 94 | hash: 0fe84f969978a492fe26dbb3bd39c6ee 95 | size: 2221672 96 | pentapeptide-12-500ns-impl-solv.xtc: 97 | hash: 8ddaaf213a4b4d34e92cbfbf6f3daad5 98 | size: 2221012 99 | pentapeptide-13-500ns-impl-solv.xtc: 100 | hash: 21ced9d0791a4330c2714a408c6b7b63 101 | size: 2222168 102 | pentapeptide-14-500ns-impl-solv.xtc: 103 | hash: ecd2213ac5de68ef0fc37e9723269d34 104 | size: 2221316 105 | pentapeptide-15-500ns-impl-solv.xtc: 106 | hash: f41d2a6e283d3b43fdf14aa3e95eff03 107 | size: 2221780 108 | pentapeptide-16-500ns-impl-solv.xtc: 109 | hash: 30c8f0805bfa248934714bfb663c4196 110 | size: 2221404 111 | pentapeptide-17-500ns-impl-solv.xtc: 112 | hash: 82c52922cb585962a0b1cab70cc32645 113 | size: 2221296 114 | pentapeptide-18-500ns-impl-solv.xtc: 115 | hash: 6906793b17978370a59025b6c2080ae1 116 | size: 2220924 117 | pentapeptide-19-500ns-impl-solv.xtc: 118 | hash: 4f7948b2f6a5515666b495a1054326c4 119 | size: 2220900 120 | pentapeptide-20-500ns-impl-solv.xtc: 121 | hash: f4f3c376e90826d3dd43bcdf08910863 122 | size: 2221344 123 | pentapeptide-21-500ns-impl-solv.xtc: 124 | hash: e5d2467545d04778d6005c2c904e6df3 125 | size: 2221000 126 | pentapeptide-22-500ns-impl-solv.xtc: 127 | hash: b2cdf3759057ef1a9ccbf9ab41044efa 128 | size: 2220708 129 | pentapeptide-23-500ns-impl-solv.xtc: 130 | hash: 1169c0f9efd028155519fa9bf2a85971 131 | size: 2221592 132 | pentapeptide-24-500ns-impl-solv.xtc: 133 | hash: 8d016a9ea6a5b63843fca58b82b4573c 134 | size: 2220908 135 | pentapeptide-impl-solv.pdb: 136 | hash: c52f482024e0ec7dcd64f2b925b53c2b 137 | size: 7501 138 | pyemma-tutorial-mt-data.npz: 139 | hash: 02f7b91ea2cac71b762f85bdbc4086e9 140 | size: 4763982 141 | pyemma-tutorial-tpt-data.npz: 142 | hash: e3d95283e915cc7992aaefd30cea5462 143 | size: 512952 144 | pyemma-tutorial-us-data.npz: 145 | hash: eb36cfe14a61a5d9a7d5b57ef7f829e8 146 | size: 1622382 147 | pyemma-tutorial-us-nacl.npz: 148 | hash: fca286bd0ffb8f30315e27bff2c1a772 149 | size: 241062 150 | url: http://ftp.imp.fu-berlin.de/pub/cmb-data/ 151 | -------------------------------------------------------------------------------- /mdshare/utils.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import os 19 | import sys 20 | import logging 21 | from requests import HTTPError 22 | from hashlib import md5 23 | 24 | 25 | class LoadError(KeyError): 26 | def __init__(self, file, message, *args, **kwargs): 27 | super(LoadError, self).__init__(*args, **kwargs) 28 | self.file = file 29 | self.message = message 30 | 31 | def __str__(self): 32 | return f'{self.file} [{self.message}]' 33 | 34 | 35 | def file_hash(file, chunk_size=65536): 36 | """Compute the MD5 hash of a file. 37 | 38 | Arguments: 39 | file (str): path of the file to be hashed 40 | chunk_size (int): size of chunks to read 41 | """ 42 | hash_ = md5() 43 | with open(file, 'rb') as fh: 44 | while True: 45 | data = fh.read(chunk_size) 46 | if not data: 47 | break 48 | hash_.update(data) 49 | return hash_.hexdigest() 50 | 51 | 52 | def url_join(repository_url, file): 53 | """Compose a URL. 54 | 55 | Arguments: 56 | repository_url (str): url of the repository 57 | file (str): name of the file in the repository 58 | """ 59 | return f'{repository_url.rstrip("/")}/{file.lstrip("/")}' 60 | 61 | 62 | def download_file(repository, file, local_path, callback=None): 63 | """Download a file. 64 | 65 | Arguments: 66 | repository (Repository): repository object 67 | file (str): name of the file in the repository 68 | local_path (str): local path where the file should be saved 69 | callback (callable): callback function 70 | """ 71 | location, metadata = repository.lookup(file) 72 | logging.debug( 73 | f'Repository::{location}::{file} has checksum {metadata["hash"]}' 74 | f' and size {metadata["size"]}') 75 | logging.debug( 76 | f'From <{repository.url}> download <{file}> to <{local_path}>') 77 | response = repository._get_connection().get( 78 | url_join(repository.url, file), 79 | stream=True) 80 | blocksize = 1024 * 8 81 | with open(local_path, 'wb') as fh: 82 | for i, data in enumerate(response.iter_content(blocksize)): 83 | fh.write(data) 84 | if callback is not None: 85 | callback(i, blocksize) 86 | checksum = file_hash(local_path) 87 | logging.debug(f'Loaded file {local_path} has {checksum}') 88 | if checksum != metadata['hash']: 89 | raise LoadError(file, 'checksum test failed') 90 | return local_path 91 | 92 | 93 | def attempt_to_download_file( 94 | repository, 95 | file, 96 | local_path, 97 | max_attempts=3, 98 | callback=None): 99 | """Retry to download a file several times if necessary. 100 | 101 | Arguments: 102 | repository (Repository): repository object 103 | file (str): name of the file in the repository 104 | local_path (str): local path where the file should be saved 105 | max_attempts (int): number of download attempts 106 | callback (callable): callback function 107 | """ 108 | attempt = 0 109 | filename = None 110 | 111 | def fault_handler(filename, exception): 112 | print(f'error: {exception}', file=sys.stderr) 113 | try: 114 | # remove faulty files 115 | os.unlink(filename) 116 | except: 117 | print(f'warning: could not remove file {filename}', file=sys.stderr) 118 | raise exception 119 | 120 | while attempt < max_attempts: 121 | attempt += 1 122 | logging.debug(f'download attempt {attempt}/{max_attempts} ...') 123 | try: 124 | filename = download_file( 125 | repository, 126 | file, 127 | local_path, 128 | callback=callback) 129 | break 130 | except (HTTPError, IOError, KeyboardInterrupt) as e: 131 | fault_handler(filename, e) 132 | if filename is None: 133 | raise LoadError(file, 'download failed') 134 | return filename 135 | 136 | 137 | def download_wrapper( 138 | repository, file, working_directory='.', 139 | max_attempts=3, force=False, callback=None): 140 | """Download a file if necessary. 141 | 142 | Arguments: 143 | repository (Repository): repository object 144 | file (str): name of the file in the repository 145 | working_directory (str): directory where the file should be saved 146 | max_attempts (int): number of download attempts 147 | force (boolean): enforce download even if file exists 148 | callback (callable): callback function 149 | """ 150 | logging.debug( 151 | f'download_wrapper({repository.url}, {file},' 152 | f' working_directory="{working_directory}",' 153 | f' max_attempts={max_attempts}, force={force})') 154 | if working_directory is None: 155 | raise RuntimeError( 156 | 'working_directory=None is illegal at this point') 157 | local_path = os.path.join(working_directory, file) 158 | logging.debug(f'local_path={local_path}') 159 | if os.path.exists(local_path) and not force: 160 | logging.debug(f'local_path={local_path} exists ... return') 161 | return local_path 162 | logging.debug( 163 | f'local_path={local_path} does not exist ... attempting download') 164 | return attempt_to_download_file( 165 | repository, 166 | file, 167 | local_path, 168 | max_attempts=max_attempts, 169 | callback=callback) 170 | -------------------------------------------------------------------------------- /mdshare/test/test_repository.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import random 19 | import string 20 | import pytest 21 | import os 22 | from yaml import dump 23 | from ..utils import LoadError 24 | from ..utils import file_hash 25 | from ..repository import Category 26 | from ..repository import Repository 27 | 28 | 29 | def randomizer(length, pattern=None): 30 | sample = ''.join([random.choice(string.ascii_letters) 31 | for x in range(length)]) 32 | if pattern is None: 33 | return sample 34 | return f'{sample}-{pattern}-{sample}' 35 | 36 | 37 | def make_random_category_dict(n, m): 38 | def metadata(): 39 | return dict( 40 | size=random.randint(100, 100000), 41 | hash=randomizer(32)) 42 | 43 | patterns = [randomizer(10) + str(i) for i in range(n)] 44 | files = [] 45 | for pattern in patterns: 46 | files += [randomizer(10, pattern) + str(i) for i in range(m)] 47 | data = {file: metadata() for file in files} 48 | return patterns, files, data 49 | 50 | 51 | class RandomCatalogue(object): 52 | def __init__(self, npattern, nentries, ncontainers, mode=0): 53 | _, _, index = make_random_category_dict(npattern, nentries) 54 | _, _, containers = make_random_category_dict(ncontainers, 1) 55 | self.data = dict( 56 | url=f'http://{randomizer(10)}.{randomizer(3)}/{randomizer(7)}', 57 | index=index, 58 | containers=containers) 59 | self.offset = '' 60 | if mode == 1: 61 | self.data.pop('url') 62 | elif mode == 2: 63 | self.data.pop('index') 64 | elif mode == 3: 65 | self.data.pop('containers') 66 | elif mode == 4: 67 | self.offset = randomizer(42) 68 | self.file = randomizer(25) 69 | 70 | def __enter__(self): 71 | with open(f'{self.file}.yaml', 'w') as fh: 72 | fh.write(dump(self.data)) 73 | with open(f'{self.file}.md5', 'w') as fh: 74 | fh.write(file_hash(f'{self.file}.yaml') + self.offset) 75 | return (self.data, self.file) 76 | 77 | def __exit__(self, exception_type, exception_value, traceback): 78 | os.remove(f'{self.file}.yaml') 79 | os.remove(f'{self.file}.md5') 80 | 81 | 82 | def test_category(): 83 | n, m = random.randint(2, 6), random.randint(2, 6) 84 | patterns, files, data = make_random_category_dict(n, m) 85 | category = Category(data) 86 | if len(category.keys()) != n * m: 87 | raise AssertionError() 88 | for pattern in patterns: 89 | if len(category.search(f'*-{pattern}-*')) != m: 90 | raise AssertionError() 91 | if len(category.search(f'*-{pattern[1:-1]}-*')) != 0: 92 | raise AssertionError() 93 | string = str(category) 94 | for file in files: 95 | for key in ('hash', 'size'): 96 | if category[file][key] != data[file][key]: 97 | raise AssertionError() 98 | if file not in string: 99 | raise AssertionError() 100 | 101 | 102 | def test_repository(): 103 | args = [random.randint(2, 7) for _ in range(3)] 104 | with RandomCatalogue(*args, mode=0) as (data, file): 105 | repository = Repository(f'{file}.yaml', f'{file}.md5') 106 | string = str(repository) 107 | if repository.url != data['url']: 108 | raise AssertionError() 109 | if data['url'] not in string: 110 | raise AssertionError() 111 | for file in data['index']: 112 | if file not in string: 113 | raise AssertionError() 114 | if file not in repository.index: 115 | raise AssertionError() 116 | location, metadata = repository.lookup(file) 117 | if location != 'index': 118 | raise AssertionError() 119 | if metadata['size'] != data['index'][file]['size']: 120 | raise AssertionError() 121 | if metadata['hash'] != data['index'][file]['hash']: 122 | raise AssertionError() 123 | if repository.size(file) != data['index'][file]['size']: 124 | raise AssertionError() 125 | if repository.hash(file) != data['index'][file]['hash']: 126 | raise AssertionError() 127 | for file in data['containers']: 128 | if file not in string: 129 | raise AssertionError() 130 | if file not in repository.containers: 131 | raise AssertionError() 132 | location, metadata = repository.lookup(file) 133 | if location != 'containers': 134 | raise AssertionError() 135 | if metadata['size'] != data['containers'][file]['size']: 136 | raise AssertionError() 137 | if metadata['hash'] != data['containers'][file]['hash']: 138 | raise AssertionError() 139 | if repository.size(file) != data['containers'][file]['size']: 140 | raise AssertionError() 141 | if repository.hash(file) != data['containers'][file]['hash']: 142 | raise AssertionError() 143 | 144 | 145 | def test_repository_break(): 146 | for mode in range(4): 147 | with RandomCatalogue(4, 3, 2, mode=mode + 1) as (data, file): 148 | with pytest.raises(RuntimeError): 149 | Repository(f'{file}.yaml', f'{file}.md5') 150 | args = [random.randint(2, 7) for _ in range(3)] 151 | with RandomCatalogue(*args, mode=0) as (data, file): 152 | repository = Repository(f'{file}.yaml', f'{file}.md5') 153 | for location in ('index', 'containers'): 154 | for file in data[location]: 155 | with pytest.raises(LoadError): 156 | repository.lookup(file[1:-1]) 157 | -------------------------------------------------------------------------------- /mdshare/api.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/mdshare project. 2 | # Copyright (C) 2017-2019 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import os 19 | import sys 20 | import tarfile 21 | from tempfile import mkdtemp 22 | from .utils import LoadError, download_wrapper 23 | from .repository import Repository 24 | from . import default_repository 25 | 26 | 27 | def load_repository(catalogue_file, checksum_file=None): 28 | """Load a repository catalogue from file 29 | 30 | Arguments: 31 | catalogue_file (str): filename of the catalogue 32 | checksum_file (str): filename of the catalogue's checksum 33 | """ 34 | return Repository(catalogue_file, checksum_file) 35 | 36 | 37 | def search(filename_pattern, repository=None): 38 | """Returns a list of available files matching a filename_pattern. 39 | 40 | Arguments: 41 | filename_pattern (str): filename pattern, allows for Unix shell-style wildcards 42 | repository (Repository): repository object 43 | """ 44 | if repository is None: 45 | repository = default_repository 46 | if not isinstance(repository, Repository): 47 | raise TypeError('received {type(repository)} instead of Repository') 48 | return repository.search(filename_pattern) 49 | 50 | 51 | def catalogue(repository=None): 52 | """Prints a human-friendly list of available files/sizes. 53 | 54 | Arguments: 55 | repository (Repository): repository object 56 | """ 57 | if repository is None: 58 | repository = default_repository 59 | if not isinstance(repository, Repository): 60 | raise TypeError('received {type(repository)} instead of Repository') 61 | print(repository) 62 | 63 | 64 | def fetch( 65 | remote_filename, working_directory='.', repository=None, 66 | max_attempts=3, force=False, show_progress=True): 67 | """Download a file if it is not already at the traget location. 68 | 69 | Arguments: 70 | remote_filename (str): name of the file in the repository 71 | working_directory (str): directory where the file should be saved 72 | repository (Repository): repository object 73 | max_attempts (int): number of download attempts 74 | force (boolean): enforce download even if file exists 75 | show_progress (boolean): show download progress 76 | """ 77 | if repository is None: 78 | repository = default_repository 79 | if not isinstance(repository, Repository): 80 | raise TypeError('received {type(repository)} instead of Repository') 81 | if working_directory is None: 82 | working_directory = mkdtemp() 83 | else: 84 | os.makedirs(working_directory, exist_ok=True) 85 | try: 86 | import progress_reporter 87 | have_progress_reporter = True 88 | except ImportError: 89 | have_progress_reporter = False 90 | 91 | stack = repository.stack(remote_filename) 92 | if len(stack) == 0: 93 | raise LoadError(remote_filename, 'no match in repository') 94 | 95 | if have_progress_reporter and show_progress: 96 | callbacks = [] 97 | pg = progress_reporter.ProgressReporter_() 98 | total = sum(item['size'] for item in stack) 99 | 100 | def update(n, blk, stage): 101 | downloaded = n * blk 102 | inc = max( 103 | 0, downloaded - pg._prog_rep_progressbars[stage].n) 104 | pg.update(inc, stage=stage) 105 | # total progress 106 | try: 107 | pg.update(inc, stage=-1) 108 | except RuntimeError: 109 | pass 110 | 111 | from functools import partial 112 | tqdm_args = dict(unit='B', file=sys.stdout, unit_scale=True) 113 | 114 | n_progress_bars = 0 115 | for stage, item in enumerate(stack): 116 | if working_directory is not None: 117 | path = os.path.join(working_directory, item['file']) 118 | if os.path.exists(path) and not force: 119 | callbacks.append(None) 120 | else: 121 | pg.register( 122 | item['size'], 123 | description=f'downloading {item["file"]}', 124 | tqdm_args=tqdm_args, 125 | stage=stage) 126 | callbacks.append(partial(update, stage=stage)) 127 | n_progress_bars += 1 128 | if n_progress_bars > 1: 129 | pg.register( 130 | total, description='total', tqdm_args=tqdm_args, stage=-1) 131 | else: 132 | from unittest.mock import MagicMock 133 | pg = MagicMock() 134 | callbacks = [None] * len(stack) 135 | 136 | result = [] 137 | with pg.context(): 138 | for item, progress in zip(stack, callbacks): 139 | file = download_wrapper( 140 | repository, 141 | item['file'], 142 | working_directory=working_directory, 143 | max_attempts=max_attempts, 144 | force=force, 145 | callback=progress) 146 | if item['unpack']: 147 | 148 | def inspect(members): 149 | for member in members: 150 | path, filename = os.path.split(member.name) 151 | if path == '': 152 | yield member, filename 153 | 154 | with tarfile.open(file, 'r:gz') as fh: 155 | members = [] 156 | for member, filename in inspect(fh): 157 | members.append(member) 158 | result.append( 159 | os.path.join(working_directory, filename)) 160 | fh.extractall( 161 | path=working_directory, members=members) 162 | os.remove(file) 163 | else: 164 | result.append(file) 165 | 166 | if len(result) == 0: 167 | raise LoadError(remote_filename, 'this should not have happend!') 168 | elif len(result) == 1: 169 | return result[0] 170 | return result 171 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. --------------------------------------------------------------------------------