├── tests ├── samples │ ├── list-depths.avro │ ├── list-lengths.avro │ ├── mc10events.root │ ├── list-depths.parquet │ ├── list-lengths.parquet │ ├── nano-2017-08-31.root │ ├── nullable-depths.avro │ ├── nullable-levels.avro │ ├── record-primitives.avro │ ├── list-depths-records.avro │ ├── list-depths-simple.avro │ ├── list-depths-strings.avro │ ├── nonnullable-depths.avro │ ├── nullable-depths.parquet │ ├── nullable-levels.parquet │ ├── list-depths-records.parquet │ ├── list-depths-simple.parquet │ ├── list-depths-strings.parquet │ ├── nonnullable-depths.parquet │ ├── nullable-list-depths.avro │ ├── record-primitives.parquet │ ├── list-depths-records-list.avro │ ├── nullable-list-depths.parquet │ ├── list-depths-records-list.parquet │ ├── nullable-record-primitives.avro │ ├── nullable-list-depths-records.avro │ ├── nullable-list-depths-strings.avro │ ├── nullable-record-primitives.parquet │ ├── nullable-list-depths-records.parquet │ ├── nullable-list-depths-strings.parquet │ ├── nullable-list-depths-records-list.avro │ ├── nullable-record-primitives-simple.avro │ ├── nullable-list-depths-records-list.parquet │ └── nullable-record-primitives-simple.parquet ├── test_issues.py ├── __init__.py ├── test_backend_numpyfile.py ├── test_backend_root.py ├── test_fill.py ├── test_database.py └── test_proxy.py ├── .travis-conda.py ├── .travis.yml ├── LICENSE ├── oamap ├── backend │ ├── __init__.py │ ├── numpyfile.py │ ├── arrow.py │ ├── root │ │ ├── cmsnano.py │ │ └── __init__.py │ └── packing.py ├── extension │ ├── __init__.py │ └── common.py ├── version.py ├── __init__.py ├── util.py ├── proxy.py ├── inference.py ├── fill.py ├── fillable.py └── dataset.py ├── .gitignore ├── setup.py └── README.rst /tests/samples/list-depths.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths.avro -------------------------------------------------------------------------------- /tests/samples/list-lengths.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-lengths.avro -------------------------------------------------------------------------------- /tests/samples/mc10events.root: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/mc10events.root -------------------------------------------------------------------------------- /tests/samples/list-depths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths.parquet -------------------------------------------------------------------------------- /tests/samples/list-lengths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-lengths.parquet -------------------------------------------------------------------------------- /tests/samples/nano-2017-08-31.root: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nano-2017-08-31.root -------------------------------------------------------------------------------- /tests/samples/nullable-depths.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-depths.avro -------------------------------------------------------------------------------- /tests/samples/nullable-levels.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-levels.avro -------------------------------------------------------------------------------- /tests/samples/record-primitives.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/record-primitives.avro -------------------------------------------------------------------------------- /tests/samples/list-depths-records.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-records.avro -------------------------------------------------------------------------------- /tests/samples/list-depths-simple.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-simple.avro -------------------------------------------------------------------------------- /tests/samples/list-depths-strings.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-strings.avro -------------------------------------------------------------------------------- /tests/samples/nonnullable-depths.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nonnullable-depths.avro -------------------------------------------------------------------------------- /tests/samples/nullable-depths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-depths.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-levels.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-levels.parquet -------------------------------------------------------------------------------- /tests/samples/list-depths-records.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-records.parquet -------------------------------------------------------------------------------- /tests/samples/list-depths-simple.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-simple.parquet -------------------------------------------------------------------------------- /tests/samples/list-depths-strings.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-strings.parquet -------------------------------------------------------------------------------- /tests/samples/nonnullable-depths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nonnullable-depths.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths.avro -------------------------------------------------------------------------------- /tests/samples/record-primitives.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/record-primitives.parquet -------------------------------------------------------------------------------- /tests/samples/list-depths-records-list.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-records-list.avro -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths.parquet -------------------------------------------------------------------------------- /tests/samples/list-depths-records-list.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/list-depths-records-list.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-record-primitives.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-record-primitives.avro -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-records.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-records.avro -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-strings.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-strings.avro -------------------------------------------------------------------------------- /tests/samples/nullable-record-primitives.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-record-primitives.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-records.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-records.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-strings.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-strings.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-records-list.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-records-list.avro -------------------------------------------------------------------------------- /tests/samples/nullable-record-primitives-simple.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-record-primitives-simple.avro -------------------------------------------------------------------------------- /tests/samples/nullable-list-depths-records-list.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-list-depths-records-list.parquet -------------------------------------------------------------------------------- /tests/samples/nullable-record-primitives-simple.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diana-hep/oamap/HEAD/tests/samples/nullable-record-primitives-simple.parquet -------------------------------------------------------------------------------- /tests/test_issues.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class TestIssues(unittest.TestCase): 5 | 6 | def runTest(self): 7 | pass 8 | 9 | def test_issue7(self): 10 | with open('README.rst') as f: 11 | try: 12 | content = f.read() 13 | except UnicodeDecodeError as e: 14 | self.fail("Cannot read README.rst: " + str(e)) 15 | -------------------------------------------------------------------------------- /.travis-conda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | if os.environ["TRAVIS_PYTHON_VERSION"] == "2.6": 6 | miniconda = False 7 | 8 | elif os.environ["TRAVIS_PYTHON_VERSION"] == "2.7": 9 | miniconda = True 10 | os.system("wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh") 11 | 12 | else: 13 | miniconda = True 14 | os.system("wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh") 15 | 16 | if miniconda: 17 | os.system("bash miniconda.sh -b -p {0}/miniconda".format(os.environ["HOME"])) 18 | os.system("{0}/miniconda/bin/conda config --set always_yes yes --set changeps1 no".format(os.environ["HOME"])) 19 | os.system("{0}/miniconda/bin/conda update -q conda".format(os.environ["HOME"])) 20 | os.system("{0}/miniconda/bin/conda info -a".format(os.environ["HOME"])) 21 | os.system("{0}/miniconda/bin/conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numba".format(os.environ["HOME"])) 22 | os.system("source {0}/miniconda/bin/activate test-environment; python setup.py install".format(os.environ["HOME"])) 23 | 24 | else: 25 | os.system("mkdir -p miniconda/bin") 26 | open("miniconda/bin/activate", "w").write("") 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | os: 4 | - linux 5 | 6 | python: 7 | - 2.6 8 | - 2.7 9 | - 3.4 10 | - 3.5 11 | - 3.6 12 | 13 | addons: 14 | apt: 15 | packages: 16 | - python-setuptools 17 | - libsnappy-dev 18 | 19 | install: 20 | - sudo apt-get update 21 | - pip install --upgrade pip 22 | 23 | install: 24 | - pip install --upgrade setuptools_scm 25 | 26 | script: 27 | python .travis-conda.py ; source $HOME/miniconda/bin/activate test-environment; python setup.py test 28 | 29 | deploy: 30 | provider: pypi 31 | user: pivarski 32 | password: 33 | secure: "irt16TqzfFa1A47AgrSEnZz89Tam7g36wUMFRB2cseipVDzk1pmN8xcxj2xebpRXWHhyKmpPUetQ1gwgYn5brK5xl0iQ/eNT4U3tWLWowtBxINYhhErSSAnMVGX+FJliex5fv/yEuU158BviLPLjhYMDXjtFH6TQmFExSoHTaZL8aX0Xswt8Ku0etJHgf4O8D2b1L5yQ1fOHy2vBhfGXhT8jI/rvwGu9DF2iJYIdnrf1jdy3aCvpiBhTUbxLO0sJVSGVpbC3L7uKwPMt+t3gb8iQL7llZL9DgCj4YEIAhLnIRhuTTXkKQ2cfYMX+b6hFiSV816Z1VR+sckfY915mPF+M/k9+m7xqcDRtYYeRsS68sKFgICdDUONR3nMvCJxYPmfSWOo0qvXPh0tjMfJ1lQOideY9ToR2fYzwzL4MGyzn/FrlXUoMNRfYJ8an1X9Xds2Bm9AVF6W1JviKOboHDDg0TqJXScy2LmMaaSdub2lN/a3iioYdK/0RtKWZ6N/qg8b0E91sVFD4zgZ/1qDm7JQhvoqmvhMQQ091Yl0xOmxmmERhMxEyYlcp+8RcAwAxl5KqwkZv2Ni2ReVBJxqfe5wvC7FP412pG1Zdd2FL2UqbyEIq2GJPE+LQGS5KhjfajWacb9wW+6tp4aCUITjU2Eboqq/y0L/R4QKS6HAWitc=" 34 | on: 35 | tags: true 36 | branch: master 37 | condition: "$TRAVIS_PYTHON_VERSION = 2.7" 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, DIANA-HEP 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /oamap/backend/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /oamap/extension/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /oamap/version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import re 32 | 33 | __version__ = "0.12.4" 34 | version = __version__ 35 | version_info = tuple(re.split(r"[-\.]", __version__)) 36 | 37 | del re 38 | -------------------------------------------------------------------------------- /oamap/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | from oamap.schema import * 32 | import oamap.compiler 33 | 34 | # convenient access to the version number 35 | from oamap.version import __version__ 36 | 37 | -------------------------------------------------------------------------------- /tests/test_backend_numpyfile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import math 32 | import tempfile 33 | import shutil 34 | 35 | import unittest 36 | 37 | from oamap.schema import * 38 | from oamap.backend.numpyfile import * 39 | 40 | class TestBackendNumpyfile(unittest.TestCase): 41 | def runTest(self): 42 | pass 43 | 44 | def test_database(self): 45 | tmpdir = tempfile.mkdtemp() 46 | try: 47 | db = NumpyFileDatabase(tmpdir) 48 | db.fromdata("one", List(Record({"x": "int32", "y": "float64"})), [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}, {"x": 6, "y": 6.6}]) 49 | 50 | db.data.two = db.data.one.define("z", lambda obj: obj.x + obj.y) 51 | 52 | self.assertEqual([(obj.x, obj.y, obj.z) for obj in db.data.two], [(1, 1.1, 2.1), (2, 2.2, 4.2), (3, 3.3, 6.3), (4, 4.4, 8.4), (5, 5.5, 10.5), (6, 6.6, 12.6)]) 53 | 54 | del db.data.one 55 | del db.data.two 56 | 57 | finally: 58 | shutil.rmtree(tmpdir) 59 | -------------------------------------------------------------------------------- /tests/test_backend_root.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import math 32 | import tempfile 33 | import shutil 34 | 35 | import unittest 36 | 37 | import oamap.backend.root 38 | import oamap.database 39 | 40 | class TestBackendRoot(unittest.TestCase): 41 | def runTest(self): 42 | pass 43 | 44 | def test_database(self): 45 | dataset = oamap.backend.root.dataset("tests/samples/mc10events.root", "Events") 46 | 47 | self.assertEqual(repr(dataset[0].Electron[0].pt), "28.555809") 48 | 49 | db = oamap.database.InMemoryDatabase() 50 | 51 | db.data.one = dataset 52 | 53 | self.assertEqual(repr(db.data.one[0].Electron[0].pt), "28.555809") 54 | 55 | def test_transform(self): 56 | dataset = oamap.backend.root.dataset("tests/samples/mc10events.root", "Events") 57 | 58 | self.assertEqual(repr(dataset[0].Electron[0].pt * math.sinh(dataset[0].Electron[0].eta)), "-17.956890574044056") 59 | 60 | db = oamap.database.InMemoryDatabase.writable(oamap.database.DictBackend()) 61 | db.data.one = dataset.define("pz", lambda x: x.pt * math.sinh(x.eta), at="Electron", numba=False) 62 | 63 | self.assertEqual(repr(db.data.one[0].Electron[0].pz), "-17.956890574044056") 64 | -------------------------------------------------------------------------------- /oamap/backend/numpyfile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import numpy 32 | 33 | import oamap.database 34 | 35 | class NumpyFileBackend(oamap.database.FilesystemBackend): 36 | def __init__(self, directory): 37 | super(NumpyFileBackend, self).__init__(directory, arraysuffix=".npy") 38 | 39 | @property 40 | def args(self): 41 | return (self._directory,) 42 | 43 | def tojson(self): 44 | return {"class": self.__class__.__module__ + "." + self.__class__.__name__, 45 | "directory": self._directory} 46 | 47 | @staticmethod 48 | def fromjson(obj, namespace): 49 | return NumpyFileBackend(obj["directory"]) 50 | 51 | def instantiate(self, partitionid): 52 | return NumpyArrays(lambda name: self.fullname(partitionid, name, create=False), 53 | lambda name: self.fullname(partitionid, name, create=True)) 54 | 55 | class NumpyArrays(object): 56 | def __init__(self, loadname, storename): 57 | self._loadname = loadname 58 | self._storename = storename 59 | 60 | def __getitem__(self, name): 61 | return numpy.load(self._loadname(name)) 62 | 63 | def __setitem__(self, name, value): 64 | numpy.save(self._storename(name), value) 65 | 66 | class NumpyFileDatabase(oamap.database.FilesystemDatabase): 67 | def __init__(self, directory, namespace=""): 68 | super(NumpyFileDatabase, self).__init__(directory, backends={namespace: NumpyFileBackend(directory)}, namespace=namespace) 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2017, DIANA-HEP 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # * Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # * Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # * Neither the name of the copyright holder nor the names of its 18 | # contributors may be used to endorse or promote products derived from 19 | # this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | import os.path 33 | 34 | from setuptools import find_packages 35 | from setuptools import setup 36 | 37 | def get_version(): 38 | g = {} 39 | exec(open(os.path.join("oamap", "version.py")).read(), g) 40 | return g["__version__"] 41 | 42 | setup(name = "oamap", 43 | version = get_version(), 44 | packages = find_packages(exclude = ["tests"]), 45 | scripts = [], 46 | data_files = ["README.rst"], 47 | description = "Perform high-speed calculations on columnar data without creating intermediate objects.", 48 | long_description = open("README.rst").read().strip(), 49 | author = "Jim Pivarski (DIANA-HEP)", 50 | author_email = "pivarski@fnal.gov", 51 | maintainer = "Jim Pivarski (DIANA-HEP)", 52 | maintainer_email = "pivarski@fnal.gov", 53 | url = "https://github.com/diana-hep/oamap", 54 | download_url = "https://github.com/diana-hep/oamap/releases", 55 | license = "BSD 3-clause", 56 | test_suite = "tests", 57 | install_requires = ["numpy"], 58 | tests_require = ["uproot", "thriftpy", "python-snappy"], 59 | classifiers = [ 60 | "Development Status :: 4 - Beta", 61 | "Intended Audience :: Developers", 62 | "Intended Audience :: Information Technology", 63 | "Intended Audience :: Science/Research", 64 | "License :: OSI Approved :: BSD License", 65 | "Operating System :: MacOS", 66 | "Operating System :: POSIX", 67 | "Operating System :: Unix", 68 | "Programming Language :: Python", 69 | "Programming Language :: Python :: 2.7", 70 | "Programming Language :: Python :: 3.4", 71 | "Programming Language :: Python :: 3.5", 72 | "Programming Language :: Python :: 3.6", 73 | "Programming Language :: Python :: 3.7", 74 | "Topic :: Scientific/Engineering", 75 | "Topic :: Scientific/Engineering :: Information Analysis", 76 | "Topic :: Scientific/Engineering :: Mathematics", 77 | "Topic :: Scientific/Engineering :: Physics", 78 | "Topic :: Software Development", 79 | "Topic :: Utilities", 80 | ], 81 | platforms = "Any", 82 | ) 83 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | OAMap: Object-Array Mapping 2 | =========================== 3 | 4 | .. image:: https://travis-ci.org/diana-hep/oamap.svg?branch=master 5 | :target: https://travis-ci.org/diana-hep/oamap 6 | 7 | Introduction 8 | ------------ 9 | 10 | Data analysts are often faced with a choice between speed and flexibility. Tabular data, such as SQL tables, can be processed rapidly enough for a truly interactive analysis session, but hierarchically nested formats, such as JSON, are better at representing relationships in complex data models. In some domains (such as particle physics), we want to perform calculations on JSON-like structures at the speed of SQL. 11 | 12 | The key to high throughput on large datasets, particularly ones with more attributes than are accessed in a single pass, is laying out the data in "columns." All values of an attribute should be contiguous on disk or memory because data are paged from one cache to the next in locally contiguous blocks. The `ROOT `_ and `Parquet `_ file formats represent JSON-like data in columns on disk, but these data are usually deserialized into objects for processing in memory. Higher performance can be achieved by maintaining the columnar structure through all stages of the calculation (see `this talk `_ and `this paper `_). 13 | 14 | The OAMap toolkit implements an Object Array Mapping in Python. Object Array Mappings, by analogy with Object Relational Mappings (ORMs) are one-to-one relationships between conceptual objects and physical arrays. You can write functions that appear to be operating on ordinary Python objects-- lists, tuples, class instances-- but are actually being performed on low-level, contiguous buffers (Numpy arrays). The result is fast processing of large, complex datasets with a low memory footprint. 15 | 16 | OAMap has two primary modes: (1) pure-Python object proxies, which pretend to be Python objects but actually access array data on demand, and (2) bare-metal bytecode compiled by `Numba `_. The pure-Python form is good for low-latency, exploratory work, while the compiled form is good for high throughput. They are seamlessly interchangeable: a Python proxy converts to the compiled form when it enters a Numba-compiled function and switches back when it leaves. You can, for instance, do a fast search in compiled code and examine the results more fully by hand. 17 | 18 | Any columnar file format or database can be used as a data source: OAMap can get arrays of data from any dict-like object (any Python object implementing ``__getitem__``), even from within a Numba-compiled function. Backends to ROOT, Parquet, and HDF5 are included, as well as a Python ``shelve`` alternative. Storing and accessing a complete dataset, including metadata, requires no more infrastructure than a collection of named arrays. (Data types are encoded in the names, values in the arrays.) OAMap is intended as a middleware layer above file formats and databases but below a fully integrated analysis suite. 19 | 20 | Installation 21 | ------------ 22 | 23 | Install OAMap like any other Python package: 24 | 25 | .. code-block:: bash 26 | 27 | pip install oamap --user 28 | 29 | or similar (use ``sudo``, ``virtualenv``, or ``conda`` if you wish). 30 | 31 | **Strict dependencies:** 32 | 33 | - `Python `_ (2.6+, 3.4+) 34 | - `Numpy `_ 35 | 36 | **Recommended dependencies:** 37 | 38 | - `Numba and LLVM `_ to JIT-compile functions (requires a particular version of LLVM, follow instructions) 39 | - `thriftpy `_ to read Parquet files (pure Python, pip is fine) 40 | - `uproot `_ to read ROOT files (pure Python, pip is fine) 41 | - `h5py `_ to read HDF5 files (requires binary libraries; follow instructions) 42 | 43 | **Optional dependencies:** (all are bindings to binaries that can be package-installed) 44 | 45 | - `lz4 `_ compression used by some ROOT and Parquet files 46 | - `python-snappy `_ compression used by some Parquet files 47 | - `lzo `_ compression used by some Parquet files 48 | - `brotli `_ compression used by some Parquet files 49 | -------------------------------------------------------------------------------- /oamap/extension/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import codecs 32 | import sys 33 | 34 | import numpy 35 | 36 | import oamap.generator 37 | 38 | class _GenerateBytes(object): 39 | py3 = sys.version_info[0] >= 3 40 | 41 | def _generatebytes(self, arrays, index, cache): 42 | listgen = self.generic 43 | primgen = self.generic.content 44 | 45 | if isinstance(listgen, oamap.generator.MaskedListGenerator): 46 | mask = cache[listgen.maskidx] 47 | if mask is None: 48 | self._getarrays(arrays, cache, listgen._toget(arrays, cache)) 49 | mask = cache[listgen.maskidx] 50 | 51 | value = mask[index] 52 | if value == listgen.maskedvalue: 53 | return None 54 | else: 55 | index = value 56 | 57 | starts = cache[listgen.startsidx] 58 | stops = cache[listgen.stopsidx] 59 | data = cache[primgen.dataidx] 60 | if starts is None or stops is None or data is None: 61 | toget = listgen._toget(arrays, cache) 62 | toget.update(primgen._toget(arrays, cache)) 63 | self._getarrays(arrays, cache, toget) 64 | starts = cache[listgen.startsidx] 65 | stops = cache[listgen.stopsidx] 66 | data = cache[primgen.dataidx] 67 | 68 | array = data[starts[index]:stops[index]] 69 | 70 | if isinstance(array, bytes): 71 | return array 72 | elif isinstance(array, numpy.ndarray): 73 | return array.tostring() 74 | elif self.py3: 75 | return bytes(array) 76 | else: 77 | return "".join(map(chr, array)) 78 | 79 | def degenerate(self, obj): 80 | if obj is None: 81 | return obj 82 | 83 | elif self.py3: 84 | if isinstance(obj, bytes): 85 | return obj 86 | else: 87 | return codecs.utf_8_encode(obj)[0] 88 | 89 | else: 90 | if isinstance(obj, str): 91 | return map(ord, obj) 92 | else: 93 | return map(ord, codecs.utf_8_encode(obj)[0]) 94 | 95 | class ByteStringGenerator(_GenerateBytes, oamap.generator.ExtendedGenerator): 96 | pattern = {"name": "ByteString", "type": "list", "content": {"type": "primitive", "dtype": "uint8", "nullable": False}} 97 | 98 | def _generate(self, arrays, index, cache): 99 | return self._generatebytes(arrays, index, cache) 100 | 101 | class UTF8StringGenerator(_GenerateBytes, oamap.generator.ExtendedGenerator): 102 | pattern = {"name": "UTF8String", "type": "list", "content": {"type": "primitive", "dtype": "uint8", "nullable": False}} 103 | 104 | def _generate(self, arrays, index, cache): 105 | out = self._generatebytes(arrays, index, cache) 106 | if out is None: 107 | return out 108 | else: 109 | return codecs.utf_8_decode(out)[0] 110 | 111 | def ByteString(nullable=False, starts=None, stops=None, data=None, mask=None, packing=None, doc=None, metadata=None): 112 | import oamap.schema 113 | return oamap.schema.List(oamap.schema.Primitive(numpy.uint8, data=data), nullable=nullable, starts=starts, stops=stops, mask=mask, packing=packing, name="ByteString", doc=doc, metadata=metadata) 114 | 115 | def UTF8String(nullable=False, starts=None, stops=None, data=None, mask=None, packing=None, doc=None, metadata=None): 116 | import oamap.schema 117 | return oamap.schema.List(oamap.schema.Primitive(numpy.uint8, data=data), nullable=nullable, starts=starts, stops=stops, mask=mask, packing=packing, name="UTF8String", doc=doc, metadata=metadata) 118 | -------------------------------------------------------------------------------- /tests/test_fill.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import unittest 32 | 33 | import oamap.inference 34 | import oamap.fill 35 | import oamap.proxy 36 | from oamap.schema import * 37 | 38 | class TestFill(unittest.TestCase): 39 | def runTest(self): 40 | pass 41 | 42 | def check(self, value, schema=None, debug=False): 43 | if schema is None: 44 | schema = oamap.inference.fromdata(value) 45 | if debug: 46 | print("schema: {0}".format(schema)) 47 | arrays = oamap.fill.fromdata(value, schema) 48 | if debug: 49 | print("arrays:") 50 | for n in sorted(arrays): 51 | print(" {0}: {1}".format(repr(n), arrays[n])) 52 | columnar = schema(arrays) 53 | if debug: 54 | print("columnar: {0}".format(columnar)) 55 | value2 = oamap.proxy.tojson(columnar) 56 | self.assertEqual(value, value2) 57 | 58 | def test_Primitive(self): 59 | self.check(3) 60 | self.check(3.14) 61 | self.check({"real": 3, "imag": 4}) 62 | self.check("inf") 63 | self.check("-inf") 64 | self.check("nan") 65 | # self.check([[1, 2], [3, 4]], Primitive("i8", (2, 2))) 66 | 67 | def test_List(self): 68 | self.check([], schema=List(Primitive("i8"))) 69 | self.check([], schema=List(List(List(List(Primitive("i8")))))) 70 | self.check([[[[]]]], schema=List(List(List(List(Primitive("i8")))))) 71 | self.check([1, 2, 3]) 72 | self.check([[1, 2, 3], [], [4, 5]]) 73 | self.check([[1, 2, None], [], [4, 5]]) 74 | 75 | def test_Union(self): 76 | self.check([1, 2, 3, 4.4, 5.5, 6.6], schema=List(Union([Primitive("i8"), Primitive("f8")]))) 77 | self.check([3.14, [], 1.1, 2.2, [1, 2, 3]]) 78 | self.check([3.14, [], 1.1, None, [1, 2, 3]]) 79 | 80 | def test_Record(self): 81 | self.check({"one": 1, "two": 2.2}) 82 | self.check({"one": {"uno": 1, "dos": 2}, "two": 2.2}) 83 | self.check({"one": {"uno": 1, "dos": [2]}, "two": 2.2}) 84 | self.check([{"one": 1, "two": 2.2}, {"one": 1.1, "two": 2.2}]) # two of same Record 85 | self.check([{"one": 1, "two": 2.2}, {"one": [1, 2, 3], "two": 2.2}]) # Union of attribute 86 | self.check([{"one": 1, "two": 2.2}, {"two": 2.2}]) # Union of Records 87 | self.check([{"one": 1, "two": 2.2}, None]) # nullable Record 88 | 89 | def test_Tuple(self): 90 | self.check([1, [2, 3], [[4, 5], [6]]], schema=Tuple([Primitive("i8"), List(Primitive("i8")), List(List(Primitive("i8")))])) 91 | self.check([1, [2, 3], None], schema=Tuple([Primitive("i8"), List(Primitive("i8")), List(List(Primitive("i8")), nullable=True)])) 92 | 93 | def test_Pointer(self): 94 | class Node(object): 95 | def __init__(self, label, next): 96 | self.label = label 97 | self.next = next 98 | 99 | schema = Record({"label": Primitive("i8")}, name="Node") 100 | schema["next"] = Pointer(schema) 101 | value = Node(0, Node(1, Node(2, None))) 102 | value.next.next.next = value 103 | 104 | arrays = oamap.fill.fromdata(value, schema) 105 | columnar = schema(arrays) 106 | 107 | self.assertEqual(value.label, columnar.label) 108 | self.assertEqual(value.next.label, columnar.next.label) 109 | self.assertEqual(value.next.next.label, columnar.next.next.label) 110 | self.assertEqual(value.next.next.next.label, columnar.next.next.next.label) 111 | self.assertEqual(value.next.next.next.next.label, columnar.next.next.next.next.label) 112 | self.assertEqual(value.next.next.next.next.next.label, columnar.next.next.next.next.next.label) 113 | self.assertEqual(value.next.next.next.next.next.next.label, columnar.next.next.next.next.next.next.label) 114 | -------------------------------------------------------------------------------- /oamap/backend/arrow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import numpy 32 | 33 | import oamap.schema 34 | import oamap.generator 35 | from oamap.util import OrderedDict 36 | 37 | def schema(table): 38 | import pyarrow 39 | def recurse(node, name, index, nullable): 40 | if isinstance(node, pyarrow.lib.ListType): 41 | return oamap.schema.List(recurse(node.value_type, name, index + 2, nullable), 42 | nullable=nullable, 43 | starts="{0}/{1}".format(name, index + 1), 44 | stops="{0}/{1}".format(name, index + 1), 45 | mask="{0}/{1}".format(name, index)) 46 | elif isinstance(node, pyarrow.lib.DataType): 47 | return oamap.schema.Primitive(node.to_pandas_dtype(), 48 | nullable=nullable, 49 | data="{0}/{1}".format(name, index + 1), 50 | mask="{0}/{1}".format(name, index)) 51 | else: 52 | raise NotImplementedError(type(node)) 53 | 54 | fields = [] 55 | for n in table.schema.names: 56 | field = table.schema.field_by_name(n) 57 | fields.append((n, recurse(field.type, n, 0, field.nullable))) 58 | 59 | return oamap.schema.List( 60 | oamap.schema.Record(OrderedDict(fields)), 61 | starts="", 62 | stops="") 63 | 64 | def proxy(table): 65 | import pyarrow 66 | class _ArrayDict(object): 67 | def __init__(self, table): 68 | self.table = table 69 | 70 | def chop(self, name): 71 | slashindex = name.rindex("/") 72 | return name[:slashindex], int(name[slashindex + 1 :]) 73 | 74 | def frombuffer(self, chunk, bufferindex): 75 | def truncate(array, length, offset=0): 76 | return array[:length + offset] 77 | 78 | def mask(index, length): 79 | buf = chunk.buffers()[index] 80 | if buf is None: 81 | return numpy.arange(length, dtype=oamap.generator.Masked.maskdtype) 82 | else: 83 | unmasked = truncate(numpy.unpackbits(numpy.frombuffer(buf, dtype=numpy.uint8)).view(numpy.bool_), length) 84 | mask = numpy.empty(len(unmasked), dtype=oamap.generator.Masked.maskdtype) 85 | mask[unmasked] = numpy.arange(unmasked.sum(), dtype=mask.dtype) 86 | mask[~unmasked] = oamap.generator.Masked.maskedvalue 87 | return mask 88 | 89 | def recurse(tpe, index, length): 90 | if isinstance(tpe, pyarrow.lib.ListType): 91 | if index == bufferindex: 92 | # list mask 93 | return mask(index, length) 94 | elif index + 1 == bufferindex: 95 | # list starts 96 | return truncate(numpy.frombuffer(chunk.buffers()[index + 1], dtype=numpy.int32), length, 1) 97 | else: 98 | # descend into list 99 | length = truncate(numpy.frombuffer(chunk.buffers()[index + 1], dtype=numpy.int32), length, 1)[-1] 100 | return recurse(tpe.value_type, index + 2, length) 101 | 102 | elif isinstance(tpe, pyarrow.lib.DataType): 103 | if index == bufferindex: 104 | # data mask 105 | return mask(index, length) 106 | elif index + 1 == bufferindex: 107 | # data 108 | return truncate(numpy.frombuffer(chunk.buffers()[index + 1], dtype=tpe.to_pandas_dtype()), length) 109 | else: 110 | raise AssertionError 111 | 112 | else: 113 | raise NotImplementedError 114 | 115 | return recurse(chunk.type, 0, len(chunk)) 116 | 117 | def getall(self, names): 118 | out = {} 119 | for name in names: 120 | if len(str(name)) == 0: 121 | if isinstance(name, oamap.generator.StartsRole): 122 | out[name] = numpy.array([0], dtype=oamap.generator.ListGenerator.posdtype) 123 | elif isinstance(name, oamap.generator.StopsRole): 124 | out[name] = numpy.array([self.table.num_rows], dtype=oamap.generator.ListGenerator.posdtype) 125 | else: 126 | raise AssertionError 127 | 128 | elif isinstance(name, oamap.generator.StopsRole): 129 | out[name] = out[name.starts][1:] 130 | 131 | else: 132 | columnname, bufferindex = self.chop(str(name)) 133 | column = self.table[self.table.schema.names.index(columnname)] 134 | chunks = column.data.chunks 135 | if len(chunks) == 0: 136 | raise ValueError("Arrow column {0} has no chunks".format(repr(columnname))) 137 | elif len(chunks) == 1: 138 | out[name] = self.frombuffer(chunks[0], bufferindex) 139 | else: 140 | out[name] = numpy.concatenate([self.frombuffer(chunk, bufferindex) for chunk in chunks]) 141 | 142 | return out 143 | 144 | return schema(table)(_ArrayDict(table)) 145 | -------------------------------------------------------------------------------- /oamap/backend/root/cmsnano.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import numpy 32 | 33 | import oamap.backend.root 34 | import oamap.schema 35 | import oamap.dataset 36 | import oamap.proxy 37 | from oamap.util import OrderedDict 38 | 39 | def dataset(path, treepath="Events", namespace=None, **kwargs): 40 | import uproot 41 | 42 | if namespace is None: 43 | namespace = "root.cmsnano({0})".format(repr(path)) 44 | 45 | if "localsource" not in kwargs: 46 | kwargs["localsource"] = lambda path: uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 47 | kwargs["total"] = False 48 | kwargs["blocking"] = True 49 | 50 | paths2entries = uproot.tree.numentries(path, treepath, **kwargs) 51 | if len(paths2entries) == 0: 52 | raise ValueError("path {0} matched no TTrees".format(repr(path))) 53 | 54 | offsets = [0] 55 | paths = [] 56 | for path, numentries in paths2entries.items(): 57 | offsets.append(offsets[-1] + numentries) 58 | paths.append(path) 59 | 60 | sch = schema(paths[0], namespace=namespace) 61 | doc = sch.doc 62 | sch.doc = None 63 | 64 | return oamap.dataset.Dataset(treepath, 65 | sch, 66 | {namespace: oamap.backend.root.ROOTBackend(paths, treepath, namespace)}, 67 | oamap.dataset.SingleThreadExecutor(), 68 | offsets, 69 | extension=None, 70 | packing=None, 71 | doc=doc, 72 | metadata={"schemafrom": paths[0]}) 73 | 74 | def proxy(path, treepath="Events", namespace=None, extension=oamap.extension.common): 75 | import uproot 76 | 77 | if namespace is None: 78 | namespace = "root.cmsnano({0})".format(repr(path)) 79 | 80 | def localsource(path): 81 | return uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 82 | 83 | return _proxy(uproot.open(path, localsource=localsource)[treepath], namespace=namespace, extension=extension) 84 | 85 | def _proxy(tree, namespace=None, extension=oamap.extension.common): 86 | if namespace is None: 87 | namespace = "root.cmsnano({0})".format(repr(path)) 88 | 89 | schema = _schema(tree, namespace=namespace) 90 | generator = schema.generator(extension=extension) 91 | 92 | return oamap.proxy.ListProxy(generator, oamap.backend.root.ROOTArrays(tree, oamap.backend.root.ROOTBackend([tree._context.sourcepath], tree._context.treename, namespace)), generator._newcache(), 0, 1, tree.numentries) 93 | 94 | def schema(path, treepath="Events", namespace=None): 95 | import uproot 96 | 97 | if namespace is None: 98 | namespace = "root.cmsnano({0})".format(repr(path)) 99 | 100 | def localsource(path): 101 | return uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 102 | 103 | return _schema(uproot.open(path, localsource=localsource)[treepath], namespace=namespace) 104 | 105 | def _schema(tree, namespace=None): 106 | if namespace is None: 107 | namespace = "root.cmsnano({0})".format(repr(path)) 108 | 109 | schema = oamap.backend.root._schema(tree, namespace=namespace) 110 | 111 | groups = OrderedDict() 112 | for name in list(schema.content.keys()): 113 | if isinstance(schema.content[name], oamap.schema.List) and "_" in name: 114 | try: 115 | branch = tree[schema.content[name].starts] 116 | except KeyError: 117 | pass 118 | else: 119 | underscore = name.index("_") 120 | groupname, fieldname = name[:underscore], name[underscore + 1:] 121 | countbranchname = branch.countbranch.name 122 | if not isinstance(countbranchname, str): 123 | countbranchname = countbranchname.decode("ascii") 124 | if groupname not in groups: 125 | groups[groupname] = schema.content[groupname] = \ 126 | oamap.schema.List(oamap.schema.Record({}, name=groupname), starts=countbranchname, stops=countbranchname, namespace=namespace) 127 | assert countbranchname == schema.content[groupname].starts 128 | groups[groupname].content[fieldname] = schema.content[name].content 129 | del schema.content[name] 130 | 131 | elif "MET_" in name or name.startswith("LHE_") or name.startswith("Pileup_") or name.startswith("PV_"): 132 | underscore = name.index("_") 133 | groupname, fieldname = name[:underscore], name[underscore + 1:] 134 | if groupname not in groups: 135 | groups[groupname] = schema.content[groupname] = \ 136 | oamap.schema.Record({}, name=groupname) 137 | groups[groupname][fieldname] = schema.content[name] 138 | del schema.content[name] 139 | 140 | hlt = oamap.schema.Record({}, name="HLT") 141 | flag = oamap.schema.Record({}, name="Flag") 142 | for name in schema.content.keys(): 143 | if name.startswith("HLT_"): 144 | hlt[name[4:]] = schema.content[name] 145 | del schema.content[name] 146 | if name.startswith("Flag_"): 147 | flag[name[5:]] = schema.content[name] 148 | del schema.content[name] 149 | 150 | schema.content["HLT"] = hlt 151 | schema.content["Flag"] = flag 152 | schema.content.name = "Event" 153 | return schema 154 | -------------------------------------------------------------------------------- /tests/test_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import math 32 | 33 | import unittest 34 | 35 | from oamap.schema import * 36 | from oamap.database import * 37 | from oamap.dataset import * 38 | import oamap.operations 39 | 40 | class TestDatabase(unittest.TestCase): 41 | def runTest(self): 42 | pass 43 | 44 | def test_data(self): 45 | db = InMemoryDatabase() 46 | db.fromdata("one", Record({"x": List("int32"), "y": List("float64")}), {"x": [1, 2, 3, 4, 5], "y": [1.1, 2.2, 3.3]}) 47 | 48 | one = db.data.one 49 | self.assertEqual(one().x[0], 1) 50 | self.assertEqual(one().x[1], 2) 51 | self.assertEqual(one().x[2], 3) 52 | self.assertEqual(one().y[0], 1.1) 53 | self.assertEqual(one().y[1], 2.2) 54 | self.assertEqual(one().y[2], 3.3) 55 | 56 | # recasting 57 | db.data.two = one.project("x") 58 | two = db.data.two 59 | self.assertEqual(two[0], 1) 60 | self.assertEqual(two[1], 2) 61 | self.assertEqual(two[2], 3) 62 | self.assertEqual(two[3], 4) 63 | self.assertEqual(two[4], 5) 64 | 65 | db.data.two = one.drop("y") 66 | two = db.data.two 67 | self.assertEqual(two().x[0], 1) 68 | self.assertEqual(two().x[1], 2) 69 | self.assertEqual(two().x[2], 3) 70 | self.assertEqual(two().x[3], 4) 71 | self.assertEqual(two().x[4], 5) 72 | 73 | db.data.two = one.drop("y").keep("x") 74 | two = db.data.two 75 | self.assertEqual(two().x[0], 1) 76 | self.assertEqual(two().x[1], 2) 77 | self.assertEqual(two().x[2], 3) 78 | self.assertEqual(two().x[3], 4) 79 | self.assertEqual(two().x[4], 5) 80 | 81 | # transformation 82 | db.data.three = one.filter(lambda x: x % 2 == 0, at="x") 83 | three = db.data.three 84 | self.assertEqual(three().x, [2, 4]) 85 | 86 | db.data.three = one.filter(lambda x: x > 1, at="x").filter(lambda x: x < 5, at="x") 87 | three = db.data.three 88 | self.assertEqual(three().x, [2, 3, 4]) 89 | 90 | # action 91 | table = one.map(lambda x: x**2, at="x") 92 | self.assertEqual(table.result().tolist(), [1, 4, 9, 16, 25]) 93 | 94 | summary = one.reduce(0, lambda x, tally: x + tally, at="x") 95 | self.assertEqual(summary.result(), sum([1, 2, 3, 4, 5])) 96 | 97 | def test_dataset(self): 98 | db = InMemoryDatabase() 99 | db.fromdata("one", List(Record({"x": "int32", "y": "float64"})), [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}, {"x": 6, "y": 6.6}]) 100 | one = db.data.one 101 | self.assertEqual(one[0].x, 1) 102 | self.assertEqual(one[1].x, 2) 103 | self.assertEqual(one[2].x, 3) 104 | self.assertEqual(one[3].x, 4) 105 | self.assertEqual(one[4].x, 5) 106 | self.assertEqual(one[5].x, 6) 107 | self.assertEqual([obj.x for obj in one], [1, 2, 3, 4, 5, 6]) 108 | self.assertEqual([obj.y for obj in one], [1.1, 2.2, 3.3, 4.4, 5.5, 6.6]) 109 | self.assertEqual(oamap.operations.project(one.partition(0), "x"), [1, 2, 3]) 110 | self.assertEqual(oamap.operations.project(one.partition(1), "x"), [4, 5, 6]) 111 | 112 | # recasting 113 | db.data.two = one.project("x") 114 | two = db.data.two 115 | self.assertEqual(two.partition(0), [1, 2, 3]) 116 | self.assertEqual(two.partition(1), [4, 5, 6]) 117 | self.assertEqual([x for x in two], [1, 2, 3, 4, 5, 6]) 118 | 119 | db.data.two = one.drop("y").project("x") 120 | two = db.data.two 121 | self.assertEqual([x for x in two], [1, 2, 3, 4, 5, 6]) 122 | self.assertEqual(two.partition(0), [1, 2, 3]) 123 | self.assertEqual(two.partition(1), [4, 5, 6]) 124 | 125 | # transformation 126 | db.data.three = one.filter(lambda obj: obj.x % 2 == 0) 127 | three = db.data.three 128 | self.assertEqual([obj.x for obj in three], [2, 4, 6]) 129 | self.assertEqual([obj.y for obj in three], [2.2, 4.4, 6.6]) 130 | self.assertEqual(oamap.operations.project(three.partition(0), "x"), [2]) 131 | self.assertEqual(oamap.operations.project(three.partition(1), "x"), [4, 6]) 132 | 133 | db.data.three = one.filter(lambda obj: obj.x > 1).filter(lambda obj: obj.x < 6) 134 | three = db.data.three 135 | 136 | self.assertEqual([obj.x for obj in three], [2, 3, 4, 5]) 137 | self.assertEqual([obj.y for obj in three], [2.2, 3.3, 4.4, 5.5]) 138 | self.assertEqual(oamap.operations.project(three.partition(0), "x"), [2, 3]) 139 | self.assertEqual(oamap.operations.project(three.partition(1), "x"), [4, 5]) 140 | 141 | # action 142 | table = one.map(lambda obj: None if obj.x % 2 == 0 else (obj.x, obj.y, obj.x + obj.y)) 143 | self.assertEqual(table.result().tolist(), [(1, 1.1, 2.1), (3, 3.3, 6.3), (5, 5.5, 10.5)]) 144 | 145 | summary = one.reduce(0, lambda obj, tally: obj.x + tally) 146 | self.assertEqual(summary.result(), sum([1, 2, 3, 4, 5, 6])) 147 | 148 | # print 149 | # print "one" 150 | # for n, x in db._backends[db._namespace]._arrays[0].items(): 151 | # print db._backends[db._namespace]._refcounts[0][n], n, x 152 | 153 | del db.data.one 154 | # print "two" 155 | # for n, x in db._backends[db._namespace]._arrays[0].items(): 156 | # print db._backends[db._namespace]._refcounts[0][n], n, x 157 | 158 | del db.data.two 159 | # print "three" 160 | # for n, x in db._backends[db._namespace]._arrays[0].items(): 161 | # print db._backends[db._namespace]._refcounts[0][n], n, x 162 | 163 | del db.data.three 164 | # print "done" 165 | # for n, x in db._backends[db._namespace]._arrays[0].items(): 166 | # print db._backends[db._namespace]._refcounts[0][n], n, x 167 | 168 | self.assertEqual(len(db._backends[db._namespace]._refcounts.get(0, {})), 0) 169 | self.assertEqual(len(db._backends[db._namespace]._refcounts.get(1, {})), 0) 170 | -------------------------------------------------------------------------------- /oamap/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import ast 32 | import math 33 | import sys 34 | import types 35 | 36 | import numpy 37 | 38 | if sys.version_info[0] > 2: 39 | basestring = str 40 | unicode = str 41 | def MethodType(function, instance, cls): 42 | if instance is None: 43 | return function 44 | else: 45 | return types.MethodType(function, instance) 46 | else: 47 | MethodType = types.MethodType 48 | 49 | try: 50 | from collections import OrderedDict 51 | except ImportError: 52 | # simple OrderedDict implementation for Python 2.6 53 | class OrderedDict(dict): 54 | def __init__(self, items=(), **kwds): 55 | items = list(items) 56 | self._order = [k for k, v in items] + [k for k, v in kwds.items()] 57 | super(OrderedDict, self).__init__(items) 58 | def keys(self): 59 | return self._order 60 | def values(self): 61 | return [self[k] for k in self._order] 62 | def items(self): 63 | return [(k, self[k]) for k in self._order] 64 | def __setitem__(self, name, value): 65 | if name not in self._order: 66 | self._order.append(name) 67 | super(OrderedDict, self).__setitem__(name, value) 68 | def __delitem__(self, name): 69 | if name in self._order: 70 | self._order.remove(name) 71 | super(OrderedDict, self).__delitem__(name) 72 | def __repr__(self): 73 | return "OrderedDict([{0}])".format(", ".join("({0}, {1})".format(repr(k), repr(v)) for k, v in self.items())) 74 | 75 | try: 76 | from UserDict import DictMixin as MutableMapping 77 | except ImportError: 78 | from collections import MutableMapping 79 | 80 | try: 81 | from importlib import import_module 82 | except ImportError: 83 | def import_module(modulename): 84 | module = __import__(modulename) 85 | for name in modulename.split(".")[1:]: 86 | module = module.__dict__[name] 87 | return module 88 | 89 | def slice2sss(index, length): 90 | step = 1 if index.step is None else index.step 91 | 92 | if step == 0: 93 | raise ValueError("slice step cannot be zero") 94 | 95 | elif step > 0: 96 | if index.start is None: 97 | start = 0 # in-range 98 | elif index.start >= 0: 99 | start = min(index.start, length) # in-range or length 100 | else: 101 | start = max(0, index.start + length) # in-range 102 | 103 | if index.stop is None: 104 | stop = length # length 105 | elif index.stop >= 0: 106 | stop = max(start, min(length, index.stop)) # in-range or length 107 | else: 108 | stop = max(start, index.stop + length) # in-range or length 109 | 110 | else: 111 | if index.start is None: 112 | start = length - 1 # in-range 113 | elif index.start >= 0: 114 | start = min(index.start, length - 1) # in-range 115 | else: 116 | start = max(index.start + length, -1) # in-range or -1 117 | 118 | if index.stop is None: 119 | stop = -1 # -1 120 | elif index.stop >= 0: 121 | stop = min(start, index.stop) # in-range or -1 122 | else: 123 | stop = min(start, max(-1, index.stop + length)) # in-range or -1 124 | 125 | return start, stop, step 126 | 127 | def json2python(value): 128 | def recurse(value): 129 | if isinstance(value, dict) and len(value) == 2 and set(value.keys()) == set(["real", "imag"]) and all(isinstance(x, (int, float)) for x in value.values()): 130 | return value["real"] + value["imag"]*1j 131 | elif value == "inf": 132 | return float("inf") 133 | elif value == "-inf": 134 | return float("-inf") 135 | elif value == "nan": 136 | return float("nan") 137 | elif isinstance(value, list): 138 | return [recurse(x) for x in value] 139 | elif isinstance(value, dict): 140 | return dict((n, recurse(x)) for n, x in value.items()) 141 | else: 142 | return value 143 | return recurse(value) 144 | 145 | def python2json(value, allowlinks=False): 146 | def recurse(value, memo): 147 | if id(value) in memo: 148 | if allowlinks: 149 | return memo[id(value)] 150 | else: 151 | raise TypeError("cross-linking within an object is not allowed") 152 | 153 | if value is None: 154 | memo[id(value)] = None 155 | 156 | elif isinstance(value, (numbers.Integral, numpy.integer)): 157 | memo[id(value)] = int(value) 158 | 159 | elif isinstance(value, (numbers.Real, numpy.floating)): 160 | if math.isnan(value): 161 | memo[id(value)] = "nan" 162 | elif math.isinf(value) and value > 0: 163 | memo[id(value)] = "inf" 164 | elif math.isinf(value): 165 | memo[id(value)] = "-inf" 166 | else: 167 | memo[id(value)] = float(value) 168 | 169 | elif isinstance(value, (numbers.Complex, numpy.complex)): 170 | memo[id(value)] = {"real": float(value.real), "imag": float(value.imag)} 171 | 172 | elif isinstance(value, basestring): 173 | memo[id(value)] = value 174 | 175 | elif isinstance(value, dict): 176 | memo[id(value)] = {} 177 | for n, x in value.items(): 178 | if not isinstance(n, basestring): 179 | raise TypeError("dict keys for JSON must be strings") 180 | memo[id(value)][n] = recurse(x, memo) 181 | 182 | else: 183 | memo[id(value)] = [] 184 | for x in value: 185 | memo[id(value)].append(recurse(x, memo)) 186 | 187 | return memo[id(value)] 188 | 189 | return recurse(value, {}) 190 | 191 | def python2hashable(value): 192 | def recurse(value): 193 | if isinstance(value, dict): 194 | return tuple((n, recurse(value[n])) for n in sorted(value)) 195 | elif isinstance(value, list): 196 | return tuple(recurse(x) for x in value) 197 | else: 198 | return value 199 | return recurse(python2json(value)) 200 | 201 | def varname(avoid, trial=None): 202 | while trial is None or trial in avoid: 203 | trial = "v" + str(len(avoid)) 204 | avoid.add(trial) 205 | return trial 206 | 207 | def paramtypes(args): 208 | try: 209 | import numba as nb 210 | except ImportError: 211 | return None 212 | else: 213 | return tuple(nb.typeof(x) for x in args) 214 | 215 | def doexec(module, env): 216 | exec(module, env) 217 | 218 | def stringfcn(fcn): 219 | if isinstance(fcn, basestring): 220 | parsed = ast.parse(fcn).body 221 | if isinstance(parsed[-1], ast.Expr): 222 | parsed[-1] = ast.Return(parsed[-1].value) 223 | parsed[-1].lineno = parsed[-1].value.lineno 224 | parsed[-1].col_offset = parsed[-1].value.col_offset 225 | 226 | env = dict(math.__dict__) 227 | env.update(globals()) 228 | 229 | free = set() 230 | defined = set(["None", "False", "True"]) 231 | defined.update(env) 232 | def recurse(node): 233 | if isinstance(node, ast.Name): 234 | if isinstance(node.ctx, ast.Store): 235 | defined.add(node.id) 236 | elif isinstance(node.ctx, ast.Load) and node.id not in defined: 237 | free.add(node.id) 238 | elif isinstance(node, ast.AST): 239 | for n in node._fields: 240 | recurse(getattr(node, n)) 241 | elif isinstance(node, list): 242 | for x in node: 243 | recurse(x) 244 | recurse(parsed) 245 | 246 | avoid = free.union(defined) 247 | fcnname = varname(avoid, "fcn") 248 | 249 | module = ast.parse(""" 250 | def {fcn}({params}): 251 | REPLACEME 252 | """.format(fcn=fcnname, params=",".join(free))) 253 | module.body[0].body = parsed 254 | module = compile(module, "", "exec") 255 | 256 | doexec(module, env) 257 | fcn = env[fcnname] 258 | 259 | return fcn 260 | 261 | def trycompile(fcn, paramtypes=None, numba=True): 262 | fcn = stringfcn(fcn) 263 | 264 | if numba is None or numba is False: 265 | return fcn 266 | 267 | try: 268 | import numba as nb 269 | except ImportError: 270 | return fcn 271 | 272 | if numba is True: 273 | numbaopts = {} 274 | else: 275 | numbaopts = numba 276 | 277 | if isinstance(fcn, nb.dispatcher.Dispatcher): 278 | fcn = fcn.py_fcn 279 | 280 | if paramtypes is None: 281 | return nb.jit(**numbaopts)(fcn) 282 | else: 283 | return nb.jit(paramtypes, **numbaopts)(fcn) 284 | 285 | def returntype(fcn, paramtypes): 286 | try: 287 | import numba as nb 288 | except ImportError: 289 | return None 290 | 291 | if isinstance(fcn, nb.dispatcher.Dispatcher): 292 | overload = fcn.overloads.get(paramtypes, None) 293 | if overload is None: 294 | return None 295 | else: 296 | return overload.signature.return_type 297 | -------------------------------------------------------------------------------- /oamap/backend/packing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import json 32 | import sys 33 | 34 | import numpy 35 | 36 | import oamap.generator 37 | 38 | if sys.version_info[0] > 2: 39 | basestring = str 40 | 41 | class PackedSource(object): 42 | def __init__(self, source, suffix): 43 | self.source = source 44 | self.suffix = suffix 45 | 46 | def __repr__(self): 47 | return "{0}({1}{2})".format(self.__class__.__name__, repr(self.source), "".join(", " + repr(x) for x in self._tojsonargs())) 48 | 49 | def getall(self, roles): 50 | if hasattr(self.source, "getall"): 51 | return self.source.getall(roles) 52 | else: 53 | return dict((n, self.source[str(n)]) for n in roles) 54 | 55 | def putall(self, roles2arrays): 56 | if hasattr(self.source, "putall"): 57 | self.source.putall(roles2arrays) 58 | else: 59 | for n, x in roles2arrays.items(): 60 | self.source[str(n)] = x 61 | 62 | def copy(self): 63 | return self.__class__(self.source, self.suffix) 64 | 65 | def anchor(self, source): 66 | if self.source is None: 67 | return self.__class__(source, self.suffix) 68 | else: 69 | return self.__class__(self.source.anchor(source), self.suffix) 70 | 71 | def __eq__(self, other): 72 | return self.__class__.__name__ == other.__class__.__name__ and self._tojsonargs() == other._tojsonargs() 73 | 74 | def __ne__(self, other): 75 | return not self.__eq__(other) 76 | 77 | def __hash__(self): 78 | return hash((PackedSource, self.__class__.__name__, tuple(self._tojsonargs()))) 79 | 80 | def tojsonfile(self, file, *args, **kwds): 81 | json.dump(self.tojson(), file, *args, **kwds) 82 | 83 | def tojsonstring(self, *args, **kwds): 84 | return json.dumps(self.tojson(), *args, **kwds) 85 | 86 | def tojson(self): 87 | out = [] 88 | node = self 89 | while isinstance(node, PackedSource): 90 | args = self._tojsonargs() 91 | if len(args) == 0: 92 | out.append(self.__class__.__name__) 93 | else: 94 | out.append({self.__class__.__name__: args}) 95 | node = node.source 96 | return out 97 | 98 | @staticmethod 99 | def fromjsonfile(file, *args, **kwds): 100 | return PackedSource.fromjson(json.load(file, *args, **kwds)) 101 | 102 | @staticmethod 103 | def fromjsonstring(data, *args, **kwds): 104 | return PackedSource.fromjson(json.loads(data, *args, **kwds)) 105 | 106 | @staticmethod 107 | def fromjson(data): 108 | if isinstance(data, list): 109 | source = None 110 | for datum in reversed(data): 111 | if isinstance(datum, basestring): 112 | classname = datum 113 | args = () 114 | elif isinstance(datum, dict) and len(datum) == 1: 115 | classname, = datum.keys() 116 | args, = datum.values() 117 | else: 118 | raise ValueError("source packings JSON must be a list of strings or {\"classname\": [args]} dicts") 119 | try: 120 | cls = globals()[classname] 121 | except KeyError: 122 | raise ValueError("source packing class {0} not found".format(repr(classname))) 123 | source = cls(source, *args) 124 | return source 125 | else: 126 | raise ValueError("source packings JSON must be a list of strings or {\"classname\": [args]} dicts") 127 | 128 | ################################################################ BitPackMasks 129 | 130 | class MaskBitPack(PackedSource): 131 | def __init__(self, source, suffix="-bitpacked"): 132 | super(MaskBitPack, self).__init__(source, suffix) 133 | 134 | def _tojsonargs(self): 135 | if self.suffix == "-bitpacked": 136 | return [] 137 | else: 138 | return [self.suffix] 139 | 140 | def getall(self, roles): 141 | others = [n for n in roles if not isinstance(n, oamap.generator.MaskRole)] 142 | renamed = dict((oamap.generator.NoRole(str(n) + self.suffix, n.namespace), n) for n in roles if isinstance(n, oamap.generator.MaskRole)) 143 | out = super(MaskBitPack, self).getall(others + list(renamed)) 144 | for suffixedname, name in renamed.items(): 145 | out[name] = self.unpack(out[suffixedname]) 146 | del out[suffixedname] 147 | return out 148 | 149 | def putall(self, roles2arrays): 150 | out = {} 151 | for n, x in roles2arrays.items(): 152 | if isinstance(n, oamap.generator.MaskRole): 153 | out[oamap.generator.NoRole(str(n) + self.suffix, n.namespace)] = self.pack(x) 154 | else: 155 | out[n] = x 156 | super(MaskBitPack, self).putall(out) 157 | 158 | @staticmethod 159 | def unpack(array): 160 | if not isinstance(array, numpy.ndarray): 161 | array = numpy.array(array, dtype=numpy.dtype(numpy.uint8)) 162 | unmasked = numpy.unpackbits(array).view(numpy.bool_) 163 | mask = numpy.empty(len(unmasked), dtype=oamap.generator.Masked.maskdtype) 164 | mask[unmasked] = numpy.arange(unmasked.sum(), dtype=mask.dtype) 165 | mask[~unmasked] = oamap.generator.Masked.maskedvalue 166 | return mask 167 | 168 | @staticmethod 169 | def pack(array): 170 | if not isinstance(array, numpy.ndarray): 171 | array = numpy.array(array, dtype=oamap.generator.Masked.maskdtype) 172 | return numpy.packbits(array != oamap.generator.Masked.maskedvalue) 173 | 174 | ################################################################ RunLengthMasks 175 | 176 | # TODO: run-length encoding for masks 177 | 178 | ################################################################ ListsAsCounts 179 | 180 | class ListCounts(PackedSource): 181 | def __init__(self, source, suffix="-counts"): 182 | super(ListCounts, self).__init__(source, suffix) 183 | 184 | def _tojsonargs(self): 185 | if self.suffix == "-counts": 186 | return [] 187 | else: 188 | return [self.suffix] 189 | 190 | def getall(self, roles): 191 | others = [n for n in roles if not isinstance(n, (oamap.generator.StartsRole, oamap.generator.StopsRole))] 192 | renamed = dict((oamap.generator.NoRole(str(n) + self.suffix, n.namespace), n) for n in roles if isinstance(n, oamap.generator.StartsRole)) 193 | out = super(ListCounts, self).getall(others + list(renamed)) 194 | for suffixedname, name in renamed.items(): 195 | out[name], out[name.stops] = self.fromcounts(out[suffixedname]) 196 | del out[suffixedname] 197 | return out 198 | 199 | def putall(self, roles2arrays): 200 | out = {} 201 | for n, x in roles2arrays.items(): 202 | if isinstance(n, oamap.generator.StartsRole): 203 | out[oamap.generator.NoRole(str(n) + self.suffix, n.namespace)] = self.tocounts(x, roles2arrays[n.stops]) 204 | elif isinstance(n, oamap.generator.StopsRole): 205 | pass 206 | else: 207 | out[n] = x 208 | super(ListCounts, self).putall(out) 209 | 210 | @staticmethod 211 | def fromcounts(array): 212 | offsets = numpy.empty(len(array) + 1, dtype=oamap.generator.ListGenerator.posdtype) 213 | offsets[0] = 0 214 | offsets[1:] = numpy.cumsum(array) 215 | return offsets[:-1], offsets[1:] 216 | 217 | @staticmethod 218 | def tocounts(starts, stops): 219 | if not isinstance(starts, numpy.ndarray): 220 | starts = numpy.array(starts, dtype=oamap.generator.ListGenerator.posdtype) 221 | if not isinstance(starts, numpy.ndarray): 222 | stops = numpy.array(stops, dtype=oamap.generator.ListGenerator.posdtype) 223 | if not starts[0] == 0 or not numpy.array_equal(starts[1:], stops[:-1]): 224 | raise ValueError("starts and stops cannot be converted to a single counts array") 225 | return stops - starts 226 | 227 | ################################################################ DropUnionOffsets 228 | 229 | class UnionDropOffsets(PackedSource): 230 | def __init__(self, source): 231 | super(DropUnionOffsets, self).__init__(source, "") 232 | 233 | def _tojsonargs(self): 234 | return [] 235 | 236 | def getall(self, roles): 237 | nooffsets = [n for n in roles if not isinstance(n, oamap.generator.OffsetsRole)] 238 | out = super(UnionDropOffsets, self).getall(nooffsets) 239 | for n in roles: 240 | if isinstance(n, oamap.generator.TagsRole): 241 | out[n.offsets] = self.tags2offsets(out[n]) 242 | return out 243 | 244 | def putall(self, roles2arrays): 245 | super(UnionDropOffsets, self).putall(dict((n, x) for n, x in roles2arrays.items() if not isinstance(n, oamap.generator.OffsetsRole))) 246 | 247 | @staticmethod 248 | def tags2offsets(tags): 249 | if not isinstance(tags, numpy.ndarray): 250 | tags = numpy.array(tags, dtype=oamap.generator.UnionGenerator.tagdtype) 251 | offsets = numpy.empty(len(tags), dtype=oamap.generator.UnionGenerator.offsetdtype) 252 | for tag in numpy.unique(tags): 253 | hastag = (tags == tag) 254 | offsets[hastag] = numpy.arange(hastag.sum(), dtype=offsets.dtype) 255 | return offsets 256 | 257 | ################################################################ CompressAll 258 | 259 | # TODO: apply a named compression algorithm 260 | -------------------------------------------------------------------------------- /oamap/backend/root/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import numpy 32 | 33 | import oamap.schema 34 | import oamap.dataset 35 | import oamap.database 36 | import oamap.proxy 37 | import oamap.backend.packing 38 | from oamap.util import OrderedDict 39 | 40 | def dataset(path, treepath, namespace=None, **kwargs): 41 | import uproot 42 | 43 | if namespace is None: 44 | namespace = "root({0}, {1})".format(repr(path), repr(treepath)) 45 | 46 | if "localsource" not in kwargs: 47 | kwargs["localsource"] = lambda path: uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 48 | kwargs["total"] = False 49 | kwargs["blocking"] = True 50 | 51 | paths2entries = uproot.tree.numentries(path, treepath, **kwargs) 52 | if len(paths2entries) == 0: 53 | raise ValueError("path {0} matched no TTrees".format(repr(path))) 54 | 55 | offsets = [0] 56 | paths = [] 57 | for path, numentries in paths2entries.items(): 58 | offsets.append(offsets[-1] + numentries) 59 | paths.append(path) 60 | 61 | sch = schema(paths[0], treepath, namespace=namespace) 62 | doc = sch.doc 63 | sch.doc = None 64 | 65 | return oamap.dataset.Dataset(treepath.split("/")[-1].split(";")[0], 66 | sch, 67 | {namespace: ROOTBackend(paths, treepath, namespace)}, 68 | oamap.dataset.SingleThreadExecutor(), 69 | offsets, 70 | extension=None, 71 | packing=None, 72 | doc=doc, 73 | metadata={"schemafrom": paths[0]}) 74 | 75 | def proxy(path, treepath, namespace="", extension=oamap.extension.common): 76 | import uproot 77 | def localsource(path): 78 | return uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 79 | return _proxy(uproot.open(path, localsource=localsource)[treepath], namespace=namespace, extension=extension) 80 | 81 | def _proxy(tree, namespace="", extension=oamap.extension.common): 82 | schema = _schema(tree, namespace=namespace) 83 | generator = schema.generator(extension=extension) 84 | return oamap.proxy.ListProxy(generator, ROOTArrays(tree, ROOTBackend([tree._context.sourcepath], tree._context.treename, namespace)), generator._newcache(), 0, 1, tree.numentries) 85 | 86 | def schema(path, treepath, namespace=""): 87 | import uproot 88 | def localsource(path): 89 | return uproot.source.file.FileSource(path, chunkbytes=8*1024, limitbytes=None) 90 | return _schema(uproot.open(path, localsource=localsource)[treepath], namespace=namespace) 91 | 92 | def _schema(tree, namespace=None): 93 | import uproot 94 | 95 | if namespace is None: 96 | namespace = "root({0}, {1})".format(repr(path), repr(treepath)) 97 | 98 | def accumulate(node): 99 | out = oamap.schema.Record(OrderedDict(), namespace=namespace) 100 | for branchname, branch in node.iteritems(aliases=False) if isinstance(node, uproot.tree.TTreeMethods) else node.iteritems(): 101 | if not isinstance(branchname, str): 102 | branchname = branchname.decode("ascii") 103 | fieldname = branchname.split(".")[-1] 104 | 105 | if len(branch.fBranches) > 0: 106 | subrecord = accumulate(branch) 107 | if len(subrecord.fields) > 0: 108 | out[fieldname] = subrecord 109 | 110 | elif isinstance(branch.interpretation, (uproot.interp.asdtype, uproot.interp.numerical.asdouble32)): 111 | subnode = oamap.schema.Primitive(branch.interpretation.todtype, data=branchname, namespace=namespace) 112 | for i in range(len(branch.interpretation.todims)): 113 | subnode = oamap.schema.List(subnode, starts="{0}:/{1}".format(branchname, i), stops="{0}:/{1}".format(branchname, i), namespace=namespace) 114 | out[fieldname] = subnode 115 | 116 | elif isinstance(branch.interpretation, uproot.interp.asjagged) and isinstance(branch.interpretation.asdtype, uproot.interp.asdtype): 117 | subnode = oamap.schema.Primitive(branch.interpretation.asdtype.todtype, data=branchname, namespace=namespace) 118 | for i in range(len(branch.interpretation.asdtype.todims)): 119 | subnode = oamap.schema.List(subnode, starts="{0}:/{1}".format(branchname, i), stops="{0}:/{1}".format(branchname, i), namespace=namespace) 120 | out[fieldname] = oamap.schema.List(subnode, starts=branchname, stops=branchname, namespace=namespace) 121 | 122 | elif isinstance(branch.interpretation, uproot.interp.asstrings): 123 | out[fieldname] = oamap.schema.List(oamap.schema.Primitive(oamap.interp.strings.CHARTYPE, data=branchname, namespace=namespace), starts=branchname, stops=branchname, namespace=namespace, name="ByteString") 124 | 125 | return out 126 | 127 | def combinelists(schema): 128 | if isinstance(schema, oamap.schema.Record) and all(isinstance(x, oamap.schema.List) for x in schema.fields.values()): 129 | out = oamap.schema.List(oamap.schema.Record(OrderedDict(), namespace=namespace), namespace=namespace) 130 | 131 | countbranch = None 132 | for fieldname, field in schema.items(): 133 | try: 134 | branch = tree[field.starts] 135 | except KeyError: 136 | return schema 137 | 138 | if branch.countbranch is None: 139 | return schema 140 | 141 | if countbranch is None: 142 | countbranch = branch.countbranch 143 | elif countbranch is not branch.countbranch: 144 | return schema 145 | 146 | out.content[fieldname] = field.content 147 | 148 | if countbranch is not None: 149 | countbranchname = countbranch.name 150 | if not isinstance(countbranchname, str): 151 | countbranchname = countbranchname.decode("ascii") 152 | out.starts = countbranchname 153 | out.stops = countbranchname 154 | return out 155 | 156 | return schema 157 | 158 | entries = accumulate(tree).replace(combinelists) 159 | entries.name = "Entry" 160 | 161 | doc = tree.title 162 | if not isinstance(doc, str): 163 | doc = doc.decode("ascii") 164 | 165 | return oamap.schema.List(entries, namespace=namespace, doc=doc) 166 | 167 | class ROOTBackend(oamap.database.Backend): 168 | def __init__(self, paths, treepath, namespace): 169 | self._paths = tuple(paths) 170 | self._treepath = treepath 171 | self._namespace = namespace 172 | 173 | @property 174 | def args(self): 175 | return (self._paths, self._treepath) 176 | 177 | def tojson(self): 178 | return {"class": self.__class__.__module__ + "." + self.__class__.__name__, 179 | "paths": list(self._paths), 180 | "treepath": self._treepath} 181 | 182 | @staticmethod 183 | def fromjson(obj, namespace): 184 | return ROOTBackend(obj["paths"], obj["treepath"], namespace) 185 | 186 | @property 187 | def namespace(self): 188 | return self._namespace 189 | 190 | def instantiate(self, partitionid): 191 | return ROOTArrays.frompath(self._paths[partitionid], self._treepath, self) 192 | 193 | class ROOTArrays(object): 194 | @staticmethod 195 | def frompath(path, treepath, backend): 196 | import uproot 197 | file = uproot.open(path) 198 | out = ROOTArrays(file[treepath], backend) 199 | out._source = file._context.source 200 | return out 201 | 202 | def __init__(self, tree, backend): 203 | self._tree = tree 204 | self._backend = backend 205 | self._keycache = {} 206 | 207 | @property 208 | def tree(self): 209 | return self._tree 210 | 211 | @property 212 | def backend(self): 213 | return self._backend 214 | 215 | def getall(self, roles): 216 | import uproot 217 | 218 | def chop(role): 219 | name = str(role).encode("ascii") 220 | try: 221 | colon = name.rindex(b":") 222 | except ValueError: 223 | return name, None 224 | else: 225 | return name[:colon], name[colon + 1:] 226 | 227 | arrays = self._tree.arrays(set(chop(x)[0] for x in roles), keycache=self._keycache) 228 | 229 | out = {} 230 | for role in roles: 231 | branchname, leafname = chop(role) 232 | array = arrays[branchname] 233 | 234 | if leafname is not None and leafname.startswith(b"/"): 235 | if isinstance(array, (uproot.interp.jagged.JaggedArray, uproot.interp.strings.Strings)): 236 | array = array.content 237 | 238 | length = array.shape[0] 239 | stride = 1 240 | for depth in range(int(leafname[1:])): 241 | length *= array.shape[depth + 1] 242 | stride *= array.shape[depth + 1] 243 | 244 | if isinstance(role, oamap.generator.StartsRole) and role not in out: 245 | offsets = numpy.arange(0, (length + 1)*stride, stride) 246 | out[role] = offsets[:-1] 247 | out[role.stops] = offsets[1:] 248 | 249 | elif isinstance(role, oamap.generator.StopsRole) and role not in out: 250 | offsets = numpy.arange(0, (length + 1)*stride, stride) 251 | out[role.starts] = offsets[:-1] 252 | out[role] = offsets[1:] 253 | 254 | elif isinstance(array, numpy.ndarray): 255 | if isinstance(role, oamap.generator.StartsRole) and role not in out: 256 | starts, stops = oamap.backend.packing.ListCounts.fromcounts(array) 257 | out[role] = starts 258 | out[role.stops] = stops 259 | 260 | elif isinstance(role, oamap.generator.StopsRole) and role not in out: 261 | starts, stops = oamap.backend.packing.ListCounts.fromcounts(array) 262 | out[role.starts] = starts 263 | out[role] = stops 264 | 265 | elif isinstance(role, oamap.generator.DataRole): 266 | if leafname is None: 267 | out[role] = array.reshape(-1) 268 | else: 269 | out[role] = array[leafname].reshape(-1) 270 | 271 | elif isinstance(array, (uproot.interp.jagged.JaggedArray, uproot.interp.strings.Strings)): 272 | if isinstance(role, oamap.generator.StartsRole): 273 | out[role] = array.starts 274 | 275 | elif isinstance(role, oamap.generator.StopsRole): 276 | out[role] = array.stops 277 | 278 | elif isinstance(role, oamap.generator.DataRole): 279 | if leafname is None: 280 | out[role] = array.content.reshape(-1) 281 | else: 282 | out[role] = array.content[leafname].reshape(-1) 283 | 284 | if role not in out: 285 | raise AssertionError(role) 286 | 287 | return out 288 | 289 | def close(self): 290 | if hasattr(self, "_source"): 291 | self._source.close() 292 | self._tree = None 293 | -------------------------------------------------------------------------------- /tests/test_proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import unittest 32 | 33 | import oamap.proxy 34 | from oamap.schema import * 35 | 36 | class TestProxy(unittest.TestCase): 37 | def runTest(self): 38 | pass 39 | 40 | def test_ListProxy_slicing(self): 41 | range100 = list(range(100)) 42 | proxy100 = List(Primitive("i8"))({"object-B": [0], "object-E": [100], "object-L-Di8": range100}) 43 | self.assertEqual(range100, proxy100) 44 | for start1 in [None, 0, 5, 95, 110, -1, -5, -95, -110]: 45 | for stop1 in [None, 0, 5, 95, 110, -1, -5, -95, -110]: 46 | for step1 in [None, 1, 2, 5, 90, 110, -1, -2, -5, -90, -110]: 47 | sliced_range100 = range100[start1:stop1:step1] 48 | sliced_proxy100 = proxy100[start1:stop1:step1] 49 | self.assertEqual(sliced_range100, sliced_proxy100) 50 | if len(sliced_range100) > 0: 51 | for start2 in [None, 0, 5, -1, -5]: 52 | for stop2 in [None, 0, 5, -1, -5]: 53 | for step2 in [None, 1, 3, -1, -3]: 54 | self.assertEqual(sliced_range100[start2:stop2:step2], sliced_proxy100[start2:stop2:step2]) 55 | 56 | def test_Primitive(self): 57 | self.assertEqual(Primitive("f8")({"object-Df8": [3.14]}), 3.14) 58 | # self.assertEqual(Primitive("f8", dims=(2, 2))({"object-Df8-2-2": [[[1, 2], [3, 4]]]}), [[1, 2], [3, 4]]) 59 | self.assertEqual(Primitive("f8", nullable=True)({"object-Df8": [], "object-M": [-1]}), None) 60 | self.assertEqual(Primitive("f8", nullable=True)({"object-Df8": [3.14], "object-M": [0]}), 3.14) 61 | 62 | def test_List(self): 63 | self.assertEqual(List(Primitive("f8"))({"object-B": [0], "object-E": [5], "object-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]}), [1.1, 2.2, 3.3, 4.4, 5.5]) 64 | self.assertEqual(len(List(Primitive("f8"))({"object-B": [0], "object-E": [5], "object-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]})), 5) 65 | self.assertEqual(List(List(Primitive("f8")))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2, 2], "object-L-E": [2, 2, 5], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]}), [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 66 | self.assertEqual(len(List(List(Primitive("f8")))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2, 2], "object-L-E": [2, 2, 5], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]})), 3) 67 | self.assertEqual(list(map(len, List(List(Primitive("f8")))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2, 2], "object-L-E": [2, 2, 5], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]}))), [2, 0, 3]) 68 | self.assertEqual(List(List(Primitive("f8")), nullable=True)({"object-B": [], "object-E": [], "object-L-B": [], "object-L-E": [], "object-L-L-Df8": [], "object-M": [-1]}), None) 69 | self.assertEqual(List(List(Primitive("f8")), nullable=True)({"object-B": [0], "object-E": [3], "object-L-B": [0, 2, 2], "object-L-E": [2, 2, 5], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5], "object-M": [0]}), [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 70 | self.assertEqual(List(List(Primitive("f8"), nullable=True))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2], "object-L-E": [2, 2], "object-L-L-Df8": [1.1, 2.2], "object-L-M": [0, 1, -1]}), [[1.1, 2.2], [], None]) 71 | self.assertEqual(List(List(Primitive("f8"), nullable=True), nullable=True)({"object-B": [0], "object-E": [3], "object-L-B": [0, 2], "object-L-E": [2, 2], "object-L-L-Df8": [1.1, 2.2], "object-M": [0], "object-L-M": [0, 1, -1]}), [[1.1, 2.2], [], None]) 72 | 73 | def test_List_slices(self): 74 | x = List(List(Primitive("f8")))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2, 2], "object-L-E": [2, 2, 5], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]}) 75 | 76 | self.assertEqual(x[0], [1.1, 2.2]) 77 | self.assertEqual(x[1], []) 78 | self.assertEqual(x[2], [3.3, 4.4, 5.5]) 79 | self.assertEqual(x[-1], [3.3, 4.4, 5.5]) 80 | self.assertEqual(x[-2], []) 81 | self.assertEqual(x[-3], [1.1, 2.2]) 82 | self.assertRaises(IndexError, lambda: x[3]) 83 | self.assertRaises(IndexError, lambda: x[-4]) 84 | 85 | self.assertEqual(x[0:1], [[1.1, 2.2]]) 86 | self.assertEqual(x[0:2], [[1.1, 2.2], []]) 87 | self.assertEqual(x[0:3], [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 88 | self.assertEqual(x[:], [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 89 | self.assertEqual(x[:10], [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 90 | self.assertEqual(x[1:3], [[], [3.3, 4.4, 5.5]]) 91 | self.assertEqual(x[2:3], [[3.3, 4.4, 5.5]]) 92 | self.assertEqual(x[3:3], []) 93 | self.assertEqual(x[-3:1], [[1.1, 2.2]]) 94 | self.assertEqual(x[-3:2], [[1.1, 2.2], []]) 95 | self.assertEqual(x[-3:3], [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 96 | self.assertEqual(x[-2:3], [[], [3.3, 4.4, 5.5]]) 97 | self.assertEqual(x[-1:3], [[3.3, 4.4, 5.5]]) 98 | self.assertEqual(x[-1:-1], []) 99 | self.assertEqual(x[-10:3], [[1.1, 2.2], [], [3.3, 4.4, 5.5]]) 100 | self.assertEqual(x[::2], [[1.1, 2.2], [3.3, 4.4, 5.5]]) 101 | self.assertEqual(x[1::2], [[]]) 102 | 103 | x = List(List(Primitive("f8"), nullable=True))({"object-B": [0], "object-E": [3], "object-L-B": [0, 2], "object-L-E": [2, 5], "object-L-M": [0, -1, 1], "object-L-L-Df8": [1.1, 2.2, 3.3, 4.4, 5.5]}) 104 | 105 | self.assertEqual(x[1], None) 106 | self.assertEqual(x[-2], None) 107 | self.assertEqual(x[0:2], [[1.1, 2.2], None]) 108 | self.assertEqual(x[0:3], [[1.1, 2.2], None, [3.3, 4.4, 5.5]]) 109 | self.assertEqual(x[:], [[1.1, 2.2], None, [3.3, 4.4, 5.5]]) 110 | self.assertEqual(x[:10], [[1.1, 2.2], None, [3.3, 4.4, 5.5]]) 111 | self.assertEqual(x[1:3], [None, [3.3, 4.4, 5.5]]) 112 | self.assertEqual(x[3:3], []) 113 | self.assertEqual(x[-3:2], [[1.1, 2.2], None]) 114 | self.assertEqual(x[-3:3], [[1.1, 2.2], None, [3.3, 4.4, 5.5]]) 115 | self.assertEqual(x[-2:3], [None, [3.3, 4.4, 5.5]]) 116 | self.assertEqual(x[-1:-1], []) 117 | self.assertEqual(x[-10:3], [[1.1, 2.2], None, [3.3, 4.4, 5.5]]) 118 | self.assertEqual(x[1::2], [None]) 119 | 120 | def test_Union(self): 121 | self.assertEqual(Union([Primitive("i8"), Primitive("f8")])({"object-T": [0], "object-O": [0], "object-U0-Di8": [1], "object-U1-Df8": []}), 1) 122 | self.assertEqual(List(Union([Primitive("i8"), Primitive("f8")]))({"object-B": [0], "object-E": [7], "object-L-T": [0, 0, 1, 1, 1, 0, 0], "object-L-O": [0, 1, 0, 1, 2, 2, 3], "object-L-U0-Di8": [1, 2, 3, 4], "object-L-U1-Df8": [1.1, 2.2, 3.3]}), [1, 2, 1.1, 2.2, 3.3, 3, 4]) 123 | 124 | self.assertEqual(list(List(Union([Primitive("i8"), Primitive("f8")], nullable=True))({"object-L-U1-Df8": [1.1, 3.3], "object-L-T": [0, 1, 1, 0], "object-E": [7], "object-L-O": [0, 0, 1, 1], "object-L-M": [0, -1, 1, -1, 2, 3, -1], "object-L-U0-Di8": [1, 3], "object-B": [0]})), [1, None, 1.1, None, 3.3, 3, None]) 125 | self.assertEqual(List(Union([Primitive("i8", nullable=True), Primitive("f8")]))({"object-L-U0-M": [0, -1, 1, -1], "object-L-T": [0, 0, 1, 1, 1, 0, 0], "object-E": [7], "object-L-O": [0, 1, 0, 1, 2, 2, 3], "object-L-U1-Df8": [1.1, 2.2, 3.3], "object-L-U0-Di8": [1, 3], "object-B": [0]}), [1, None, 1.1, 2.2, 3.3, 3, None]) 126 | 127 | self.assertEqual(List(Union([Primitive("i8"), List(Primitive("f8"))]))({"object-B": [0], "object-E": [2], "object-L-T": [0, 1], "object-L-O": [0, 0], "object-L-U0-Di8": [3], "object-L-U1-B": [0], "object-L-U1-E": [3], "object-L-U1-L-Df8": [1.1, 2.2, 3.3]}), [3, [1.1, 2.2, 3.3]]) 128 | 129 | def test_Record(self): 130 | x = Record({"x": Primitive("i8"), "y": Primitive("f8")})({"object-Fx-Di8": [3], "object-Fy-Df8": [3.14]}) 131 | self.assertEqual(x.x, 3) 132 | self.assertEqual(x.y, 3.14) 133 | 134 | x = List(Record({"x": Primitive("i8"), "y": Primitive("f8")}))({"object-B": [0], "object-E": [3], "object-L-Fx-Di8": [1, 2, 3], "object-L-Fy-Df8": [1.1, 2.2, 3.3]}) 135 | self.assertEqual(x[0].x, 1) 136 | self.assertEqual(x[1].x, 2) 137 | self.assertEqual(x[2].x, 3) 138 | self.assertEqual(x[0].y, 1.1) 139 | self.assertEqual(x[1].y, 2.2) 140 | self.assertEqual(x[2].y, 3.3) 141 | 142 | x = List(Record({"x": Primitive("i8"), "y": Primitive("f8", nullable=True)}))({"object-B": [0], "object-E": [3], "object-L-Fx-Di8": [1, 2, 3], "object-L-Fy-Df8": [2.2], "object-L-Fy-M": [-1, 0, -1]}) 143 | self.assertEqual(x[0].x, 1) 144 | self.assertEqual(x[1].x, 2) 145 | self.assertEqual(x[2].x, 3) 146 | self.assertEqual(x[0].y, None) 147 | self.assertEqual(x[1].y, 2.2) 148 | self.assertEqual(x[2].y, None) 149 | 150 | x = List(Record({"x": Primitive("i8"), "y": Primitive("f8")}, nullable=True))({"object-B": [0], "object-E": [3], "object-L-M": [0, -1, 1], "object-L-Fx-Di8": [1, 3], "object-L-Fy-Df8": [1.1, 3.3]}) 151 | self.assertEqual(x[0].x, 1) 152 | self.assertEqual(x[1], None) 153 | self.assertEqual(x[2].x, 3) 154 | self.assertEqual(x[0].y, 1.1) 155 | self.assertEqual(x[1], None) 156 | self.assertEqual(x[2].y, 3.3) 157 | 158 | x = Record({"x": Primitive("i8"), "y": List(Primitive("f8"))})({"object-Fx-Di8": [3], "object-Fy-B": [0], "object-Fy-E": [3], "object-Fy-L-Df8": [1.1, 2.2, 3.3]}) 159 | self.assertEqual(x.x, 3) 160 | self.assertEqual(x.y, [1.1, 2.2, 3.3]) 161 | 162 | x = Record({"x": Primitive("i8"), "y": Union([Primitive("i8"), Primitive("f8")])})({"object-Fx-Di8": [3], "object-Fy-T": [0], "object-Fy-O": [0], "object-Fy-U0-Di8": [1], "object-Fy-U1-Df8": [1.1]}) 163 | self.assertEqual(x.x, 3) 164 | self.assertEqual(x.y, 1) 165 | 166 | x = Record({"x": Primitive("i8"), "y": List(Union([Primitive("i8"), Primitive("f8")]))})({"object-Fx-Di8": [3], "object-Fy-B": [0], "object-Fy-E": [3], "object-Fy-L-T": [0, 1, 1], "object-Fy-L-O": [0, 0, 1], "object-Fy-L-U0-Di8": [1], "object-Fy-L-U1-Df8": [1.1, 2.2]}) 167 | self.assertEqual(x.x, 3) 168 | self.assertEqual(x.y, [1, 1.1, 2.2]) 169 | 170 | x = List(Union([Primitive("i8"), Record({"x": Primitive("i8"), "y": Primitive("f8")})]))({"object-B": [0], "object-E": [4], "object-L-T": [0, 1, 1, 0], "object-L-O": [0, 0, 1, 1], "object-L-U0-Di8": [99, 98], "object-L-U1-Fx-Di8": [1, 2], "object-L-U1-Fy-Df8": [1.1, 2.2]}) 171 | self.assertEqual(x[0], 99) 172 | self.assertEqual(x[1].x, 1) 173 | self.assertEqual(x[1].y, 1.1) 174 | self.assertEqual(x[2].x, 2) 175 | self.assertEqual(x[2].y, 2.2) 176 | self.assertEqual(x[3], 98) 177 | 178 | def test_Tuple(self): 179 | x = Tuple((Primitive("i8"), Primitive("f8")))({"object-F0-Di8": [3], "object-F1-Df8": [3.14]}) 180 | self.assertEqual(x[0], 3) 181 | self.assertEqual(x[1], 3.14) 182 | 183 | x = List(Tuple((Primitive("i8"), Primitive("f8"))))({"object-B": [0], "object-E": [3], "object-L-F0-Di8": [1, 2, 3], "object-L-F1-Df8": [1.1, 2.2, 3.3]}) 184 | self.assertEqual(x[0][0], 1) 185 | self.assertEqual(x[1][0], 2) 186 | self.assertEqual(x[2][0], 3) 187 | self.assertEqual(x[0][1], 1.1) 188 | self.assertEqual(x[1][1], 2.2) 189 | self.assertEqual(x[2][1], 3.3) 190 | 191 | x = List(Tuple((Primitive("i8"), Primitive("f8", nullable=True))))({"object-B": [0], "object-E": [3], "object-L-F0-Di8": [1, 2, 3], "object-L-F1-Df8": [2.2], "object-L-F1-M": [-1, 0, -1]}) 192 | self.assertEqual(x[0][0], 1) 193 | self.assertEqual(x[1][0], 2) 194 | self.assertEqual(x[2][0], 3) 195 | self.assertEqual(x[0][1], None) 196 | self.assertEqual(x[1][1], 2.2) 197 | self.assertEqual(x[2][1], None) 198 | 199 | x = List(Tuple((Primitive("i8"), Primitive("f8")), nullable=True))({"object-B": [0], "object-E": [3], "object-L-M": [0, -1, 1], "object-L-F0-Di8": [1, 3], "object-L-F1-Df8": [1.1, 3.3]}) 200 | self.assertEqual(x[0][0], 1) 201 | self.assertEqual(x[1], None) 202 | self.assertEqual(x[2][0], 3) 203 | self.assertEqual(x[0][1], 1.1) 204 | self.assertEqual(x[1], None) 205 | self.assertEqual(x[2][1], 3.3) 206 | 207 | x = Tuple((Primitive("i8"), List(Primitive("f8"))))({"object-F0-Di8": [3], "object-F1-B": [0], "object-F1-E": [3], "object-F1-L-Df8": [1.1, 2.2, 3.3]}) 208 | self.assertEqual(x[0], 3) 209 | self.assertEqual(x[1], [1.1, 2.2, 3.3]) 210 | 211 | x = Tuple((Primitive("i8"), Union([Primitive("i8"), Primitive("f8")])))({"object-F0-Di8": [3], "object-F1-T": [0], "object-F1-O": [0], "object-F1-U0-Di8": [1], "object-F1-U1-Df8": [1.1]}) 212 | self.assertEqual(x[0], 3) 213 | self.assertEqual(x[1], 1) 214 | 215 | x = Tuple((Primitive("i8"), List(Union([Primitive("i8"), Primitive("f8")]))))({"object-F0-Di8": [3], "object-F1-B": [0], "object-F1-E": [3], "object-F1-L-T": [0, 1, 1], "object-F1-L-O": [0, 0, 1], "object-F1-L-U0-Di8": [1], "object-F1-L-U1-Df8": [1.1, 2.2]}) 216 | self.assertEqual(x[0], 3) 217 | self.assertEqual(x[1], [1, 1.1, 2.2]) 218 | 219 | x = List(Union([Primitive("i8"), Tuple((Primitive("i8"), Primitive("f8")))]))({"object-B": [0], "object-E": [4], "object-L-T": [0, 1, 1, 0], "object-L-O": [0, 0, 1, 1], "object-L-U0-Di8": [99, 98], "object-L-U1-F0-Di8": [1, 2], "object-L-U1-F1-Df8": [1.1, 2.2]}) 220 | self.assertEqual(x[0], 99) 221 | self.assertEqual(x[1][0], 1) 222 | self.assertEqual(x[1][1], 1.1) 223 | self.assertEqual(x[2][0], 2) 224 | self.assertEqual(x[2][1], 2.2) 225 | self.assertEqual(x[3], 98) 226 | 227 | def test_Pointer(self): 228 | self.assertEqual(Pointer(Primitive("f8"))({"object-P": [3], "object-X-Df8": [0.0, 1.1, 2.2, 3.3, 4.4]}), 3.3) 229 | 230 | tree = Pointer(None) 231 | tree.target = List(tree) 232 | 233 | self.assertEqual(tree({"object-P": [0], "object-X-B": [0], "object-X-E": [0], "object-X-L-P-object-X-Df8": []}), []) 234 | 235 | self.assertEqual(repr(tree({"object-P": [0], "object-X-B": [0], "object-X-E": [1], "object-X-L-P-object-X": [0]})), "[[...]]") 236 | 237 | self.assertEqual(tree({"object-P": [0, 1], "object-X-B": [0, 1], "object-X-E": [1, 1], "object-X-L-P-object-X": [1]}), [[]]) 238 | self.assertEqual(tree({"object-P": [0, 1], "object-X-B": [0, 2], "object-X-E": [2, 2], "object-X-L-P-object-X": [1, 1]}), [[], []]) 239 | 240 | linkedlist = Record({"label": Primitive("i8")}) 241 | linkedlist["next"] = Pointer(linkedlist) 242 | 243 | x = linkedlist({"object-Flabel-Di8": [0, 1, 2], "object-Fnext-P-object": [1, 2, 0]}) 244 | self.assertEqual(x.label, 0) 245 | self.assertEqual(x.next.label, 1) 246 | self.assertEqual(x.next.next.label, 2) 247 | self.assertEqual(x.next.next.next.label, 0) 248 | 249 | linkedlist = Record({"label": Primitive("i8")}) 250 | linkedlist["next"] = Pointer(linkedlist, nullable=True) 251 | 252 | x = linkedlist({"object-Flabel-Di8": [0, 1, 2], "object-Fnext-P-object": [1, 2], "object-Fnext-M": [0, 1, -1]}) 253 | self.assertEqual(x.label, 0) 254 | self.assertEqual(x.next.label, 1) 255 | self.assertEqual(x.next.next.label, 2) 256 | self.assertEqual(x.next.next.next, None) 257 | -------------------------------------------------------------------------------- /oamap/proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import bisect 32 | import json 33 | import numbers 34 | import sys 35 | import math 36 | 37 | import numpy 38 | 39 | import oamap.util 40 | 41 | if sys.version_info[0] > 2: 42 | xrange = range 43 | 44 | # base class of all runtime types that require proxies: List, Record, and Tuple 45 | class Proxy(object): pass 46 | 47 | def tojson(value): 48 | if isinstance(value, ListProxy): 49 | return [tojson(x) for x in value] 50 | elif isinstance(value, RecordProxy): 51 | return dict((n, tojson(getattr(value, n))) for n in value._fields) 52 | elif isinstance(value, TupleProxy): 53 | return [tojson(x) for x in value] 54 | elif isinstance(value, (numbers.Integral, numpy.integer)): 55 | return int(value) 56 | elif isinstance(value, (numbers.Real, numpy.floating)): 57 | if math.isnan(value): 58 | return "nan" 59 | elif value == float("-inf"): 60 | return "-inf" 61 | elif value == float("inf"): 62 | return "inf" 63 | else: 64 | return float(value) 65 | elif isinstance(value, (numbers.Complex, numpy.complex)): 66 | return {"real": tojson(value.real), "imag": tojson(value.imag)} 67 | elif isinstance(value, numpy.ndarray): 68 | return value.tolist() 69 | else: 70 | return value 71 | 72 | def tojsonstring(value, *args, **kwds): 73 | return json.dumps(tojson(value), *args, **kwds) 74 | 75 | def tojsonfile(file, value, *args, **kwds): 76 | json.dump(file, tojson(value), *args, **kwds) 77 | 78 | ################################################################ Lists 79 | 80 | class ListProxy(Proxy): 81 | __slots__ = ["_generator", "_arrays", "_cache", "_whence", "_stride", "_length"] 82 | 83 | def __init__(self, generator, arrays, cache, whence, stride, length): 84 | assert stride != 0 85 | assert length >= 0 86 | self._generator = generator 87 | self._arrays = arrays 88 | self._cache = cache 89 | self._whence = whence 90 | self._stride = stride 91 | self._length = length 92 | 93 | def __repr__(self, memo=None): 94 | if memo is None: 95 | memo = set() 96 | key = (id(self._generator), self._whence, self._stride, self._length) 97 | if key in memo: 98 | return "[...]" 99 | memo = memo.union(set([key])) 100 | if len(self) > 10: 101 | before = self[:5] 102 | after = self[-5:] 103 | return "[{0}, ..., {1}]".format(", ".join(x.__repr__(memo) if isinstance(x, (ListProxy, TupleProxy)) else repr(x) for x in before), 104 | ", ".join(x.__repr__(memo) if isinstance(x, (ListProxy, TupleProxy)) else repr(x) for x in after)) 105 | else: 106 | return "[{0}]".format(", ".join(x.__repr__(memo) if isinstance(x, (ListProxy, TupleProxy)) else repr(x) for x in self)) 107 | 108 | def __str__(self): 109 | return repr(self) 110 | 111 | def __getattr__(self, field): 112 | if field in self.__dict__: 113 | return self.__dict__[field] 114 | else: 115 | import oamap.operations 116 | for n, x in reversed(list(oamap.operations.actions.items()) + list(oamap.operations.transformations.items()) + list(oamap.operations.recastings.items())): 117 | if field == n: 118 | return lambda *args, **kwargs: x(self, *args, **kwargs) 119 | raise AttributeError("ListProxy has no attribute {0}".format(repr(field))) 120 | 121 | @property 122 | def schema(self): 123 | return self._generator.schema 124 | 125 | @property 126 | def fields(self): 127 | generator = self._generator 128 | while isinstance(generator, oamap.generator.ListGenerator): 129 | generator = generator.content 130 | if isinstance(generator, oamap.generator.RecordGenerator): 131 | return list(generator.fields) 132 | else: 133 | raise TypeError("list does not contain records") 134 | 135 | def indexed(self): 136 | return self 137 | 138 | def __len__(self): 139 | return self._length 140 | 141 | def __getslice__(self, start, stop): 142 | return self.__getitem__(slice(start, stop)) 143 | 144 | def __getitem__(self, index): 145 | if isinstance(index, slice): 146 | start, stop, step = oamap.util.slice2sss(index, self._length) 147 | 148 | whence = self._whence + self._stride*start 149 | stride = self._stride*step 150 | 151 | # length = int(math.ceil(float(abs(stop - start)) / abs(step))) 152 | d, m = divmod(abs(start - stop), abs(step)) 153 | length = d + (1 if m != 0 else 0) 154 | 155 | return ListProxy(self._generator, self._arrays, self._cache, whence, stride, length) 156 | 157 | else: 158 | normalindex = index if index >= 0 else index + self._length 159 | if not 0 <= normalindex < self._length: 160 | raise IndexError("index {0} is out of bounds for size {1}".format(index, self._length)) 161 | return self._generator.content._generate(self._arrays, self._whence + self._stride*normalindex, self._cache) 162 | 163 | def __iter__(self): 164 | return (self._generator.content._generate(self._arrays, i, self._cache) for i in xrange(self._whence, self._whence + self._stride*self._length, self._stride)) 165 | 166 | def __hash__(self): 167 | # lists aren't usually hashable, but since ListProxy is immutable, we can add this feature 168 | return hash((ListProxy,) + tuple(self)) 169 | 170 | def __eq__(self, other): 171 | if isinstance(other, ListProxy): 172 | return list(self) == list(other) 173 | elif isinstance(other, list): 174 | return list(self) == other 175 | else: 176 | return False 177 | 178 | def __lt__(self, other): 179 | if isinstance(other, ListProxy): 180 | return list(self) < list(other) 181 | elif isinstance(other, list): 182 | return list(self) < other 183 | else: 184 | raise TypeError("unorderable types: list() < {1}()".format(other.__class__)) 185 | 186 | # all of the following emulate normal list functionality using the overloaded methods above 187 | 188 | def __ne__(self, other): return not self.__eq__(other) 189 | def __le__(self, other): return self.__lt__(other) or self.__eq__(other) 190 | def __gt__(self, other): return not self.__lt__(other) and not self.__eq__(other) 191 | def __ge__(self, other): return not self.__lt__(other) 192 | 193 | def __add__(self, other): return list(self) + list(other) 194 | def __mul__(self, reps): return list(self) * reps 195 | def __rmul__(self, reps): return reps * list(self) 196 | def __reversed__(self): 197 | if sys.version_info[0] <= 2: 198 | return (self[i - 1] for i in xrange(len(self), 0, -1)) 199 | else: 200 | return (self[i - 1] for i in range(len(self), 0, -1)) 201 | def count(self, value): return sum(1 for x in self if x == value) 202 | def index(self, value, *args): 203 | if len(args) == 0: 204 | start = 0 205 | stop = len(self) 206 | elif len(args) == 1: 207 | start = args[0] 208 | stop = len(self) 209 | elif len(args) == 2: 210 | start, stop = args 211 | else: 212 | raise TypeError("index() takes at most 3 arguments ({0} given)".format(1 + len(args))) 213 | for i, x in enumerate(self): 214 | if x == value: 215 | return i 216 | raise ValueError("{0} is not in list".format(value)) 217 | 218 | def __contains__(self, value): 219 | for x in self: 220 | if x == value: 221 | return True 222 | return False 223 | 224 | ################################################################ Records 225 | 226 | class RecordProxy(Proxy): 227 | __slots__ = ["_generator", "_arrays", "_cache", "_index"] 228 | 229 | def __init__(self, generator, arrays, cache, index): 230 | self._generator = generator 231 | self._arrays = arrays 232 | self._cache = cache 233 | self._index = index 234 | 235 | def __repr__(self): 236 | return "<{0} at index {1}>".format("Record" if self._generator.name is None else self._generator.name, self._index) 237 | 238 | def __str__(self): 239 | return repr(self) 240 | 241 | @property 242 | def _fields(self): 243 | return list(self._generator.fields) 244 | 245 | def __dir__(self): 246 | return dir(super(RecordProxy, self)) + list(str(x) for x in self._fields) 247 | 248 | def __getattr__(self, field): 249 | try: 250 | # actual field names get priority (they're not allowed to start with underscore) 251 | generator = self._generator.fields[field] 252 | except KeyError: 253 | # barring any conflicts with actual field names, "schema" and "fields" are convenient 254 | if field == "schema": 255 | return self._generator.schema 256 | elif field == "fields": 257 | return self._fields 258 | elif field == "name": 259 | return self._generator.name 260 | else: 261 | import oamap.operations 262 | for n, x in reversed(list(oamap.operations.actions.items()) + list(oamap.operations.transformations.items()) + list(oamap.operations.recastings.items())): 263 | if field == n: 264 | return lambda *args, **kwargs: x(self, *args, **kwargs) 265 | raise AttributeError("{0} object has no attribute {1}".format(repr("Record" if self._generator.name is None else self._generator.name), repr(field))) 266 | else: 267 | return generator._generate(self._arrays, self._index, self._cache) 268 | 269 | def __hash__(self): 270 | return hash((RecordProxy, self._generator.name) + tuple(self._generator.fields.items())) 271 | 272 | def __eq__(self, other): 273 | return isinstance(other, RecordProxy) and self._generator.name == other._generator.name and set(self._generator.fields) == set(other._generator.fields) and all(self.__getattr__(n) == other.__getattr__(n) for n in self._generator.fields) 274 | 275 | def __lt__(self, other): 276 | if isinstance(other, RecordProxy) and self._generator.name == other._generator.name and set(self._generator.fields) == set(other._generator.fields): 277 | return [self.__getattr__(n) for n in self._generator.fields] < [other.__getattr__(n) for n in self._generator.fields] 278 | else: 279 | raise TypeError("unorderable types: {0}() < {1}()".format("" if self._generator.name is None else "".format(repr(self._generator.name)), other.__class__)) 280 | 281 | def __ne__(self, other): return not self.__eq__(other) 282 | def __le__(self, other): return self.__lt__(other) or self.__eq__(other) 283 | def __gt__(self, other): return not self.__lt__(other) and not self.__eq__(other) 284 | def __ge__(self, other): return not self.__lt__(other) 285 | 286 | ################################################################ Tuples 287 | 288 | class TupleProxy(Proxy): 289 | __slots__ = ["_generator", "_arrays", "_cache", "_index"] 290 | 291 | def __init__(self, generator, arrays, cache, index): 292 | self._generator = generator 293 | self._arrays = arrays 294 | self._cache = cache 295 | self._index = index 296 | 297 | def __repr__(self, memo=None): 298 | if memo is None: 299 | memo = set() 300 | key = (self._index,) + tuple(id(x) for x in self._generator.types) 301 | if key in memo: 302 | return "(...)" 303 | memo = memo.union(set([key])) 304 | return "({0}{1})".format(", ".join(x.__repr__(memo) if isinstance(x, (ListProxy, TupleProxy)) else repr(x) for x in self), "," if len(self) == 1 else "") 305 | 306 | def __str__(self): 307 | return repr(self) 308 | 309 | def __getattr__(self, field): 310 | if field in self.__dict__: 311 | return self.__dict__[field] 312 | else: 313 | import oamap.operations 314 | for n, x in reversed(list(oamap.operations.actions.items()) + list(oamap.operations.transformations.items()) + list(oamap.operations.recastings.items())): 315 | if field == n: 316 | return lambda *args, **kwargs: x(self, *args, **kwargs) 317 | raise AttributeError("TupleProxy has no attribute {0}".format(repr(field))) 318 | 319 | def __len__(self): 320 | return len(self._generator.types) 321 | 322 | def __getslice__(self, start, stop): 323 | # for old-Python compatibility 324 | return self.__getitem__(slice(start, stop)) 325 | 326 | def __getitem__(self, index): 327 | if isinstance(index, slice): 328 | lenself = len(self) 329 | start = 0 if index.start is None else index.start 330 | stop = lenself if index.stop is None else index.stop 331 | step = 1 if index.step is None else index.step 332 | return tuple(self[i] for i in range(start, stop, step)) 333 | 334 | else: 335 | return self._generator.types[index]._generate(self._arrays, self._index, self._cache) 336 | 337 | def __iter__(self): 338 | return (t._generate(self._arrays, self._index, self._cache) for t in self._generator.types) 339 | 340 | def __hash__(self): 341 | return hash(tuple(self)) 342 | 343 | def __eq__(self, other): 344 | if isinstance(other, TupleProxy): 345 | return tuple(self) == tuple(other) 346 | elif isinstance(other, tuple): 347 | return tuple(self) == other 348 | else: 349 | return False 350 | 351 | def __lt__(self, other): 352 | if isinstance(other, TupleProxy): 353 | return tuple(self) < tuple(other) 354 | elif isinstance(other, tuple): 355 | return tuple(self) < other 356 | else: 357 | raise TypeError("unorderable types: tuple() < {1}()".format(other.__class__)) 358 | 359 | # all of the following emulate normal tuple functionality using the overloaded methods above 360 | 361 | def __ne__(self, other): return not self.__eq__(other) 362 | def __le__(self, other): return self.__lt__(other) or self.__eq__(other) 363 | def __gt__(self, other): return not self.__lt__(other) and not self.__eq__(other) 364 | def __ge__(self, other): return not self.__lt__(other) 365 | 366 | def __add__(self, other): return tuple(self) + tuple(other) 367 | def __mul__(self, reps): return tuple(self) * reps 368 | def __rmul__(self, reps): return reps * tuple(self) 369 | def __reversed__(self): 370 | return (self[i - 1] for i in range(len(self), 0, -1)) 371 | def count(self, value): return sum(1 for x in self if x == value) 372 | def index(self, value, *args): 373 | if len(args) == 0: 374 | start = 0 375 | stop = len(self) 376 | elif len(args) == 1: 377 | start = args[0] 378 | stop = len(self) 379 | elif len(args) == 2: 380 | start, stop = args 381 | else: 382 | raise TypeError("index() takes at most 3 arguments ({0} given)".format(1 + len(args))) 383 | for i, x in enumerate(self): 384 | if x == value: 385 | return i 386 | raise ValueError("{0} is not in list".format(value)) 387 | 388 | def __contains__(self, value): 389 | for x in self: 390 | if x == value: 391 | return True 392 | return False 393 | -------------------------------------------------------------------------------- /oamap/inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import re 32 | import numbers 33 | import sys 34 | import math 35 | 36 | import numpy 37 | 38 | import oamap.schema 39 | from oamap.util import OrderedDict 40 | 41 | if sys.version_info[0] > 2: 42 | basestring = str 43 | 44 | ################################################################ inferring schemas from data 45 | 46 | def fromdata(obj, limit=None): 47 | if limit is None or (isinstance(limit, (numbers.Integral, numpy.integer)) and limit >= 0): 48 | pass 49 | else: 50 | raise TypeError("limit must be None or a non-negative integer, not {0}".format(limit)) 51 | 52 | class Intermediate(object): 53 | def __init__(self, nullable): 54 | self.nullable = nullable 55 | 56 | class Unknown(Intermediate): 57 | def resolve(self): 58 | raise TypeError("could not resolve a type (e.g. all examples of a List-typed attribute are empty, can't determine its content type)") 59 | 60 | class Boolean(Intermediate): 61 | def resolve(self): 62 | return oamap.schema.Primitive(numpy.dtype(numpy.bool_), nullable=self.nullable) 63 | 64 | class Number(Intermediate): 65 | max_uint8 = numpy.iinfo(numpy.uint8).max 66 | max_uint16 = numpy.iinfo(numpy.uint16).max 67 | max_uint32 = numpy.iinfo(numpy.uint32).max 68 | max_uint64 = numpy.iinfo(numpy.uint64).max 69 | min_int8 = numpy.iinfo(numpy.int8).min 70 | max_int8 = numpy.iinfo(numpy.int8).max 71 | min_int16 = numpy.iinfo(numpy.int16).min 72 | max_int16 = numpy.iinfo(numpy.int16).max 73 | min_int32 = numpy.iinfo(numpy.int32).min 74 | max_int32 = numpy.iinfo(numpy.int32).max 75 | min_int64 = numpy.iinfo(numpy.int64).min 76 | max_int64 = numpy.iinfo(numpy.int64).max 77 | def __init__(self, nullable, min, max, whole, real): 78 | Intermediate.__init__(self, nullable) 79 | self.min = min 80 | self.max = max 81 | self.whole = whole 82 | self.real = real 83 | def resolve(self): 84 | if self.whole: 85 | if self.min >= 0: 86 | if self.max <= self.max_uint8: 87 | t = numpy.uint8 88 | elif self.max <= self.max_uint16: 89 | t = numpy.uint16 90 | elif self.max <= self.max_uint32: 91 | t = numpy.uint32 92 | elif self.max <= self.max_uint64: 93 | t = numpy.uint64 94 | else: 95 | t = numpy.float64 96 | else: 97 | if self.min_int8 <= self.min and self.max <= self.max_int8: 98 | t = numpy.int8 99 | elif self.min_int16 <= self.min and self.max <= self.max_int16: 100 | t = numpy.int16 101 | elif self.min_int32 <= self.min and self.max <= self.max_int32: 102 | t = numpy.int32 103 | elif self.min_int64 <= self.min and self.max <= self.max_int64: 104 | t = numpy.int64 105 | else: 106 | t = numpy.float64 107 | elif self.real: 108 | t = numpy.float64 109 | else: 110 | t = numpy.complex128 111 | return oamap.schema.Primitive(numpy.dtype(t), nullable=self.nullable) 112 | 113 | class String(Intermediate): 114 | def __init__(self, nullable, utf8): 115 | Intermediate.__init__(self, nullable) 116 | self.utf8 = utf8 117 | def resolve(self): 118 | return oamap.schema.List(oamap.schema.Primitive(numpy.uint8), nullable=self.nullable, name=("UTF8String" if self.utf8 else "ByteString")) 119 | 120 | class IntermediateList(Intermediate): 121 | def __init__(self, nullable, content): 122 | Intermediate.__init__(self, nullable) 123 | self.content = content 124 | def resolve(self): 125 | return oamap.schema.List(self.content.resolve(), nullable=self.nullable) 126 | 127 | class IntermediateRecord(Intermediate): 128 | def __init__(self, nullable, fields, name): 129 | Intermediate.__init__(self, nullable) 130 | self.fields = fields 131 | self.name = name 132 | def resolve(self): 133 | return oamap.schema.Record(dict((n, x.resolve()) for n, x in self.fields.items()), nullable=self.nullable, name=self.name) 134 | 135 | class IntermediateTuple(Intermediate): 136 | def __init__(self, nullable, types): 137 | Intermediate.__init__(self, nullable) 138 | self.types = types 139 | def resolve(self): 140 | return oamap.schema.Tuple([x.resolve() for x in self.types], nullable=self.nullable) 141 | 142 | # Unions are special for type-inference 143 | class IntermediateUnion(Intermediate): 144 | def __init__(self, nullable, possibilities): 145 | Intermediate.__init__(self, nullable) 146 | self.possibilities = possibilities 147 | def resolve(self): 148 | return oamap.schema.Union([x.resolve() for x in self.possibilities], nullable=self.nullable) 149 | 150 | # no Pointers in type-inference (we'd have to keep a big map of *everything*!) 151 | 152 | def flatten(possibilities): 153 | return [y for x in possibilities if isinstance(x, IntermediateUnion) for y in x.possibilities] + [x for x in possibilities if not isinstance(x, IntermediateUnion)] 154 | 155 | def unify2(x, y): 156 | nullable = x.nullable or y.nullable 157 | 158 | if isinstance(x, Unknown) and isinstance(y, Unknown): 159 | return Unknown(nullable) 160 | 161 | elif isinstance(x, Unknown): 162 | y.nullable = nullable 163 | return y 164 | 165 | elif isinstance(y, Unknown): 166 | x.nullable = nullable 167 | return x 168 | 169 | elif isinstance(x, Boolean) and isinstance(y, Boolean): 170 | return Boolean(nullable) 171 | 172 | elif isinstance(x, Number) and isinstance(y, Number): 173 | return Number(nullable, min(x.min, y.min), max(x.max, y.max), x.whole and y.whole, x.real and y.real) 174 | 175 | elif isinstance(x, String) and isinstance(y, String): 176 | return String(nullable, x.utf8 or y.utf8) 177 | 178 | elif isinstance(x, IntermediateList) and isinstance(y, IntermediateList): 179 | return IntermediateList(nullable, unify2(x.content, y.content)) 180 | 181 | elif isinstance(x, IntermediateRecord) and isinstance(y, IntermediateRecord) and set(x.fields) == set(y.fields) and (x.name is None or y.name is None or x.name == y.name): 182 | return IntermediateRecord(nullable, dict((n, unify2(x.fields[n], y.fields[n])) for n in x.fields), name=(y.name if x.name is None else x.name)) 183 | 184 | elif isinstance(x, IntermediateTuple) and isinstance(y, IntermediateTuple) and len(x.types) == len(y.types): 185 | return IntermediateTuple(nullable, [unify2(xi, yi) for xi, yi in zip(x.types, y.types)]) 186 | 187 | elif isinstance(x, IntermediateUnion) and isinstance(y, IntermediateUnion): 188 | return unify(x.possibilities + y.possibilities) 189 | 190 | elif isinstance(x, IntermediateUnion): 191 | return unify(x.possibilities + [y]) 192 | 193 | elif isinstance(y, IntermediateUnion): 194 | return unify([x] + y.possibilities) 195 | 196 | else: 197 | # can't be unified 198 | return IntermediateUnion(nullable, flatten([x, y])) 199 | 200 | def unify(possibilities): 201 | if len(possibilities) == 0: 202 | return Unknown(False) 203 | 204 | elif len(possibilities) == 1: 205 | return possibilities[0] 206 | 207 | elif len(possibilities) == 2: 208 | return unify2(possibilities[0], possibilities[1]) 209 | 210 | else: 211 | distinct = [] 212 | for x in flatten(possibilities): 213 | found = False 214 | 215 | for i, y in enumerate(distinct): 216 | merged = unify2(x, y) 217 | if not isinstance(merged, IntermediateUnion): 218 | distinct[i] = merged 219 | found = True 220 | break 221 | 222 | if not found: 223 | distinct.append(x) 224 | 225 | if len(distinct) == 1: 226 | return distinct[0] 227 | else: 228 | return IntermediateUnion(False, flatten(distinct)) 229 | 230 | def buildintermediate(obj, limit, memo): 231 | if id(obj) in memo: 232 | raise ValueError("cyclic reference in Python object at {0} (Pointer types cannot be inferred)".format(obj)) 233 | 234 | # by copying, rather than modifying in-place (memo.add), we find cyclic references, rather than DAGs 235 | memo = memo.union(set([id(obj)])) 236 | 237 | if obj is None: 238 | return Unknown(True) 239 | 240 | elif obj is False or obj is True: 241 | return Boolean(False) 242 | 243 | elif isinstance(obj, (numbers.Integral, numpy.integer)): 244 | return Number(False, int(obj), int(obj), True, True) 245 | 246 | elif isinstance(obj, (numbers.Real, numpy.floating)): 247 | return Number(False, float(obj), float(obj), False, True) 248 | 249 | elif isinstance(obj, (numbers.Complex, numpy.complex)): 250 | return Number(False, float("-inf"), float("inf"), False, False) 251 | 252 | elif isinstance(obj, bytes): 253 | return String(False, False) 254 | 255 | elif isinstance(obj, basestring): 256 | return String(False, True) 257 | 258 | elif isinstance(obj, dict): 259 | return IntermediateRecord(False, dict((n, buildintermediate(x, limit, memo)) for n, x in obj.items()), None) 260 | 261 | elif isinstance(obj, tuple) and hasattr(obj, "_fields"): 262 | # this is a namedtuple; interpret it as a Record, rather than a Tuple 263 | return IntermediateRecord(False, dict((n, buildintermediate(getattr(obj, n), limit, memo)) for n in obj._fields), obj.__class__.__name__) 264 | 265 | elif isinstance(obj, tuple): 266 | return IntermediateTuple(False, [buildintermediate(x, limit, memo) for x in obj]) 267 | 268 | else: 269 | try: 270 | limited = [] 271 | for x in obj: 272 | if limit is None or len(limited) < limit: 273 | limited.append(x) 274 | else: 275 | break 276 | except TypeError: 277 | # not iterable, so interpret it as a Record 278 | return IntermediateRecord(False, dict((n, buildintermediate(getattr(obj, n), limit, memo)) for n in dir(obj) if not n.startswith("_") and not callable(getattr(obj, n))), obj.__class__.__name__) 279 | else: 280 | # iterable, so interpret it as a List 281 | return IntermediateList(False, unify([buildintermediate(x, None, memo) for x in obj])) 282 | 283 | return buildintermediate(obj, limit, set()).resolve() 284 | 285 | ################################################################ inferring schemas from a namespace 286 | 287 | def fromnames(arraynames, prefix="object", delimiter="-"): 288 | def filter(arraynames, prefix): 289 | return [x for x in arraynames if x.startswith(prefix)] 290 | 291 | def recurse(arraynames, prefix, byname, internalpointers): 292 | prefixdelimiter = prefix + delimiter 293 | name = None 294 | for n in arraynames: 295 | if n.startswith(prefixdelimiter): 296 | if n[len(prefixdelimiter)] == "N": 297 | match = oamap.schema.Schema._identifier.match(n[len(prefixdelimiter) + 1:]) 298 | if match is not None: 299 | name = match.group(0) 300 | break 301 | 302 | if name is not None: 303 | prefix = prefixdelimiter + "N" + name 304 | prefixdelimiter = prefix + delimiter 305 | 306 | mask = prefixdelimiter + "M" 307 | starts = prefixdelimiter + "B" 308 | stops = prefixdelimiter + "E" 309 | content = prefixdelimiter + "L" 310 | tags = prefixdelimiter + "T" 311 | offsets = prefixdelimiter + "O" 312 | uniondata = prefixdelimiter + "U" 313 | field = prefixdelimiter + "F" 314 | positions = prefixdelimiter + "P" 315 | external = prefixdelimiter + "X" 316 | primitive = prefixdelimiter + "D" 317 | 318 | nullable = mask in arraynames 319 | if not nullable: 320 | mask = None 321 | 322 | if starts in arraynames and stops in arraynames: 323 | byname[prefix] = None 324 | byname[prefix] = oamap.schema.List(recurse(filter(arraynames, content), content, byname, internalpointers), nullable=nullable, starts=None, stops=None, mask=None, name=name, doc=None) 325 | 326 | elif tags in arraynames: 327 | possibilities = [] 328 | while True: 329 | possibility = uniondata + repr(len(possibilities)) 330 | if any(x.startswith(possibility) for x in arraynames): 331 | possibilities.append(possibility) 332 | else: 333 | break 334 | byname[prefix] = None 335 | byname[prefix] = oamap.schema.Union([recurse(filter(arraynames, x), x, byname, internalpointers) for x in possibilities], nullable=nullable, tags=None, offsets=None, mask=None, name=name, doc=None) 336 | 337 | elif any(x.startswith(field) for x in arraynames): 338 | pattern = re.compile("^" + field + "(" + oamap.schema.Schema._identifier.pattern + ")") 339 | fields = {} 340 | for x in arraynames: 341 | matches = pattern.match(x) 342 | if matches is not None: 343 | if matches.group(1) not in fields: 344 | fields[matches.group(1)] = [] 345 | fields[matches.group(1)].append(x) 346 | 347 | types = [] 348 | while True: 349 | tpe = field + repr(len(types)) 350 | if any(x.startswith(tpe) for x in arraynames): 351 | types.append(tpe) 352 | else: 353 | break 354 | 355 | if len(fields) >= 0 and len(types) == 0: 356 | byname[prefix] = oamap.schema.Record(oamap.schema.OrderedDict([(n, recurse(fields[n], field + n, byname, internalpointers)) for n in sorted(fields)]), nullable=nullable, mask=None, name=name, doc=None) 357 | elif len(fields) == 0 and len(types) > 0: 358 | byname[prefix] = oamap.schema.Tuple([recurse(filter(arraynames, n), n, byname, internalpointers) for n in types], nullable=nullable, mask=None, name=name, doc=None) 359 | else: 360 | raise KeyError("ambiguous set of array names: may be Record or Tuple at {0}".format(repr(prefix))) 361 | 362 | elif any(x.startswith(positions) for x in arraynames): 363 | if positions in arraynames: 364 | # external 365 | byname2 = {} 366 | internalpointers2 = [] 367 | target = finalize(recurse(filter(arraynames, external), external, byname2, internalpointers2), byname2, internalpointers2) 368 | byname[prefix] = oamap.schema.Pointer(target, nullable=nullable, positions=None, mask=None, name=name, doc=None) 369 | 370 | else: 371 | # internal 372 | matches = [x[len(positions) + 1:] for x in arraynames if x.startswith(positions)] 373 | if len(matches) != 1: 374 | raise KeyError("ambiguous set of array names: more than one internal Pointer at {0}".format(repr(prefix))) 375 | target = None # placeholder! see finalize 376 | byname[prefix] = oamap.schema.Pointer(target, nullable=nullable, positions=None, mask=None, name=name, doc=None) 377 | internalpointers.append((byname[prefix], matches[0])) 378 | 379 | elif any(x.startswith(primitive) for x in arraynames): 380 | matches = [x[len(primitive) - 1:] for x in arraynames if x.startswith(primitive)] 381 | if len(matches) != 1: 382 | raise KeyError("ambiguous set of array names: more than one Primitive at {0}".format(repr(prefix))) 383 | dtype = oamap.schema.Primitive._str2dtype(matches[0], delimiter) 384 | byname[prefix] = oamap.schema.Primitive(dtype, nullable=nullable, data=None, mask=None, name=name, doc=None) 385 | 386 | else: 387 | raise KeyError("missing array names: nothing found as {0} contents".format(repr(prefix))) 388 | 389 | return byname[prefix] 390 | 391 | def finalize(out, byname, internalpointers): 392 | for pointer, targetname in internalpointers: 393 | if targetname in byname: 394 | pointer.target = byname[targetname] 395 | else: 396 | raise KeyError("Pointer's internal target is {0}, but there is no object with that prefix".format(repr(targetname))) 397 | return out 398 | 399 | byname = {} 400 | internalpointers = [] 401 | return finalize(recurse(filter(arraynames, prefix), prefix, byname, internalpointers), byname, internalpointers) 402 | -------------------------------------------------------------------------------- /oamap/fill.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import re 32 | from functools import reduce 33 | 34 | import oamap.generator 35 | import oamap.inference 36 | import oamap.fillable 37 | 38 | def toarrays(fillables): 39 | return dict((n, x[:]) for n, x in fillables.items()) 40 | 41 | ################################################################ Python data, possibly made by json.load 42 | 43 | def _fromdata_initialize(gen, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs): 44 | if isinstance(gen, oamap.generator.PrimitiveGenerator): 45 | fillables[gen.data].revert() 46 | forefront = len(fillables[gen.data]) 47 | fillables_leaf_to_root.append(fillables[gen.data]) 48 | 49 | elif isinstance(gen, oamap.generator.ListGenerator): 50 | _fromdata_initialize(gen.content, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 51 | fillables[gen.starts].revert() 52 | fillables[gen.stops].revert() 53 | assert len(fillables[gen.starts]) == len(fillables[gen.stops]) 54 | forefront = len(fillables[gen.stops]) 55 | fillables_leaf_to_root.append(fillables[gen.starts]) 56 | fillables_leaf_to_root.append(fillables[gen.stops]) 57 | 58 | elif isinstance(gen, oamap.generator.UnionGenerator): 59 | for x in gen.possibilities: 60 | _fromdata_initialize(x, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 61 | fillables[gen.tags].revert() 62 | fillables[gen.offsets].revert() 63 | assert len(fillables[gen.tags]) == len(fillables[gen.offsets]) 64 | forefront = len(fillables[gen.tags]) 65 | fillables_leaf_to_root.append(fillables[gen.tags]) 66 | fillables_leaf_to_root.append(fillables[gen.offsets]) 67 | 68 | elif isinstance(gen, oamap.generator.RecordGenerator): 69 | uniques = set(_fromdata_initialize(x, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) for x in gen.fields.values()) 70 | assert len(uniques) == 1 71 | forefront = list(uniques)[0] 72 | 73 | elif isinstance(gen, oamap.generator.TupleGenerator): 74 | uniques = set(_fromdata_initialize(x, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) for x in gen.types) 75 | assert len(uniques) == 1 76 | forefront = list(uniques)[0] 77 | 78 | elif isinstance(gen, oamap.generator.PointerGenerator): 79 | if gen._internal and gen.target is generator and len(fillables[gen.positions]) != 0: 80 | raise TypeError("the root of a Schema may be the target of a Pointer, but if so, it can only be filled from data once") 81 | 82 | if gen not in pointers: 83 | pointers.append(gen) 84 | pointerobjs_keys.append(id(gen)) 85 | targetids_keys.append(id(gen.target)) 86 | 87 | if not gen._internal: 88 | _fromdata_initialize(gen.target, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 89 | fillables[gen.positions].revert() 90 | forefront = len(fillables[gen.positions]) 91 | fillables_leaf_to_root.append(fillables[gen.positions]) 92 | positions_to_pointerobjs[gen.positions] = id(gen) 93 | 94 | elif isinstance(gen, oamap.generator.ExtendedGenerator): 95 | forefront = _fromdata_initialize(gen.generic, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 96 | 97 | else: 98 | raise TypeError("unrecognized generator: {0}".format(repr(gen))) 99 | 100 | if isinstance(gen, oamap.generator.Masked): 101 | fillables[gen.mask].revert() 102 | # mask forefront overrides any other arrays 103 | forefront = len(fillables[gen.mask]) 104 | fillables_leaf_to_root.append(fillables[gen.mask]) 105 | 106 | return forefront 107 | 108 | def _fromdata_forefront(gen, fillables, pointerobjs, secondary=False): 109 | if not secondary and isinstance(gen, oamap.generator.Masked): 110 | # mask forefront overrides any other arrays 111 | return fillables[gen.mask].forefront() 112 | 113 | elif isinstance(gen, oamap.generator.PrimitiveGenerator): 114 | return fillables[gen.data].forefront() 115 | 116 | elif isinstance(gen, oamap.generator.ListGenerator): 117 | return fillables[gen.stops].forefront() 118 | 119 | elif isinstance(gen, oamap.generator.UnionGenerator): 120 | return fillables[gen.tags].forefront() 121 | 122 | elif isinstance(gen, oamap.generator.RecordGenerator): 123 | for x in gen.fields.values(): 124 | return _fromdata_forefront(x, fillables, pointerobjs) 125 | 126 | elif isinstance(gen, oamap.generator.TupleGenerator): 127 | for x in gen.types: 128 | return _fromdata_forefront(x, fillables, pointerobjs) 129 | 130 | elif isinstance(gen, oamap.generator.PointerGenerator): 131 | return len(pointerobjs[id(gen)]) 132 | 133 | elif isinstance(gen, oamap.generator.ExtendedGenerator): 134 | return _fromdata_forefront(gen.generic, fillables, pointerobjs) 135 | 136 | def _fromdata_unionnullable(union): 137 | for possibility in union.possibilities: 138 | if isinstance(possibility, oamap.generator.Masked): 139 | return True 140 | elif isinstance(possibility, oamap.generator.UnionGenerator): 141 | return _fromdata_unionnullable(possibility) 142 | return False 143 | 144 | def _fromdata_fill(obj, gen, fillables, targetids, pointerobjs, at, pointerat): 145 | if id(gen) in targetids: 146 | targetids[id(gen)][id(obj)] = (_fromdata_forefront(gen, fillables, pointerobjs), obj) 147 | 148 | if obj is None: 149 | if isinstance(gen, oamap.generator.Masked): 150 | fillables[gen.mask].append(gen.maskedvalue) 151 | return # only mask is filled 152 | elif isinstance(gen, oamap.generator.UnionGenerator) and _fromdata_unionnullable(gen): 153 | pass # mask to fill is in a Union possibility 154 | elif isinstance(gen, oamap.generator.ExtendedGenerator) and isinstance(gen.generic, oamap.generator.Masked): 155 | _fromdata_fill(obj, gen.generic, fillables, targetids, pointerobjs, at, pointerat) 156 | return # filled the generic generator's mask 157 | else: 158 | raise TypeError("cannot fill None where expecting type {0} at {1}".format(gen.schema, at)) 159 | 160 | # obj is not None (except for the Union case) 161 | if isinstance(gen, oamap.generator.Masked): 162 | fillables[gen.mask].append(_fromdata_forefront(gen, fillables, pointerobjs, secondary=True)) 163 | 164 | if isinstance(gen, oamap.generator.PrimitiveGenerator): 165 | fillables[gen.data].append(obj) 166 | 167 | elif isinstance(gen, oamap.generator.ListGenerator): 168 | start = stop = _fromdata_forefront(gen.content, fillables, pointerobjs) 169 | try: 170 | if isinstance(obj, dict) or (isinstance(obj, tuple) and hasattr(obj, "_fields")): 171 | raise TypeError 172 | it = iter(obj) 173 | except TypeError: 174 | raise TypeError("cannot fill {0} where expecting type {1} at {2}".format(repr(obj), gen.schema, at)) 175 | else: 176 | for x in it: 177 | _fromdata_fill(x, gen.content, fillables, targetids, pointerobjs, at + (stop - start,), pointerat) 178 | stop += 1 179 | 180 | fillables[gen.starts].append(start) 181 | fillables[gen.stops].append(stop) 182 | 183 | elif isinstance(gen, oamap.generator.UnionGenerator): 184 | tag = None 185 | for i, possibility in enumerate(gen.possibilities): 186 | if obj in possibility.schema: 187 | tag = i 188 | break 189 | if tag is None: 190 | raise TypeError("cannot fill {0} where expecting type {1} at {2}".format(repr(obj), gen.schema, at)) 191 | 192 | offset = _fromdata_forefront(possibility, fillables, pointerobjs) 193 | _fromdata_fill(obj, possibility, fillables, targetids, pointerobjs, at + ("tag" + repr(tag),), pointerat) 194 | 195 | fillables[gen.tags].append(tag) 196 | fillables[gen.offsets].append(offset) 197 | 198 | elif isinstance(gen, oamap.generator.RecordGenerator): 199 | if isinstance(obj, dict): 200 | for n, x in gen.fields.items(): 201 | if n not in obj: 202 | raise TypeError("cannot fill {0} because its {1} field is missing at {2}".format(repr(obj), repr(n), at)) 203 | _fromdata_fill(obj[n], x, fillables, targetids, pointerobjs, at + (n,), pointerat) 204 | else: 205 | for n, x in gen.fields.items(): 206 | if not hasattr(obj, n): 207 | raise TypeError("cannot fill {0} because its {1} field is missing at {2}".format(repr(obj), repr(n), at)) 208 | _fromdata_fill(getattr(obj, n), x, fillables, targetids, pointerobjs, at + (n,), pointerat) 209 | 210 | elif isinstance(gen, oamap.generator.TupleGenerator): 211 | for i, x in enumerate(gen.types): 212 | try: 213 | v = obj[i] 214 | except (TypeError, IndexError): 215 | raise TypeError("cannot fill {0} because it does not have a field {1} at {2}".format(repr(obj), i, at)) 216 | else: 217 | _fromdata_fill(v, x, fillables, targetids, pointerobjs, at + (i,), pointerat) 218 | 219 | elif isinstance(gen, oamap.generator.PointerGenerator): 220 | # Pointers will be set after we see all the target values 221 | pointerobjs[id(gen)].append(obj) 222 | if id(gen) not in pointerat: 223 | pointerat[id(gen)] = at 224 | 225 | elif isinstance(gen, oamap.generator.ExtendedGenerator): 226 | _fromdata_fill(gen.degenerate(obj), gen.generic, fillables, targetids, pointerobjs, at, pointerat) 227 | 228 | def _fromdata_finish(fillables, pointers, pointerobjs, targetids, pointerat, pointer_fromequal, fillables_leaf_to_root): 229 | # do the pointers after everything else 230 | for pointer in pointers: 231 | while len(pointerobjs[id(pointer)]) > 0: 232 | pointerobjs2 = {id(pointer): []} 233 | for obj in pointerobjs[id(pointer)]: 234 | if id(obj) in targetids[id(pointer.target)] and targetids[id(pointer.target)][id(obj)][1] == obj: 235 | # case 1: an object in the target *is* the object in the pointer (same ids) 236 | position, _ = targetids[id(pointer.target)][id(obj)] 237 | 238 | else: 239 | position = None 240 | if pointer_fromequal: 241 | # fallback to quadratic complexity search 242 | for key, (pos, obj2) in targetids[id(pointer.target)].items(): 243 | if obj == obj2: 244 | position = pos 245 | break 246 | 247 | if position is not None: 248 | # case 2: an object in the target *is equal to* the object in the pointer (only check if pointer_fromequal) 249 | pass 250 | 251 | else: 252 | # case 3: the object was not found; it must be added to the target (beyond indexes where it can be found) 253 | _fromdata_fill(obj, pointer.target, fillables, targetids, pointerobjs2, pointerat[id(pointer)], pointerat) 254 | position, _ = targetids[id(pointer.target)][id(obj)] 255 | 256 | # every obj in pointerobjs[id(pointer)] gets *one* append 257 | fillables[pointer.positions].append(position) 258 | 259 | pointerobjs[id(pointer)] = pointerobjs2[id(pointer)] 260 | 261 | for fillable in fillables_leaf_to_root: 262 | fillable.update() 263 | 264 | def fromdata(value, generator=None, pointer_fromequal=False): 265 | if generator is None: 266 | generator = oamap.inference.fromdata(value).generator() 267 | if not isinstance(generator, oamap.generator.Generator): 268 | generator = generator.generator() 269 | 270 | return toarrays(fromdatamore(value, oamap.fillable.arrays(generator), generator=generator, pointer_fromequal=pointer_fromequal)) 271 | 272 | def fromdatamore(value, fillables, generator=None, pointer_fromequal=False): 273 | if generator is None: 274 | generator = oamap.inference.fromdata(value).generator() 275 | if not isinstance(generator, oamap.generator.Generator): 276 | generator = generator.generator() 277 | 278 | pointers = [] 279 | pointerobjs_keys = [] 280 | targetids_keys = [] 281 | fillables_leaf_to_root = [] 282 | positions_to_pointerobjs = {} 283 | 284 | _fromdata_initialize(generator, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 285 | 286 | pointerat = {} 287 | targetids = dict((x, {}) for x in targetids_keys) 288 | pointerobjs = dict((x, []) for x in pointerobjs_keys) 289 | 290 | if _fromdata_forefront(generator, fillables, pointerobjs) != 0 and not isinstance(generator, oamap.generator.ListGenerator): 291 | raise TypeError("non-Lists can only be filled from data once") 292 | 293 | _fromdata_fill(value, generator, fillables, targetids, pointerobjs, (), pointerat) 294 | _fromdata_finish(fillables, pointers, pointerobjs, targetids, pointerat, pointer_fromequal, fillables_leaf_to_root) 295 | 296 | return fillables 297 | 298 | def fromiterdata(values, generator=None, limit=lambda entries, arrayitems, arraybytes: False, pointer_fromequal=False): 299 | if generator is None: 300 | generator = oamap.inference.fromdata(values).generator() 301 | if not isinstance(generator, oamap.generator.Generator): 302 | generator = generator.generator() 303 | if not isinstance(generator, oamap.generator.ListGenerator): 304 | raise TypeError("non-Lists cannot be filled iteratively") 305 | 306 | # starting set of fillables 307 | fillables = oamap.fillable.arrays(generator) 308 | factor = dict((n, x.dtype.itemsize) for n, x in fillables.items()) 309 | 310 | pointers = [] 311 | pointerobjs_keys = [] 312 | targetids_keys = [] 313 | fillables_leaf_to_root = [] 314 | positions_to_pointerobjs = {} 315 | 316 | _fromdata_initialize(generator, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 317 | 318 | pointerat = {} 319 | targetids = dict((x, {}) for x in targetids_keys) 320 | pointerobjs = dict((x, []) for x in pointerobjs_keys) 321 | 322 | start = stop = _fromdata_forefront(generator.content, fillables, pointerobjs) 323 | 324 | for value in values: 325 | # prospectively fill a value 326 | _fromdata_fill(value, generator.content, fillables, targetids, pointerobjs, (), pointerat) 327 | 328 | # criteria for ending a limit based on forefront (_potential_ size), rather than len (_accepted_ size) 329 | arrayitems = {} 330 | arraybytes = {} 331 | for n, x in fillables.items(): 332 | if n in positions_to_pointerobjs: 333 | arrayitems[n] = len(pointerobjs[positions_to_pointerobjs[n]]) 334 | else: 335 | arrayitems[n] = x.forefront() 336 | arraybytes[n] = arrayitems[n]*factor[n] 337 | 338 | if not limit((stop - start) + 1, arrayitems, arraybytes): 339 | # accepting this entry would make the limit too large 340 | fillables[generator.starts].append(start) 341 | fillables[generator.stops].append(stop) 342 | _fromdata_finish(fillables, pointers, pointerobjs, targetids, pointerat, pointer_fromequal, fillables_leaf_to_root) 343 | # yield a new limit of arrays 344 | yield stop - start, toarrays(fillables) 345 | 346 | # and make a new set of fillables (along with everything that depends on it) 347 | fillables = oamap.fillable.arrays(generator) 348 | 349 | pointers = [] 350 | pointerobjs_keys = [] 351 | targetids_keys = [] 352 | fillables_leaf_to_root = [] 353 | positions_to_pointerobjs = {} 354 | 355 | _fromdata_initialize(generator, generator, fillables, pointers, pointerobjs_keys, targetids_keys, fillables_leaf_to_root, positions_to_pointerobjs) 356 | 357 | pointerat = {} 358 | targetids = dict((x, {}) for x in targetids_keys) 359 | pointerobjs = dict((x, []) for x in pointerobjs_keys) 360 | 361 | start = stop = _fromdata_forefront(generator.content, fillables, pointerobjs) 362 | 363 | # really fill it in this new partition 364 | _fromdata_fill(value, generator.content, fillables, targetids, pointerobjs, (), pointerat) 365 | stop += 1 366 | for fillable in fillables_leaf_to_root: 367 | fillable.update() 368 | 369 | else: 370 | # else accept the data into the fillables and move on 371 | stop += 1 372 | for fillable in fillables_leaf_to_root: 373 | fillable.update() 374 | 375 | # always yield at the end 376 | fillables[generator.starts].append(start) 377 | fillables[generator.stops].append(stop) 378 | _fromdata_finish(fillables, pointers, pointerobjs, targetids, pointerat, pointer_fromequal, fillables_leaf_to_root) 379 | yield (stop - start), toarrays(fillables) 380 | -------------------------------------------------------------------------------- /oamap/fillable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2017, DIANA-HEP 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import os 32 | import math 33 | import struct 34 | import sys 35 | 36 | import numpy 37 | 38 | import oamap.generator 39 | 40 | if sys.version_info[0] > 2: 41 | xrange = range 42 | 43 | class Fillable(object): 44 | def __init__(self, dtype): 45 | raise NotImplementedError 46 | 47 | def __len__(self): 48 | return self._len 49 | 50 | def forefront(self): 51 | return self._chunkindex*self.chunksize + self._indexinchunk 52 | 53 | def append(self, value): 54 | raise NotImplementedError 55 | 56 | def extend(self, values): 57 | raise NotImplementedError 58 | 59 | def update(self): 60 | self._len = self.forefront() 61 | 62 | def revert(self): 63 | self._chunkindex, self._indexinchunk = divmod(self._len, self.chunksize) 64 | 65 | def close(self): 66 | pass 67 | 68 | def __getitem__(self, index): 69 | raise NotImplementedError 70 | 71 | def __array__(self, dtype=None, copy=False, order="K", subok=False, ndmin=0): 72 | if dtype is None: 73 | dtype = self.dtype 74 | elif not isinstance(dtype, numpy.dtype): 75 | dtype = numpy.dtype(dtype) 76 | 77 | if dtype == self.dtype and not copy and not subok and ndmin == 0: 78 | return self[:] 79 | else: 80 | return numpy.array(self[:], dtype=dtype, copy=copy, order=order, subok=subok, ndmin=ndmin) 81 | 82 | ################################################################ make fillables 83 | 84 | def _makefillables(generator, fillables, makefillable): 85 | if isinstance(generator, oamap.generator.Masked): 86 | fillables[generator.mask] = makefillable(generator.mask, generator.maskdtype) 87 | 88 | if isinstance(generator, oamap.generator.PrimitiveGenerator): 89 | if generator.dtype is None: 90 | raise ValueError("dtype is unknown (None) for Primitive generator at {0}".format(repr(generator.data))) 91 | fillables[generator.data] = makefillable(generator.data, generator.dtype) 92 | 93 | elif isinstance(generator, oamap.generator.ListGenerator): 94 | fillables[generator.starts] = makefillable(generator.starts, generator.posdtype) 95 | fillables[generator.stops] = makefillable(generator.stops, generator.posdtype) 96 | _makefillables(generator.content, fillables, makefillable) 97 | 98 | elif isinstance(generator, oamap.generator.UnionGenerator): 99 | fillables[generator.tags] = makefillable(generator.tags, generator.tagdtype) 100 | fillables[generator.offsets] = makefillable(generator.offsets, generator.offsetdtype) 101 | for possibility in generator.possibilities: 102 | _makefillables(possibility, fillables, makefillable) 103 | 104 | elif isinstance(generator, oamap.generator.RecordGenerator): 105 | for field in generator.fields.values(): 106 | _makefillables(field, fillables, makefillable) 107 | 108 | elif isinstance(generator, oamap.generator.TupleGenerator): 109 | for field in generator.types: 110 | _makefillables(field, fillables, makefillable) 111 | 112 | elif isinstance(generator, oamap.generator.PointerGenerator): 113 | fillables[generator.positions] = makefillable(generator.positions, generator.posdtype) 114 | if not generator._internal: 115 | _makefillables(generator.target, fillables, makefillable) 116 | 117 | elif isinstance(generator, oamap.generator.ExtendedGenerator): 118 | _makefillables(generator.generic, fillables, makefillable) 119 | 120 | else: 121 | raise AssertionError("unrecognized generator type: {0}".format(generator)) 122 | 123 | def arrays(generator, chunksize=8192): 124 | if not isinstance(generator, oamap.generator.Generator): 125 | generator = generator.generator() 126 | fillables = {} 127 | _makefillables(generator, fillables, lambda name, dtype: FillableArray(dtype, chunksize=chunksize)) 128 | return fillables 129 | 130 | def files(generator, directory, chunksize=8192, lendigits=16): 131 | if not isinstance(generator, oamap.generator.Generator): 132 | generator = generator.generator() 133 | if not os.path.exists(directory): 134 | os.mkdir(directory) 135 | fillables = {} 136 | _makefillables(generator, fillables, lambda name, dtype: FillableFile(os.path.join(directory, name), dtype, chunksize=chunksize, lendigits=lendigits)) 137 | return fillables 138 | 139 | def numpyfiles(generator, directory, chunksize=8192, lendigits=16): 140 | if not isinstance(generator, oamap.generator.Generator): 141 | generator = generator.generator() 142 | if not os.path.exists(directory): 143 | os.mkdir(directory) 144 | fillables = {} 145 | _makefillables(generator, fillables, lambda name, dtype: FillableNumpyFile(os.path.join(directory, name), dtype, chunksize=chunksize, lendigits=lendigits)) 146 | return fillables 147 | 148 | ################################################################ FillableArray 149 | 150 | class FillableArray(Fillable): 151 | # Numpy arrays and list items have 96+8 byte (80+8 byte) overhead in Python 2 (Python 3) 152 | # compared to 8192 1-byte values (8-byte values), this is 1% overhead (0.1% overhead) 153 | def __init__(self, dtype, chunksize=8192): 154 | if not isinstance(dtype, numpy.dtype): 155 | dtype = numpy.dtype(dtype) 156 | self._data = [numpy.empty(chunksize, dtype=dtype)] 157 | self._len = 0 158 | self._indexinchunk = 0 159 | self._chunkindex = 0 160 | 161 | @property 162 | def dtype(self): 163 | return self._data[0].dtype 164 | 165 | @property 166 | def chunksize(self): 167 | return self._data[0].shape[0] 168 | 169 | def append(self, value): 170 | if self._indexinchunk >= len(self._data[self._chunkindex]): 171 | while len(self._data) <= self._chunkindex + 1: 172 | self._data.append(numpy.empty(self.chunksize, dtype=self.dtype)) 173 | self._indexinchunk = 0 174 | self._chunkindex += 1 175 | 176 | self._data[self._chunkindex][self._indexinchunk] = value 177 | self._indexinchunk += 1 178 | 179 | def extend(self, values): 180 | chunkindex = self._chunkindex 181 | indexinchunk = self._indexinchunk 182 | 183 | while len(values) > 0: 184 | if indexinchunk >= len(self._data[chunkindex]): 185 | while len(self._data) <= chunkindex + 1: 186 | self._data.append(numpy.empty(self.chunksize, dtype=self.dtype)) 187 | indexinchunk = 0 188 | chunkindex += 1 189 | 190 | tofill = min(len(values), self.chunksize - indexinchunk) 191 | self._data[chunkindex][indexinchunk : indexinchunk + tofill] = values[:tofill] 192 | indexinchunk += tofill 193 | values = values[tofill:] 194 | 195 | self._chunkindex = chunkindex 196 | self._indexinchunk = indexinchunk 197 | 198 | def __getitem__(self, index): 199 | if isinstance(index, slice): 200 | lenself = len(self) 201 | step = 1 if index.step is None else index.step 202 | if step > 0: 203 | start = 0 if index.start is None else index.start 204 | stop = lenself if index.stop is None else index.stop 205 | else: 206 | start = lenself - 1 if index.start is None else index.start 207 | stop = 0 if index.stop is None else index.stop 208 | 209 | if start < 0: 210 | start += lenself 211 | if stop < 0: 212 | stop += lenself 213 | 214 | start = min(lenself, max(0, start)) 215 | stop = min(lenself, max(0, stop)) 216 | 217 | if step == 0: 218 | raise ValueError("slice step cannot be zero") 219 | 220 | else: 221 | if step > 0: 222 | start_chunkindex = int(math.floor(float(start) / self.chunksize)) 223 | stop_chunkindex = int(math.ceil(float(stop) / self.chunksize)) 224 | start_indexinchunk = start - start_chunkindex*self.chunksize 225 | stop_indexinchunk = stop - (stop_chunkindex - 1)*self.chunksize 226 | else: 227 | start_chunkindex = int(math.floor(float(start) / self.chunksize)) 228 | stop_chunkindex = int(math.floor(float(stop) / self.chunksize)) - 1 229 | start_indexinchunk = start - start_chunkindex*self.chunksize 230 | stop_indexinchunk = stop - (stop_chunkindex + 1)*self.chunksize 231 | 232 | def beginend(): 233 | offset = 0 234 | for chunkindex in xrange(start_chunkindex, stop_chunkindex, 1 if step > 0 else -1): 235 | if step > 0: 236 | if chunkindex == start_chunkindex: 237 | begin = start_indexinchunk 238 | else: 239 | begin = offset 240 | if chunkindex == stop_chunkindex - 1: 241 | end = stop_indexinchunk 242 | else: 243 | end = self.chunksize 244 | offset = (begin - self.chunksize) % step 245 | else: 246 | if chunkindex == start_chunkindex: 247 | begin = start_indexinchunk 248 | else: 249 | begin = self.chunksize - 1 - offset 250 | if chunkindex == stop_chunkindex + 1 and index.stop is not None: 251 | end = stop_indexinchunk 252 | else: 253 | end = None 254 | offset = (begin - -1) % -step 255 | yield chunkindex, begin, end 256 | 257 | length = 0 258 | for chunkindex, begin, end in beginend(): 259 | if step > 0: 260 | length += int(math.ceil(float(end - begin) / step)) 261 | elif end is None: 262 | length += int(math.ceil(-float(begin + 1) / step)) 263 | else: 264 | length += int(math.ceil(-float(begin - end) / step)) 265 | 266 | out = numpy.empty(length, dtype=self.dtype) 267 | outi = 0 268 | 269 | for chunkindex, begin, end in beginend(): 270 | array = self._data[chunkindex][begin:end:step] 271 | 272 | out[outi : outi + len(array)] = array 273 | outi += len(array) 274 | if outi >= len(out): 275 | break 276 | 277 | return out 278 | 279 | else: 280 | lenself = len(self) 281 | normalindex = index if index >= 0 else index + lenself 282 | if not 0 <= normalindex < lenself: 283 | raise IndexError("index {0} is out of bounds for size {1}".format(index, lenself)) 284 | 285 | chunkindex, indexinchunk = divmod(index, self.chunksize) 286 | return self._data[chunkindex][indexinchunk] 287 | 288 | ################################################################ FillableFile 289 | 290 | class FillableFile(Fillable): 291 | def __init__(self, filename, dtype, chunksize=8192, lendigits=16): 292 | if not isinstance(dtype, numpy.dtype): 293 | dtype = numpy.dtype(dtype) 294 | self._data = numpy.zeros(chunksize, dtype=dtype) # 'zeros', not 'empty' for security 295 | self._len = 0 296 | self._indexinchunk = 0 297 | self._chunkindex = 0 298 | self._filename = filename 299 | self._openfile(filename, lendigits) 300 | 301 | def _openfile(self, filename, lendigits): 302 | open(filename, "wb", 0).close() 303 | self._file = open(filename, "r+b", 0) 304 | self._datapos = 0 305 | # a plain file has no header 306 | 307 | @property 308 | def filename(self): 309 | return self._file.name 310 | 311 | @property 312 | def dtype(self): 313 | return self._data.dtype 314 | 315 | @property 316 | def chunksize(self): 317 | return self._data.shape[0] 318 | 319 | def append(self, value): 320 | self._data[self._indexinchunk] = value 321 | self._indexinchunk += 1 322 | 323 | if self._indexinchunk == self.chunksize: 324 | self._flush() 325 | self._indexinchunk = 0 326 | self._chunkindex += 1 327 | 328 | def _flush(self): 329 | self._file.seek(self._datapos + self._chunkindex*self.chunksize*self.dtype.itemsize) 330 | self._file.write(self._data.tostring()) 331 | 332 | def extend(self, values): 333 | chunkindex = self._chunkindex 334 | indexinchunk = self._indexinchunk 335 | 336 | while len(values) > 0: 337 | tofill = min(len(values), self.chunksize - indexinchunk) 338 | self._data[indexinchunk : indexinchunk + tofill] = values[:tofill] 339 | indexinchunk += tofill 340 | values = values[tofill:] 341 | 342 | if indexinchunk == self.chunksize: 343 | self._file.seek(self._datapos + chunkindex*self.chunksize*self.dtype.itemsize) 344 | self._file.write(self._data.tostring()) 345 | indexinchunk = 0 346 | chunkindex += 1 347 | 348 | self._chunkindex = chunkindex 349 | self._indexinchunk = indexinchunk 350 | 351 | def revert(self): 352 | chunkindex, self._indexinchunk = divmod(self._len, self.chunksize) 353 | if self._chunkindex != chunkindex: 354 | self._file.seek(self._datapos + chunkindex*self.chunksize*self.dtype.itemsize) 355 | olddata = numpy.frombuffer(self._file.read(self.chunksize*self.dtype.itemsize), dtype=self.dtype) 356 | self._data[:len(olddata)] = olddata 357 | 358 | self._chunkindex = chunkindex 359 | 360 | def close(self): 361 | if hasattr(self, "_file"): 362 | self._flush() 363 | self._file.close() 364 | 365 | def __del__(self): 366 | self.close() 367 | 368 | def __enter__(self, *args, **kwds): 369 | return self 370 | 371 | def __exit__(self, *args, **kwds): 372 | self.close() 373 | 374 | def __getitem__(self, value): 375 | if not self._file.closed: 376 | self._flush() 377 | 378 | if isinstance(value, slice): 379 | lenself = len(self) 380 | if lenself == 0: 381 | array = numpy.empty(lenself, dtype=self.dtype) 382 | else: 383 | array = numpy.memmap(self.filename, self.dtype, "r", self._datapos, lenself, "C") 384 | if value.start is None and value.stop is None and value.step is None: 385 | return array 386 | else: 387 | return array[value] 388 | 389 | else: 390 | lenself = len(self) 391 | normalindex = index if index >= 0 else index + lenself 392 | if not 0 <= normalindex < lenself: 393 | raise IndexError("index {0} is out of bounds for size {1}".format(index, lenself)) 394 | 395 | if not self._file.closed: 396 | # since the file's still open, get it from here instead of making a new filehandle 397 | itemsize = self.dtype.itemsize 398 | try: 399 | self._file.seek(self._datapos + normalindex*itemsize) 400 | return numpy.frombuffer(self._file.read(itemsize), self.dtype)[0] 401 | finally: 402 | self._file.seek(self._datapos + self._chunkindex*self.chunksize*self.dtype.itemsize) 403 | else: 404 | # otherwise, you have to open a new file 405 | with open(self.filename, "rb") as file: 406 | file.seek(self._datapos + normalindex*itemsize) 407 | return numpy.frombuffer(file.read(itemsize), self.dtype)[0] 408 | 409 | ################################################################ FillableNumpyFile (FillableFile with a self-describing header) 410 | 411 | class FillableNumpyFile(FillableFile): 412 | def _openfile(self, filename, lendigits): 413 | magic = b"\x93NUMPY\x01\x00" 414 | header1 = "{{'descr': {0}, 'fortran_order': False, 'shape': (".format(repr(str(self.dtype))).encode("ascii") 415 | header2 = "{0}, }}".format(repr((10**lendigits - 1,))).encode("ascii")[1:] 416 | 417 | unpaddedlen = len(magic) + 2 + len(header1) + len(header2) 418 | paddedlen = int(math.ceil(float(unpaddedlen) / self.dtype.itemsize)) * self.dtype.itemsize 419 | header2 = header2 + b" " * (paddedlen - unpaddedlen) 420 | self._lenpos = len(magic) + 2 + len(header1) 421 | self._datapos = len(magic) + 2 + len(header1) + len(header2) 422 | assert self._datapos % self.dtype.itemsize == 0 423 | 424 | open(filename, "wb", 0).close() 425 | self._file = open(filename, "r+b", 0) 426 | self._formatter = "{0:%dd}" % lendigits 427 | self._file.write(magic) 428 | self._file.write(struct.pack("".format(self.__class__.__name__, self._name, repr(self._args), repr(self._kwargs)) 70 | 71 | def __str__(self): 72 | return ".{0}({1}{2})".format(self._name, ", ".join(repr(x) for x in self._args), "".join(", {0}={1}".format(n, repr(x)) for n, x in self._kwargs.items())) 73 | 74 | @property 75 | def name(self): 76 | return self._name 77 | 78 | @property 79 | def args(self): 80 | return self._args 81 | 82 | @property 83 | def kwargs(self): 84 | return self._kwargs 85 | 86 | @property 87 | def function(self): 88 | return self._function 89 | 90 | def apply(self, data): 91 | return self._function(*((data,) + self._args), **self._kwargs) 92 | 93 | class Recasting(Operation): pass 94 | class Transformation(Operation): pass 95 | class Action(Operation): pass 96 | 97 | class Operable(object): 98 | def __init__(self): 99 | self._operations = () 100 | 101 | @staticmethod 102 | def update_operations(): 103 | def newrecasting(name, function): 104 | @functools.wraps(function) 105 | def recasting(self, *args, **kwargs): 106 | out = self.__class__.__new__(self.__class__) 107 | Operable.__init__(out) 108 | out.__dict__ = self.__dict__.copy() 109 | out._operations = self._operations + (Recasting(name, args, kwargs, function),) 110 | return out 111 | return recasting 112 | 113 | def newtransformation(name, function): 114 | @functools.wraps(function) 115 | def transformation(self, *args, **kwargs): 116 | out = self.__class__.__new__(self.__class__) 117 | Operable.__init__(out) 118 | out.__dict__ = self.__dict__.copy() 119 | out._operations = self._operations + (Transformation(name, args, kwargs, function),) 120 | return out 121 | return transformation 122 | 123 | def newaction(name, function): 124 | @functools.wraps(function) 125 | def action(self, *args, **kwargs): 126 | try: 127 | combiner = kwargs.pop("combiner") 128 | except KeyError: 129 | combiner = function.combiner 130 | out = self.__class__.__new__(self.__class__) 131 | Operable.__init__(out) 132 | out.__dict__ = self.__dict__.copy() 133 | out._operations = self._operations + (Action(name, args, kwargs, function),) 134 | return out.act(combiner) 135 | return action 136 | 137 | for n, x in oamap.operations.recastings.items(): 138 | setattr(Operable, n, oamap.util.MethodType(newrecasting(n, x), None, Operable)) 139 | 140 | for n, x in oamap.operations.transformations.items(): 141 | setattr(Operable, n, oamap.util.MethodType(newtransformation(n, x), None, Operable)) 142 | 143 | for n, x in oamap.operations.actions.items(): 144 | setattr(Operable, n, oamap.util.MethodType(newaction(n, x), None, Operable)) 145 | 146 | def _nooperations(self): 147 | return len(self._operations) == 0 148 | 149 | def _notransformations(self): 150 | return all(isinstance(x, Recasting) for x in self._operations) 151 | 152 | Operable.update_operations() 153 | 154 | class _Data(Operable): 155 | def __init__(self, name, schema, backends, executor, extension=None, packing=None, doc=None, metadata=None): 156 | super(_Data, self).__init__() 157 | self._name = name 158 | self._schema = schema 159 | self._backends = backends 160 | self._executor = executor 161 | self._extension = extension 162 | self._packing = packing 163 | self._doc = doc 164 | self._metadata = metadata 165 | self._cachedobject = None 166 | 167 | def __repr__(self): 168 | return "{1}".format(repr(self._name), "".join(str(x) for x in self._operations)) 169 | 170 | def __str__(self): 171 | return "{1}".format(repr(self._name), "".join("\n " + str(x) for x in self._operations)) 172 | 173 | @property 174 | def name(self): 175 | return self._name 176 | 177 | @property 178 | def schema(self): 179 | return self._schema.deepcopy() 180 | 181 | @property 182 | def extension(self): 183 | return self._extension 184 | 185 | @property 186 | def packing(self): 187 | return self._packing 188 | 189 | @property 190 | def doc(self): 191 | return self._doc 192 | 193 | @property 194 | def metadata(self): 195 | return self._metadata 196 | 197 | def arrays(self): 198 | return DataArrays(self._backends) 199 | 200 | def transform(self, name, namespace, update): 201 | if self._nooperations(): 202 | return [SingleThreadExecutor.PseudoFuture(update(self))] 203 | 204 | elif self._notransformations(): 205 | result = self() 206 | for operation in self._operations: 207 | result = operation.apply(result) 208 | if isinstance(result, oamap.proxy.ListProxy): 209 | out = Dataset(name, result._generator.schema, self._backends, self._executor, [0, len(result)], extension=self._extension, packing=None, doc=self._doc, metadata=self._metadata) 210 | else: 211 | out = Data(name, result._generator.schema, self._backends, self._executor, extension=self._extension, packing=None, doc=self._doc, metadata=self._metadata) 212 | return [SingleThreadExecutor.PseudoFuture(update(out))] 213 | 214 | else: 215 | def task(name, dataset, namespace, update): 216 | result = dataset() 217 | for operation in dataset._operations: 218 | result = operation.apply(result) 219 | 220 | backend = dataset._backends[namespace] 221 | schema, roles2arrays = oamap.operations._DualSource.collect(result._generator.namedschema(), result._arrays, namespace, backend.prefix(name), backend.delimiter()) 222 | 223 | active = backend.instantiate(0) 224 | if hasattr(active, "putall"): 225 | active.putall(roles2arrays) 226 | else: 227 | for n, x in roles2arrays.items(): 228 | active[str(n)] = x 229 | 230 | if isinstance(result, oamap.proxy.ListProxy): 231 | out = Dataset(name, schema, dataset._backends, dataset._executor, [0, len(result)], extension=dataset._extension, packing=None, doc=dataset._doc, metadata=dataset._metadata) 232 | else: 233 | out = Data(name, schema, dataset._backends, dataset._executor, extension=dataset._extension, packing=None, doc=dataset._doc, metadata=dataset._metadata) 234 | return update(out) 235 | 236 | return [self._executor.submit(task, name, self, namespace, update)] 237 | 238 | def act(self, combiner): 239 | def task(dataset): 240 | result = dataset() 241 | for operation in dataset._operations: 242 | result = operation.apply(result) 243 | return result 244 | 245 | return combiner([self._executor.submit(task, self)]) 246 | 247 | class Data(_Data): 248 | def __call__(self): 249 | if self._cachedobject is None: 250 | if self._extension is None: 251 | extension = oamap.util.import_module("oamap.extension.common") 252 | elif isinstance(self._extension, basestring): 253 | extension = oamap.util.import_module(self._extension) 254 | else: 255 | extension = [oamap.util.import_module(x) for x in self._extension] 256 | 257 | self._cachedobject = self._schema(self.arrays(), extension=extension, packing=self._packing) 258 | 259 | return self._cachedobject 260 | 261 | class DataArrays(object): 262 | def __init__(self, backends): 263 | self._backends = backends 264 | self._active = {} 265 | self._partitionid = 0 266 | 267 | def _toplevel(self, out, filtered): 268 | return filtered 269 | 270 | def getall(self, roles): 271 | out = {} 272 | for namespace, backend in self._backends.items(): 273 | filtered = self._toplevel(out, [x for x in roles if x.namespace == namespace]) 274 | 275 | if len(filtered) > 0: 276 | active = self._active.get(namespace, None) 277 | if active is None: 278 | active = self._active[namespace] = backend.instantiate(self._partitionid) 279 | 280 | if hasattr(active, "getall"): 281 | out.update(active.getall(filtered)) 282 | else: 283 | for x in roles: 284 | out[x] = active[str(x)] 285 | 286 | return out 287 | 288 | def close(self): 289 | for namespace, active in self._active.items(): 290 | if hasattr(active, "close"): 291 | active.close() 292 | self._active[namespace] = None 293 | 294 | class Dataset(_Data): 295 | def __init__(self, name, schema, backends, executor, offsets, extension=None, packing=None, doc=None, metadata=None): 296 | if not isinstance(schema, oamap.schema.List): 297 | raise TypeError("Dataset must have a list schema, not\n\n {0}".format(schema.__repr__(indent=" "))) 298 | 299 | super(Dataset, self).__init__(name, schema, backends, executor, extension=extension, packing=packing, doc=doc, metadata=metadata) 300 | 301 | if not isinstance(offsets, numpy.ndarray): 302 | try: 303 | if not all(isinstance(x, (numbers.Integral, numpy.integer)) and x >= 0 for x in offsets): 304 | raise TypeError 305 | except TypeError: 306 | raise TypeError("offsets must be an iterable of non-negative integers") 307 | offsets = numpy.array(offsets, dtype=numpy.int64) 308 | if len(offsets.shape) != 1: 309 | raise ValueError("offsets must be one-dimensional") 310 | if len(offsets) < 2 or offsets[0] != 0: 311 | raise ValueError("offsets must have at least two items, and the first one must be zero") 312 | if not numpy.all(offsets[:-1] <= offsets[1:]): 313 | raise ValueError("offsets must be monotonically increasing") 314 | self._offsets = offsets 315 | self._cachedpartition = None 316 | 317 | def __repr__(self): 318 | return "{3}".format(repr(self._name), self.numpartitions, self.numentries, "".join(str(x) for x in self._operations)) 319 | 320 | def __str__(self): 321 | return "{3}".format(repr(self._name), self.numpartitions, self.numentries, "".join("\n " + str(x) for x in self._operations)) 322 | 323 | @property 324 | def offsets(self): 325 | return self._offsets.tolist() 326 | 327 | @property 328 | def starts(self): 329 | return self._offsets[:-1].tolist() 330 | 331 | @property 332 | def stops(self): 333 | return self._offsets[1:].tolist() 334 | 335 | @property 336 | def partitions(self): 337 | return zip(self.start, self.stop) 338 | 339 | @property 340 | def numpartitions(self): 341 | return len(self._offsets) - 1 342 | 343 | @property 344 | def numentries(self): 345 | return int(self._offsets[-1]) 346 | 347 | def partition(self, partitionid): 348 | if self._cachedpartition != partitionid: 349 | self._cachedpartition = partitionid 350 | 351 | if self._extension is None: 352 | extension = oamap.util.import_module("oamap.extension.common") 353 | elif isinstance(self._extension, basestring): 354 | extension = oamap.util.import_module(self._extension) 355 | else: 356 | extension = [oamap.util.import_module(x) for x in self._extension] 357 | 358 | self._cachedobject = self._schema(self.arrays(partitionid), extension=extension, packing=self._packing) 359 | 360 | return self._cachedobject 361 | 362 | def __iter__(self): 363 | for partitionid in range(self.numpartitions): 364 | for i in range(self._offsets[partitionid], self._offsets[partitionid + 1]): 365 | yield self[i] 366 | 367 | def __getitem__(self, index): 368 | if isinstance(index, slice): 369 | start, stop, step = oamap.util.slice2sss(index, self.numentries) 370 | partitionid = max(0, min(numpy.searchsorted(self._offsets, start, side="right") - 1, self.numpartitions - 1)) 371 | localstart = start - self._offsets[partitionid] 372 | localstop = stop - self._offsets[partitionid] 373 | if localstop < -1 or localstop > (self._offsets[partitionid + 1] - self._offsets[partitionid]): 374 | raise IndexError("slice spans multiple partitions") 375 | 376 | out = self.partition(partitionid) 377 | out._whence = localstart 378 | out._stride = step 379 | 380 | # out._length = int(math.ceil(float(abs(localstop - localstart)) / abs(step))) 381 | d, m = divmod(abs(localstart - localstop), abs(step)) 382 | out._length = d + (1 if m != 0 else 0) 383 | return out 384 | 385 | else: 386 | normindex = index if index >= 0 else index + self.numentries 387 | if not 0 <= normindex < self.numentries: 388 | raise IndexError("index {0} out of range for {1} entries".format(index, self.numentries)) 389 | partitionid = numpy.searchsorted(self._offsets, normindex, side="right") - 1 390 | localindex = normindex - self._offsets[partitionid] 391 | return self.partition(partitionid)[localindex] 392 | 393 | def arrays(self, partitionid): 394 | normid = partitionid if partitionid >= 0 else partitionid + self.numpartitions 395 | if not 0 <= normid < self.numpartitions: 396 | raise IndexError("partitionid {0} out of range for {1} partitions".format(partitionid, self.numpartitions)) 397 | 398 | startsrole = oamap.generator.StartsRole(self._schema._get_starts("object", "-"), self._schema.namespace, None) 399 | stopsrole = oamap.generator.StopsRole(self._schema._get_stops("object", "-"), self._schema.namespace, None) 400 | startsrole.stops = stopsrole 401 | stopsrole.starts = startsrole 402 | return DatasetArrays(normid, startsrole, stopsrole, self._offsets[normid + 1] - self._offsets[normid], self._backends) 403 | 404 | def transform(self, name, namespace, update): 405 | if self._nooperations(): 406 | return [SingleThreadExecutor.PseudoFuture(update(self))] 407 | 408 | elif self._notransformations(): 409 | result = self.partition(0) 410 | for operation in self._operations: 411 | result = operation.apply(result) 412 | if isinstance(result, oamap.proxy.ListProxy): 413 | out = Dataset(name, result._generator.schema, self._backends, self._executor, self._offsets, extension=self._extension, packing=None, doc=self._doc, metadata=self._metadata) 414 | else: 415 | out = Data(name, result._generator.schema, self._backends, self._executor, extension=self._extension, packing=None, doc=self._doc, metadata=self._metadata) 416 | return [SingleThreadExecutor.PseudoFuture(update(out))] 417 | 418 | else: 419 | def task(name, dataset, namespace, partitionid): 420 | result = dataset.partition(partitionid) 421 | for operation in dataset._operations: 422 | result = operation.apply(result) 423 | 424 | backend = dataset._backends[namespace] 425 | schema, roles2arrays = oamap.operations._DualSource.collect(result._generator.namedschema(), result._arrays, namespace, backend.prefix(name), backend.delimiter()) 426 | 427 | active = backend.instantiate(partitionid) 428 | if hasattr(active, "putall"): 429 | active.putall(roles2arrays) 430 | else: 431 | for n, x in roles2arrays.items(): 432 | active[str(n)] = x 433 | if isinstance(result, oamap.proxy.ListProxy): 434 | return schema, len(result) 435 | else: 436 | return schema, 1 437 | 438 | tasks = [self._executor.submit(task, name, self, namespace, i) for i in range(self.numpartitions)] 439 | 440 | def collect(name, dataset, results, update): 441 | if isinstance(results[0], tuple) and len(results[0]) == 2 and isinstance(results[0][0], oamap.schema.Schema): 442 | offsets = numpy.cumsum([0] + [numentries for schema, numentries in results], dtype=numpy.int64) 443 | schema = results[0][0] 444 | else: 445 | offsets = numpy.cumsum([0] + [x.result()[1] for x in results], dtype=numpy.int64) 446 | schema = results[0].result()[0] 447 | 448 | if isinstance(schema, oamap.schema.List): 449 | out = Dataset(name, schema, dataset._backends, dataset._executor, offsets, extension=dataset._extension, packing=None, doc=dataset._doc, metadata=dataset._metadata) 450 | else: 451 | out = Data(name, schema, dataset._backends, dataset._executor, extension=dataset._extension, packing=None, doc=dataset._doc, metadata=dataset._metadata) 452 | return update(out) 453 | 454 | tasks.append(self._executor.submit(collect, name, self, tuple(tasks), update)) 455 | return tasks 456 | 457 | def act(self, combiner): 458 | def task(dataset, partitionid): 459 | result = dataset.partition(partitionid) 460 | for operation in dataset._operations: 461 | result = operation.apply(result) 462 | return result 463 | 464 | return combiner([self._executor.submit(task, self, i) for i in range(self.numpartitions)]) 465 | 466 | class DatasetArrays(DataArrays): 467 | def __init__(self, partitionid, startsrole, stopsrole, numentries, backends): 468 | super(DatasetArrays, self).__init__(backends) 469 | self._partitionid = partitionid 470 | self._startsrole = startsrole 471 | self._stopsrole = stopsrole 472 | self._numentries = numentries 473 | 474 | def _toplevel(self, out, filtered): 475 | try: 476 | index = filtered.index(self._startsrole) 477 | except ValueError: 478 | pass 479 | else: 480 | del filtered[index] 481 | out[self._startsrole] = numpy.array([0], dtype=oamap.generator.ListGenerator.posdtype) 482 | 483 | try: 484 | index = filtered.index(self._stopsrole) 485 | except ValueError: 486 | pass 487 | else: 488 | del filtered[index] 489 | out[self._stopsrole] = numpy.array([self._numentries], dtype=oamap.generator.ListGenerator.posdtype) 490 | 491 | return filtered 492 | 493 | --------------------------------------------------------------------------------