├── .gitignore
├── MANIFEST.in
├── docs
    ├── _static
    │   ├── example.pdf
    │   ├── beginner_guide_data.zip
    │   └── example.svg
    ├── api
    │   ├── steps.rst
    │   ├── pygrametl.rst
    │   ├── tables.rst
    │   ├── fifodict.rst
    │   ├── parallel.rst
    │   ├── aggregators.rst
    │   ├── datasources.rst
    │   ├── drawntabletesting.rst
    │   ├── jdbcconnectionwrapper.rst
    │   └── jythonmultiprocessing.rst
    ├── _exts
    │   ├── rtdmockup.py
    │   ├── formatref.py
    │   └── autoformat.py
    ├── index.rst
    ├── examples
    │   ├── database.rst
    │   ├── jython.rst
    │   ├── bulkloading.rst
    │   ├── facttables.rst
    │   └── datasources.rst
    ├── make.bat
    ├── Makefile
    ├── quickstart
    │   └── install.rst
    └── conf.py
├── pygrametl
    ├── jythonsupport
    │   ├── Value.class
    │   └── Value.java
    ├── jythonmultiprocessing.py
    ├── drawntabletesting
    │   ├── formattable.py
    │   └── dttr.py
    ├── aggregators.py
    └── FIFODict.py
├── tests
    ├── drawntabletesting
    │   ├── dttr
    │   │   ├── datasource_in_config.dtt
    │   │   ├── rows.csv
    │   │   ├── read_dt_from_csv.dtt
    │   │   ├── assert_subset.dtt
    │   │   ├── assert_equal.dtt
    │   │   ├── assert_disjoint.dtt
    │   │   ├── connection_in_config.dtt
    │   │   └── config.py
    │   ├── __init__.py
    │   ├── test_dttr.py
    │   └── test_dtt.py
    ├── __init__.py
    ├── tables
    │   └── __init__.py
    ├── utilities.py
    └── test_datasources.py
├── .github
    └── workflows
    │   └── python-unittest-on-pr-and-push.yml
├── LICENSE.txt
├── setup.py
├── README.rst
└── CHANGELOG.rst


/.gitignore:
--------------------------------------------------------------------------------
1 | docs/_build
2 | *.egg-info/
3 | __pycache__/
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst
2 | include *.txt
3 | recursive-include docs *
4 | prune docs/_build
5 | 


--------------------------------------------------------------------------------
/docs/_static/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chrthomsen/pygrametl/HEAD/docs/_static/example.pdf


--------------------------------------------------------------------------------
/docs/_static/beginner_guide_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chrthomsen/pygrametl/HEAD/docs/_static/beginner_guide_data.zip


--------------------------------------------------------------------------------
/pygrametl/jythonsupport/Value.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chrthomsen/pygrametl/HEAD/pygrametl/jythonsupport/Value.class


--------------------------------------------------------------------------------
/docs/api/steps.rst:
--------------------------------------------------------------------------------
1 | steps
2 | =====
3 | 
4 | .. automodule:: pygrametl.steps
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/pygrametl.rst:
--------------------------------------------------------------------------------
1 | pygrametl
2 | =========
3 | 
4 | .. automodule:: pygrametl
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/tables.rst:
--------------------------------------------------------------------------------
1 | tables
2 | ======
3 | 
4 | .. automodule:: pygrametl.tables
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/fifodict.rst:
--------------------------------------------------------------------------------
1 | FIFODict
2 | ========
3 | 
4 | .. automodule:: pygrametl.FIFODict
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/parallel.rst:
--------------------------------------------------------------------------------
1 | parallel
2 | ========
3 | 
4 | .. automodule:: pygrametl.parallel
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/aggregators.rst:
--------------------------------------------------------------------------------
1 | aggregators
2 | ===========
3 | 
4 | .. automodule:: pygrametl.aggregators
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/datasources.rst:
--------------------------------------------------------------------------------
1 | datasources
2 | ===========
3 | 
4 | .. automodule:: pygrametl.datasources
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/datasource_in_config.dtt:
--------------------------------------------------------------------------------
1 | book
2 | | bid:int (pk) | title:text | genre:text |
3 | | ------------ | ---------- | ---------- |
4 | csv_two rows.csv ,
5 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/rows.csv:
--------------------------------------------------------------------------------
1 | 1,Unknown,Unknown
2 | 2,Nineteen Eighty-Four,Novel
3 | 3,Calvin and Hobbes One,Comic
4 | 4,Calvin and Hobbes Two,Comic
5 | 5,The Silver Spoon,Cookbook


--------------------------------------------------------------------------------
/docs/api/drawntabletesting.rst:
--------------------------------------------------------------------------------
1 | drawntabletesting
2 | =================
3 | 
4 | .. automodule:: pygrametl.drawntabletesting
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/read_dt_from_csv.dtt:
--------------------------------------------------------------------------------
1 | book
2 | | bid:int (pk) | title:text            | genre:text |
3 | | ------------ | --------------------- | ---------- |
4 | csv rows.csv ,
5 | 


--------------------------------------------------------------------------------
/docs/api/jdbcconnectionwrapper.rst:
--------------------------------------------------------------------------------
1 | JDBCConnectionWrapper
2 | =====================
3 | 
4 | .. automodule:: pygrametl.JDBCConnectionWrapper
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/api/jythonmultiprocessing.rst:
--------------------------------------------------------------------------------
1 | jythonmultiprocessing
2 | =====================
3 | 
4 | .. automodule:: pygrametl.jythonmultiprocessing
5 |    :show-inheritance:
6 |    :undoc-members:
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/.github/workflows/python-unittest-on-pr-and-push.yml:
--------------------------------------------------------------------------------
 1 | name: Python unittest
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ "main" ]
 6 |   push:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 |   python_unittest:
11 |     name: Python unittest
12 | 
13 |     runs-on: ${{ matrix.operating-system }}
14 |     strategy:
15 |       matrix:
16 |         operating-system: [ubuntu-latest, macos-latest, windows-latest]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Python unittest
21 |       run: python3 -m unittest --verbose
22 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/assert_subset.dtt:
--------------------------------------------------------------------------------
 1 | book
 2 | | bid:int (pk) | title:text            | genre:text |
 3 | | ------------ | --------------------- | ---------- |
 4 | | 1            | Unknown               | Unknown    |
 5 | | 2            | Nineteen Eighty-Four  | Novel      |
 6 | | 3            | Calvin and Hobbes One | Comic      |
 7 | | 4            | Calvin and Hobbes Two | Comic      |
 8 | | 5            | The Silver Spoon      | Cookbook   |
 9 | 
10 | book, subset
11 | | bid:int (pk) | title:text            | genre:text |
12 | | ------------ | --------------------- | ---------- |
13 | | 1            | Unknown               | Unknown    |


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/assert_equal.dtt:
--------------------------------------------------------------------------------
 1 | book
 2 | | bid:int (pk) | title:text            | genre:text |
 3 | | ------------ | --------------------- | ---------- |
 4 | | 1            | Unknown               | Unknown    |
 5 | | 2            | Nineteen Eighty-Four  | Novel      |
 6 | | 3            | Calvin and Hobbes One | Comic      |
 7 | | 4            | Calvin and Hobbes Two | Comic      |
 8 | | 5            | The Silver Spoon      | Cookbook   |
 9 | 
10 | book, equal
11 | | bid:int (pk) | title:text            | genre:text |
12 | | ------------ | --------------------- | ---------- |
13 | | 1            | Unknown               | Unknown    |
14 | | 2            | Nineteen Eighty-Four  | Novel      |
15 | | 3            | Calvin and Hobbes One | Comic      |
16 | | 4            | Calvin and Hobbes Two | Comic      |
17 | | 5            | The Silver Spoon      | Cookbook   |


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/assert_disjoint.dtt:
--------------------------------------------------------------------------------
 1 | book
 2 | | bid:int (pk) | title:text            | genre:text |
 3 | | ------------ | --------------------- | ---------- |
 4 | | 1            | Unknown               | Unknown    |
 5 | | 2            | Nineteen Eighty-Four  | Novel      |
 6 | | 3            | Calvin and Hobbes One | Comic      |
 7 | | 4            | Calvin and Hobbes Two | Comic      |
 8 | | 5            | The Silver Spoon      | Cookbook   |
 9 | 
10 | book, disjoint
11 | | bid:int (pk) | title:text            | genre:text |
12 | | ------------ | --------------------- | ---------- |
13 | | 1            | The Silver Spoon      | Cookbook   |
14 | | 2            | Calvin and Hobbes One | Comic      |
15 | | 3            | Calvin and Hobbes Two | Comic      |
16 | | 4            | Nineteen Eighty-Four  | Novel      |
17 | | 5            | Unknown               | Unknown    |


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/connection_in_config.dtt:
--------------------------------------------------------------------------------
 1 | book@oltp
 2 | | bid:int (pk) | title:text            | genre:text |
 3 | | ------------ | --------------------- | ---------- |
 4 | | 1            | Unknown               | Unknown    |
 5 | | 2            | Nineteen Eighty-Four  | Novel      |
 6 | | 3            | Calvin and Hobbes One | Comic      |
 7 | | 4            | Calvin and Hobbes Two | Comic      |
 8 | | 5            | The Silver Spoon      | Cookbook   |
 9 | 
10 | book@oltp, equal
11 | | bid:int (pk) | title:text            | genre:text |
12 | | ------------ | --------------------- | ---------- |
13 | | 1            | Unknown               | Unknown    |
14 | | 2            | Nineteen Eighty-Four  | Novel      |
15 | | 3            | Calvin and Hobbes One | Comic      |
16 | | 4            | Calvin and Hobbes Two | Comic      |
17 | | 5            | The Silver Spoon      | Cookbook   |


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009-2020, Aalborg University (pygrametl@cs.aau.dk)
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source anqd binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | - Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | - Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 2 | # All rights reserved.
 3 | 
 4 | # Redistribution and use in source anqd binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | 
 7 | # - Redistributions of source code must retain the above copyright notice, this
 8 | #   list of conditions and the following disclaimer.
 9 | 
10 | # - Redistributions in binary form must reproduce the above copyright notice,
11 | #   this list of conditions and the following disclaimer in the documentation
12 | #   and/or other materials provided with the distribution.
13 | 
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/tests/tables/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 2 | # All rights reserved.
 3 | 
 4 | # Redistribution and use in source anqd binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | 
 7 | # - Redistributions of source code must retain the above copyright notice, this
 8 | #   list of conditions and the following disclaimer.
 9 | 
10 | # - Redistributions in binary form must reproduce the above copyright notice,
11 | #   this list of conditions and the following disclaimer in the documentation
12 | #   and/or other materials provided with the distribution.
13 | 
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 2 | # All rights reserved.
 3 | 
 4 | # Redistribution and use in source anqd binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | 
 7 | # - Redistributions of source code must retain the above copyright notice, this
 8 | #   list of conditions and the following disclaimer.
 9 | 
10 | # - Redistributions in binary form must reproduce the above copyright notice,
11 | #   this list of conditions and the following disclaimer in the documentation
12 | #   and/or other materials provided with the distribution.
13 | 
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | import pygrametl
 7 | 
 8 | setup(
 9 |     name='pygrametl',
10 |     version=pygrametl.__version__,
11 |     author='Aalborg University',
12 |     author_email='pygrametl@cs.aau.dk',
13 |     packages=find_packages(),
14 |     package_data={
15 |         'pygrametl': [
16 |             'jythonsupport/Value.class',
17 |             'jythonsupport/Value.java']},
18 |     url='http://pygrametl.org/',
19 |     license='BSD',
20 |     description='ETL programming in Python',
21 |     long_description=open('README.rst').read(),
22 |     long_description_content_type="text/x-rst",
23 |     classifiers=[
24 |                 'Development Status :: 5 - Production/Stable',
25 |                 'Intended Audience :: Developers',
26 |                 'License :: OSI Approved :: BSD License',
27 |                 'Programming Language :: Java',
28 |                 'Programming Language :: Python',
29 |                 'Programming Language :: Python :: 2.7',
30 |                 'Programming Language :: Python :: 3',
31 |                 'Topic :: Database',
32 |                 'Topic :: Database :: Front-Ends',
33 |                 'Topic :: Software Development',
34 |                 'Topic :: Software Development :: Libraries :: Python Modules',
35 |                 'Topic :: Software Development :: Libraries :: Application '
36 |                 'Frameworks'],
37 |     entry_points={
38 |         'console_scripts': [
39 |             'dttr = pygrametl.drawntabletesting.dttr:main']
40 |    }
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/dttr/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 2 | # All rights reserved.
 3 | 
 4 | # Redistribution and use in source anqd binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | 
 7 | # - Redistributions of source code must retain the above copyright notice, this
 8 | #   list of conditions and the following disclaimer.
 9 | 
10 | # - Redistributions in binary form must reproduce the above copyright notice,
11 | #   this list of conditions and the following disclaimer in the documentation
12 | #   and/or other materials provided with the distribution.
13 | 
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | import csv
26 | import sqlite3
27 | 
28 | 
29 | def csv_two(columns, path, delimiter):
30 |     with open(path) as csvfile:
31 |         f = csv.DictReader(csvfile, fieldnames=columns, delimiter=delimiter)
32 |         return list(f)
33 | 
34 | 
35 | connection = sqlite3.connect(':memory:')
36 | oltp = sqlite3.connect(':memory:')
37 | 


--------------------------------------------------------------------------------
/pygrametl/jythonsupport/Value.java:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2011-2020, Aalborg University (pygrametl@cs.aau.dk)
 3 | * All rights reserved.
 4 | *
 5 | * Redistribution and use in source anqd binary forms, with or without
 6 | * modification, are permitted provided that the following conditions are met:
 7 | *
 8 | * - Redistributions of source code must retain the above copyright notice, this
 9 | *   list of conditions and the following disclaimer.
10 | *
11 | * - Redistributions in binary form must reproduce the above copyright notice,
12 | *   this list of conditions and the following disclaimer in the documentation
13 | *   and/or other materials provided with the distribution.
14 | *
15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 | 
27 | 
28 | package pygrametl.jythonsupport;
29 | 
30 | public class Value {
31 |     private int theValue;
32 | 
33 |     public Value(char type, int value) {
34 | 	if(!(type == 'b' || type == 'i' || type == 'B' || type == 'h' ||
35 | 	     type == 'H' || type == 'l')) {
36 | 	    throw new
37 | 		IllegalArgumentException("Only the types 'b', 'B', 'h', "
38 | 					 + "'H', 'i', and 'l' are supported");
39 | 	}
40 | 	theValue = value;
41 |     }
42 | 
43 |     public synchronized int getValue() {
44 | 	return theValue;
45 |     }
46 | 
47 |     public synchronized void setValue(int newVal) {
48 | 	theValue = newVal;
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/test_dttr.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 2 | # All rights reserved.
 3 | 
 4 | # Redistribution and use in source anqd binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | 
 7 | # - Redistributions of source code must retain the above copyright notice, this
 8 | #   list of conditions and the following disclaimer.
 9 | 
10 | # - Redistributions in binary form must reproduce the above copyright notice,
11 | #   this list of conditions and the following disclaimer in the documentation
12 | #   and/or other materials provided with the distribution.
13 | 
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | import unittest
26 | import subprocess
27 | import os
28 | 
29 | class DTTRTest(unittest.TestCase):
30 | 
31 |     def test_dttr(self):
32 |         # A copy of the existing environment is needed on Windows
33 |         # See https://stackoverflow.com/questions/58997105/fatal-python-error-failed-to-get-random-numbers-to-initialize-python
34 |         newenv = os.environ.copy()
35 |         newenv['PYTHONPATH'] = '.'
36 | 
37 |         process = subprocess.run([
38 |             'python3',
39 |             'pygrametl/drawntabletesting/dttr.py',
40 |             '-f',
41 |             'tests/drawntabletesting/dttr/'
42 |         ], env=newenv, capture_output=True)
43 | 
44 |         process.check_returncode()
45 |         self.assertEqual(b'', process.stdout)
46 |         self.assertEqual(b'', process.stderr)
47 | 


--------------------------------------------------------------------------------
/docs/_exts/rtdmockup.py:
--------------------------------------------------------------------------------
 1 | """Simple mock-up of the external dependencies so documentation can be
 2 |    created without requiring the installation of Java and Jython.
 3 | 
 4 |    The code used is made publicly available by www.readthedocs.org, under the
 5 |    MIT license. For more information see the following links:
 6 |    https://github.com/rtfd/readthedocs.org
 7 |    https://read-the-docs.readthedocs.org/en/latest/index.html
 8 | """
 9 | 
10 | # Copyright (c) 2011 Charles Leifer, Eric Holscher, Bobby Grace
11 | 
12 | # Permission is hereby granted, free of charge, to any person
13 | # obtaining a copy of this software and associated documentation
14 | # files (the "Software"), to deal in the Software without
15 | # restriction, including without limitation the rights to use,
16 | # copy, modify, merge, publish, distribute, sublicense, and/or sell
17 | # copies of the Software, and to permit persons to whom the
18 | # Software is furnished to do so, subject to the following
19 | # conditions:
20 | 
21 | # The above copyright notice and this permission notice shall be
22 | # included in all copies or substantial portions of the Software.
23 | 
24 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
26 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
28 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
29 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31 | # OTHER DEALINGS IN THE SOFTWARE.
32 | 
33 | import sys
34 | 
35 | 
36 | class Mock(object):
37 | 
38 |     def __init__(self, *args, **kwargs):
39 |         pass
40 | 
41 |     def __call__(self, *args, **kwargs):
42 |         return Mock()
43 | 
44 |     @classmethod
45 |     def __getattr__(cls, name):
46 |         if name in ('__file__', '__path__'):
47 |             return '/dev/null'
48 |         elif name[0] == name[0].upper():
49 |             mockType = type(name, (), {})
50 |             mockType.__module__ = __name__
51 |             return mockType
52 |         else:
53 |             return Mock()
54 | 
55 | 
56 | def mockModules(modules):
57 |     for mod_name in modules:
58 |         sys.modules[mod_name] = Mock()
59 | 


--------------------------------------------------------------------------------
/pygrametl/jythonmultiprocessing.py:
--------------------------------------------------------------------------------
 1 | """A module for Jython emulating (a small part of) CPython's multiprocessing.
 2 |    With this, pygrametl can be made to use multiprocessing, but actually use
 3 |    threads when used from Jython (where there is no GIL).
 4 | """
 5 | 
 6 | # Copyright (c) 2011-2020, Aalborg University (pygrametl@cs.aau.dk)
 7 | # All rights reserved.
 8 | 
 9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | 
12 | # - Redistributions of source code must retain the above copyright notice, this
13 | #   list of conditions and the following disclaimer.
14 | 
15 | # - Redistributions in binary form must reproduce the above copyright notice,
16 | #   this list of conditions and the following disclaimer in the documentation
17 | #   and/or other materials provided with the distribution.
18 | 
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | from threading import Thread
31 | 
32 | from pygrametl.jythonsupport import Value
33 | 
34 | 
35 | # Needed for both pip2 and pip3 to be supported
36 | try:
37 |     from Queue import Queue
38 | except ImportError:
39 |     from queue import Queue
40 | 
41 | # NOTE: This module is made for Jython.
42 | 
43 | __all__ = ['JoinableQueue', 'Process', 'Queue', 'Value']
44 | 
45 | 
46 | class Process(Thread):
47 |     pid = '<n/a>'
48 |     daemon = property(Thread.isDaemon, Thread.setDaemon)
49 |     name = property(Thread.getName, Thread.setName)
50 | 
51 | 
52 | class JoinableQueue(Queue):
53 | 
54 |     def close(self):
55 |         pass
56 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | pygrametl - ETL Development in Python
 3 | #####################################
 4 | pygrametl is a package for creating Extract-Transform-Load (ETL) programs in Python.
 5 | 
 6 | The package contains several classes for filling fact tables and dimensions
 7 | (including snowflaked and slowly changing dimensions), classes for extracting
 8 | data from different sources, classes for optionally defining an ETL flow using
 9 | steps, classes for parallelizing an ETL flow, classes for testing an ETL flow,
10 | and convenient functions for often-needed ETL functionality.
11 | 
12 | The package's modules are:
13 | 
14 | *   **datasources** for access to different data sources
15 | *   **tables** for giving easy and abstracted access to dimension and fact tables
16 | *   **parallel** for parallelizing an ETL flow
17 | *   **JDBCConnectionWrapper** and **jythonmultiprocessing** for Jython support
18 | *   **aggregators** for aggregating data
19 | *   **steps** for defining steps in an ETL flow
20 | *   **FIFODict** for a dict with a limited size and where elements are removed in first-in, first-out order
21 | *   **drawntabletesting** for testing an ETL flow
22 | 
23 | 
24 | pygrametl is currently being maintained at Aalborg University in Denmark by the following people:
25 | 
26 | **Current Maintainers**
27 |     - Christian Thomsen <chr@cs.aau.dk>
28 |     - Søren Kejser Jensen <devel@kejserjensen.dk>
29 | 
30 | **Former Maintainers**
31 |     - Christoffer Moesgaard <cmoesgaard@gmail.com>
32 |     - Ove Andersen <ove.andersen.oa@gmail.com>
33 | 
34 | Getting started
35 | ===============
36 | 
37 | .. toctree::
38 |    :maxdepth: 1
39 | 
40 |    quickstart/install
41 |    quickstart/beginner
42 | 
43 | Code Examples
44 | =============
45 | 
46 | .. toctree::
47 |    :maxdepth: 1
48 | 
49 |    examples/database
50 |    examples/datasources
51 |    examples/dimensions
52 |    examples/facttables
53 |    examples/bulkloading
54 |    examples/parallel
55 |    examples/jython
56 |    Testing <examples/testing>
57 | 
58 | API
59 | ===
60 | 
61 | .. toctree::
62 |    :maxdepth: 1
63 | 
64 |    api/pygrametl
65 |    api/datasources
66 |    api/tables
67 |    api/parallel
68 |    api/jdbcconnectionwrapper
69 |    api/jythonmultiprocessing
70 |    api/aggregators
71 |    api/steps
72 |    api/fifodict
73 |    api/drawntabletesting
74 | 
75 | .. Prevents the indices from being generated in the LaTeX documentation
76 | .. only:: html
77 | 
78 |     Indices and tables
79 |     ==================
80 | 
81 |     * :ref:`genindex`
82 |     * :ref:`modindex`
83 |     * :ref:`search`
84 | 


--------------------------------------------------------------------------------
/pygrametl/drawntabletesting/formattable.py:
--------------------------------------------------------------------------------
 1 | """Script that automatically format a drawn table testing table."""
 2 | 
 3 | # Copyright (c) 2021, Aalborg University (pygrametl@cs.aau.dk)
 4 | # All rights reserved.
 5 | 
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | 
 9 | # - Redistributions of source code must retain the above copyright notice, this
10 | #   list of conditions and the following disclaimer.
11 | 
12 | # - Redistributions in binary form must reproduce the above copyright notice,
13 | #   this list of conditions and the following disclaimer in the documentation
14 | #   and/or other materials provided with the distribution.
15 | 
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 
28 | import sys
29 | import pygrametl.drawntabletesting as dtt
30 | 
31 | 
32 | if len(sys.argv) != 3:
33 |     print("usage: " + sys.argv[0] + " file line")
34 |     sys.exit(1)
35 | path = sys.argv[1]
36 | point = int(sys.argv[2]) - 1  # Expected to be one-based
37 | 
38 | # Extracts the table from the document
39 | with open(path, 'r') as f:
40 |     lines = f.readlines()
41 |     length = len(lines)
42 | 
43 |     start = point
44 |     while start >= 0 and '|' in lines[start]:
45 |         start -= 1
46 |     start += 1  # Do not include the header
47 | 
48 |     end = point
49 |     while end < length and '|' in lines[end]:
50 |         end += 1
51 |     end -= 1  # Do not include the delimiter
52 | 
53 | # The table's indention must be taken into account
54 | table = ''.join(lines[start:end + 1])
55 | first_char = table.find('|')
56 | last_char = table.rfind('|')
57 | prefix = table[:first_char]
58 | suffix = table[last_char + 1:]
59 | table = table[first_char:last_char + 1]
60 | 
61 | # The indention level must be added for each line
62 | table = dtt.Table('', table, testconnection=object())
63 | table = str(table).split('\n')
64 | 
65 | write = 0
66 | indention = '\n' + ' ' * first_char
67 | for output in range(start, end):
68 |     lines[output] = indention + table[write]
69 |     write += 1
70 | lines[start] = prefix + table[0]
71 | lines[end] = indention + table[-1] + suffix
72 | 
73 | # The file is updated to format the table
74 | with open(path, 'w') as f:
75 |     f.writelines(lines)
76 | 


--------------------------------------------------------------------------------
/tests/utilities.py:
--------------------------------------------------------------------------------
 1 | """Shared functionality used by the unit tests"""
 2 | 
 3 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
 4 | # All rights reserved.
 5 | 
 6 | # Redistribution and use in source anqd binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | 
 9 | # - Redistributions of source code must retain the above copyright notice, this
10 | #   list of conditions and the following disclaimer.
11 | 
12 | # - Redistributions in binary form must reproduce the above copyright notice,
13 | #   this list of conditions and the following disclaimer in the documentation
14 | #   and/or other materials provided with the distribution.
15 | 
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import os
28 | import locale
29 | import pygrametl
30 | 
31 | 
32 | def get_os_encoding():
33 |     """Get the OS's encoding in the same manner as open() so they match"""
34 |     # https://docs.python.org/3/library/functions.html#open
35 |     return locale.getpreferredencoding(False)
36 | 
37 | def get_connection():
38 |     """Returns a new connection to the selected test database."""
39 | 
40 |     # The unit tests defaults to SQLite as it has no dependencies
41 |     connection_type_and_string = \
42 |         os.environ.get('PYGRAMETL_TEST_CONNECTIONSTRING', 'sqlite://:memory:')
43 | 
44 |     # Select the
45 |     connection_type, connection_string = \
46 |         connection_type_and_string.split('://')
47 | 
48 |     if connection_type == 'sqlite':
49 |         return __sqlite3_connection(connection_string)
50 |     elif connection_type == 'psycopg2':
51 |         return __psycopg2_connection(connection_string)
52 |     else:
53 |         raise ValueError(
54 |             'Expected sqlite:// or psycopg2:// and a connection string')
55 | 
56 | 
57 | def ensure_default_connection_wrapper():
58 |     """Ensure the default connection wrapper is ready for the next test."""
59 |     connection_wrapper = pygrametl.getdefaulttargetconnection()
60 | 
61 |     try:
62 |         connection_wrapper.rollback()
63 |     except Exception:
64 |         # The connection is closed so a new one is created
65 |         global get_connection
66 |         connection_wrapper = pygrametl.ConnectionWrapper(get_connection())
67 |         connection_wrapper.setasdefault()
68 | 
69 |     # The database must be in a known good state before each test
70 |     pygrametl.drawntabletesting.Table.clear()
71 |     return connection_wrapper
72 | 
73 | 
74 | def remove_default_connection_wrapper():
75 |     """Ensure there is no default connection wrapper set."""
76 |     pygrametl._defaulttargetconnection = None
77 | 
78 | 
79 | def __sqlite3_connection(connection_string):
80 |     """Create a new sqlite3 connection for use with unit tests."""
81 |     import sqlite3
82 |     connection = sqlite3.connect(connection_string)
83 |     connection.execute('PRAGMA foreign_keys = ON;')
84 |     return connection
85 | 
86 | 
87 | def __psycopg2_connection(connection_string):
88 |     """Create a new psycopg2 connection for use with unit tests."""
89 |     import psycopg2
90 |     return psycopg2.connect(connection_string)
91 | 


--------------------------------------------------------------------------------
/docs/_exts/formatref.py:
--------------------------------------------------------------------------------
 1 | """Replaces references to files with a relative reference to a local file when
 2 |    the documentation is exported to HTML and an absolute reference to the file
 3 |    on pygrametl.org when the documentation is exported to a PDF.
 4 | """
 5 | 
 6 | # Copyright (c) 2022, Aalborg University (pygrametl@cs.aau.dk)
 7 | # All rights reserved.
 8 | 
 9 | # Redistribution and use in source anqd binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | 
12 | # - Redistributions of source code must retain the above copyright notice, this
13 | #   list of conditions and the following disclaimer.
14 | 
15 | # - Redistributions in binary form must reproduce the above copyright notice,
16 | #   this list of conditions and the following disclaimer in the documentation
17 | #   and/or other materials provided with the distribution.
18 | 
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | from pathlib import Path
31 | from docutils import nodes
32 | from sphinx.util import logging
33 | 
34 | def role(name, rawtext, text, lineno, inliner, options={}, content=[]):
35 |     # text is the roles input, i.e., file name in text <relative file path>
36 |     start_of_path = text.index('<') + 1
37 |     end_of_path = text.rindex('>')
38 |     file_name = text[:start_of_path - 1].strip()
39 |     file_path = text[start_of_path:end_of_path]
40 | 
41 |     # References the file in an appropriate manner for the output format
42 |     global sphinx_app
43 |     if sphinx_app.builder.format == 'html':
44 |         # For HTML :formatref:` <>` links to the local file like ` <>`_
45 |         node = nodes.reference(rawtext, file_name, 
46 |                 refuri=str(file_path), **options)
47 |     elif sphinx_app.builder.format == 'latex':
48 |         # For PDF :formatref:` <>` links to www.pygrametl.org/doc/<file path>
49 |         # Thus, the file path is converted so it is relative to the root of the
50 |         # HTML documentation instead of the folder containing the source file
51 |         source = get_attribute('source', inliner.document.attlist())
52 |         absolute_file_path = (Path(source).parent / file_path).resolve()
53 |         absolute_file_path_parts = absolute_file_path.parts
54 |         index_of_docs = absolute_file_path_parts.index('docs')
55 |         web_file_path = "/".join(absolute_file_path_parts[index_of_docs + 1:])
56 |         uri = "www.pygrametl.org/doc/%s" % web_file_path
57 |         node = nodes.reference(rawtext, file_name, refuri=uri, **options)
58 |     else:
59 |         raise ValueError("Only HTML and LaTeX is supported")
60 |     return [node], []
61 | 
62 | def get_attribute(name, attributes):
63 |     for attribute in attributes:
64 |         if attribute[0] == name:
65 |             return attribute[1]
66 | 
67 | def setup(app):
68 |     app.add_role('formatref', role)
69 | 
70 |     # app is saved as the format builder is not available until role() is run
71 |     global sphinx_app
72 |     sphinx_app = app
73 | 
74 |     # If multiple formats are produced, e.g., make latexpdf html, Sphinx only
75 |     # stores the name of the first format. Also, Sphinx caches the output of
76 |     # role, so make clean must always be run before make latexpdf or make html
77 |     logger = logging.getLogger(__name__)
78 |     logger.warning('make clean must be run before make html and make latexpdf')
79 | 


--------------------------------------------------------------------------------
/docs/examples/database.rst:
--------------------------------------------------------------------------------
 1 | .. _database:
 2 | 
 3 | Database
 4 | ========
 5 | Database access in pygrametl is done through either a :PEP:`249` connection if
 6 | CPython is used, or with a `JDBC <https://jcp.org/en/jsr/detail?id=221>`__
 7 | connection when pygrametl is running on Jython. pygrametl provides multiple
 8 | abstractions on top of these connections and direct usage of these to manipulate
 9 | the database should generally not be necessary. As an abstraction for database
10 | rows Python's :class:`.dict` type is used, where the keys the names of the
11 | columns in the table and the values are the data stored in that row.
12 | 
13 | 
14 | Connection Wrappers
15 | -------------------
16 | Multiple connection wrappers are provided by the pygrametl framework to allow
17 | :PEP:`249` connections and `JDBC <https://jcp.org/en/jsr/detail?id=221>`__
18 | connections to be used uniformly, and to allow multiple threads and process to
19 | use the connection safely. In addition, the connection wrappers for :PEP:`249`
20 | connections also automatically convert from the pyformat parameter style used by
21 | pygrametl to any of the other parameter styles defined in :PEP:`249#paramstyle`.
22 | To simplify the use of database connections, the first connection wrapper
23 | created is set as the default. The default connection wrapper can be used by
24 | abstractions such as :class:`.tables.FactTable` and :class:`.tables.Dimension`
25 | without the user having to pass the connection wrapper to them explicitly. If
26 | another database connection should be used, for example, if data is read from one
27 | database and written to another, a specific connection can be explicitly passed
28 | as an argument to all pygrametl abstractions that can read to and/or write from
29 | a database.
30 | 
31 | :class:`.ConnectionWrapper` and
32 | :class:`.JDBCConnectionWrapper.JDBCConnectionWrapper` are the two main
33 | connection wrappers provided by pygrametl. The interface provided by these two
34 | classes is just an abstraction on top of database operations, and provides
35 | methods, among others, for executing statements, iterating over returned rows,
36 | and committing transactions. Note however that these connection wrappers cannot
37 | be used by multiple threads or processes in parallel. To ensure that database
38 | access is performed correctly in a parallel ETL program without burdening the
39 | user with the task, the class :class:`.parallel.SharedConnectionWrapperClient`
40 | is provided. This class can be created from an existing connection wrapper using
41 | the function :func:`.parallel.shareconnectionwrapper`. Each separate process can
42 | then be given a unique copy of the shared connection to access the database
43 | safely in parallel. For more information about the parallel capabilities of
44 | pygrametl see :ref:`parallel`.
45 | 
46 | 
47 | Experimental Connection Wrappers
48 | --------------------------------
49 | pygrametl also provides two very experimental connection wrappers:
50 | :class:`.BackgroundConnectionWrapper` and
51 | :class:`.JDBCConnectionWrapper.BackgroundJDBCConnectionWrapper`. They are
52 | provided as alternatives to :class:`.ConnectionWrapper` and
53 | :class:`.JDBCConnectionWrapper.JDBCConnectionWrapper` and perform the database
54 | operations in a separate thread instead of the same thread as the ETL program.
55 | As they are considered experimental, they are not set as default upon creation,
56 | and must thus manually be set as the default with the method
57 | :meth:`setasdefault`, available on all connection wrappers, or be manually
58 | passed around the program.
59 | 
60 | For most usage the classes :class:`.ConnectionWrapper` and
61 | :class:`.JDBCConnectionWrapper.JDBCConnectionWrapper` will likely provide better
62 | performance compared to the background versions. Furthermore, a connection
63 | wrapper used in a parallel ETL program should always be wrapped using
64 | :func:`.parallel.shareconnectionwrapper` to ensure safe parallel database
65 | access, which itself runs the connection wrapper in a separate process or thread
66 | depending on the implementation. As the two implementations are very similar and
67 | provide an identical interface, either set of implementations might be removed
68 | in a future release.
69 | 


--------------------------------------------------------------------------------
/pygrametl/aggregators.py:
--------------------------------------------------------------------------------
  1 | """A module with classes for aggregation.
  2 |    An Aggregator has two methods: process and finish.
  3 | 
  4 |    process(group, val) is called to "add" val to the aggregation of the set of
  5 |    values identified by the value of group. The value in group (which could be
  6 |    any hashable type, also a tuple as ('A', 'B')) thus corresponds to the
  7 |    GROUP BY attributes in SQL.
  8 | 
  9 |    finish(group, default) is called to get the final result for group.
 10 |    If no such results exists, default is returned.
 11 | """
 12 | 
 13 | # Copyright (c) 2011-2020, Aalborg University (pygrametl@cs.aau.dk)
 14 | # All rights reserved.
 15 | 
 16 | # Redistribution and use in source anqd binary forms, with or without
 17 | # modification, are permitted provided that the following conditions are met:
 18 | 
 19 | # - Redistributions of source code must retain the above copyright notice, this
 20 | #   list of conditions and the following disclaimer.
 21 | 
 22 | # - Redistributions in binary form must reproduce the above copyright notice,
 23 | #   this list of conditions and the following disclaimer in the documentation
 24 | #   and/or other materials provided with the distribution.
 25 | 
 26 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 29 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 30 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 31 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 32 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 33 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 34 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | __all__ = ['Aggregator', 'SimpleAggregator', 'Sum', 'Count', 'CountDistinct',
 38 |            'Max', 'Min', 'Avg']
 39 | 
 40 | 
 41 | class Aggregator(object):
 42 | 
 43 |     def process(self, group, val):
 44 |         raise NotImplementedError()
 45 | 
 46 |     def finish(self, group, default=None):
 47 |         raise NotImplementedError()
 48 | 
 49 | 
 50 | class SimpleAggregator(Aggregator):
 51 | 
 52 |     def __init__(self):
 53 |         self._results = {}
 54 | 
 55 |     def process(self, group, val):
 56 |         pass
 57 | 
 58 |     def finish(self, group, default=None):
 59 |         return self._results.get(group, default)
 60 | 
 61 | 
 62 | class Sum(SimpleAggregator):
 63 | 
 64 |     def process(self, group, val):
 65 |         tmp = self._results.get(group, 0)
 66 |         tmp += val
 67 |         self._results[group] = tmp
 68 | 
 69 | 
 70 | class Count(SimpleAggregator):
 71 | 
 72 |     def process(self, group, val):
 73 |         tmp = self._results.get(group, 0)
 74 |         tmp += 1
 75 |         self._results[group] = tmp
 76 | 
 77 | 
 78 | class CountDistinct(SimpleAggregator):
 79 | 
 80 |     def process(self, group, val):
 81 |         if group not in self._results:
 82 |             self._results[group] = set()
 83 |         self._results[group].add(val)
 84 | 
 85 |     def finish(self, group, default=None):
 86 |         if group not in self._results:
 87 |             return default
 88 |         return len(self._results[group])
 89 | 
 90 | 
 91 | class Max(SimpleAggregator):
 92 | 
 93 |     def process(self, group, val):
 94 |         if group not in self._results:
 95 |             self._results[group] = val
 96 |         else:
 97 |             tmp = self._results[group]
 98 |             if val > tmp:
 99 |                 self._results[group] = val
100 | 
101 | 
102 | class Min(SimpleAggregator):
103 | 
104 |     def process(self, group, val):
105 |         if group not in self._results:
106 |             self._results[group] = val
107 |         else:
108 |             tmp = self._results[group]
109 |             if val < tmp:
110 |                 self._results[group] = val
111 | 
112 | 
113 | class Avg(Aggregator):
114 | 
115 |     def __init__(self):
116 |         self.__sum = Sum()
117 |         self.__count = Count()
118 | 
119 |     def process(self, group, val):
120 |         self.__sum.process(group, val)
121 |         self.__count.process(group, val)
122 | 
123 |     def finish(self, group, default=None):
124 |         tmp = self.__sum.finish(group, None)
125 |         if tmp is None:
126 |             return default
127 |         else:
128 |             return float(tmp) / self.__count.finish(group)
129 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pygrametl
 2 | =========
 3 | |badge1| |badge2|
 4 | 
 5 | .. |badge1| image:: https://github.com/chrthomsen/pygrametl/actions/workflows/python-unittest-on-pr-and-push.yml/badge.svg
 6 |    :target: https://github.com/chrthomsen/pygrametl/actions
 7 | 
 8 | .. |badge2| image:: https://img.shields.io/pypi/dm/pygrametl?style=flat&label=Downloads
 9 |    :target: https://pypi.org/project/pygrametl/
10 | 
11 | `pygrametl <http://pygrametl.org>`_ (pronounced py-gram-e-t-l) is a Python framework that provides functionality commonly used when developing Extract-Transform-Load (ETL) programs. It is fully open-source and released under a 2-clause BSD license. As shown in the figure below, an ETL program that uses pygrametl is a standard Python program that imports pygrametl and uses the abstractions it provides. To provide developers with complete control over the data warehouse's schema, pygrametl assumes that all of the dimension tables and fact tables used in the ETL program have already been created using SQL.
12 | 
13 | .. image:: https://pygrametl.org/assets/etl-with-pygrametl.svg
14 | 
15 | Defining the data warehouse's schema using SQL and implementing the ETL program itself using standard Python turns out to be very efficient and effective, even when compared to drawing the program in a graphical user interface like Apache Hop or Pentaho Data Integration. pygrametl supports CPython and Jython so both existing Python code that uses native extensions models and PEP 249 connectors and JVM-based code that uses JDBC drivers can be used in the ETL program.
16 | 
17 | When using pygrametl, the developer creates an object for each data source, dimension and fact table and operate on rows in the form of standard Python ``dict``\s. Thus, (s)he can easily read rows from a data source using a loop like ``for row in datasource:``, transform the rows using arbitrary Python code like ``row["price"] *= 1.25``, and then add new dimension members to a dimension and facts to a fact table using ``dimension.insert(row)`` and ``facttable.insert(row)``, respectively. This is a very simple example, but pygrametl also supports much more complicated scenarios. For example, it is possible to create a single object for an entire snowflaked dimension. It is then possible to add a new dimension member with a single method call by using ``snowflake.insert(row)``. This will automatically perform all of the necessary lookups and insertions in the tables participating in the snowflaked dimension. pygrametl also supports multiple types of slowly changing dimensions. Again, the programmer only has to invoke a single method: ``slowlychanging.scdensure(row)``. This will perform the needed updates of both type 1 (i.e., overwrites) and type 2 (i.e., adding new versions).
18 | 
19 | pygrametl was first made publicly available in 2009. Since then, we have continuously made improvements and added new features. Version 2.8 was released in September 2023. Today, pygrametl is used in production systems in different sectors such as healthcare, finance, and transport.
20 | 
21 | Installation
22 | ------------
23 | pygrametl can be installed from `PyPI <https://pypi.org/project/pygrametl/>`_ with the following command:
24 | 
25 | :code:`$ pip install pygrametl`
26 | 
27 | The current development version of pygrametl is available on `GitHub <https://github.com/chrthomsen/pygrametl>`_:
28 | 
29 | :code:`$ git clone https://github.com/chrthomsen/pygrametl.git`
30 | 
31 | For more information about installation see the `Install Guide <http://pygrametl.org/doc/quickstart/install.html>`_.
32 | 
33 | Documentation
34 | -------------
35 | The documentation is available in `HTML <http://pygrametl.org/doc/index.html>`_ and as a `PDF <http://pygrametl.org/doc/pygrametl.pdf>`_. There are also `installation <http://pygrametl.org/doc/quickstart/install.html>`_ and `beginner <http://pygrametl.org/doc/quickstart/beginner.html>`_ guides available.
36 | 
37 | In addition to the documentation, multiple papers have been published about pygrametl. The papers are listed `here <http://pygrametl.org/#documentation>`_ and provide a more detailed description of the foundational ideas behind pygrametl but is obviously not keep up to date with changes and improvements implemented in the framework, for such see the documentation. If you use pygrametl in academia, please cite the relevant paper(s).
38 | 
39 | Community
40 | ---------
41 | To keep the development of pygrametl open for external participation, we have public mailing lists and use Github. Feel free to ask questions and provide all kinds of feedback:
42 | 
43 | - `pygrametl-user <https://groups.google.com/forum/#!forum/pygrametl-user>`_ - For any questions about how to deploy and utilize pygrametl for ETL.
44 | - `pygrametl-dev <https://groups.google.com/forum/#!forum/pygrametl-dev>`_ - For - questions and discussion about the development of pygrametl.
45 | - `Github <https://github.com/chrthomsen/pygrametl>`_ - Bugs and patches should be submitted to Github as issues and pull requests.
46 | 
47 | When asking a question or reporting a possible bug in pygrametl, please first verify that the problem still occurs with the latest version of pygrametl. If the problem persists after updating please include the following information, preferably with detailed version information, when reporting the problem:
48 | 
49 | - Operating System.
50 | - Python Implementation.
51 | - Relational Database Management System.
52 | - Python Database Connector.
53 | - A short description of the problem with a minimal code example that reproduces the problem.
54 | 
55 | We encourage the use of Github and the mailing lists. For discussions not suitable for a public mailing list, you can, however, send us a private `email <mailto:pygrametl@cs.aau.dk>`_.
56 | 
57 | Maintainers
58 | -----------
59 | pygrametl is maintained at `Aalborg University <http://www.cs.aau.dk/>`_ by `Christian Thomsen <https://github.com/chrthomsen>`_ and `Søren Kejser Jensen <https://github.com/skejserjensen>`_.
60 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pygrametl.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pygrametl.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pygrametl.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pygrametl.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pygrametl"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pygrametl"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/examples/jython.rst:
--------------------------------------------------------------------------------
  1 | .. _jython:
  2 | 
  3 | Jython
  4 | ======
  5 | pygrametl supports running ETL flows on Jython, an implementation of Python that
  6 | run on the JVM. Using Jython instead of CPython allows an ETL flow to be
  7 | parallelized using multiple threads instead of multiple processes. This is
  8 | because Jython does not have a global interpreter lock, which in CPython ensures
  9 | that only a single thread is running per process at a given time. For more
 10 | information about the GIL see the Python wiki `GIL
 11 | <https://wiki.python.org/moin/GlobalInterpreterLock>`__.
 12 | 
 13 | To make switching between CPython and Jython as simple as possible, two
 14 | abstractions are provided by pygrametl. Firstly, :mod:`.JDBCConnectionWrapper`
 15 | provides two connection wrappers for `JDBC
 16 | <https://jcp.org/en/jsr/detail?id=221>`__ connections with the same interface as
 17 | the connection wrappers for :pep:`249` connections. As the connection wrappers,
 18 | all share the same interface the user usually only has to change the connection
 19 | type (`JDBC <https://jcp.org/en/jsr/detail?id=221>`__ or :pep:`249`) and the
 20 | connection wrapper when switching between CPython and Jython. For more
 21 | information about database access in pygrametl see :ref:`database`. Secondly,
 22 | Jython currently has no support for :mod:`multiprocessing` as threads are more
 23 | lightweight than processes and multiple threads can be run in parallel. So
 24 | pygrametl includes the module :mod:`.jythonmultiprocessing` which wraps Python's
 25 | :mod:`threading` module and provides a very small part of Python's
 26 | :mod:`multiprocessing` module. Thus, pygrametl exposes the same interface for
 27 | creating parallel ETL flows no matter if a user is using CPython or Jython.
 28 | 
 29 | While both Jython and CPython are capable of executing the same language, the
 30 | two platforms are implemented differently, so optimizations suitable for one
 31 | platform may be less effective on the other. One aspect to be aware of when
 32 | running high-performance pygrametl-based ETL flows on Jython is memory
 33 | management. For example, Oracle's HotSpot JVM implements a generational garbage
 34 | collector that uses a much slower garbage collection strategy for the old
 35 | generations than for the young. Thus, allowing too many objects to be promoted
 36 | to the old generations can reduce the throughput of an ETL flow significantly.
 37 | Unfortunately, this can easily occur if the values controlling caching, such as
 38 | :attr:`.Decoupled.batchsize`, are set too high. Similarly, if the value for
 39 | :attr:`.Decoupled.batchsize` is set too low the overhead of transferring data
 40 | between threads increases as smaller batches are used. Many tools for profiling
 41 | programs running on the JVM exist: `JFR
 42 | <https://docs.oracle.com/javacomponents/jmc-5-4/jfr-runtime-guide/about.htm>`__
 43 | and `JConsole
 44 | <http://docs.oracle.com/javase/8/docs/technotes/guides/management/jconsole.html>`__
 45 | are bundled with the JDK, while tools such as `VisualVM
 46 | <https://visualvm.github.io/>`__ must be installed separately but often provide
 47 | additional functionality.
 48 | 
 49 | Setup
 50 | -----
 51 | Using pygrametl with Jython requires an extra step compared to CPython, as
 52 | Jython is less integrated with Python's package management system. Firstly,
 53 | install pygrametl from `PyPI <https://pypi.python.org/pypi/pygrametl/>`__ or by
 54 | downloading the development version from `GitHub
 55 | <https://github.com/chrthomsen/pygrametl>`__. For more information about
 56 | installing pygrametl for use with CPython see :ref:`install`.
 57 | 
 58 | After pygrametl has been installed, the location it has been installed to must
 59 | be added to the environment variable ``JYTHONPATH``, as Jython purposely does
 60 | not import modules from CPython by default. The default directory used by
 61 | CPython for packages depends on the operating system and whether a package is
 62 | installed locally or globally. Check the output of the ``pip install`` command
 63 | or its log for precise information about where the package has being installed.
 64 | The method for setting this variable depends on the operating system. On most
 65 | Unix-like systems, the variable can be set in ``~/.profile``, which will be
 66 | sourced on login. On Windows, environment variables can be changed through the
 67 | System setting in the Control Panel. Python's module search path can also be
 68 | extended on a per program basis by adding a path to :attr:`.sys.path` at the
 69 | start of a Python program.
 70 | 
 71 | Usage
 72 | -----
 73 | Jython can in most cases be used as a direct replacement for CPython unless its
 74 | C API is being used. While Jython does not implement CPython C API, it can use
 75 | libraries implemented in other JVM-based languages like Java, Scala, Clojure,
 76 | and Kotlin. To use such libraries, they must be added to the JVM classpath by
 77 | using the ``-J-cp`` command-line option. For more information about Jython's
 78 | command-line flags run the command ``jython -h``.
 79 | 
 80 | .. code-block:: python
 81 | 
 82 |     from pygrametl.tables import FactTable
 83 |     from pygrametl.JDBCConnectionWrapper import JDBCConnectionWrapper
 84 | 
 85 |     # The Java classes used must be explicitly imported into the program
 86 |     import java.sql.DriverManager
 87 | 
 88 |     # The actual database connection is handled by a JDBC connection
 89 |     jconn = java.sql.DriverManager.getConnection(
 90 | 	"jdbc:postgresql://localhost/dw?user=dwuser&password=dwpass")
 91 | 
 92 |     # As PEP 249 and JDBC connections provide different interfaces, is it
 93 |     # necessary to use a JDBCConnectionWrapper instead of a ConnectionWrapper.
 94 |     # Both provides the same interface, thus pygrametl can execute queries
 95 |     # without taking into account how the connection is implemented
 96 |     conn = JDBCConnectionWrapper(jdbcconn=jconn)
 97 | 
 98 |     # This instance of FactTable manages the table "facttable" in the
 99 |     # database using the default connection wrapper created above
100 |     factTable = FactTable(
101 | 	name='testresults',
102 | 	measures=['errors'],
103 | 	keyrefs=['pageid', 'testid', 'dateid'])
104 | 
105 | The above example demonstrates how few changes are needed to change the first
106 | example from :ref:`facttables` from using CPython to Jython. The database
107 | connection is changed from a :pep:`249` connection to a `JDBC
108 | <https://jcp.org/en/jsr/detail?id=221>`__ connection, and
109 | :class:`.ConnectionWrapper` is changed to
110 | :class:`.JDBCConnectionWrapper.JDBCConnectionWrapper`. The creation of the
111 | :class:`.FactTable` object does not need to be changed to run on Jython, as the
112 | connection wrappers abstract away the differences between `JDBC
113 | <https://jcp.org/en/jsr/detail?id=221>`__ and :pep:`249`. The other Jython
114 | module, :mod:`.jythonmultiprocessing`, is even simpler to use as pygrametl's
115 | parallel module :mod:`.parallel` imports either it or CPython's built-in
116 | :mod:`.multiprocessing` module depending on whether Jython or CPython is used.
117 | 


--------------------------------------------------------------------------------
/docs/quickstart/install.rst:
--------------------------------------------------------------------------------
  1 | .. _install:
  2 | 
  3 | Install Guide
  4 | =============
  5 | Installing pygrametl is fairly simple, mainly due to the package having no
  6 | mandatory dependencies. This guide contains all the information needed to
  7 | install and use the package with CPython. pygrametl also supports the JVM-based
  8 | Python implementation Jython. For more information about using pygrametl with
  9 | Jython see :ref:`jython`.
 10 | 
 11 | Installing a Python Implementation
 12 | ----------------------------------
 13 | pygrametl requires an implementation of the Python programming language to run.
 14 | Currently, pygrametl officially supports the following implementations (other
 15 | implementations like `PyPy <https://www.pypy.org/>`__ and `IronPython
 16 | <https://ironpython.net/>`__ might also work):
 17 | 
 18 | * `Jython <http://www.jython.org/>`__, version 2.7 or above
 19 | * `Python 2 <http://www.python.org/>`__, version 2.7 or above
 20 | * `Python 3 <http://www.python.org/>`__, version 3.4 or above
 21 | 
 22 | .. warning::
 23 |     As Python 2 is no longer being `maintained
 24 |     <https://www.python.org/doc/sunset-python-2/>`_ support for it will slowly
 25 |     be reduced as we continue to develop pygrametl. Currently, :mod:`.dttr` is
 26 |     the only pygrametl module that requires Python 3 (version 3.4 or above).
 27 | 
 28 | After a Python implementation has been installed and added to the system's
 29 | path, it can be run from either the command prompt in Windows or the shell in
 30 | Unix-like systems. This should launch the Python interpreter in interactive
 31 | mode, allowing commands to be executed directly on the command line. ::
 32 | 
 33 |     Python 3.9.2 (default, Feb 20 2021, 18:40:11)
 34 |     [GCC 10.2.0] on linux
 35 |     Type "help", "copyright", "credits" or "license" for more information.
 36 |     >>>
 37 | 
 38 | 
 39 | Installing pygrametl
 40 | --------------------
 41 | pygrametl can either be installed from `PyPI
 42 | <https://pypi.python.org/pypi/pygrametl/>`__ using a package manager, such as
 43 | `pip <https://pip.pypa.io/>`__ or `conda <http://conda.pydata.org/>`__, or by
 44 | manually checking out the latest development version from the official `GitHub
 45 | repository <https://github.com/chrthomsen/pygrametl>`__. Installing pygrametl
 46 | from `PyPI <https://pypi.python.org/pypi/pygrametl/>`__ is currently the
 47 | simplest way to install pygrametl as the process is automated by the package
 48 | manager. Bug fixes and new experimental features are, however, of course,
 49 | available first in the `GitHub repository
 50 | <https://github.com/chrthomsen/pygrametl>`__.
 51 | 
 52 | Install from PyPI with pip
 53 | ##########################
 54 | pip can install pygrametl to the Python implementation's global package
 55 | directory, or to the user's local package directory which is usually located in
 56 | the user's home directory. Installing pygrametl globally will often require root
 57 | or administrator privileges with the advantage that the package will be
 58 | available to all users of that system. Installing it locally will only make it
 59 | available to the current user, but the installation can be performed without
 60 | additional privileges. The two types of installation can be performed using one
 61 | of the following commands: ::
 62 | 
 63 |     # Install pygrametl to the global package directory
 64 |     $ pip install pygrametl
 65 | 
 66 |     # Install pygrametl to the user's local package directory
 67 |     $ pip install pygrametl --user
 68 | 
 69 | Install from PyPI with conda
 70 | ############################
 71 | conda is an alternative package manager for Python. It is bundled with the
 72 | `Anaconda <https://www.anaconda.com/products/individual>`__ CPython distribution
 73 | from `Anaconda, Inc <https://www.anaconda.com/>`__. There is no official
 74 | pygrametl conda package as it uses a different package format than pip. It is
 75 | however trivial to download, convert, and install the PyPI package using conda
 76 | with only a few commands. ::
 77 | 
 78 |     # Create a template for the conda package using the PyPI package
 79 |     $ conda skeleton pypi pygrametl
 80 | 
 81 |     # Build the conda package
 82 |     $ conda build pygrametl/meta.yaml
 83 | 
 84 |     # Install the conda package
 85 |     $ conda install --use-local pygrametl
 86 | 
 87 | Afterward, the folder containing the package template can be deleted as it is
 88 | only used for building the package.
 89 | 
 90 | Install from GitHub
 91 | ###################
 92 | The latest development version of pygrametl can be downloaded from the official
 93 | `GitHub repository <https://github.com/chrthomsen/pygrametl>`__. The project
 94 | currently uses Git for version control, so the repository can be cloned using
 95 | the following command. ::
 96 | 
 97 |     # Clone the pygrametl repository from GitHub
 98 |     $ git clone https://github.com/chrthomsen/pygrametl.git
 99 | 
100 | Before Python can import the modules, the pygrametl package must be added to
101 | :attr:`.sys.path`. This can be done manually in your Python programs, by setting
102 | ``PYTHONPATH`` if CPython is used, or by setting ``JYTHONPATH`` if Jython is
103 | used. More information about how `CPython
104 | <http://docs.python.org/3/tutorial/modules.html#the-module-search-path>`__ and
105 | `Jython
106 | <https://jython.readthedocs.io/en/latest/ModulesPackages/#module-search-path-compilation-and-loading>`__
107 | locate modules can be found in the two links provided.
108 | 
109 | Verifying Installation
110 | ----------------------
111 | A simple way to verify that pygrametl has been installed correctly and is
112 | accessible to the Python interpreter is to start the interpreter in
113 | interactive mode from the command line and run the command ``import
114 | pygrametl``. ::
115 | 
116 |     Python 3.9.2 (default, Feb 20 2021, 18:40:11)
117 |     [GCC 10.2.0] on linux
118 |     Type "help", "copyright", "credits" or "license" for more information.
119 |     >>> import pygrametl
120 |     >>>
121 | 
122 | If this fails with the message ``ImportError: No module named pygrametl`` then
123 | verify that the install location of the package is included in either the
124 | environment variable, ``PYTHONPATH`` if CPython is used, or the environment
125 | variable ``JYTHONPATH`` if Jython is used. By including the location of
126 | pygrametl in these variables, it is available to all instances of that Python
127 | implementation just like any built-in Python package. As an alternative, the
128 | path to pygrametl can be set on a program to program basis, by adding the path
129 | of pygrametl to :attr:`.sys.path`, before importing the package in your code.
130 | 
131 | .. code-block:: python
132 | 
133 |     # The path to the pygrametl package is added to the path used by the Python
134 |     # interpreter when modules are being imported, this must be done in all
135 |     # program using a module not included in the default Python path
136 |     import sys
137 |     sys.path.append('/path/to/pygrametl')
138 | 
139 |     # After the folder is added to Python's path can pygrametl be imported
140 |     import pygrametl
141 | 


--------------------------------------------------------------------------------
/tests/test_datasources.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
  2 | # All rights reserved.
  3 | 
  4 | # Redistribution and use in source anqd binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are met:
  6 | 
  7 | # - Redistributions of source code must retain the above copyright notice, this
  8 | #   list of conditions and the following disclaimer.
  9 | 
 10 | # - Redistributions in binary form must reproduce the above copyright notice,
 11 | #   this list of conditions and the following disclaimer in the documentation
 12 | #   and/or other materials provided with the distribution.
 13 | 
 14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 24 | 
 25 | import sqlite3
 26 | import unittest
 27 | 
 28 | import pygrametl
 29 | from pygrametl.datasources import MappingSource, SQLTransformingSource
 30 | from tests import utilities
 31 | 
 32 | 
 33 | class MappingSourceTest(unittest.TestCase):
 34 |     @classmethod
 35 |     def setUpClass(cls):
 36 |         # Ensure other tests does not affect these tests
 37 |         utilities.remove_default_connection_wrapper()
 38 | 
 39 |     def setUp(self):
 40 |         self.input_list = [
 41 |             { "id": 1, "title": "Unknown", "genre": "Unknown" },
 42 |             { "id": 2, "title": "Nineteen Eighty-Four", "genre": "Novel" },
 43 |             { "id": 3, "title": "Calvin and Hobbes One", "genre": "Comic" },
 44 |             { "id": 4, "title": "Calvin and Hobbes Two", "genre": "Comic" },
 45 |             { "id": 5, "title": "The Silver Spoon", "genre": "Cookbook" }
 46 |         ]
 47 | 
 48 |     def test_mapping_single_callable(self):
 49 |         source = MappingSource(iter(self.input_list), {
 50 |             "id": lambda x: x + 1
 51 |         })
 52 |         expected = [
 53 |             { "id": 2, "title": "Unknown", "genre": "Unknown" },
 54 |             { "id": 3, "title": "Nineteen Eighty-Four", "genre": "Novel" },
 55 |             { "id": 4, "title": "Calvin and Hobbes One", "genre": "Comic" },
 56 |             { "id": 5, "title": "Calvin and Hobbes Two", "genre": "Comic" },
 57 |             { "id": 6, "title": "The Silver Spoon", "genre": "Cookbook" }
 58 |         ]
 59 | 
 60 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
 61 |         self.assertEqual(expected, list(source))
 62 | 
 63 |     def test_mapping_two_callables(self):
 64 |         source = MappingSource(iter(self.input_list), {
 65 |             "id": lambda x: x + 1,
 66 |             "genre": lambda x: x[0]
 67 |         })
 68 |         expected = [
 69 |             { "id": 2, "title": "Unknown", "genre": "U" },
 70 |             { "id": 3, "title": "Nineteen Eighty-Four", "genre": "N" },
 71 |             { "id": 4, "title": "Calvin and Hobbes One", "genre": "C" },
 72 |             { "id": 5, "title": "Calvin and Hobbes Two", "genre": "C" },
 73 |             { "id": 6, "title": "The Silver Spoon", "genre": "C" }
 74 |         ]
 75 | 
 76 | 
 77 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
 78 |         self.assertEqual(expected, list(source))
 79 | 
 80 | 
 81 | class SQLTransformationSourceTest(unittest.TestCase):
 82 | 
 83 |     @classmethod
 84 |     def setUpClass(cls):
 85 |         cls.input_list = [
 86 |             { "id": 1, "title": "Unknown", "genre": "Unknown" },
 87 |             { "id": 2, "title": "Nineteen Eighty-Four", "genre": "Novel" },
 88 |             { "id": 3, "title": "Calvin and Hobbes One", "genre": "Comic" },
 89 |             { "id": 4, "title": "Calvin and Hobbes Two", "genre": "Comic" },
 90 |             { "id": 5, "title": "The Silver Spoon", "genre": "Cookbook" }
 91 |         ]
 92 | 
 93 |         cls.expected_group_by_genre = [
 94 |             {"genre": "Comic", "COUNT(title)": 2},
 95 |             {"genre": "Cookbook", "COUNT(title)": 1},
 96 |             {"genre": "Novel", "COUNT(title)": 1},
 97 |             {"genre": "Unknown", "COUNT(title)": 1}
 98 |         ]
 99 | 
100 |         # Ensure other tests does not affect these tests
101 |         utilities.remove_default_connection_wrapper()
102 | 
103 |     def test_transform(self):
104 |         source = SQLTransformingSource(
105 |             iter(self.input_list), "book",
106 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre")
107 | 
108 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
109 |         self.assertEqual(self.expected_group_by_genre, list(source))
110 | 
111 |     def test_transform_with_batch_size_of_one(self):
112 |         source = SQLTransformingSource(
113 |             iter(self.input_list), "book",
114 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre",
115 |             batchsize=1)
116 | 
117 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
118 |         self.assertEqual(self.expected_group_by_genre, list(source))
119 | 
120 |     def test_transform_with_batch_size_of_one_and_perbatch(self):
121 |         expected_group_by_genre_per_batch = [
122 |             {"genre": "Unknown", "COUNT(title)": 1},
123 |             {"genre": "Novel", "COUNT(title)": 1},
124 |             {"genre": "Comic", "COUNT(title)": 1},
125 |             {"genre": "Comic", "COUNT(title)": 1},
126 |             {"genre": "Cookbook", "COUNT(title)": 1}
127 |         ]
128 | 
129 |         source = SQLTransformingSource(
130 |             iter(self.input_list), "book",
131 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre",
132 |             batchsize=1, perbatch=True)
133 | 
134 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
135 |         self.assertEqual(expected_group_by_genre_per_batch, list(source))
136 | 
137 |     def test_transform_with_renamed_columns(self):
138 |         expected_group_by_genre_renamed = [
139 |             {"genre": "Comic", "count": 2},
140 |             {"genre": "Cookbook", "count": 1},
141 |             {"genre": "Novel", "count": 1},
142 |             {"genre": "Unknown", "count": 1}
143 |         ]
144 | 
145 |         source = SQLTransformingSource(
146 |             iter(self.input_list), "book",
147 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre",
148 |             columnnames=["genre", "count"])
149 | 
150 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
151 |         self.assertEqual(expected_group_by_genre_renamed, list(source))
152 | 
153 |     def test_transform_with_pep_connection(self):
154 |         source = SQLTransformingSource(
155 |             iter(self.input_list), "book",
156 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre",
157 |             targetconnection=sqlite3.connect(":memory:"))
158 | 
159 |         self.assertIsNone(pygrametl.getdefaulttargetconnection())
160 |         self.assertEqual(self.expected_group_by_genre, list(source))
161 | 
162 |     def test_transform_with_connection_wrapper(self):
163 |         source = SQLTransformingSource(
164 |             iter(self.input_list), "book",
165 |             "SELECT genre, COUNT(title) FROM book GROUP BY genre",
166 |             targetconnection=utilities.ensure_default_connection_wrapper())
167 | 
168 |         # Ensure this test does not affect the other tests even if it fails
169 |         utilities.remove_default_connection_wrapper()
170 |         self.assertEqual(self.expected_group_by_genre, list(source))
171 | 


--------------------------------------------------------------------------------
/docs/examples/bulkloading.rst:
--------------------------------------------------------------------------------
  1 | .. _bulkloading:
  2 | 
  3 | Bulk Loading
  4 | ============
  5 | Bulk loading rows instead of inserting them one at a time can dramatically
  6 | increase the throughput of an ETL program. Bulk loading works by loading data
  7 | from a temporary file into the database. The actual process of bulk loading is
  8 | unfortunately different for each RDBMS. Because of this, a user-defined function
  9 | must be created that uses the functionality provided by a particular RDBMS to
 10 | bulk load the data from a file. The following is a list of example functions
 11 | showing how bulk loading can be performed for some of the more commonly used
 12 | RDBMSs.
 13 | 
 14 | Currently, three classes in pygrametl use bulk loading: :class:`.BulkDimension`,
 15 | :class:`.CachedBulkDimension`, and :class:`.BulkFactTable`. Thus a function that
 16 | can bulk load data from a file into the specific RDBMS used for the data
 17 | warehouse, must be passed to each of these classes constructors. The function
 18 | must have the following signature:
 19 | 
 20 | .. py:function:: func(name, attributes, fieldsep, rowsep, nullval, filehandle):
 21 | 
 22 |     Required signature of a function bulk loading data from a file into an RDBMS
 23 |     in pygrametl.
 24 | 
 25 |     :param name: The name of the table in the data warehouse.
 26 |     :param attributes: A list containing the sequence of attributes in the table.
 27 |     :param fieldsep: The string used to separate fields in the temporary file.
 28 |     :param rowsep: The string used to separate rows in the temporary file.
 29 |     :param nullval: If the class was passed a string to substitute None values with,
 30 |       then it will be passed, if not then None is passed.
 31 |     :param filehandle: Either the name of the file or the file object itself,
 32 |       depending upon the value of member :attr:`.usefilename` on the class.
 33 | 
 34 | PostgreSQL
 35 | ----------
 36 | For PostgreSQL the `copy_expert
 37 | <https://www.psycopg.org/docs/cursor.html#cursor.copy_expert>`__ method from
 38 | psycopg2 can be used:
 39 | 
 40 | .. code-block:: python
 41 | 
 42 |     # psycopg2
 43 |     def pgbulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
 44 | 	global connection
 45 | 	cursor = connection.cursor()
 46 | 	cursor.copy_expert(
 47 | 		f"COPY {name} ({','.join(attributes)}) FROM STDIN WITH (FORMAT csv, DELIMITER '{fieldsep}', NULL '{nullval}');",
 48 | 		filehandle
 49 | 	)
 50 | 
 51 | If Jython is used the `copyIn
 52 | <https://jdbc.postgresql.org/documentation/publicapi/org/postgresql/copy/CopyManager.html#copyIn-java.lang.String->`__
 53 | method in JDBC's :class:`CopyManager` class can be used:
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |     # JDBC
 58 |     def pgcopybulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
 59 | 	global pgconnection
 60 | 	copymgr = pgconnection.getCopyAPI()
 61 | 	sql = "COPY %s(%s) FROM STDIN WITH DELIMITER '%s'" % \
 62 | 	      (name, ', '.join(attributes), fieldsep)
 63 | 	copymgr.copyIn(sql, filehandle)
 64 | 
 65 | MySQL
 66 | -----
 67 | For MySQL the `LOAD DATA INFILE
 68 | <http://dev.mysql.com/doc/refman/5.7/en/load-data.html>`__ functionality
 69 | provided by MySQL SQL dialect can be used.
 70 | 
 71 | .. code-block:: python
 72 | 
 73 |     # MySQLdb
 74 |     def mysqlbulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
 75 | 	global connection
 76 | 	cursor = connection.cursor()
 77 | 	sql = "LOAD DATA LOCAL INFILE '%s' INTO TABLE %s FIELDS TERMINATED BY '%s' LINES TERMINATED BY '%s' (%s);" % \
 78 | 		(filehandle, name, fieldsep, rowsep, ', '.join(attributes))
 79 | 	cursor.execute(sql)
 80 | 
 81 | Oracle
 82 | ------
 83 | Oracle supports two methods for bulk loading from text files, SQL Loader and
 84 | External Tables. The following example uses SQL Loader as it does not require
 85 | the creation of an additional table, which is problematic to do in a bulk
 86 | loading function as the data types of each column must be specified.
 87 | 
 88 | SQL Loader is part of Oracle's client package. SQL Loader requires all
 89 | configuration and data files to have specific suffixes, so a file must be
 90 | created with the suffix .dat and passed to any bulk loading table as
 91 | :attr:`.tempdest`.
 92 | 
 93 | .. code-block:: python
 94 | 
 95 |     with tempfile.NamedTemporaryFile(suffix=".dat") as dat_handle:
 96 | 	BulkDimension(
 97 | 	    ...
 98 | 	    tempdest=dat_handle)
 99 | 
100 | 
101 | The bulk loading function shown below constructs a control file with the .ctl
102 | suffix using the functions arguments. The SQL Loader is then executed (sqlldr
103 | must in the system path) and passed the constructed .ctl file.
104 | 
105 | .. code-block:: python
106 | 
107 |     # cx_Oracle or JDBC
108 |     def oraclebulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
109 | 
110 | 	# The configuration file used by SQL Loader must use the suffix .ctf
111 | 	with tempfile.NamedTemporaryFile(suffix=".ctl") as ctl_handle:
112 | 
113 | 	    # The attributes to be loaded must be qouted using double quotes
114 | 	    unqouted_atts = str(tuple(attributes)).replace("'", "")
115 | 	    ctl_contents = """
116 | 		LOAD DATA INFILE '%s' "str %r"
117 | 		APPEND INTO TABLE %s
118 | 		FIELDS TERMINATED BY %r
119 | 		%s
120 | 		""" % (filehandle.name, rowsep, name, fieldsep, unqouted_atts)
121 | 
122 | 	    # Strips the multi line string of unnecessary indention, and ensures
123 | 	    # that the contents are written to the file by flushing it
124 | 	    ctl_contents = textwrap.dedent(ctl_handle).lstrip()
125 | 	    ctl_handle.write(ctl_contents)
126 | 	    ctl_handle.flush()
127 | 
128 | 	    # Bulk loads the data using Oracle's SQL Loader. As a new connection
129 | 	    # is created, the same username, passowrd, etc. must be given again
130 | 	    os.system("sqlldr username/password@ip:port/sid control=" +
131 | 		    str(ctl_handle.name))
132 | 
133 | 
134 | Microsoft SQL Server
135 | --------------------
136 | For Microsoft SQL Server the `BULK INSERT
137 | <https://msdn.microsoft.com/en-us/library/ms188365.aspx>`__ functionality
138 | provided by Transact-SQL can be used.
139 | 
140 | There are a number of things to be aware of when using pygrametl with SQL
141 | Server. If the file used for bulk loading is located on a machine running
142 | Microsoft Windows, the file must be copied before bulk loading, as the locks
143 | placed on the file by the OS and pygrametl, prevent SQL Server from opening it
144 | directly. Copying the file can be done e.g. using `shutil.copyfile
145 | <https://docs.python.org/3/library/shutil.html#shutil.copyfile>`__.
146 | 
147 | By default, BULK INSERT ignores column names, so the number and order of columns
148 | must match the table you are inserting into. This can be overcome by adding a
149 | `format file <https://msdn.microsoft.com/en-us/library/ms178129.aspx>`__. In
150 | this case, we create a `non-XML format file
151 | <https://msdn.microsoft.com/en-us/library/ms191479.aspx>`__.
152 | 
153 | A simple example of bulk loading in SQL Server along with the creation of a
154 | format file can be seen below:
155 | 
156 | .. code-block:: python
157 | 
158 |     def sqlserverbulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
159 | 	global msconn
160 | 	cursor = msconn.cursor()
161 | 
162 | 	# Copy the tempdest
163 | 	shutil.copyfile(filehandle, r'd:\dw\tmpfilecopy')
164 | 
165 | 	# Create format file
166 | 	fmt = open(r'd:\dw\format.fmt', 'w+')
167 | 	# 12.0 corresponds to the version of the bcp utility being used by SQL Server.
168 | 	# For more information, see the above link on non-XML format files.
169 | 	fmt.write("12.0\r\n%d\r\n" % len(attributes))
170 | 	count = 0
171 | 	sep = "\\t"
172 | 	for a in attributes:
173 | 	    count += 1
174 | 	    if count == len(attributes): sep = "\\n"
175 | 	    # For information regarding the format values,
176 | 	    # see the above link on non-XML format files.
177 | 	    fmt.write('%d SQLCHAR 0 8000 "%s" %d %s "Latin1_General_100_CI_AS_SC"\r\n' % (count, sep, count, a))
178 | 	fmt.close()
179 | 
180 | 	sql = "BULK INSERT %s FROM '%s' WITH (FORMATFILE = '%s', FIELDTERMINATOR = '%s', ROWTERMINATOR = '%s')" % \
181 | 		(name, r'd:\dw\tmpfilecopy', r'd:\dw\format.fmt', fieldsep, rowsep,)
182 | 	cursor.execute(sql)
183 | 


--------------------------------------------------------------------------------
/docs/_exts/autoformat.py:
--------------------------------------------------------------------------------
  1 | """Automatic addition of additional markup to the doc strings used by pygrametl,
  2 |    which should allow them to be readable in the source code and in the
  3 |    documentation after Sphinx has processed them.
  4 | """
  5 | 
  6 | # Copyright (c) 2014-2020, Aalborg University (pygrametl@cs.aau.dk)
  7 | # All rights reserved.
  8 | 
  9 | # Redistribution and use in source anqd binary forms, with or without
 10 | # modification, are permitted provided that the following conditions are met:
 11 | 
 12 | # - Redistributions of source code must retain the above copyright notice, this
 13 | #   list of conditions and the following disclaimer.
 14 | 
 15 | # - Redistributions in binary form must reproduce the above copyright notice,
 16 | #   this list of conditions and the following disclaimer in the documentation
 17 | #   and/or other materials provided with the distribution.
 18 | 
 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 27 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | import re
 31 | import sys
 32 | 
 33 | 
 34 | def correct_docstring(app, what, name, obj, options, lines):
 35 |     """Makes some correction to the markup, this should keep it readable in
 36 |         the source files, and having the output formatted using Sphinx.
 37 |     """
 38 | 
 39 |     # Iteration is immutable to prevent lines from being skipped
 40 |     for index, value in enumerate(lines):
 41 | 
 42 |         # Adds additional backslashes to keep escape sequences as text
 43 |         if '\\t' in value or '\\n' in value:
 44 |             lines[index] = lines[index].replace("\\", "\\\\")
 45 | 
 46 |         # Escapes * in argument descriptions to stop Sphinx using them as
 47 |         # markup
 48 |         if '*' in value:
 49 |             lines[index] = escape_star(value)
 50 | 
 51 |         # Formatting of the arguments header with bold and a newline
 52 |         if value == 'Arguments:' or value == 'Keyword arguments:':
 53 |             lines[index] = '**' + value + '**'
 54 |             lines.insert(index + 1, '')
 55 | 
 56 | 
 57 | def escape_star(line):
 58 |     """Escape all unmatched stars (*) so Sphinx know they aren't markup"""
 59 |     line_split = line.split()
 60 | 
 61 |     for index, value in enumerate(line_split):
 62 |         # Star is only added to the end of the word, if the are used for markup
 63 |         if not value.endswith('*'):
 64 |             line_split[index] = line_split[index].replace("*", "\\*")
 65 | 
 66 |     return ' '.join(line_split)
 67 | 
 68 | 
 69 | def correct_signature(app, what, name, obj, options, signature,
 70 |                       return_annotation):
 71 |     """Makes some correction to the markup, to prevent Sphinx from using escape
 72 |         sequences instead of just printing them"""
 73 | 
 74 |     # Returns the signature are empty, instead of doing None checks everywhere
 75 |     if not signature:
 76 |         return(signature, return_annotation)
 77 | 
 78 |     # Adds additional backslashes to keep escape sequences as text
 79 |     if '\\t' in signature or '\\n' in signature:
 80 |         signature = signature.replace("\\", "\\\\")
 81 | 
 82 |     # Removes the address added by Sphinx if a function pointer have defaults
 83 |     if "<function" in signature:
 84 |         signature = correct_function_pointers(obj, signature)
 85 | 
 86 |     # Side effects are discarded, so we have to return a tuple with new strings
 87 |     return(signature, return_annotation)
 88 | 
 89 | 
 90 | def correct_function_pointers(obj, signature):
 91 |     """Manuel mapping of function pointers with addresses to their original
 92 |         names, it is needed until Sphinx Issue #759 have been resolved.
 93 |     """
 94 | 
 95 |     # Signatures can belong to either a function, method or object, depending
 96 |     # on what version of python is used. Extration of docstrings from objects
 97 |     # does in some versions of python require accessing the method first.
 98 |     if hasattr(obj, "func_defaults"):
 99 |         filename = obj.__code__.co_filename
100 |         lineno = obj.__code__.co_firstlineno
101 |         source_code_line = read_function_signature(filename, lineno)
102 |     elif hasattr(obj, "__code__"):
103 |         filename = obj.__code__.co_filename
104 |         lineno = obj.__code__.co_firstlineno
105 |         source_code_line = read_function_signature(filename, lineno)
106 |     else:
107 |         filename = obj.__init__.__code__.co_filename
108 |         lineno = obj.__init__.__code__.co_firstlineno
109 |         source_code_line = read_function_signature(filename, lineno)
110 | 
111 |     # The line of source code read from the file, and the original signature, is
112 |     # split into a list of parameters, allowing the function names from the line
113 |     # of source code read to easily substitute the memory addresses present in
114 |     # the original signature given by Sphinx
115 |     signature_split = signature.split(',')
116 |     source_code_line_split = source_code_line.split(',')
117 | 
118 |     # Function name, def, self, and the ending colon are stripped to match the
119 |     # original signature read by Sphinx, making substituting each part trivial
120 |     param_start_index = source_code_line_split[0].find('(')
121 |     source_code_line_split[0] = source_code_line_split[0][param_start_index:]
122 |     source_code_line_split[-1] = source_code_line_split[-1][0:-1]
123 | 
124 |     if source_code_line_split[0] == '(self':
125 |         del(source_code_line_split[0])
126 |         source_code_line_split[0] = '(' + source_code_line_split[0]
127 | 
128 |     # Finally we substitute the pointers with the matching line from source
129 |     # code
130 |     result_string_list = []
131 |     for sig, source in zip(signature_split, source_code_line_split):
132 |         if '<function ' in sig:
133 |             result_string_list.append(source)
134 |         else:
135 |             result_string_list.append(sig)
136 | 
137 |     # The function pointer block is just replaced with the function_name
138 |     return ','.join(result_string_list)
139 | 
140 | 
141 | def read_function_signature(filename, lineno):
142 | 
143 |     # The line number is subtracted by one to make it match the one produced by
144 |     # the enumerator, as the line number starts from one and the enumerator
145 |     # from zero
146 |     lineno = lineno - 1
147 | 
148 |     # We read through the file until we line number passed, the reader is then
149 |     # "activated" and we make a copy of all lines read until we match a ":"
150 |     # indicating the end of the function signature which is all we need.
151 |     function_signature = ""
152 |     file_handle = open(filename)
153 |     reached_function_signature = False
154 |     for file_index, line in enumerate(file_handle):
155 | 
156 |         if file_index == lineno:
157 |             reached_function_signature = True
158 | 
159 |         if reached_function_signature:
160 |             function_signature += line.strip()
161 | 
162 |             if line.endswith(':\n'):
163 |                 file_handle.close()
164 |                 break
165 | 
166 |     # Finally the all white space is removed from the signature to make it
167 |     # simpler to process in "correct_function_pointers(obj, signature)"
168 |     return function_signature  # .strip()
169 | 
170 | 
171 | def setup(app):
172 |     """Initial setup that connects the plug-in to Sphinx"""
173 | 
174 |     # Connection of functions to events raised by Sphinx's autodoc plug-in
175 |     # Documentation: http://sphinx-doc.org/ext/autodoc.html
176 |     app.connect('autodoc-process-docstring', correct_docstring)
177 |     app.connect('autodoc-process-signature', correct_signature)
178 | 


--------------------------------------------------------------------------------
/pygrametl/FIFODict.py:
--------------------------------------------------------------------------------
  1 | """A simple mapping between keys and values, but with a limited capacity. When
  2 |    the max. capacity is reached, the first inserted key/value pair is deleted
  3 | """
  4 | 
  5 | # Copyright (c) 2009-2020, Aalborg University (pygrametl@cs.aau.dk)
  6 | # All rights reserved.
  7 | 
  8 | # Redistribution and use in source anqd binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are met:
 10 | 
 11 | # - Redistributions of source code must retain the above copyright notice, this
 12 | #   list of conditions and the following disclaimer.
 13 | 
 14 | # - Redistributions in binary form must reproduce the above copyright notice,
 15 | #   this list of conditions and the following disclaimer in the documentation
 16 | #   and/or other materials provided with the distribution.
 17 | 
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 22 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 23 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 24 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 25 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | __all__ = ['FIFODict']
 31 | 
 32 | 
 33 | class FIFODictDeque:
 34 | 
 35 |     """A simple FIFO mapping between keys and values. Implemented using a dict
 36 |        and a Deque. When the max. capacity is reached, the key/value pair that
 37 |        has been in the dict the longest time is removed.
 38 |     """
 39 | 
 40 |     def __init__(self, size, finalizer=None):
 41 |         """Create a FIFODictDeque with the given maximum size.
 42 | 
 43 |            Arguments:
 44 | 
 45 |            - size: Determines the maximum size of the dict.
 46 |            - finalizer: If finalizer is given, it must be a callable
 47 |              f(key, value). It is then called, when a item is removed due to
 48 |              the size of the dict reaching the maximum (finalizer is NOT called
 49 |              when an item is explicitly deleted with del d[key] or when the
 50 |              dict is cleared).
 51 |         """
 52 |         if not isinstance(size, type(0)):
 53 |             raise TypeError("size must be an int")
 54 |         if not size > 0:
 55 |             raise ValueError("size must be positive")
 56 |         if finalizer is not None and not callable(finalizer):
 57 |             raise TypeError("finalizer must be None or a callable")
 58 | 
 59 |         self.__size = size
 60 |         self.__data = {}
 61 |         self.__order = deque()
 62 |         self.__finalizer = finalizer
 63 | 
 64 |     def add(self, key, val):
 65 |         """Add a key/value pair to the dict.
 66 | 
 67 |            If a pair p with the same key already exists, p is replaced by the
 68 |            new pair n, but n gets p's position in the FIFO dict and is deleted
 69 |            when the old pair p would have been deleted. When the maximum
 70 |            capacity is reached, the pair with the oldest key is deleted
 71 |            from the dict.
 72 | 
 73 |            The argument key is the key and the argument val is the value.
 74 |         """
 75 |         if key in self.__data:
 76 |             self.__data[key] = val  # Replace old value
 77 |         elif len(self.__order) < self.__size:
 78 |             # The dict is not full yet. Just add the new pair.
 79 |             self.__order.append(key)
 80 |             self.__data[key] = val
 81 |         else:
 82 |             # The dict is full. We have to delete the oldest item first.
 83 |             delKey = self.__order.popleft()
 84 |             if self.__finalizer:
 85 |                 self.__finalizer(delKey, self.__data[delKey])
 86 |             del self.__data[delKey]
 87 |             self.__order.append(key)
 88 |             self.__data[key] = val
 89 | 
 90 |     def get(self, key, default=None):
 91 |         """Find and return the element a given key maps to.
 92 | 
 93 |            Look for the given key in the dict and return the associated value
 94 |            if found. If not found, the value of default is returned.
 95 |         """
 96 |         return self.__data.get(key, default)
 97 | 
 98 |     def clear(self):
 99 |         """Delete all key/value pairs from the dict"""
100 |         self.__data = {}
101 |         self.__order = []
102 | 
103 |     def __setitem__(self, key, item):
104 |         self.add(key, item)
105 | 
106 |     def __getitem__(self, key):
107 |         return self.__data[key]
108 | 
109 |     def __len__(self):
110 |         return len(self.__data)
111 | 
112 |     def __str__(self):
113 |         allitems = []
114 |         for key in self.__order:
115 |             val = self.__data[key]
116 |             item = "%s: %s" % (str(key), str(val))
117 |             allitems.append(item)
118 |         return "{%s}" % ", ".join(allitems)
119 | 
120 |     def __contains__(self, item):
121 |         return (item in self.__data)
122 | 
123 |     def __delitem__(self, item):
124 |         if item not in self.__data:
125 |             raise KeyError(item)
126 | 
127 |         del self.__data[item]
128 |         self.__order.remove(item)
129 | 
130 |     def __iter__(self):
131 |         for k in self.__order:
132 |             yield k
133 | 
134 | 
135 | class FIFODictOrderedDict:
136 | 
137 |     """A simple FIFO mapping between keys and values. Implemented using an
138 |        OrderedDict. When the max. capacity is reached, the key/value pair that
139 |        has been in the dict the longest time is removed.
140 |     """
141 | 
142 |     def __init__(self, size, finalizer=None):
143 |         """Create a FIFODictOrderedDict with the given maximum size.
144 | 
145 |            Arguments:
146 | 
147 |            - size: Determines the maximum size of the dict.
148 |            - finalizer: If finalizer is given, it must be a callable
149 |              f(key, value). It is then called, when a item is removed due to
150 |              the size of the dict reaching the maximum (finalizer is NOT called
151 |              when an item is explicitly deleted with del d[key] or when the
152 |              dict is cleared).
153 |         """
154 |         if not isinstance(size, type(0)):
155 |             raise TypeError("size must be an int")
156 |         if not size > 0:
157 |             raise ValueError("size must be positive")
158 |         if finalizer is not None and not callable(finalizer):
159 |             raise TypeError("finalizer must be None or a callable")
160 | 
161 |         self.__size = size
162 |         self.__data = OrderedDict()
163 |         self.__finalizer = finalizer
164 | 
165 |     def add(self, key, val):
166 |         """Add a key/value pair to the dict.
167 | 
168 |            If a pair p with the same key already exists, p is replaced by the
169 |            new pair n, but n gets p's position in the FIFO dict and is deleted
170 |            when the old pair p would have been deleted. When the maximum
171 |            capacity is reached, the pair with the oldest key is deleted
172 |            from the dict.
173 | 
174 |            The argument key is the key and the argument val is the value.
175 |         """
176 |         if key in self.__data:
177 |             self.__data[key] = val  # Replace old value
178 |         elif len(self.__data) < self.__size:
179 |             # The dict is not full yet. Just add the new pair.
180 |             self.__data[key] = val
181 |         else:
182 |             # The dict is full. We have to delete the oldest item first.
183 |             if self.__finalizer:
184 |                 (delKey, delValue) = self.__data.popitem(last=False)
185 |                 self.__finalizer(delKey, delValue)
186 |             else:
187 |                 self.__data.popitem(last=False)
188 |             self.__data[key] = val
189 | 
190 |     def get(self, key, default=None):
191 |         """Find and return the element a given key maps to.
192 | 
193 |            Look for the given key in the dict and return the associated value
194 |            if found. If not found, the value of default is returned.
195 |         """
196 |         return self.__data.get(key, default)
197 | 
198 |     def clear(self):
199 |         """Delete all key/value pairs from the dict"""
200 |         self.__data = OrderedDict()
201 | 
202 |     def __setitem__(self, key, item):
203 |         self.add(key, item)
204 | 
205 |     def __getitem__(self, key):
206 |         return self.__data[key]
207 | 
208 |     def __len__(self):
209 |         return len(self.__data)
210 | 
211 |     def __str__(self):
212 |         allitems = []
213 |         for key in self.__data:
214 |             val = self.__data[key]
215 |             item = "%s: %s" % (str(key), str(val))
216 |             allitems.append(item)
217 |         return "{%s}" % ", ".join(allitems)
218 | 
219 |     def __contains__(self, item):
220 |         return (item in self.__data)
221 | 
222 |     def __delitem__(self, item):
223 |         if item not in self.__data:
224 |             raise KeyError(item)
225 | 
226 |         del self.__data[item]
227 | 
228 |     def __iter__(self):
229 |         for k in self.__data:
230 |             yield k
231 | 
232 | 
233 | # Exports the most appropiate version of FIFODict based on if
234 | # OrderedDict is available in the version of Python used.
235 | try:
236 |     from collections import OrderedDict
237 |     FIFODict = FIFODictOrderedDict
238 | except ImportError:
239 |     from collections import deque
240 |     FIFODict = FIFODictDeque
241 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # pygrametl documentation build configuration file, created by
  4 | # sphinx-quickstart2 on Thu Nov 14 21:04:01 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | 
 15 | # -- Interpreter configuration -------------------------------------------
 16 | 
 17 | from os.path import abspath, dirname, join
 18 | import sys
 19 | import os
 20 | 
 21 | # Make sure we get this copy of pygrametl, and not one on the system
 22 | sys.path.insert(1, dirname(dirname(abspath(__file__))))
 23 | import pygrametl
 24 | 
 25 | # If extensions (or modules to document with autodoc) are in another directory,
 26 | # add these directories to sys.path here. If the directory is relative to the
 27 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 28 | sys.path.insert(0, os.path.abspath('_exts'))
 29 | 
 30 | # Prevents the creation of .pyc and .pyo by the Python interpreter
 31 | sys.dont_write_bytecode = True
 32 | 
 33 | # Creates mockups of Java code used in the Jython modules used in the framework
 34 | import rtdmockup
 35 | rtdmockup.mockModules(['pygrametl.jythonsupport', 'java', 'java.sql'])
 36 | 
 37 | # -- General configuration -----------------------------------------------
 38 | 
 39 | # If your documentation needs a minimal Sphinx version, state it here.
 40 | #needs_sphinx = '1.0'
 41 | 
 42 | # Add any Sphinx extension module names here, as strings. They can be extensions
 43 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 44 | extensions = ['sphinx.ext.autodoc', 'autoformat', 'formatref']
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ['_templates']
 48 | 
 49 | # The suffix of source filenames.
 50 | source_suffix = '.rst'
 51 | 
 52 | # The encoding of source files.
 53 | #source_encoding = 'utf-8-sig'
 54 | 
 55 | # The master toctree document.
 56 | master_doc = 'index'
 57 | 
 58 | # General information about the project.
 59 | project = u'pygrametl'
 60 | copyright = u'2009 - 2023, Aalborg University'
 61 | 
 62 | # The version info for the project you're documenting, acts as replacement for
 63 | # |version| and |release|, also used in various other places throughout the
 64 | # built documents.
 65 | #
 66 | # The short X.Y version.
 67 | version = pygrametl.__version__
 68 | # The full version, including alpha/beta/rc tags.
 69 | release = version
 70 | 
 71 | # The language for content autogenerated by Sphinx. Refer to documentation
 72 | # for a list of supported languages.
 73 | #language = None
 74 | 
 75 | # There are two options for replacing |today|: either, you set today to some
 76 | # non-false value, then it is used:
 77 | #today = ''
 78 | # Else, today_fmt is used as the format for a strftime call.
 79 | #today_fmt = '%B %d, %Y'
 80 | 
 81 | # List of patterns, relative to source directory, that match files and
 82 | # directories to ignore when looking for source files.
 83 | exclude_patterns = ['_build', '_exts']
 84 | 
 85 | # The reST default role (used for this markup: `text`) to use for all documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | 
106 | # -- Options for Autodoc -------------------------------------------------
107 | 
108 | # Both the class’ and the __init__ method’s docstring are concatenated and
109 | # inserted.
110 | autoclass_content = 'both'
111 | 
112 | # -- Options for HTML output ---------------------------------------------
113 | 
114 | # The theme to use for HTML and HTML Help pages.  See the documentation for
115 | # a list of builtin themes.
116 | html_theme = 'alabaster'
117 | 
118 | # Theme options are theme-specific and customize the look and feel of a theme
119 | # further.  For a list of options available for each theme, see the
120 | # documentation.
121 | html_theme_options = {'page_width': '1076px'}
122 | 
123 | 
124 | # Add any paths that contain custom themes here, relative to this directory.
125 | #html_theme_path = []
126 | 
127 | # The name for this set of Sphinx documents.  If None, it defaults to
128 | # "<project> v<release> documentation".
129 | #html_title = None
130 | 
131 | # A shorter title for the navigation bar.  Default is the same as html_title.
132 | #html_short_title = None
133 | 
134 | # The name of an image file (relative to this directory) to place at the top
135 | # of the sidebar.
136 | #html_logo = None
137 | 
138 | # The name of an image file (within the static path) to use as favicon of the
139 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
140 | # pixels large.
141 | #html_favicon = None
142 | 
143 | # Add any paths that contain custom static files (such as style sheets) here,
144 | # relative to this directory. They are copied after the builtin static files,
145 | # so a file named "default.css" will overwrite the builtin "default.css".
146 | html_static_path = ['_static']
147 | 
148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
149 | # using the given strftime format.
150 | #html_last_updated_fmt = '%b %d, %Y'
151 | 
152 | # If true, SmartyPants will be used to convert quotes and dashes to
153 | # typographically correct entities.
154 | #html_use_smartypants = True
155 | 
156 | # Custom sidebar templates, maps document names to template names.
157 | #html_sidebars = {}
158 | 
159 | # Additional templates that should be rendered to pages, maps page names to
160 | # template names.
161 | #html_additional_pages = {}
162 | 
163 | # If false, no module index is generated.
164 | #html_domain_indices = True
165 | 
166 | # If false, no index is generated.
167 | #html_use_index = True
168 | 
169 | # If true, the index is split into individual pages for each letter.
170 | #html_split_index = False
171 | 
172 | # If true, links to the reST sources are added to the pages.
173 | #html_show_sourcelink = True
174 | 
175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
176 | #html_show_sphinx = True
177 | 
178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
179 | #html_show_copyright = True
180 | 
181 | # If true, an OpenSearch description file will be output, and all pages will
182 | # contain a <link> tag referring to it.  The value of this option must be the
183 | # base URL from which the finished HTML is served.
184 | #html_use_opensearch = ''
185 | 
186 | # This is the file name suffix for HTML files (e.g. ".xhtml").
187 | #html_file_suffix = None
188 | 
189 | # Output file base name for HTML help builder.
190 | htmlhelp_basename = 'pygrametldoc'
191 | 
192 | 
193 | # -- Options for LaTeX output --------------------------------------------
194 | 
195 | latex_elements = {
196 |     # The paper size ('letterpaper' or 'a4paper').
197 |     #'papersize': 'letterpaper',
198 | 
199 |     # The font size ('10pt', '11pt' or '12pt').
200 |     #'pointsize': '10pt',
201 | 
202 |     # Additional stuff for the LaTeX preamble.
203 |     #'preamble': '',
204 | 
205 |     # Removes the index from the finnished pdf
206 |     'printindex': ''
207 | }
208 | 
209 | # Grouping the document tree into LaTeX files. List of tuples
210 | # (source start file, target name, title, author, documentclass [howto/manual]).
211 | latex_documents = [
212 |     ('index', 'pygrametl.tex', u'pygrametl Documentation',
213 |      u'Aalborg Universitet', 'manual'),
214 | ]
215 | 
216 | # The name of an image file (relative to this directory) to place at the top of
217 | # the title page.
218 | #latex_logo = None
219 | 
220 | # For "manual" documents, if this is true, then toplevel headings are parts,
221 | # not chapters.
222 | #latex_use_parts = False
223 | 
224 | # If true, show page references after internal links.
225 | #latex_show_pagerefs = False
226 | 
227 | # If true, show URL addresses after external links.
228 | #latex_show_urls = False
229 | 
230 | # Documents to append as an appendix to all manuals.
231 | #latex_appendices = []
232 | 
233 | # If false, no module index is generated.
234 | latex_domain_indices = False
235 | 
236 | 
237 | # -- Options for manual page output --------------------------------------
238 | 
239 | # One entry per manual page. List of tuples
240 | # (source start file, name, description, authors, manual section).
241 | man_pages = [
242 |     ('index', 'pygrametl', u'pygrametl Documentation',
243 |      [u'Aalborg Universitet'], 1)
244 | ]
245 | 
246 | # If true, show URL addresses after external links.
247 | #man_show_urls = False
248 | 
249 | 
250 | # -- Options for Texinfo output ------------------------------------------
251 | 
252 | # Grouping the document tree into Texinfo files. List of tuples
253 | # (source start file, target name, title, author,
254 | #  dir menu entry, description, category)
255 | texinfo_documents = [
256 |     ('index', 'pygrametl', u'pygrametl Documentation',
257 |      u'Aalborg Universitet', 'pygrametl', 'One line description of project.',
258 |      'Miscellaneous'),
259 | ]
260 | 
261 | # Documents to append as an appendix to all manuals.
262 | #texinfo_appendices = []
263 | 
264 | # If false, no module index is generated.
265 | texinfo_domain_indices = False
266 | 
267 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
268 | #texinfo_show_urls = 'footnote'
269 | 


--------------------------------------------------------------------------------
/pygrametl/drawntabletesting/dttr.py:
--------------------------------------------------------------------------------
  1 | """This module is a test runner for tests defined using drawn tables."""
  2 | 
  3 | # Copyright (c) 2021-2023, Aalborg University (pygrametl@cs.aau.dk)
  4 | # All rights reserved.
  5 | 
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | 
  9 | # - Redistributions of source code must retain the above copyright notice, this
 10 | #   list of conditions and the following disclaimer.
 11 | 
 12 | # - Redistributions in binary form must reproduce the above copyright notice,
 13 | #   this list of conditions and the following disclaimer in the documentation
 14 | #   and/or other materials provided with the distribution.
 15 | 
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 20 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 21 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 22 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 23 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 24 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | import os
 28 | import sys
 29 | import csv
 30 | import shlex
 31 | import types
 32 | import sqlite3
 33 | import argparse
 34 | from collections import namedtuple
 35 | from pathlib import Path
 36 | 
 37 | import pygrametl.drawntabletesting as dtt
 38 | 
 39 | 
 40 | # Types
 41 | ReaderError = namedtuple('ReaderError', 'path start end name cause')
 42 | PreCondition = namedtuple('PreCondition', 'path start end table')
 43 | PostCondition = namedtuple('PostCondition', 'path start end table assert_name')
 44 | 
 45 | 
 46 | class ExtendAction(argparse.Action):
 47 |     """Creates a list of arguments passed to a flag instead of one per flag."""
 48 | 
 49 |     def __call__(self, parser, namespace, values, option_string=None):
 50 |         items = getattr(namespace, self.dest) or []
 51 |         items.extend(values)
 52 |         setattr(namespace, self.dest, items)
 53 | 
 54 | 
 55 | # Constants
 56 | DEFAULT_CONNECTION_NAME = 'connection'
 57 | 
 58 | 
 59 | # Functions
 60 | def print_reason_for_failure(when, condition, reason):
 61 |     print("[{} {}({}-{})] {}".format(when, condition.path, condition.start,
 62 |                                      condition.end, reason), end='\n')
 63 | 
 64 | 
 65 | def print_reader_error(path, firstlinenumber, lastlinenumber, reader_name, e):
 66 |     reader_error = ReaderError(
 67 |         path, firstlinenumber, lastlinenumber, reader_name, e)
 68 |     reason = reader_error.name + ' - ' + str(reader_error.cause)
 69 |     print_reason_for_failure("(Reader)", reader_error, reason)
 70 | 
 71 | 
 72 | def read_csv(columns, path, delimiter):
 73 |     with open(path) as f:
 74 |         return list(csv.DictReader(f, fieldnames=columns, delimiter=delimiter))
 75 | 
 76 | 
 77 | def read_sql(columns, config, *arguments):
 78 |     if 'SELECT' == arguments[0]:
 79 |         connection = DEFAULT_CONNECTION_NAME
 80 |         query = ' '.join(arguments)
 81 |     else:
 82 |         connection = arguments[0]
 83 |         query = ' '.join(arguments[1:])
 84 |     connection = getattr(config, connection)
 85 | 
 86 |     # DTT expects a sequence of dicts with the column names as keys
 87 |     cursor = connection.cursor()
 88 |     cursor.execute(query)
 89 |     rows = map(lambda row: dict(zip(columns, row)), cursor.fetchall())
 90 |     cursor.close()
 91 |     return rows
 92 | 
 93 | 
 94 | def read_dt(path, dt, lastlinenumber, pre_dtts, post_dtts, config, nullsubst,
 95 |             variableprefix, connection_wrappers, pre_conditions,
 96 |             post_conditions):
 97 |     header = list(map(lambda s: s.strip(), dt[0].split(',')))
 98 |     firstlinenumber = lastlinenumber - len(dt) + 1
 99 | 
100 |     # If the last line does not start with a pipe it cannot be a DT column and
101 |     # must instead be a data source with the column names defined by the DT
102 |     loadFrom = None
103 |     reader_name = "UNKNOWN"
104 |     if '|' != dt[-1].strip()[0]:
105 |         columns = [c.split(':')[0].strip() for c in dt[1].split('|') if c]
106 |         (reader_name, *arguments) = shlex.split(dt[-1])  # Splits as POSIX SH
107 |         try:
108 |             if reader_name == 'csv':
109 |                 reader_function = read_csv
110 |             elif reader_name == 'sql':
111 |                 reader_function = read_sql
112 |                 arguments.insert(0, config)
113 |             else:
114 |                 reader_function = getattr(config, reader_name)
115 |             arguments.insert(0, columns)
116 |             loadFrom = reader_function(*arguments)
117 |             dt = dt[:-1]  # The external data source should be passed to Table
118 |         except Exception as e:
119 |             # Errors are caught so the test runner is not terminated
120 |             print_reader_error(path, firstlinenumber, lastlinenumber,
121 |                                str(reader_name), e)
122 |             return
123 | 
124 |     # If the user has not given a connection the default is used
125 |     connection = DEFAULT_CONNECTION_NAME
126 |     if '@' in header[0]:
127 |         (header[0], connection) = header[0].split('@')
128 | 
129 |     # Ensures a connection to the test data is available and creates the table
130 |     try:
131 |         # Only one ConnectionWrapper should be created per connection
132 |         if connection not in connection_wrappers:
133 |             connection_wrappers[connection] = \
134 |                 dtt.connectionwrapper(getattr(config, connection))
135 |         table = dtt.Table(name=header[0], table='\n'.join(dt[1:]),
136 |                           nullsubst=nullsubst, variableprefix=variableprefix,
137 |                           loadFrom=loadFrom,
138 |                           testconnection=connection_wrappers[connection])
139 |     except Exception as e:
140 |         # Errors are caught so the test runner is not terminated
141 |         print_reader_error(path, firstlinenumber, lastlinenumber,
142 |                            str(reader_name), e)
143 |         return
144 | 
145 |     # Only postconditions include an assert
146 |     if len(header) == 1 and path in pre_dtts:
147 |         pre_conditions.append(PreCondition(path, firstlinenumber,
148 |                                            lastlinenumber, table))
149 |     elif len(header) == 2 and path in post_dtts:
150 |         post_conditions.append(PostCondition(path, firstlinenumber,
151 |                                              lastlinenumber, table,
152 |                                              header[1].capitalize()))
153 | 
154 | 
155 | def read_dtt_file(path, pre_dtts, post_dtts, config, nullsubst, variableprefix,
156 |                   connection_wrappers, pre_conditions, post_conditions):
157 |     linenumber = 0
158 |     with open(path, 'r') as f:
159 |         dt = []
160 |         for line in f:
161 |             line = line.strip()
162 |             linenumber += 1
163 |             # Lines with content are accumulated
164 |             if line:
165 |                 dt.append(line)
166 |             # Empty lines separate DTs in the file
167 |             elif dt:
168 |                 read_dt(path, dt, linenumber - 1, pre_dtts, post_dtts, config,
169 |                         nullsubst, variableprefix, connection_wrappers,
170 |                         pre_conditions, post_conditions)
171 |                 dt = []
172 | 
173 |     # Reads the last DT if the file not end with an empty line
174 |     if dt:
175 |         read_dt(path, dt, linenumber - 1, pre_dtts, post_dtts, config,
176 |                 nullsubst, variableprefix, connection_wrappers,
177 |                 pre_conditions, post_conditions)
178 | 
179 | 
180 | def ensure_pre_condition(pre_condition):
181 |     # Executes the ensure method without terminating if it fails
182 |     try:
183 |         pre_condition.table.ensure()
184 |     except Exception as e:
185 |         print_reason_for_failure("(Pre)", pre_condition, str(e))
186 | 
187 | 
188 | def assert_post_condition(post_condition):
189 |     # Executes the assert method without terminating if it fails
190 |     try:
191 |         getattr(post_condition.table, "assert" + post_condition.assert_name)()
192 |     except AttributeError:
193 |         raise ValueError("Unsupported assert specified")
194 |     except Exception as e:
195 |         print_reason_for_failure("(Post)", post_condition, str(e))
196 | 
197 | 
198 | def usage(parser, verbose):
199 |     print("usage: " + Path(sys.argv[0]).stem + " [-" + "".join(map(
200 |         lambda a: a.option_strings[0][1:], parser._actions)) + "]", end="\n")
201 | 
202 |     if verbose:
203 |         print()
204 |         print("Run tests specified in .dtt files.\n")
205 |         for action in parser._actions:
206 |             print(", ".join(action.option_strings), end="\t\t")
207 |             if action.metavar:
208 |                 print(action.metavar, end="\t")
209 |             else:
210 |                 print("", end='\t')
211 |             print(action.help, end="")
212 |             print()
213 |     sys.exit(1)
214 | 
215 | 
216 | def parse_arguments():
217 |     parser = argparse.ArgumentParser(add_help=False)
218 |     parser.register('action', 'extend', ExtendAction)
219 | 
220 |     # HACK: Correcting tab characters are inserted here for alignment
221 |     parser.add_argument('-e', '--etl', action='extend', nargs='+',
222 |                         metavar="ETL [ARGS...]",
223 |                         help="run the command ETL with the arguments ARGS")
224 |     parser.add_argument('-f', '--files', action='extend', nargs='+',
225 |                         metavar="FILES...",
226 |                         help="use only the conditions specified in FILES")
227 |     parser.add_argument('-h', '--help', action='store_true',
228 |                         help="\tshow this help message and exit")
229 |     parser.add_argument('-n', '--null', action='store', metavar="STRING",
230 |                         help="\tuse STRING to represent NULL (default: NULL)")
231 |     parser.add_argument('-p', '--pre', action='extend', nargs='+',
232 |                         metavar="FILES...",
233 |                         help="use only the preconditions specified in FILES")
234 |     parser.add_argument('-P', '--post', action='extend', nargs='+',
235 |                         metavar="FILES...",
236 |                         help="use only the postconditions specified in FILES")
237 |     parser.add_argument('-r', '--recursion-off', action='store_true',
238 |                         help="execute only the tests in cwd and not sub-folders")
239 |     parser.add_argument('-v', '--varprefix', action='store', metavar="STRING",
240 |                         help="\tuse STRING as prefix for variables (default: $)")
241 | 
242 |     args, failed = parser.parse_known_args()
243 |     if args.help:
244 |         usage(parser, True)
245 | 
246 |     if failed:
247 |         usage(parser, False)
248 |     return args
249 | 
250 | 
251 | # Main
252 | def main():
253 |     args = parse_arguments()
254 | 
255 |     # If -f is given conditions and config.py should be read from that folder
256 |     if args.files:
257 |         os.chdir(args.files[0])
258 | 
259 |     # Ensures that the expected config.py file is loaded
260 |     try:
261 |         sys.path.insert(0, os.getcwd())
262 |         import config  # Must specify a PEP 249 connection named 'connection'
263 |         del(sys.path[0])
264 |     except ImportError:
265 |         config = types.ModuleType('config')
266 |         config.connection = sqlite3.connect(':memory:')
267 | 
268 |     # Reads only the DTT files required to execute the tests, the arguments
269 |     # to Table is always given to ensure the defaults in dttr.py is used
270 |     dtts = list(map(lambda p: str(p), Path(os.getcwd()).glob('*.dtt') if
271 |                     args.recursion_off else Path(os.getcwd()).rglob('*.dtt')))
272 |     if args.pre and args.post:
273 |         paths = set(args.pre + args.post)
274 |         dtts = filter(lambda path: str(path) in paths, dtts)
275 |     pre_dtts = set(args.pre if args.pre else dtts)
276 |     post_dtts = set(args.post if args.post else dtts)
277 |     nullsubst = args.null if args.null else 'NULL'
278 |     variableprefix = args.varprefix if args.varprefix else '$'
279 |     connection_wrappers = {}
280 |     pre_conditions = []
281 |     post_conditions = []
282 |     for dtt_path in dtts:
283 |         read_dtt_file(dtt_path, pre_dtts, post_dtts, config, nullsubst,
284 |                       variableprefix, connection_wrappers, pre_conditions,
285 |                       post_conditions)
286 | 
287 |     # Ensures all preconditions can setup state for the tests
288 |     for pre_condition in pre_conditions:
289 |         ensure_pre_condition(pre_condition)
290 | 
291 |     # Executes the ETL flow to load data into the test warehouse
292 |     if args.etl:
293 |         os.system(' '.join(args.etl))
294 | 
295 |     # Checks that postconditions are met after the ETL flow is run
296 |     for post_condition in post_conditions:
297 |         assert_post_condition(post_condition)
298 | 
299 | 
300 | if __name__ == '__main__':
301 |     main()
302 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
  1 | Unreleased
  2 | ----------
  3 | **Added**
  4 |   Support for specifying if all or only the latest version of a member should be
  5 |   updated when type 1 updates are applied to ``SlowlyChangingDimension``.
  6 | 
  7 |   ``lookuprow`` doing ``lookup`` followed by ``getbykey`` added to ``Dimension``
  8 |   classes and ``Dimension``-like classes. Given a row with the ``lookupatts``,
  9 |   this method returns the full row.
 10 | 
 11 |   ``SlowlyChangingDimension.lookuprowasof`` doing ``lookupasof`` followed by
 12 |   ``getbykey`` added to ``SlowlyChangingDimension``. Given a row with the
 13 |   ``lookupatts`` and a timestamp, this method returns the full row version
 14 |   that was valid at the timestamp.
 15 | 
 16 |   ``__getitem__`` added to ``drawntabletesting.Table``. For a Table
 17 |   ``t``, this makes it possible to use ``t[n]`` to get the ``n``th row
 18 |   in ``t``.
 19 | 
 20 | **Fixed**
 21 |   All uses of ``open()`` in the beginner guide now include "utf-8" to minimize
 22 |   the chance of errors due to different encodings.
 23 | 
 24 |   ``dependson`` is now a list instead of a filter iterator. This fixes issue #72 
 25 |   where dependencies were only loaded in the first bulk load.
 26 | 
 27 |   ``MappingSource`` no longer duplicates rows when passing multiple callables.
 28 | 
 29 | **Changed**
 30 |   ``SQLSource`` now has a ``fetchsize`` constructor parameter so the end-user can
 31 |   control how much data should be held in main memory for each round trip to the RDBMS.
 32 |   
 33 |   Changed psycopg2 bulkloader documentation to use ``copy_expert`` instead of 
 34 |   ``copy_from``. This solves issue #74, where newer psycopg2 versions escape 
 35 |   table names to avoid sql injection.
 36 | 
 37 | Version 2.8
 38 | -----------
 39 | **Added**
 40 |   pygrametl's existing set of unit tests. By default, the unit tests are executed
 41 |   against an in-memory SQLite database so no configuration is needed.
 42 | 
 43 |   ``SQLTransformingSource`` a new class supporting transformation of rows by loading
 44 |   them into a temporary table in an RDBMS and then retrieving them using an SQL
 45 |   query.
 46 | 
 47 |   ``SlowlyChangingDimension.lookupasof`` which can be used to lookup the version of a
 48 |   member that was valid at a given time.
 49 | 
 50 | **Changed**
 51 |   Beginner guide updated and dataset added to it.
 52 | 
 53 |   If a ``rowexpander`` does not return a row in the form of a ``dict``,
 54 |   ``Dimension.ensure`` now explicitly raises a ``TypeError`` with the name of
 55 |   the function set as the ``rowexpander``.
 56 | 
 57 |   ``ymdhmsparser`` can now handle ``datetime.datetime`` as input. Any other
 58 |   input is cast to a string. (GitHub issue #40)
 59 | 
 60 |   ``ymdparser`` can now handle ``datetime.datetime`` and ``datetime.date`` as
 61 |   input. Any other input is cast to a string.  (GitHub issue #40)
 62 | 
 63 |   If ``orderingatt`` is not specified for a ``SlowlyChangingDimension``,
 64 |   ``fromatt`` will now be used if ``versionatt`` are ``toatt`` not specified.
 65 | 
 66 |   When using ``fromatt`` or ``toatt`` as ``orderingatt``, the generated SQL
 67 |   will specify NULLS FIRST or NULLS LAST. Before this change, NULLS FIRST was
 68 |   assumed for ORDER BY DESC, but this is not guaranteed to hold for all
 69 |   DBMSs. The change thus requires the used DBMS to support that NULLS FIRST or
 70 |   NULLS LAST is specified in the generated SQL.
 71 | 
 72 | **Fixed**
 73 |   ``BulkFactTable.__init__`` now sets the attributes ``keyrefs``, ``measures``,
 74 |   and ``all``. These attributes are required by the ``FactTablePartitioner``.
 75 | 
 76 |   ``BulkFactTable`` constructed with ``usemultirow=True`` (the default is
 77 |   ``False``) can now load rows containing ``NULL`` values. (GitHub issue #50)
 78 | 
 79 |   Incorrect quotation of identifiers in ``SlowlyChangingDimension`` fixed.
 80 | 
 81 |   Missing key value of root when calling ``getbykey`` of ``SnowflakedDimension`` fixed.
 82 | 
 83 |   Added explicit commit and rollback to fix problems with hanging DTT.
 84 | 
 85 | Version 2.7
 86 | -----------
 87 | **Note**
 88 |   This is the last version to actively support Python 2. Support for it will
 89 |   slowly be reduced as we continue to develop pygrametl.
 90 | 
 91 | **Added**
 92 |   ``drawntabletesting`` a new module for testing ETL flows. The module makes it
 93 |   easy to define the preconditions and postconditions for the database as part
 94 |   of each test. This is done simply by "drawing" the tables and their contents
 95 |   using strings.
 96 | 
 97 |   ``AccumulatingSnapshotFactTable`` a new class supporting accumulating snapshot
 98 |   fact tables where facts can be updated as a process progresses.
 99 | 
100 |   ``BatchFactTable.__init__`` now optionally takes the argument ``usemultirow``.
101 |   When this argument is ``True`` (the default is ``False``), batches are loaded
102 |   using ``execute`` with a single ``INSERT INTO name VALUES`` statement instead
103 |   of ``executemany()``. (GitHub issue #19).
104 | 
105 |   ``closecurrent`` method added to ``SlowlyChangingDimension`` to make it
106 |   possible to set an end date for the most current version without adding a new
107 |   version.
108 | 
109 |   A (read-only) property ``awaitingrows`` added to ``BatchFactTable`` and
110 |   ``_BaseBulkloadable`` to get the number of inserted rows awaiting to be loaded
111 |   into the database table. (GitHub issue #23)
112 | 
113 | **Changed**
114 |   ``SlowlyChangingDimension.scdensure`` now checks if the newest version has its
115 |   ``toatt`` set to a value different from ``maxto`` (if ``toatt`` is defined).
116 |   This can happen from a call to ``closecurrent`` or a manual update. If it is
117 |   the case, a new version will be added when ``scdensure`` is called even if no
118 |   other differences are present.
119 | 
120 |   Generators in ``datasources`` don't raise ``StopIteration`` anymore as
121 |   required by PEP 479.
122 | 
123 |   ``__author__`` and ``__maintainer__`` removed from all .py files.
124 | 
125 |   ``__version__`` removed from all .py files except ``pygrametl/__init__.py``
126 |   The version of pygrametl is thus now available as ``pygrametl.__version__``
127 |   and will be updated for every release.
128 | 
129 | **Fixed**
130 |   Outdated information stating that type 1 slowly changing dimensions are not
131 |   supported has been removed from the documentation. In addition, minor errors
132 |   and inconsistencies have been corrected throughput the documentation. (GitHub
133 |   issue #27)
134 | 
135 |   Wrong use of paramstyle in ``ConnectionWrapper.executemany`` fixed.
136 | 
137 |   A call to an incorrect method in ``aggregators.Avg.finish()``.
138 | 
139 |   The ``datespan()`` function now checks whether ``fromdate`` and ``todate`` are
140 |   strings before calling ``.split()``. In addition, the function now uses
141 |   ``dict.items()`` instead of ``dict.iteritems()`` which is not supported in
142 |   Python 3.
143 | 
144 | Version 2.6
145 | -----------
146 | **Added**
147 |   ``PandasSource`` a new class, that given a Pandas ``DataFrame`` acts as a data
148 |   source. Each row of the ``DataFrame`` is returned as a ``dict`` that can be
149 |   loaded into a data warehouse using ``tables``.
150 | 
151 |   ``MappingSource`` a new class, that given a data source and a dictionary of
152 |   columns to callables, maps the callables over each element of the specified
153 |   column before returning the row.
154 | 
155 | **Changed**
156 |   ``SlowlyChangingDimension`` improved to make ``versionatt`` optional. (GitHub
157 |   issue #12. Thanks to HereticSK)
158 | 
159 |   ``ConnectionWrapper.__init__`` now optionally takes the argument
160 |   ``copyintonew``. When this argument is ``True`` (the default is ``False``), a
161 |   new ``dict`` with parameters is created when a statement is executed. The new
162 |   ``dict`` only holds the k/v pairs needed by the statement. This is to avoid
163 |   ``DatabaseError: ORA-01036: illegal variable name/number`` with cx_Oracle.
164 |   (GitHub issue #9).
165 | 
166 |   First argument to ``TypedCSVSource.__init__`` renamed from ``csvfile`` to
167 |   ``f`` to be consistent with documentation and ``CSVSource``
168 | 
169 | **Fixed**
170 |   ``ConnectionWrapper.execute`` does not pass the argument ``arguments`` to the
171 |   underlying cursor's execute method if ``arguments`` is ``None``. Some drivers
172 |   raise an ``Error`` if ``None`` is passed, some don't.
173 | 
174 | Version 2.5
175 | -----------
176 | **Added**
177 |   ``TypedCSVSource`` a new class that reads a CSV file (by means of
178 |   ``csv.DictReader``) and performs user-specified casts (or other function
179 |   calls) on the values before returning the rows.
180 | 
181 |   Added ``definequote`` function to enable quoting of SQL identifiers in all
182 |   tables.
183 | 
184 |   Added ``getdbfriendlystr`` function to enable conversion of values into
185 |   strings that are accepted by an RDBMS. Boolean values become ```0`` or ``1``,
186 |   ``None`` values can be replaced by another value.
187 | 
188 |   All Bulkloadables now accept the argument ``strconverter`` to their
189 |   ``__init__`` methods. This should be a function that converts values into
190 |   strings that are written to a temporary file and eventually bulkloaded. The
191 |   default value is the new ``getdbfriendlystr``.
192 | 
193 |   ``SlowlyChangingDimension`` can now optionally be given the argument
194 |   ``useorderby`` when instantiated. If ``True`` (the default), the SQL used by
195 |   ``lookup`` uses ``ORDER BY`` (this is the same behaviour as before). If
196 |   ``False``, ``ORDER BY`` is not used and the SQL used by ``lookup`` will fetch
197 |   all versions of the member and then find the key value for the newest version
198 |   with Python code. For some systems, this can lead to significant performance
199 |   improvements.
200 | 
201 | **Changed**
202 |   Generator used in ``ConnectionWrapper.fetchalltuples`` to reduce memory
203 |   consumption. (Thanks to Alexey Kuzmenko)
204 | 
205 |   ``SlowlyChangingDimension`` can sometimes avoid deleting from the cache on
206 |   updates, now checked in the same way as in ``CachedDimension``
207 | 
208 |   ``rowfactory`` now tries to use ``fetchmany``. (Suggested by Alexey Kuzmenko).
209 | 
210 |   ``_BaseBulkloadable`` now has the method ``insert`` while the methods
211 |   ``_insertwithnull`` and ``_insertwithoutnull`` have been removed (and
212 |   subclasses do thus not pick one of them at runtime). The ``insert`` method
213 |   will always call ``strconverter`` (see above) no matter if a ``nullsubst`` has
214 |   been specified or not.
215 | 
216 |   ``_BaseBulkloadable`` will now raise a ``TypeError`` if no ``nullsubst`` is
217 |   specified and a ``None`` value is present. Before this change, the ``None``
218 |   value would silently be converted into the string ``'None'``. Users must now
219 |   give a ``nullsubst`` argument when instantiating a subclass of
220 |   ``_BaseBulkloadable`` that should be able to handle ``None`` values.
221 | 
222 |   ``SubprocessFactTable`` has been changed similarly to ``_BaseBulkloadable``
223 |   and does now define ``insert`` which uses ``strconverter``. Thus
224 |   ``_insertwithnull`` and  ``_insertwithoutnull`` have been removed.
225 | 
226 |   ``getunderlyingmodule`` has been changed and now tries different possible
227 |   module names and looks for ``'paramstyle'`` and ``'connect'``.
228 |   ``ConnectionWrapper`` now uses ``getunderlyingmodule`` in ``__init__`` when
229 |   trying to determine the paramstyle to use.
230 | 
231 | **Fixed**
232 |   Using ``cachesize=0`` with ``SlowlyChangingDimension`` no longer causes
233 |   crash.
234 | 
235 |   Problem with double use of namemappings in ``_before_update`` in
236 |   ``CachedDimension`` and ``SlowlyChangingDimension`` fixed. (Thanks to Alexey
237 |   Kuzmenko).
238 | 
239 |   Problem with ``rowfactory`` only returning one row fixed. (Thanks to Alexey
240 |   Kuzmenko).
241 | 
242 |   Problem with ``JDBCConnectionWrapper.rowfactory`` returning dictionaries with
243 |   incorrect keys fixed. (GitHub issue #5).
244 | 
245 |   Problem with ``TypeOneSlowlyChangingDimension`` caching ``None`` after an
246 |   update if a namemapping mapped to an attribute not in the update row fixed.
247 | 
248 |   Problem in ``__init__.copy`` fixed.
249 | 
250 |   Namemapping is now used when comparing measure values in ``FactTable.ensure``
251 |   with ``compare=True``.
252 | 
253 | Version 2.4
254 | -----------
255 | **Note**
256 |   This is the last version to support versions of Python 2 older than 2.7
257 | 
258 | **Added**
259 |   ``TypeOneSlowlyChangingDimension`` a new class that adds support for efficient
260 |   loading and updating of a type 1 exclusive slowly changing dimension.
261 | 
262 |   ``CachedBulkLoadingDimension`` a new class that supports bulk loading a
263 |   dimension without requiring the caching of all rows that are loaded.
264 | 
265 |   Alternative implementation of ``FIFODict`` based on an ``OrderedDict``.
266 |   (Thanks to Alexey Kuzmenko).
267 | 
268 |   Dimension classes with finite caches can now be prefilled more efficiently
269 |   using the ``FETCH FIRST`` SQL statement for increased performance.
270 | 
271 |   Examples on how to perform bulk loading in MySQL, Oracle Database, and
272 |   Microsoft SQL Server. (Thanks to Alexey Kuzmenko).
273 | 
274 | **Changed**
275 |   It is now verified that ``lookupatts`` is a subset of all attributes.
276 | 
277 |   All method calls to a superclass constructor now uses named parameters.
278 | 
279 |   Made cosmetic changes, and added additional information about how to ensure
280 |   cache coherency between pygrametl and the database to existing docstrings.
281 | 
282 |   The entire codebase was updated to adhere more closely to PEP 8 using
283 |   autopep8.
284 | 
285 | **Fixed**
286 |   Using ``dependson`` no longer causes crashes due to multiple loads of a table.
287 |   (Thanks to Alexey Kuzmenko).
288 | 
289 |   Using ``defaultidvalue`` no longer causes ``Dimension.ensure`` to fail to
290 |   insert correctly, or make ``CachedDimension.ensure`` produce duplicates.
291 |   (Thanks to Alexey Kuzmenko).
292 | 
293 |   Using ``SlowlyChangingDimension`` with the cache disabled no longer causes a
294 |   crash in ``SlowlyChangingDimension.scdensure``.
295 | 
296 |   Using ``BulkDimension``, ``CachedBulkDimension`` or ``BulkFactTable`` with
297 |   ``tempdest`` and ``usefilename`` no longer causes a crash in
298 |   ``_BaseBulkloadable._bulkloadnow``.
299 | 
300 | Version 2.3.2
301 | -------------
302 | **Fixed**
303 |   ``SnowflakedDimension`` no longer crashes due to ``levellist`` not being a
304 |   list before the length of it is computed.
305 | 
306 |   ``FactTable`` now inserts the correct number of commas to the SQL statements
307 |   used for inserting rows, independent of the value of ``keyrefs``.
308 | 
309 | Version 2.3.1
310 | -------------
311 | **Fixed**
312 |   Using other parameter styles than ``pyformat`` no longer causes a crash in
313 |   ``ConnectionWrapper``.
314 | 
315 | Version 2.3
316 | -------------
317 | **Added**
318 |   A new quick start guide was added to the documentation.
319 | 
320 |   Added code examples for all classes in pygrametl except ``Steps``.
321 | 
322 |   pygrametl now officially supports Python 2.6.X, Python 2.7.X, Python 3, Jython
323 |   2.5.X and Jython 2.7.X.
324 | 
325 |   ``BulkDimension`` a new class that supports bulk loading of dimension tables.
326 | 
327 |   ``_BaseBulkloadable`` with common functionality for ``BulkFactTable`` and
328 |   ``BulkDimension``.
329 | 
330 |   ``SQLSource`` can now pass parameters to the cursor's ``execute`` function.
331 | 
332 | **Fixed**
333 |   Importing everything from ``tables`` using a wildcard now longer causes a
334 |   crash.
335 | 
336 | Version 2.2
337 | -----------
338 | **Added**
339 |   Created a PyPI package and uploaded it to `pypi.python.org/project/pygrametl
340 |   <https://pypi.python.org/project/pygrametl>`_.
341 | 
342 |   Added code examples for some of the classes in pygrametl.
343 | 
344 | **Changed**
345 |   Documentation is now written in reStructuredText and compiled using Sphinx.
346 | 


--------------------------------------------------------------------------------
/tests/drawntabletesting/test_dtt.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, Aalborg University (pygrametl@cs.aau.dk)
  2 | # All rights reserved.
  3 | 
  4 | # Redistribution and use in source anqd binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are met:
  6 | 
  7 | # - Redistributions of source code must retain the above copyright notice, this
  8 | #   list of conditions and the following disclaimer.
  9 | 
 10 | # - Redistributions in binary form must reproduce the above copyright notice,
 11 | #   this list of conditions and the following disclaimer in the documentation
 12 | #   and/or other materials provided with the distribution.
 13 | 
 14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 20 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 21 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 22 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 23 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 24 | 
 25 | import unittest
 26 | import pygrametl
 27 | import pygrametl.drawntabletesting as dtt
 28 | from tests import utilities
 29 | 
 30 | # Examples are from docs/examples/testing.rst
 31 | class TableTest(unittest.TestCase):
 32 |     @classmethod
 33 |     def setUpClass(cls):
 34 |         utilities.ensure_default_connection_wrapper()
 35 |         cls.initial = dtt.Table("book", """
 36 |         | bid:int (pk) | title:text            | genre:text |
 37 |         | ------------ | --------------------- | ---------- |
 38 |         | 1            | Unknown               | Unknown    |
 39 |         | 2            | Nineteen Eighty-Four  | Novel      |
 40 |         | 3            | Calvin and Hobbes One | Comic      |
 41 |         | 4            | Calvin and Hobbes Two | Comic      |
 42 |         | 5            | The Silver Spoon      | Cookbook   |
 43 |         """)
 44 | 
 45 |     def setUp(self):
 46 |         utilities.ensure_default_connection_wrapper()
 47 | 
 48 |     def test_init_correct(self):
 49 |         dtt.Table("book", """
 50 |         | bid:int (pk) | title:text (unique)   | genre:text (not null) |
 51 |         | ------------ | --------------------- | --------------------- |
 52 |         | 1            | Unknown               | Unknown               |
 53 |         | 2            | Nineteen Eighty-Four  | Novel                 |
 54 |         | 3            | Calvin and Hobbes One | Comic                 |
 55 |         | 4            | Calvin and Hobbes Two | Comic                 |
 56 |         | 5            | The Silver Spoon      | Cookbook              |
 57 |         """)
 58 | 
 59 |     def test_init_unknown_incorrect(self):
 60 |         # Unknown constraints
 61 |         with self.assertRaises(ValueError):
 62 |             dtt.Table("book", """
 63 |             | bid:int (pk) | title:text (unique)   | genre:text (notnull) |
 64 |             | ------------ | --------------------- | -------------------- |
 65 |             | 1            | Unknown               | Unknown              |
 66 |             | 2            | Nineteen Eighty-Four  | Novel                |
 67 |             | 3            | Calvin and Hobbes One | Comic                |
 68 |             | 4            | Calvin and Hobbes Two | Comic                |
 69 |             | 5            | The Silver Spoon      | Cookbook             |
 70 |             """)
 71 | 
 72 |         # Missing : between name and type
 73 |         with self.assertRaises(ValueError):
 74 |             dtt.Table("book", """
 75 |             | bid int (pk) | title text (unique)   | genre text (not null) |
 76 |             | ------------ | --------------------- | --------------------- |
 77 |             | 1            | Unknown               | Unknown               |
 78 |             | 2            | Nineteen Eighty-Four  | Novel                 |
 79 |             | 3            | Calvin and Hobbes One | Comic                 |
 80 |             | 4            | Calvin and Hobbes Two | Comic                 |
 81 |             | 5            | The Silver Spoon      | Cookbook              |
 82 |             """)
 83 | 
 84 |     def test_ensure_and_foreign_key(self):
 85 |         dtt.Table("genre", """
 86 |         | bid:int (pk) | genre:text |
 87 |         | ------------ | ---------- |
 88 |         | 1            | Unknown    |
 89 |         | 2            | Novel      |
 90 |         | 3            | Comic      |
 91 |         | 4            | Cookbook   |
 92 |         """).ensure()
 93 | 
 94 |         dtt.Table("book", """
 95 |         | bid:int (pk) | title:text             | gid:int (fk genre(bid)) |
 96 |         | ------------ | ---------------------- | ------------------------ |
 97 |         | 1            | Unknown                | 1                        |
 98 |         | 2            | Nineteen Eighty-Four   | 2                        |
 99 |         | 3            | Calvin and Hobbes One  | 3                        |
100 |         | 4            | Calvin and Hobbes Two  | 3                        |
101 |         | 5            | The Silver Spoon       | 4                        |
102 |         """).ensure()
103 | 
104 |     def test_key(self):
105 |         self.assertEqual(self.initial.key(), "bid")
106 | 
107 |     def test_getsqltocreate(self):
108 |         self.assertEqual(
109 |             self.initial.getSQLToCreate(),
110 |             "CREATE TABLE book(bid int, title text, genre text, PRIMARY KEY (bid))")
111 | 
112 |     def test_getsqltoinsert(self):
113 |         self.assertEqual(self.initial.getSQLToInsert(), (
114 |             "INSERT INTO book(bid, title, genre) VALUES"
115 |             "(1, 'Unknown', 'Unknown'), "
116 |             "(2, 'Nineteen Eighty-Four', 'Novel'), "
117 |             "(3, 'Calvin and Hobbes One', 'Comic'), "
118 |             "(4, 'Calvin and Hobbes Two', 'Comic'), "
119 |             "(5, 'The Silver Spoon', 'Cookbook')"))
120 | 
121 |     def test_assert_equal(self):
122 |         book = self.initial
123 |         book.ensure()
124 |         book.assertEqual()
125 | 
126 |     def test_assert_not_equal(self):
127 |         book = self.initial
128 |         book.ensure()
129 |         with self.assertRaises(AssertionError):
130 |             (book + "| 6 | Metro 2033 | Novel |").assertEqual()
131 | 
132 |     def test_assert_disjoint(self):
133 |         self.initial.ensure()
134 |         dtt.Table("book", """
135 |         | bid:int (pk) | title:text            | genre:text |
136 |         | ------------ | --------------------- | ---------- |
137 |         | 1            | None                  | None       |
138 |         """).assertDisjoint()
139 | 
140 |     def test_assert_not_disjoint(self):
141 |         book = self.initial
142 |         book.ensure()
143 |         with self.assertRaises(AssertionError):
144 |             book.assertDisjoint()
145 | 
146 |     def test_assert_subset(self):
147 |         self.initial.ensure()
148 |         dtt.Table("book", """
149 |         | bid:int (pk) | title:text            | genre:text |
150 |         | ------------ | --------------------- | ---------- |
151 |         | 1            | Unknown               | Unknown    |
152 |         """).assertSubset()
153 | 
154 |     def test_assert_not_subset(self):
155 |         book = self.initial
156 |         book.ensure()
157 |         with self.assertRaises(AssertionError):
158 |             (book + "| 6 | Metro 2033 | Novel |").assertSubset()
159 | 
160 |     def test_create_reset_ensure_clear_drop(self):
161 |         connection_wrapper = pygrametl.getdefaulttargetconnection()
162 |         with self.assertRaises(Exception):
163 |             connection_wrapper.execute("SELECT * FROM " + self.initial.name)
164 | 
165 |         self.initial.create()
166 |         connection_wrapper.execute("SELECT * FROM " + self.initial.name)
167 |         self.assertEqual(len(list(connection_wrapper.fetchalltuples())), 0)
168 | 
169 |         self.initial.reset()
170 |         connection_wrapper.execute("SELECT * FROM " + self.initial.name)
171 |         self.assertEqual(len(list(connection_wrapper.fetchalltuples())), 5)
172 | 
173 |         self.initial.ensure()
174 |         connection_wrapper.execute("SELECT * FROM " + self.initial.name)
175 |         self.assertEqual(len(list(connection_wrapper.fetchalltuples())), 5)
176 | 
177 |         self.initial.clear()
178 |         with self.assertRaises(Exception):
179 |             connection_wrapper.execute("SELECT * FROM " + self.initial.name)
180 | 
181 |         self.initial.create()
182 |         with self.assertRaises(Exception):
183 |             self.initial.ensure()
184 | 
185 |         self.initial.drop()
186 |         with self.assertRaises(Exception):
187 |             connection_wrapper.execute("SELECT * FROM " + self.initial.name)
188 | 
189 |     def test_add_update_and_additions(self):
190 |         book = self.initial
191 |         book_added = book + "| 6 | Metro 2033 | Novel |" \
192 |             + "| 7 | Metro 2034 | Novel |"
193 |         book_updated = book_added.update(0, "| -1 | Unknown | Unknown |")
194 |         book_expected = [
195 |             {'bid': -1, 'title': 'Unknown', 'genre': 'Unknown'},
196 |             {'bid': 6, 'title': 'Metro 2033', 'genre': 'Novel'},
197 |             {'bid': 7, 'title': 'Metro 2034', 'genre': 'Novel'}
198 |         ]
199 |         self.assertEqual(book_expected, book_updated.additions(withKey=True))
200 | 
201 |     def test_variables_and_foreign_keys_correct(self):
202 |         dtt.Table("genre", """
203 |         | gid:int (pk) | genre:text |
204 |         | ------------ | ---------- |
205 |         | 1            | Novel      |
206 |         | 2            | Comic      |
207 |         """).ensure()
208 | 
209 |         dtt.Table("book", """
210 |         | bid:int (pk) | title:text             | gid:int (fk genre(gid))  |
211 |         | ------------ | ---------------------- | ------------------------ |
212 |         | 1            | Nineteen Eighty-Four   | 1                        |
213 |         | 2            | Calvin and Hobbes One  | 2                        |
214 |         | 3            | Calvin and Hobbes Two  | 2                        |
215 |         """).ensure()
216 | 
217 |         dtt.Table("genre", """
218 |         | gid:int (pk)  | genre:text |
219 |         | ------------- | ---------- |
220 |         | $1            | Novel      |
221 |         | $2            | Comic      |
222 |         """).assertEqual()
223 | 
224 |         dtt.Table("book", """
225 |         | bid:int (pk) | title:text             | gid:int (fk genre(gid)) |
226 |         | ------------ | ---------------------- | ----------------------- |
227 |         | 1            | Nineteen Eighty-Four   | $1                      |
228 |         | 2            | Calvin and Hobbes One  | $2                      |
229 |         | 3            | Calvin and Hobbes Two  | $2                      |
230 |         """).assertEqual()
231 | 
232 |     def test_variables_and_foreign_keys_wrong(self):
233 |         dtt.Table("genre", """
234 |         | gid:int (pk) | genre:text |
235 |         | ------------ | ---------- |
236 |         | 1            | Novel      |
237 |         | 2            | Comic      |
238 |         """).ensure()
239 | 
240 |         dtt.Table("book", """
241 |         | bid:int (pk) | title:text             | gid:int (fk genre(gid))  |
242 |         | ------------ | ---------------------- | ------------------------ |
243 |         | 1            | Nineteen Eighty-Four   | 2                        |
244 |         | 2            | Calvin and Hobbes One  | 1                        |
245 |         | 3            | Calvin and Hobbes Two  | 1                        |
246 |         """).ensure()
247 | 
248 |         dtt.Table("genre", """
249 |         | gid:int (pk)  | genre:text |
250 |         | ------------- | ---------- |
251 |         | $1            | Novel      |
252 |         | $2            | Comic      |
253 |         """).assertEqual()
254 | 
255 |         book = dtt.Table("book", """
256 |         | bid:int (pk) | title:text             | gid:int (fk genre(gid)) |
257 |         | ------------ | ---------------------- | ----------------------- |
258 |         | 1            | Nineteen Eighty-Four   | $1                      |
259 |         | 2            | Calvin and Hobbes One  | $2                      |
260 |         | 3            | Calvin and Hobbes Two  | $2                      |
261 |         """)
262 | 
263 |         with self.assertRaises(AssertionError):
264 |             book.assertEqual()
265 | 
266 |     def test_variables_underscore(self):
267 |         dtt.Table("address", """
268 |         | aid:int (pk) | dept:text | location:text           | validfrom:date | validto:date |
269 |         | ------------ | --------- | ----------------------- | -------------- | ------------ |
270 |         | NULL         | CS        | Fredrik Bajers Vej 7    | 1990-01-01     | 2000-01-01   |
271 |         | NULL         | CS        | Selma Lagerløfs Vej 300 | 2000-01-01     | NULL         |
272 |         """).ensure()
273 | 
274 |         dtt.Table("address", """
275 |         | aid:int (pk) | dept:text | location:text           | validfrom:date | validto:date  |
276 |         | ------------ | --------- | ----------------------- | -------------- | ------------- |
277 |         | $_           | CS        | Fredrik Bajers Vej 7    | 1990-01-01     | $3            |
278 |         | $_           | CS        | Selma Lagerløfs Vej 300 | $3             | NULL          |
279 |         """).assertEqual()
280 | 
281 |     def test_variables_underscore_not_null(self):
282 |         dtt.Table("address", """
283 |         | aid:int (pk) | dept:text | location:text           | validfrom:date | validto:date |
284 |         | ------------ | --------- | ----------------------- | -------------- | ------------ |
285 |         | NULL         | CS        | Fredrik Bajers Vej 7    | 1990-01-01     | 2000-01-01   |
286 |         | NULL         | CS        | Selma Lagerløfs Vej 300 | 2000-01-01     | NULL         |
287 |         """).ensure()
288 | 
289 |         address = dtt.Table("address", """
290 |         | aid:int (pk) | dept:text | location:text           | validfrom:date | validto:date  |
291 |         | ------------ | --------- | ----------------------- | -------------- | ------------- |
292 |         | $_!          | CS        | Fredrik Bajers Vej 7    | 1990-01-01     | $4            |
293 |         | $_!          | CS        | Selma Lagerløfs Vej 300 | $4             | NULL          |
294 |         """)
295 | 
296 |         with self.assertRaises(AssertionError):
297 |             address.assertEqual()
298 | 
299 | 
300 | # The tests are from docs/examples/testing.rst
301 | def executeETLFlow(cw, row):
302 |     if row['bid'] == 5:
303 |         cw.execute("INSERT INTO book (bid, title, genre) VALUES(" +
304 |                    (",".join(map(lambda x: "'" + x + "'" if type(x) is str
305 |                                  else str(x), list(row.values())))) + ")")
306 | 
307 | 
308 | class BookStateTest(unittest.TestCase):
309 |     @classmethod
310 |     def setUpClass(cls):
311 |         utilities.ensure_default_connection_wrapper()
312 |         cls.initial = dtt.Table("book", """
313 |         | bid:int (pk) | title:text            | genre:text |
314 |         | ------------ | --------------------- | ---------- |
315 |         | 1            | Unknown               | Unknown    |
316 |         | 2            | Nineteen Eighty-Four  | Novel      |
317 |         | 3            | Calvin and Hobbes One | Comic      |
318 |         | 4            | The Silver Spoon      | Cookbook   |
319 |         """)
320 | 
321 |     def setUp(self):
322 |         utilities.ensure_default_connection_wrapper()
323 |         self.initial.reset()
324 | 
325 |     def test_insertNew(self):
326 |         expected = self.initial + "| 5 | Calvin and Hobbes Two | Comic |"
327 |         newrow = expected.additions(withKey=True)[0]
328 |         executeETLFlow(pygrametl.getdefaulttargetconnection(), newrow)
329 |         expected.assertEqual()
330 | 
331 |     def test_insertExisting(self):
332 |         newrow = {'bid': 6, 'book': 'Calvin and Hobbes One', 'genre': 'Comic'}
333 |         executeETLFlow(pygrametl.getdefaulttargetconnection(), newrow)
334 |         self.initial.assertEqual()
335 | 


--------------------------------------------------------------------------------
/docs/examples/facttables.rst:
--------------------------------------------------------------------------------
  1 | .. _facttables:
  2 | 
  3 | Fact Tables
  4 | ===========
  5 | pygrametl provides multiple classes for representing fact tables. These classes
  6 | enable facts to be loaded one at a time, as batches stored in memory, or in
  7 | bulk from a file on disk. Support for loading facts with missing information and
  8 | then updating them later is also supported. For information about how to load
  9 | facts in parallel see :ref:`parallel`. In the following examples, we use
 10 | PostgreSQL as the RDBMS and psycopg2 as the database driver.
 11 | 
 12 | All of the following classes are currently implemented in the
 13 | :mod:`.pygrametl.tables` module.
 14 | 
 15 | FactTable
 16 | ---------
 17 | The most basic class for representing a fact table is :class:`.FactTable`.
 18 | Before creating a :class:`.FactTable` object, an appropriate table must be
 19 | created in the database, and a :pep:`249` connection to the database must be
 20 | created and wrapped by the class :class:`.ConnectionWrapper`. For more
 21 | information about how database connections are used in pygrametl see
 22 | :ref:`database`. :class:`.FactTable` constructor must be given the table's
 23 | :attr:`name`, the attributes used as :attr:`measures` in the fact table, and the
 24 | attributes referencing dimensions (:attr:`keyrefs`). Be aware that
 25 | :class:`.FactTable` performs an insert in the database whenever the
 26 | :meth:`.FactTable.insert` method is called, which can very quickly become a
 27 | bottleneck.
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |     import psycopg2
 32 |     import pygrametl
 33 |     from pygrametl.tables import FactTable
 34 | 
 35 |     # The actual database connection is handled by a PEP 249 connection
 36 |     pgconn = psycopg2.connect("""host='localhost' dbname='dw' user='dwuser'
 37 | 			      password='dwpass'""")
 38 | 
 39 |     # This ConnectionWrapper will be set as a default and is then implicitly
 40 |     # used, but it is stored in conn so transactions can be committed and the
 41 |     # connection closed
 42 |     conn = pygrametl.ConnectionWrapper(connection=pgconn)
 43 | 
 44 |     # This instance of FactTable connects to the table facttable in the
 45 |     # database using the default connection wrapper created above
 46 |     factTable = FactTable(
 47 | 	name='facttable',
 48 | 	measures=['price'],
 49 | 	keyrefs=['storeid', 'productid', 'dateid'])
 50 | 
 51 | The above example shows the three step process needed to connect an instance of
 52 | :class:`.FactTable` to an existing database table. Firstly, a :PEP:`249`
 53 | connection to the database is created. Then an instance of
 54 | :class:`.ConnectionWrapper` is created to provide a uniform interface to all
 55 | types of database connections supported by pygrametl. The instance of
 56 | :class:`.ConnectionWrapper` is also set as the default database connection to
 57 | use for this ETL flow. Lastly, a :class:`.FactTable` is created as a
 58 | representation of the actual database table.
 59 | 
 60 | Operations on the fact table are done using three methods:
 61 | :meth:`.FactTable.insert` inserts new facts directly into the fact table when
 62 | they are passed to the method. :meth:`.FactTable.lookup` returns a fact if the
 63 | database contains one with the given combination of keys referencing the
 64 | dimensions. :meth:`.FactTable.ensure` combines :meth:`.FactTable.lookup` and
 65 | :meth:`.FactTable.insert` by ensuring that a fact does not exist before
 66 | inserting it. An example of each function and the automatic name mapping can be
 67 | seen below, where the fact table from the last example is reused.
 68 | 
 69 | .. code-block:: python
 70 | 
 71 |     import psycopg2
 72 |     import pygrametl
 73 |     from pygrametl.tables import FactTable
 74 | 
 75 |     # The actual database connection is handled by a PEP 249 connection
 76 |     pgconn = psycopg2.connect("""host='localhost' dbname='dw' user='dwuser'
 77 | 			      password='dwpass'""")
 78 | 
 79 |     # This ConnectionWrapper will be set as a default and is then implicitly
 80 |     # used, but it is stored in conn so transactions can be committed and the
 81 |     # connection closed
 82 |     conn = pygrametl.ConnectionWrapper(connection=pgconn)
 83 | 
 84 |     # This instance of FactTable connects to the table facttable in the
 85 |     # database using the default connection wrapper created above
 86 |     factTable = FactTable(
 87 | 	name='facttable',
 88 | 	measures=['price'],
 89 | 	keyrefs=['storeid', 'productid', 'dateid'])
 90 | 
 91 |     # A list of facts ready to inserted into the fact table
 92 |     facts = [{'storeid': 1, 'productid': 13, 'dateid': 4, 'price': 50},
 93 | 	     {'storeid': 2, 'productid':  7, 'dateid': 4, 'price': 75},
 94 | 	     {'storeid': 1, 'productid':  7, 'dateid': 4, 'price': 50},
 95 | 	     {'storeid': 3, 'productid':  9, 'dateid': 4, 'price': 25}]
 96 | 
 97 |     # The facts can be inserted using the insert method
 98 |     for row in facts:
 99 | 	factTable.insert(row)
100 |     conn.commit()
101 | 
102 |     # Lookup returns the keys and measures given only the keys
103 |     row = factTable.lookup({'storeid': 1, 'productid': 13, 'dateid': 4})
104 | 
105 |     # Ensure should be used when loading facts that might already be loaded
106 |     newFacts = [{'storeid': 2, 'itemid':  7, 'dateid': 4, 'price': 75},
107 | 		{'storeid': 1, 'itemid':  7, 'dateid': 4, 'price': 50},
108 | 		{'storeid': 1, 'itemid':  2, 'dateid': 7, 'price': 150},
109 | 		{'storeid': 3, 'itemid':  3, 'dateid': 6, 'price': 100}]
110 | 
111 |     for row in newFacts:
112 | 	# The second argument forces ensure to not only match the keys for facts
113 | 	# to be considered equal, but also checks if the measures are the same
114 | 	# for facts with the same key, and if not raises a ValueError. The third
115 | 	# argument renames itemid to productid using a name mapping
116 | 	factTable.ensure(row, True, {'productid': 'itemid'})
117 |     conn.commit()
118 |     conn.close()
119 | 
120 | BatchFactTable
121 | --------------
122 | :class:`.BatchFactTable` loads facts into the fact table in batches instead of
123 | one at a time like :class:`.FactTable`. Thus reducing the number of round trips
124 | to the database which improves the performance of the ETL flow. The size of each
125 | batch is determined by the :attr:`batchsize` parameter added to the class's
126 | constructor. :class:`.BatchFactTable` loads each batch using either the
127 | :meth:`executemany` method specified in :pep:`249` or a single SQL ``INSERT INTO
128 | facttable VALUES(...)`` statement depending on the value passed to
129 | :attr:`usemultirow` in the classes constructor. The
130 | :meth:`.ConnectionWrapper.commit` method must be called after all facts have
131 | been inserted into the fact table to both ensure that the last batch is loaded
132 | into the database from memory and that the transaction is committed.
133 | 
134 | .. note:: Both :meth:`.BatchFactTable.lookup` and :meth:`.BatchFactTable.ensure`
135 | 	  force the current batch of facts to be an inserted. This is to keep
136 | 	  them consistent with all of facts inserted into the fact table. Thus
137 | 	  using these methods can reduce the benefit of batching insertions.
138 | 
139 | BulkFactTable
140 | -------------
141 | :class:`.BulkFactTable` also inserts facts in batches but writes the facts to a
142 | temporary file instead of keeping them in memory. Thus the size of a batch is
143 | limited by the size of the disk instead of the amount of memory available.
144 | However, this prevents :meth:`BulkFactTable.lookup` and
145 | :meth:`BulkFactTable.ensure` from being implemented efficiently, so these
146 | methods are not available. Like for :class:`.BatchFactTable`, the method
147 | :meth:`.ConnectionWrapper.commit` must be called to ensure that the last batch
148 | of facts is loaded into the database. Multiple additional parameters have been
149 | added to the class's constructor to provide control over the temporary file used
150 | to store facts, such as what delimiters to use and the number of facts to be
151 | bulk loaded in each batch. All of these parameters have a default value except
152 | for :attr:`.bulkloader`. This parameter must be passed a function that will be
153 | called for each batch of facts to be loaded. This is necessary as the exact way
154 | to perform bulk loading differs from RDBMS to RDBMS.
155 | 
156 | .. py:function:: func(name, attributes, fieldsep, rowsep, nullval, filehandle):
157 | 
158 |     Required signature of a function bulk loading data from a file into an RDBMS
159 |     in pygrametl. For more information about bulk loading see
160 |     :ref:`bulkloading`.
161 | 
162 |     **Arguments:**
163 | 
164 |     - name: the name of the fact table in the data warehouse.
165 |     - attributes: a list containing both the sequence of attributes constituting
166 |       the primary key of the fact table, as well as the measures.
167 |     - fieldsep: the string used to separate fields in the temporary file.
168 |     - rowsep: the string used to separate rows in the temporary file.
169 |     - nullval: if the :class:`.BulkFactTable` was passed a string to substitute
170 |       None values with, then it will be passed, if not then None is passed.
171 |     - filehandle: either the name of the file or the file object itself,
172 |       depending upon the value of :attr:`.BulkFactTable.usefilename`. Using
173 |       the filename is necessary if the bulk loading is invoked through SQL
174 |       (instead of directly via a method on the PEP249 driver). It is also
175 |       necessary if the bulkloader runs in another process.
176 | 
177 | 
178 | In the following example, a :class:`.BulkFactTable` is used to bulk load facts
179 | into a data warehouse using function :func:`pgbulkloader`. For information about
180 | how to bulk loading data into other RDBMSs see :ref:`bulkloading`.
181 | 
182 | .. code-block:: python
183 | 
184 |     import psycopg2
185 |     import pygrametl
186 |     from pygrametl.tables import BulkFactTable
187 | 
188 |     pgconn = psycopg2.connect("""host='localhost' dbname='dw' user='dwuser'
189 | 			      password='dwpass'""")
190 | 
191 |     conn = pygrametl.ConnectionWrapper(connection=pgconn)
192 | 
193 |     facts = [{'storeid': 1, 'productid': 13, 'dateid': 4, 'price': 50},
194 | 	     {'storeid': 2, 'productid':  7, 'dateid': 4, 'price': 75},
195 | 	     {'storeid': 1, 'productid':  7, 'dateid': 4, 'price': 50},
196 | 	     {'storeid': 3, 'productid':  9, 'dateid': 4, 'price': 25}]
197 | 
198 | 
199 |     # This function bulk loads a file into PostgreSQL using psycopg2
200 |     def pgbulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
201 | 	cursor = conn.cursor()
202 | 	# psycopg2 does not accept the default value used to represent NULL
203 | 	# by BulkDimension, which is None. Here this is ignored as we have no
204 | 	# NULL values that we wish to substitute for a more descriptive value
205 | 	cursor.copy_from(file=filehandle, table=name, sep=fieldsep,
206 | 			 columns=attributes)
207 | 
208 | 
209 |     # The bulk loading function must be passed to BulkFactTable's constructor
210 |     factTable = BulkFactTable(
211 | 	name='facttable',
212 | 	measures=['price'],
213 | 	keyrefs=['storeid', 'productid', 'dateid'],
214 | 	bulkloader=pgbulkloader)
215 | 
216 |     # commit() and close() must be called to ensure that all facts have been
217 |     # inserted into the database and that the connection is closed correctly
218 |     #  afterward
219 |     for row in facts:
220 | 	factTable.insert(row)
221 |     conn.commit()
222 |     conn.close()
223 | 
224 | AccumulatingSnapshotFactTable
225 | -----------------------------
226 | :class:`.AccumulatingSnapshotFactTable` represents a fact table where facts are
227 | updated as a process evolves. Typically different date references (OrderDate,
228 | PaymentDate, ShipDate, DeliveryDate, etc.) are set when they become known.
229 | Measures (e.g., measuring the lag between the different dates) are also often
230 | set as they become available. Like for :class:`.FactTable`, the class
231 | :class:`.AccumulatingSnapshotFactTable` performs an insert in the database
232 | whenever the :meth:`.AccumulatingSnapshotFactTable.insert` method is called. The
233 | following example illustrates how to create the class:
234 | 
235 | .. code-block:: python
236 | 
237 |     import psycopg2
238 |     import pygrametl
239 |     from pygrametl.tables import AccumulatingSnapshotFactTable
240 | 
241 |     # The actual database connection is handled by a PEP 249 connection
242 |     pgconn = psycopg2.connect("""host='localhost' dbname='dw' user='dwuser'
243 | 			      password='dwpass'""")
244 | 
245 |     # This ConnectionWrapper will be set as a default and is then implicitly
246 |     # used, but it is stored in conn so transactions can be committed and the
247 |     # connection closed
248 |     conn = pygrametl.ConnectionWrapper(connection=pgconn)
249 | 
250 | 
251 |     # A factexpander can be used to modify a row only if it has been updated, note
252 |     # that we only ignore namemapping for brevity, production code should use it
253 |     def computelag(row, namemapping, updated):
254 | 	if 'shipmentdateid' in updated:
255 | 	    row['shipmentlag'] = row['shipmentdateid'] - row['paymentdateid']
256 | 	if 'deliverydateid' in updated:
257 | 	    row['deliverylag'] = row['deliverydate'] - row['shipmentdateid']
258 | 
259 | 
260 |     # This instance of AccumulatingSnapshotFactTable connects to the table
261 |     # orderprocessing in the database using the connection created above
262 |     asft = AccumulatingSnapshotFactTable(
263 | 	name='orderprocessing',
264 | 	keyrefs=['orderid', 'customerid', 'productid'],
265 | 	otherrefs=['paymentdateid', 'shipmentdateid', 'deliverydateid'],
266 | 	measures=['price', 'shipmentlag', 'deliverylag'],
267 | 	factexpander=computelag)
268 | 
269 | Firstly a :PEP:`249` connection is created to perform the actual database
270 | operations, then an instance of the :class:`.ConnectionWrapper` is created as a
271 | uniform wrapper around the :PEP:`249` connection which is set as the default database
272 | connection for this ETL flow. Then a user-defined function to compute lag measures
273 | is defined. Lastly, an :class:`.AccumulatingSnapshotFactTable` is created.
274 | 
275 | As stated :meth:`.AccumulatingSnapshotFactTable.insert` inserts new facts
276 | directly into the fact table when they are passed to the method.
277 | :meth:`.AccumulatingSnapshotFactTable.lookup` checks if the database contains a
278 | fact with the given combination of keys referencing the dimensions. These
279 | methods behave in the same way as in :class:`.FactTable`. The method
280 | :meth:`.AccumulatingSnapshotFactTable.update`, will based on the :attr:`keyrefs`,
281 | find the fact and update it if there are any differences in :attr:`otherrefs`
282 | and :attr:`measures`. The method :meth:`.AccumulatingSnapshotFactTable.ensure`
283 | checks if the row it is given, already exists in the database table. If it does
284 | not exist, it is immediately inserted. If it exists, the method will see if some
285 | of the values for :attr:`otherrefs` or :attr:`measures` have been updated in the
286 | passed row. If so, it will update the row in the database. Before that, it will,
287 | however, run the :func:`factexpander` if one was given to
288 | :meth:`.AccumulatingSnapshotFactTable.__init__` when the object was created.
289 | Note that the generated SQL for lookups and updates will use the :attr:`keyrefs`
290 | in the ``WHERE`` clause and an index on them should be considered. An example of
291 | how to use the class can be seen below:
292 | 
293 | .. code-block:: python
294 | 
295 |     import psycopg2
296 |     import pygrametl
297 |     from pygrametl.tables import AccumulatingSnapshotFactTable
298 | 
299 |     # The actual database connection is handled by a PEP 249 connection
300 |     pgconn = psycopg2.connect("""host='localhost' dbname='dw' user='dwuser'
301 | 			      password='dwpass'""")
302 | 
303 |     # A factexpander can be used to modify a row only if it has been updated, note
304 |     # that we only ignore namemapping for brevity, production code should use it
305 |     conn = pygrametl.ConnectionWrapper(connection=pgconn)
306 | 
307 | 
308 |     # A factexpander can be used to modify a row only if it has been updated, note
309 |     # that we only ignore namemapping for brevity, production code should use it
310 |     def computelag(row, namemapping, updated):
311 | 	if 'shipmentdateid' in updated:
312 | 	    row['shipmentlag'] = row['shipmentdateid'] - row['paymentdateid']
313 | 	if 'deliverydateid' in updated:
314 | 	    row['deliverylag'] = row['deliverydate'] - row['shipmentdateid']
315 | 
316 | 
317 |     # This instance of AccumulatingSnapshotFactTable connects to the table
318 |     # orderprocessing in the database using the connection created above
319 |     asft = AccumulatingSnapshotFactTable(
320 | 	name='orderprocessing',
321 | 	keyrefs=['orderid', 'customerid', 'productid'],
322 | 	otherrefs=['paymentdateid', 'shipmentdateid', 'deliverydateid'],
323 | 	measures=['price', 'shipmentlag', 'deliverylag'],
324 | 	factexpander=computelag)
325 | 
326 |     # A list of facts that are ready to inserted into the fact table
327 |     facts = [{'orderid': 1, 'customerid': 1, 'productid': 1, 'price': 10},
328 | 	     {'orderid': 2, 'customerid': 2, 'productid': 2, 'price': 20},
329 | 	     {'orderid': 3, 'customerid': 3, 'productid': 3, 'price': 30}]
330 | 
331 |     # The facts can be inserted using the ensure method. (If we had used the
332 |     # insert method instead, we should have made sure the facts above had a
333 |     # value for each attribute in the fact table. When using ensure, missing
334 |     # attributes will be set to None before an insertion.)
335 |     for row in facts:
336 | 	asft.ensure(row)
337 | 
338 |     # Now assume that the the orders get paid and shipped
339 |     facts[0]['paymentdateid'] = 12
340 |     facts[0]['shipmentdateid'] = 14
341 |     facts[2]['paymentdateid'] = 11
342 | 
343 |     # Update the accumulating fact table in the DW
344 |     for row in facts:
345 | 	asft.ensure(row)  # will call computelag and do the needed updates
346 | 
347 |     conn.commit()
348 |     conn.close()
349 | 


--------------------------------------------------------------------------------
/docs/examples/datasources.rst:
--------------------------------------------------------------------------------
  1 | .. _datasources:
  2 | 
  3 | Data Sources
  4 | ============
  5 | pygrametl supports numerous data sources, which are iterable classes that
  6 | produce rows. A row is a Python :class:`.dict` where the keys are the names of
  7 | the columns in the table where the row is from, and the values are the data
  8 | stored in that row. Users can easily implement new data sources by implementing
  9 | a version of the :meth:`__iter__` method that returns :class:`.dict`. As data
 10 | source are iterable, they can, e.g., be used in a loop as shown below:
 11 | 
 12 | .. code-block:: python
 13 | 
 14 |    for row in datasource:
 15 |        ...
 16 | 
 17 | While users can define their own data sources, pygrametl includes a number of
 18 | commonly used data sources:
 19 | 
 20 | SQLSource
 21 | ---------
 22 | :class:`.SQLSource` is a data source used to iterate over the results of a
 23 | single SQL query. The data source's constructor must be passed a :PEP:`249`
 24 | connection and not a :class:`.ConnectionWrapper`. As an example, a PostgreSQL
 25 | connection created using the psycopg2 package is used below:
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     import psycopg2
 30 |     from pygrametl.datasources import SQLSource
 31 | 
 32 |     conn = psycopg2.connect(database='db', user='dbuser', password='dbpass')
 33 |     sqlSource = SQLSource(connection=conn, query='SELECT * FROM table')
 34 | 
 35 | In the above example, an :class:`.SQLSource` is created in order to extract all
 36 | rows from the table named table.
 37 | 
 38 | A tuple of strings can also optionally be supplied as the parameter :attr:`.names`, to
 39 | automatically rename the elements in the query results. Naturally, the number of
 40 | supplied names must match the number of elements in the result:
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |     import psycopg2
 45 |     from pygrametl.datasources import SQLSource
 46 | 
 47 |     conn = psycopg2.connect(database='db', user='dbuser', password='dbpass')
 48 |     sqlSource = SQLSource(connection=conn, query='SELECT * FROM table',
 49 | 			  names=('id', 'name', 'price'))
 50 | 
 51 | :class:`.SQLSource` also makes it possible to supply an SQL expression that will
 52 | be executed before the query, through the :attr:`.initsql` parameter. The result
 53 | of the expression will not be returned. In the example below a new view is
 54 | created and then used in the query:
 55 | 
 56 | .. code-block:: python
 57 | 
 58 |     import psycopg2
 59 |     from pygrametl.datasources import SQLSource
 60 | 
 61 |     conn = psycopg2.connect(database='db', user='dbuser', password='dbpass')
 62 |     sqlSource = SQLSource(connection=conn, query='SELECT * FROM view',
 63 | 	initsql='CREATE VIEW view AS SELECT id, name FROM table WHERE price > 10')
 64 | 
 65 | CSVSource
 66 | ---------
 67 | :class:`.CSVSource` is a data source that returns a row for each line in a
 68 | character-separated file. It is an alias for Python's `csv.DictReader
 69 | <http://docs.python.org/3/library/csv.html#csv.DictReader>`__ as it already is
 70 | iterable and returns :class:`.dict`. An example of how to use
 71 | :class:`.CSVSource` to read a file containing comma-separated values is shown
 72 | below:
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |     from pygrametl.datasources import CSVSource
 77 | 
 78 |     # ResultsFile.csv contains: name,age,score
 79 |     csvSource = CSVSource(f=open('ResultsFile.csv', 'r', 16384), delimiter=',')
 80 | 
 81 | In the above example, a :class:`.CSVSource` is initialized with a file handler
 82 | that uses a buffer size of 16384, This particular buffer size is used as it
 83 | performed better than the alternatives we evaluated it against.
 84 | 
 85 | TypedCSVSource
 86 | --------------
 87 | :class:`.TypedCSVSource` extends :class:`.CSVSource` with typecasting by
 88 | wrapping `csv.DictReader
 89 | <http://docs.python.org/3/library/csv.html#csv.DictReader>`__ instead of simply
 90 | being an alias.
 91 | 
 92 | .. code-block:: python
 93 | 
 94 |     from pygrametl.datasources import TypedCSVSource
 95 | 
 96 |     # ResultsFile.csv contains: name,age,score
 97 |     typedCSVSource = TypedCSVSource(f=open('ResultsFile.csv', 'r', 16384),
 98 | 				    casts={'age': int, 'score': float},
 99 | 				    delimiter=',')
100 | 
101 | In the above example, a :class:`.TypedCSVSource` is initialized with a file
102 | handler that uses a buffer size of 16384. This particular buffer size is used as
103 | it performed better than the alternatives we evaluated it against. A dictionary
104 | is also passed which provides information about what type each column should be
105 | cast to. A cast is not performed for the name column as :class:`.TypedCSVSource`
106 | uses :class:`.str` as the default.
107 | 
108 | PandasSource
109 | -------------
110 | :class:`.PandasSource` wraps a Pandas DataFrame so it can be used as a data
111 | source. The class reuses existing functionality provided by `DataFrame
112 | <https://pandas.pydata.org/pandas-docs/stable/reference/frame.html>`__. An
113 | example of how to use this class can be seen below. In this example data is
114 | loaded from a spreadsheet, then transformed using a Pandas DataFrame, and last
115 | converted to an iterable that produce :class:`.dict` for use with pygrametl:
116 | 
117 | .. code-block:: python
118 | 
119 |     import pandas
120 |     from pygrametl.datasources import PandasSource
121 | 
122 |     df = pandas.read_excel('Revenue.xls')
123 |     df['price'] = df['price'].apply(lambda p: float(p) / 7.46)
124 |     pandasSource = PandasSource(df)
125 | 
126 | In the above example, a Pandas DataFrame is created from a spreadsheet
127 | containing revenue from some form of sales. Afterwards the data of the price
128 | column is transformed using one of the higher-order functions built into the
129 | Pandas package. Last, so the data can be loaded into a data warehouse using
130 | pygrametl, a :class:`.PandasSource` is created with the DataFrame as an
131 | argument, making the rows of the DataFrame accessible through a data source.
132 | 
133 | MergeJoiningSource
134 | ------------------
135 | In addition to the above data sources which reads data from external sources,
136 | pygrametl also includes a number of data sources that take other data sources as
137 | input to transform and/or combine them.
138 | 
139 | :class:`.MergeJoiningSource` can be used to equijoin the rows from two data
140 | sources. The rows of the two data sources must be delivered in sorted order. The
141 | shared attributes on which the rows are to be joined must also be given.
142 | 
143 | .. code-block:: python
144 | 
145 |     from pygrametl.datasources import CSVSource, MergeJoiningSource
146 | 
147 |     products = CSVSource(f=open('products.csv', 'r', 16384), delimiter=',')
148 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter='\t')
149 |     mergeJoiningSource = MergeJoiningSource(src1=products, key1='productid',
150 | 					    src2=sales, key2='productid')
151 | 
152 | In the above example, a :class:`.MergeJoiningSource` is used to join two data
153 | sources on their shared attribute productid.
154 | 
155 | HashJoiningSource
156 | -----------------
157 | :class:`.HashJoiningSource` functions similarly to :class:`.MergeJoiningSource`,
158 | but it performs the join using a hash map. Thus the two input data sources need
159 | not produce their rows in sorted order.
160 | 
161 | .. code-block:: python
162 | 
163 |     from pygrametl.datasources import CSVSource, HashJoiningSource
164 | 
165 |     products = CSVSource(f=open('products.csv', 'r', 16384), delimiter=',')
166 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter='\t')
167 |     hashJoiningSource = HashJoiningSource(src1=products, key1='productid',
168 | 					  src2=sales, key2='productid')
169 | 
170 | UnionSource
171 | -----------
172 | The class :class:`.UnionSource` creates a union of a number of the supplied data
173 | sources. :class:`.UnionSource` does not require that the input data sources all
174 | produce rows containing the same attributes, which also means that an
175 | :class:`.UnionSource` does not guarantee that all of the rows it produces
176 | contain the same attributes.
177 | 
178 | .. code-block:: python
179 | 
180 |     from pygrametl.datasources import CSVSource, UnionSource
181 | 
182 |     salesOne = CSVSource(f=open('sales1.csv', 'r', 16384), delimiter='\t')
183 |     salesTwo = CSVSource(f=open('sales2.csv', 'r', 16384), delimiter='\t')
184 |     salesThree = CSVSource(f=open('sales3.csv', 'r', 16384), delimiter='\t')
185 | 
186 |     combinedSales = UnionSource(salesOne, salesTwo, salesThree)
187 | 
188 | Each data source are exhausted before the next data source is read. This means
189 | that all rows are read from the first data source before any rows are read from
190 | the second data source, and so on.
191 | 
192 | RoundRobinSource
193 | ----------------
194 | It can also be beneficial to interleave rows, and for this purpose,
195 | :class:`.RoundRobinSource` can be used.
196 | 
197 | .. code-block:: python
198 | 
199 |     from pygrametl.datasources import CSVSource, RoundRobinSource
200 | 
201 |     salesOne = CSVSource(f=open('sales1.csv', 'r', 16384), delimiter='\t')
202 |     salesTwo = CSVSource(f=open('sales2.csv', 'r', 16384), delimiter='\t')
203 |     salesThree = CSVSource(f=open('sales3.csv', 'r', 16384), delimiter='\t')
204 | 
205 |     combinedSales = RoundRobinSource((salesOne, salesTwo, salesThree),
206 | 				     batchsize=500)
207 | 
208 | In the above example, :class:`.RoundRobinSource` is given a number of data
209 | sources, and the argument :attr:`.batchsize`, which are the number of rows to be
210 | read from one data source before reading from the next in a round-robin fashion.
211 | 
212 | ProcessSource
213 | -------------
214 | :class:`.ProcessSource` is used for iterating over a data source using a
215 | separate worker process or thread. The worker reads data from the input data
216 | source and creates batches of rows. When a batch is complete, it is added to a
217 | queue so it can be consumed by another process or thread. If the queue is full
218 | the worker blocks until an element is removed from the queue. The sizes of the
219 | batches and the queue are optional parameters, but tuning them can often improve
220 | throughput. For more examples of the parallel features provided by pygrametl see
221 | :doc:`parallel`.
222 | 
223 | .. code-block:: python
224 | 
225 |     from pygrametl.datasources import CSVSource, ProcessSource
226 | 
227 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter='\t')
228 |     processSource = ProcessSource(source=sales, batchsize=1000, queuesize=20)
229 | 
230 | FilteringSource
231 | ---------------
232 | :class:`.FilteringSource` is used to apply a filter to a data source. By
233 | default, the built-in Python function `bool
234 | <http://docs.python.org/3/library/functions.html#bool>`__ is used, which can be
235 | used to remove empty rows. Alternatively, the user can supply a custom filter
236 | function, which should be a callable function :attr:`f(row)`, which returns
237 | :attr:`True` when a row should be passed on. In the example below, rows are
238 | removed if the value of their location attribute is not Aalborg.
239 | 
240 | .. code-block:: python
241 | 
242 |     from pygrametl.datasources import CSVSource, FilteringSource
243 | 
244 | 
245 |     def locationfilter(row):
246 | 	row['location'] == 'Aalborg'
247 | 
248 | 
249 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter='\t')
250 |     salesFiltered = FilteringSource(source=sales, filter=locationfilter)
251 | 
252 | MappingSource
253 | -------------
254 | :class:`.MappingSource` can be used to apply functions to the columns of a data
255 | source. It can be given a dictionary that where the keys are the columns and the
256 | values are callable functions of the form :attr:`f(val)`. The functions will be
257 | applied to the attributes in an undefined order. In the example below, a
258 | function is used to cast all values for the attribute price to integers while
259 | rows are being read from a CSV file.
260 | 
261 | .. code-block:: python
262 | 
263 |     from pygrametl.datasources import CSVSource, MappingSource
264 | 
265 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter=',')
266 |     salesMapped = MappingSource(source=sales, callables={'price': int})
267 | 
268 | TransformingSource
269 | ------------------
270 | :class:`.TransformingSource` can be used to apply functions to the rows of a
271 | data source. The class can be supplied with a number of callable functions of
272 | the form :attr:`f(row)`, which will be applied to the source in the given order.
273 | 
274 | .. code-block:: python
275 | 
276 |     import pygrametl
277 |     from pygrametl.datasources import CSVSource, TransformingSource
278 | 
279 | 
280 |     def dkk_to_eur(row):
281 | 	price_as_a_number = int(row['price'])
282 | 	row['dkk'] = price_as_a_number
283 | 	row['eur'] = price_as_a_number / 7.43
284 | 
285 | 
286 |     sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter=',')
287 |     salesTransformed = TransformingSource(sales, dkk_to_eur)
288 | 
289 | In the above example, the price is converted from a string to an integer and
290 | stored in the row as two currencies.
291 | 
292 | SQLTransformingSource
293 | ---------------------
294 | :class:`.SQLTransformingSource` can be used to transform the rows of a data
295 | source using SQL. :class:`.SQLTransformingSource` loads the rows into a
296 | temporary table in an RDBMS and then retrieves them using an SQL query. By
297 | default each :class:`.SQLTransformingSource` uses a separate in-memory SQLite
298 | database but another database can be used by passing a :PEP:`249` connection or
299 | one of the :class:`.ConnectionWrapper` types as the parameter
300 | :attr:`.targetconnection`. By using an on-disk database
301 | :class:`.SQLTransformingSource` can be used with datasets that do not fit in
302 | memory. If an existing database is used the rows from the data source can also
303 | be enriched using data from other tables in the database, e.g., by joining the
304 | rows with an existing table in the database. Be aware that
305 | :class:`.SQLTransformingSource` creates, empties, and drops the temporary table.
306 | 
307 | .. code-block:: python
308 | 
309 |     import pygrametl
310 |     from pygrametl.datasources import TypedCSVSource, SQLTransformingSource
311 | 
312 |     sales = TypedCSVSource(f=open('sales.csv', 'r', 16384),
313 |                            casts={'price': int}, delimiter=',')
314 | 
315 |     salesTransformed = SQLTransformingSource(sales,
316 |         "sales", "SELECT product, SUM(price) FROM sales GROUP BY product")
317 | 
318 | In the above example, the total revenue is computed for each product. First, a
319 | temporary in-memory SQLite database is created. Then a temporary table named
320 | sales with the same schema as the rows in :attr:`sales` is created. Finally, the
321 | rows in :attr:`sales` is loaded into the temporary table in batches and then the
322 | final result is produced by executing the provided SQL query. In addition, to
323 | the required parameters shown above, :class:`.SQLTransformingSource` also has
324 | multiple optional parameters, e.g., :attr:`.extendedcasts` accepts a
325 | :class:`.dict` that specifies how Python types should be mapped to SQL types,
326 | :attr:`.perbatch` specifies if the transformation should be applied for each
327 | batch of rows or for all rows in the input data source, and :attr:`.columnnames`
328 | allows the columns in the output rows to be renamed.
329 | 
330 | CrossTabbingSource
331 | ------------------
332 | :class:`.CrossTabbingSource` can be used to compute the cross tab of a data
333 | source. The class takes as parameters the names of the attributes that are to
334 | appear as rows and columns in the crosstab, as well as the name of the attribute
335 | to aggregate. By default, the values are aggregated using
336 | :class:`.pygrametl.aggregators.Sum`, but the class also accepts an alternate
337 | aggregator from the module :class:`pygrametl.aggregators`.
338 | 
339 | .. code-block:: python
340 | 
341 |      from pygrametl.datasources import CSVSource, CrossTabbingSource, \
342 | 	 TransformingSource
343 |      from pygrametl.aggregators import Avg
344 | 
345 | 
346 |      def dkk_to_eur(row):
347 | 	 price_as_a_number = int(row['price'])
348 | 	 row['dkk'] = price_as_a_number
349 | 	 row['eur'] = price_as_a_number / 7.43
350 | 
351 | 
352 |      sales = CSVSource(f=open('sales.csv', 'r', 16384), delimiter=',')
353 |      salesTransformed = TransformingSource(sales, dkk_to_eur)
354 | 
355 |      crossTab = CrossTabbingSource(source=salesTransformed, rowvaluesatt='product',
356 | 				   colvaluesatt='location', values='eur',
357 | 				   aggregator=Avg())
358 | 
359 | In the above example, a crosstab is made from a table containing sales data in
360 | order to view the average price of products across different locations.
361 | :class:`.TransformingSource` is used to parse and convert the price from DKK to EUR.
362 | 
363 | DynamicForEachSource
364 | --------------------
365 | :class:`.DynamicForEachSource` is a data source that for each data source
366 | provided as input, creates a new data source that will be iterated by the
367 | :class:`.DynamicForEachSource` data source. To create the new data sources the
368 | user must provide a function that when called with a single argument, return a
369 | new data source. In the example below, :class:`.DynamicForEachSource` is used to
370 | create a :class:`.CSVSource` for each of the CSV files in a directory. The
371 | :class:`.DynamicForEachSource` stores the input list in a safe multiprocessing
372 | queue, and as such the :class:`.DynamicForEachSource` instance can be given to
373 | several :class:`.ProcessSource`. For information about pygrametl's parallel
374 | features see :doc:`parallel`.
375 | 
376 | .. code-block:: python
377 | 
378 |     import glob
379 |     from pygrametl.datasources import CSVSource, DynamicForEachSource
380 | 
381 | 
382 |     def createCSVSource(filename):
383 | 	return CSVSource(f=open(filename, 'r', 16384), delimiter=',')
384 | 
385 | 
386 |     salesFiles = glob.glob('sales/*.csv')
387 |     combinedSales = DynamicForEachSource(seq=salesFiles, callee=createCSVSource)
388 | 


--------------------------------------------------------------------------------
/docs/_static/example.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" fill-opacity="1" color-rendering="auto" color-interpolation="auto" text-rendering="auto" stroke="black" stroke-linecap="square" width="626" stroke-miterlimit="10" shape-rendering="auto" stroke-opacity="1" fill="black" stroke-dasharray="none" font-weight="normal" stroke-width="1" height="273" font-family="'Dialog'" font-style="normal" stroke-linejoin="miter" font-size="12px" stroke-dashoffset="0" image-rendering="auto">
  2 |   <!--Generated by ySVG 2.6-->
  3 |   <defs id="genericDefs"/>
  4 |   <g>
  5 |     <defs id="defs1">
  6 |       <linearGradient x1="360.625" gradientUnits="userSpaceOnUse" x2="500.625" y1="346.584" y2="420" id="linearGradient1" spreadMethod="reflect">
  7 |         <stop stop-opacity="1" stop-color="rgb(232,238,247)" offset="0%"/>
  8 |         <stop stop-opacity="1" stop-color="rgb(183,201,227)" offset="100%"/>
  9 |       </linearGradient>
 10 |       <linearGradient x1="592.625" gradientUnits="userSpaceOnUse" x2="732.625" y1="204.084" y2="277.5" id="linearGradient2" spreadMethod="reflect">
 11 |         <stop stop-opacity="1" stop-color="rgb(232,238,247)" offset="0%"/>
 12 |         <stop stop-opacity="1" stop-color="rgb(183,201,227)" offset="100%"/>
 13 |       </linearGradient>
 14 |       <linearGradient x1="137.375" gradientUnits="userSpaceOnUse" x2="277.375" y1="204.084" y2="277.5" id="linearGradient3" spreadMethod="reflect">
 15 |         <stop stop-opacity="1" stop-color="rgb(232,238,247)" offset="0%"/>
 16 |         <stop stop-opacity="1" stop-color="rgb(183,201,227)" offset="100%"/>
 17 |       </linearGradient>
 18 |       <linearGradient x1="360.625" gradientUnits="userSpaceOnUse" x2="500.625" y1="204.084" y2="277.5" id="linearGradient4" spreadMethod="reflect">
 19 |         <stop stop-opacity="1" stop-color="rgb(232,238,247)" offset="0%"/>
 20 |         <stop stop-opacity="1" stop-color="rgb(183,201,227)" offset="100%"/>
 21 |       </linearGradient>
 22 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath1">
 23 |         <path d="M0 0 L626 0 L626 273 L0 273 L0 0 Z"/>
 24 |       </clipPath>
 25 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath2">
 26 |         <path d="M122 162 L748 162 L748 435 L122 435 L122 162 Z"/>
 27 |       </clipPath>
 28 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath3">
 29 |         <path d="M-230.625 -150 L395.375 -150 L395.375 123 L-230.625 123 L-230.625 -150 Z"/>
 30 |       </clipPath>
 31 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath4">
 32 |         <path d="M122 346.584 L122 435 L748 435 L748 346.584 Z"/>
 33 |       </clipPath>
 34 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath5">
 35 |         <path d="M122 162 L122 346.584 L748 346.584 L748 162 Z"/>
 36 |       </clipPath>
 37 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath6">
 38 |         <path d="M-462.625 -7.5 L163.375 -7.5 L163.375 265.5 L-462.625 265.5 L-462.625 -7.5 Z"/>
 39 |       </clipPath>
 40 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath7">
 41 |         <path d="M122 204.084 L122 435 L748 435 L748 204.084 Z"/>
 42 |       </clipPath>
 43 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath8">
 44 |         <path d="M122 162 L122 204.084 L748 204.084 L748 162 Z"/>
 45 |       </clipPath>
 46 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath9">
 47 |         <path d="M-7.375 -7.5 L618.625 -7.5 L618.625 265.5 L-7.375 265.5 L-7.375 -7.5 Z"/>
 48 |       </clipPath>
 49 |       <clipPath clipPathUnits="userSpaceOnUse" id="clipPath10">
 50 |         <path d="M-230.625 -7.5 L395.375 -7.5 L395.375 265.5 L-230.625 265.5 L-230.625 -7.5 Z"/>
 51 |       </clipPath>
 52 |     </defs>
 53 |     <g fill="white" text-rendering="geometricPrecision" shape-rendering="geometricPrecision" transform="translate(-122,-162)" stroke="white">
 54 |       <rect x="122" width="626" height="273" y="162" clip-path="url(#clipPath2)" stroke="none"/>
 55 |     </g>
 56 |     <g text-rendering="geometricPrecision" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,230.625,150)" image-rendering="optimizeQuality">
 57 |       <image x="0" y="0" clip-path="url(#clipPath3)" width="161" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAKEAAAB5CAYAAABRPEH0AAADt0lEQVR4Xu3X244V&#10;BRBG4cIBBBlGgQTEEUEEw0FNFAVRUUAOKu//PtZKZccdIhf27PG/6FXJdzndJKxU&#10;9a5yHMdxHMdxHMdxHOe/z4n23pY9rd52D/RxbMPDeeHJdqqdbu+3M1o1GgA90AV9&#10;7DzGzebj4bzsg3a+fdg+ahe0ajSAg7bfztYESS87C5EAKZyH86JL7eN22K61z7R6&#10;dEAPV2r6YEmxJXcSIg/gQQRI7cR3o33Z7rWv2zdava9qeqCL6zUxcikJkU+4Iw1b&#10;kBN8UBPgFzXhfd9+bD9L7aeaHh60++1mTYhsRE7z4m242YJ8A7Jib9QEyMt+a7+3&#10;F+2lVo0G8Lw9aQ9rQuRMX6zZhiyzRUOEp2o+NtmCrFo2IAG+an+2v6SaFv6oiZEQ&#10;2Yi32+V2ro5wkqmXVcopPqypmy3IBuSlb6S3vG7P2qN2t12tOclc1EWz+R7kBwm/&#10;fjjF3H5WMPW//Q+Q2IYsqcc1S4vlxRLjoi6a7Qi57/wK4iPUCPUumwhZVvxi/rTm&#10;V7IR6n9jhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEq&#10;zggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4I&#10;FWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVn&#10;hIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SK&#10;M0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNC&#10;xRmh4oxQcUaoOCNUnBEq7tgi5CHXah7Kw1+UEerfEeHz9rjdb4ftoI4Y4emah/Cw&#10;e+1RzUt4mSFqGz28ak/bw3anXW3n28laOCdqCt5vV9rt9qD9WrMNX9fEKIEAOcW/&#10;tG/brXa5nWt7tXCIkD8+2y616zUnmcqftGc1L5W4jk9rAvyh5mryCXehnam5qotn&#10;c5JZqWzDmzUhshE5zdx+vhO1bnTAcvquJsDPa7YgV5RrykJbPPwx95ya+YFCiGxE&#10;TvPdmo9PotS60cGdmhPMBiRAfkvww3avdjDbIbIRL9a85JOan+DgxVon/v/54cqP&#10;ELrgBO/XPwEeaQtuz+b7kNPMNyIfmwTJdpTYevRAFywrTvBOA9wMD+QbkYezGXkR&#10;UUqgB7o4lvjeNbxI2uY4juM4juM4juM4zqL5GzkdQ2BK+BnWAAAAAElFTkSuQmCC&#10;" height="121" preserveAspectRatio="none"/>
 58 |       <rect x="360.625" y="320" transform="matrix(1,0,0,1,-352.625,-312)" clip-path="url(#clipPath4)" fill="url(#linearGradient1)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 59 |       <rect x="360.625" y="320" transform="matrix(1,0,0,1,-352.625,-312)" clip-path="url(#clipPath5)" fill="rgb(183,201,227)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 60 |       <rect stroke-linecap="butt" x="360.625" y="320" transform="matrix(1,0,0,1,-352.625,-312)" clip-path="url(#clipPath2)" fill="none" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke-miterlimit="1.45"/>
 61 |       <line stroke-linecap="butt" transform="matrix(1,0,0,1,-352.625,-312)" clip-path="url(#clipPath2)" fill="none" x1="360.625" x2="500.625" y1="346.584" image-rendering="auto" y2="346.584" stroke-miterlimit="1.45"/>
 62 |     </g>
 63 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" font-family="sans-serif" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
 64 |       <text x="376.625" xml:space="preserve" y="338.0645" clip-path="url(#clipPath2)" stroke="none">Time (Dimension)</text>
 65 |       <text x="364.625" xml:space="preserve" y="364.6484" clip-path="url(#clipPath2)" stroke="none">TimeID: Int (PK)</text>
 66 |       <text x="364.625" xml:space="preserve" y="379.2324" clip-path="url(#clipPath2)" stroke="none">Day: Int</text>
 67 |       <text x="364.625" xml:space="preserve" y="393.8164" clip-path="url(#clipPath2)" stroke="none">Month: Int</text>
 68 |       <text x="364.625" xml:space="preserve" y="408.4004" clip-path="url(#clipPath2)" stroke="none">Year: Int</text>
 69 |     </g>
 70 |     <g text-rendering="geometricPrecision" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,462.625,7.5)" image-rendering="optimizeQuality">
 71 |       <image x="0" y="0" clip-path="url(#clipPath6)" width="161" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAKEAAAB5CAYAAABRPEH0AAADt0lEQVR4Xu3X244V&#10;BRBG4cIBBBlGgQTEEUEEw0FNFAVRUUAOKu//PtZKZccdIhf27PG/6FXJdzndJKxU&#10;9a5yHMdxHMdxHMdxHOe/z4n23pY9rd52D/RxbMPDeeHJdqqdbu+3M1o1GgA90AV9&#10;7DzGzebj4bzsg3a+fdg+ahe0ajSAg7bfztYESS87C5EAKZyH86JL7eN22K61z7R6&#10;dEAPV2r6YEmxJXcSIg/gQQRI7cR3o33Z7rWv2zdava9qeqCL6zUxcikJkU+4Iw1b&#10;kBN8UBPgFzXhfd9+bD9L7aeaHh60++1mTYhsRE7z4m242YJ8A7Jib9QEyMt+a7+3&#10;F+2lVo0G8Lw9aQ9rQuRMX6zZhiyzRUOEp2o+NtmCrFo2IAG+an+2v6SaFv6oiZEQ&#10;2Yi32+V2ro5wkqmXVcopPqypmy3IBuSlb6S3vG7P2qN2t12tOclc1EWz+R7kBwm/&#10;fjjF3H5WMPW//Q+Q2IYsqcc1S4vlxRLjoi6a7Qi57/wK4iPUCPUumwhZVvxi/rTm&#10;V7IR6n9jhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEq&#10;zggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4I&#10;FWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVn&#10;hIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SK&#10;M0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNC&#10;xRmh4oxQcUaoOCNUnBEq7tgi5CHXah7Kw1+UEerfEeHz9rjdb4ftoI4Y4emah/Cw&#10;e+1RzUt4mSFqGz28ak/bw3anXW3n28laOCdqCt5vV9rt9qD9WrMNX9fEKIEAOcW/&#10;tG/brXa5nWt7tXCIkD8+2y616zUnmcqftGc1L5W4jk9rAvyh5mryCXehnam5qotn&#10;c5JZqWzDmzUhshE5zdx+vhO1bnTAcvquJsDPa7YgV5RrykJbPPwx95ya+YFCiGxE&#10;TvPdmo9PotS60cGdmhPMBiRAfkvww3avdjDbIbIRL9a85JOan+DgxVon/v/54cqP&#10;ELrgBO/XPwEeaQtuz+b7kNPMNyIfmwTJdpTYevRAFywrTvBOA9wMD+QbkYezGXkR&#10;UUqgB7o4lvjeNbxI2uY4juM4juM4juM4zqL5GzkdQ2BK+BnWAAAAAElFTkSuQmCC&#10;" height="121" preserveAspectRatio="none"/>
 72 |       <rect x="592.625" y="177.5" transform="matrix(1,0,0,1,-584.625,-169.5)" clip-path="url(#clipPath7)" fill="url(#linearGradient2)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 73 |       <rect x="592.625" y="177.5" transform="matrix(1,0,0,1,-584.625,-169.5)" clip-path="url(#clipPath8)" fill="rgb(183,201,227)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 74 |       <rect stroke-linecap="butt" x="592.625" y="177.5" transform="matrix(1,0,0,1,-584.625,-169.5)" clip-path="url(#clipPath2)" fill="none" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke-miterlimit="1.45"/>
 75 |       <line stroke-linecap="butt" transform="matrix(1,0,0,1,-584.625,-169.5)" clip-path="url(#clipPath2)" fill="none" x1="592.625" x2="732.625" y1="204.084" image-rendering="auto" y2="204.084" stroke-miterlimit="1.45"/>
 76 |     </g>
 77 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" font-family="sans-serif" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
 78 |       <text x="598.2188" xml:space="preserve" y="195.5645" clip-path="url(#clipPath2)" stroke="none">Location (Dimension)</text>
 79 |       <text x="596.625" xml:space="preserve" y="222.1484" clip-path="url(#clipPath2)" stroke="none">LocationID: Int (PK)</text>
 80 |       <text x="596.625" xml:space="preserve" y="236.7324" clip-path="url(#clipPath2)" stroke="none">CIty: Text</text>
 81 |       <text x="596.625" xml:space="preserve" y="251.3164" clip-path="url(#clipPath2)" stroke="none">Region: Text</text>
 82 |     </g>
 83 |     <g text-rendering="geometricPrecision" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,7.375,7.5)" image-rendering="optimizeQuality">
 84 |       <image x="0" y="0" clip-path="url(#clipPath9)" width="161" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAKEAAAB5CAYAAABRPEH0AAADt0lEQVR4Xu3X244V&#10;BRBG4cIBBBlGgQTEEUEEw0FNFAVRUUAOKu//PtZKZccdIhf27PG/6FXJdzndJKxU&#10;9a5yHMdxHMdxHMdxHOe/z4n23pY9rd52D/RxbMPDeeHJdqqdbu+3M1o1GgA90AV9&#10;7DzGzebj4bzsg3a+fdg+ahe0ajSAg7bfztYESS87C5EAKZyH86JL7eN22K61z7R6&#10;dEAPV2r6YEmxJXcSIg/gQQRI7cR3o33Z7rWv2zdava9qeqCL6zUxcikJkU+4Iw1b&#10;kBN8UBPgFzXhfd9+bD9L7aeaHh60++1mTYhsRE7z4m242YJ8A7Jib9QEyMt+a7+3&#10;F+2lVo0G8Lw9aQ9rQuRMX6zZhiyzRUOEp2o+NtmCrFo2IAG+an+2v6SaFv6oiZEQ&#10;2Yi32+V2ro5wkqmXVcopPqypmy3IBuSlb6S3vG7P2qN2t12tOclc1EWz+R7kBwm/&#10;fjjF3H5WMPW//Q+Q2IYsqcc1S4vlxRLjoi6a7Qi57/wK4iPUCPUumwhZVvxi/rTm&#10;V7IR6n9jhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEq&#10;zggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4I&#10;FWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVn&#10;hIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SK&#10;M0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNC&#10;xRmh4oxQcUaoOCNUnBEq7tgi5CHXah7Kw1+UEerfEeHz9rjdb4ftoI4Y4emah/Cw&#10;e+1RzUt4mSFqGz28ak/bw3anXW3n28laOCdqCt5vV9rt9qD9WrMNX9fEKIEAOcW/&#10;tG/brXa5nWt7tXCIkD8+2y616zUnmcqftGc1L5W4jk9rAvyh5mryCXehnam5qotn&#10;c5JZqWzDmzUhshE5zdx+vhO1bnTAcvquJsDPa7YgV5RrykJbPPwx95ya+YFCiGxE&#10;TvPdmo9PotS60cGdmhPMBiRAfkvww3avdjDbIbIRL9a85JOan+DgxVon/v/54cqP&#10;ELrgBO/XPwEeaQtuz+b7kNPMNyIfmwTJdpTYevRAFywrTvBOA9wMD+QbkYezGXkR&#10;UUqgB7o4lvjeNbxI2uY4juM4juM4juM4zqL5GzkdQ2BK+BnWAAAAAElFTkSuQmCC&#10;" height="121" preserveAspectRatio="none"/>
 85 |       <rect x="137.375" y="177.5" transform="matrix(1,0,0,1,-129.375,-169.5)" clip-path="url(#clipPath7)" fill="url(#linearGradient3)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 86 |       <rect x="137.375" y="177.5" transform="matrix(1,0,0,1,-129.375,-169.5)" clip-path="url(#clipPath8)" fill="rgb(183,201,227)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 87 |       <rect stroke-linecap="butt" x="137.375" y="177.5" transform="matrix(1,0,0,1,-129.375,-169.5)" clip-path="url(#clipPath2)" fill="none" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke-miterlimit="1.45"/>
 88 |       <line stroke-linecap="butt" transform="matrix(1,0,0,1,-129.375,-169.5)" clip-path="url(#clipPath2)" fill="none" x1="137.375" x2="277.375" y1="204.084" image-rendering="auto" y2="204.084" stroke-miterlimit="1.45"/>
 89 |     </g>
 90 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" font-family="sans-serif" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
 91 |       <text x="153.1816" xml:space="preserve" y="195.5645" clip-path="url(#clipPath2)" stroke="none">Book (Dimension)</text>
 92 |       <text x="141.375" xml:space="preserve" y="222.1484" clip-path="url(#clipPath2)" stroke="none">BookID: Int (PK)</text>
 93 |       <text x="141.375" xml:space="preserve" y="236.7324" clip-path="url(#clipPath2)" stroke="none">Book: Text</text>
 94 |       <text x="141.375" xml:space="preserve" y="251.3164" clip-path="url(#clipPath2)" stroke="none">Genre: Text</text>
 95 |     </g>
 96 |     <g text-rendering="geometricPrecision" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,230.625,7.5)" image-rendering="optimizeQuality">
 97 |       <image x="0" y="0" clip-path="url(#clipPath10)" width="161" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAKEAAAB5CAYAAABRPEH0AAADt0lEQVR4Xu3X244V&#10;BRBG4cIBBBlGgQTEEUEEw0FNFAVRUUAOKu//PtZKZccdIhf27PG/6FXJdzndJKxU&#10;9a5yHMdxHMdxHMdxHOe/z4n23pY9rd52D/RxbMPDeeHJdqqdbu+3M1o1GgA90AV9&#10;7DzGzebj4bzsg3a+fdg+ahe0ajSAg7bfztYESS87C5EAKZyH86JL7eN22K61z7R6&#10;dEAPV2r6YEmxJXcSIg/gQQRI7cR3o33Z7rWv2zdava9qeqCL6zUxcikJkU+4Iw1b&#10;kBN8UBPgFzXhfd9+bD9L7aeaHh60++1mTYhsRE7z4m242YJ8A7Jib9QEyMt+a7+3&#10;F+2lVo0G8Lw9aQ9rQuRMX6zZhiyzRUOEp2o+NtmCrFo2IAG+an+2v6SaFv6oiZEQ&#10;2Yi32+V2ro5wkqmXVcopPqypmy3IBuSlb6S3vG7P2qN2t12tOclc1EWz+R7kBwm/&#10;fjjF3H5WMPW//Q+Q2IYsqcc1S4vlxRLjoi6a7Qi57/wK4iPUCPUumwhZVvxi/rTm&#10;V7IR6n9jhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEq&#10;zggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4I&#10;FWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SKM0LFGaHijFBxRqg4I1ScESrOCBVn&#10;hIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNCxRmh4oxQcUaoOCNUnBEqzggVZ4SK&#10;M0LFGaHijFBxRqg4I1ScESrOCBVnhIozQsUZoeKMUHFGqDgjVJwRKs4IFWeEijNC&#10;xRmh4oxQcUaoOCNUnBEq7tgi5CHXah7Kw1+UEerfEeHz9rjdb4ftoI4Y4emah/Cw&#10;e+1RzUt4mSFqGz28ak/bw3anXW3n28laOCdqCt5vV9rt9qD9WrMNX9fEKIEAOcW/&#10;tG/brXa5nWt7tXCIkD8+2y616zUnmcqftGc1L5W4jk9rAvyh5mryCXehnam5qotn&#10;c5JZqWzDmzUhshE5zdx+vhO1bnTAcvquJsDPa7YgV5RrykJbPPwx95ya+YFCiGxE&#10;TvPdmo9PotS60cGdmhPMBiRAfkvww3avdjDbIbIRL9a85JOan+DgxVon/v/54cqP&#10;ELrgBO/XPwEeaQtuz+b7kNPMNyIfmwTJdpTYevRAFywrTvBOA9wMD+QbkYezGXkR&#10;UUqgB7o4lvjeNbxI2uY4juM4juM4juM4zqL5GzkdQ2BK+BnWAAAAAElFTkSuQmCC&#10;" height="121" preserveAspectRatio="none"/>
 98 |       <rect x="360.625" y="177.5" transform="matrix(1,0,0,1,-352.625,-169.5)" clip-path="url(#clipPath7)" fill="url(#linearGradient4)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
 99 |       <rect x="360.625" y="177.5" transform="matrix(1,0,0,1,-352.625,-169.5)" clip-path="url(#clipPath8)" fill="rgb(183,201,227)" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke="none"/>
100 |       <rect stroke-linecap="butt" x="360.625" y="177.5" transform="matrix(1,0,0,1,-352.625,-169.5)" clip-path="url(#clipPath2)" fill="none" width="140" image-rendering="auto" rx="4" ry="4" height="100" stroke-miterlimit="1.45"/>
101 |       <line stroke-linecap="butt" transform="matrix(1,0,0,1,-352.625,-169.5)" clip-path="url(#clipPath2)" fill="none" x1="360.625" x2="500.625" y1="204.084" image-rendering="auto" y2="204.084" stroke-miterlimit="1.45"/>
102 |     </g>
103 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" font-family="sans-serif" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
104 |       <text x="379.2295" xml:space="preserve" y="195.5645" clip-path="url(#clipPath2)" stroke="none">Sale (Fact Table)</text>
105 |       <text x="364.625" xml:space="preserve" y="222.1484" clip-path="url(#clipPath2)" stroke="none">BookID: Int (PK)</text>
106 |       <text x="364.625" xml:space="preserve" y="236.7324" clip-path="url(#clipPath2)" stroke="none">LocationID: Int (PK)</text>
107 |       <text x="364.625" xml:space="preserve" y="251.3164" clip-path="url(#clipPath2)" stroke="none">TimeID: Int (PK)</text>
108 |       <text x="364.625" xml:space="preserve" y="265.9004" clip-path="url(#clipPath2)" stroke="none">Sale: Int</text>
109 |     </g>
110 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
111 |       <path fill="none" d="M360.6141 227.5 L285.3727 227.5" clip-path="url(#clipPath2)"/>
112 |     </g>
113 |     <g text-rendering="geometricPrecision" stroke-miterlimit="1.45" shape-rendering="geometricPrecision" transform="matrix(1,0,0,1,-122,-162)" stroke-linecap="butt">
114 |       <path d="M277.3727 227.5 L289.3727 232.5 L286.3727 227.5 L289.3727 222.5 Z" stroke="none" clip-path="url(#clipPath2)"/>
115 |       <path fill="none" d="M430.625 277.4933 L430.625 311.9976" clip-path="url(#clipPath2)"/>
116 |       <path d="M430.625 319.9976 L435.625 307.9976 L430.625 310.9976 L425.625 307.9976 Z" clip-path="url(#clipPath2)" stroke="none"/>
117 |       <path fill="none" d="M500.6045 227.5 L584.6508 227.5" clip-path="url(#clipPath2)"/>
118 |       <path d="M592.6508 227.5 L580.6508 222.5 L583.6508 227.5 L580.6508 232.5 Z" clip-path="url(#clipPath2)" stroke="none"/>
119 |     </g>
120 |   </g>
121 | </svg>
122 | 


--------------------------------------------------------------------------------