├── ETC
    ├── CCMC
    │   ├── __init__.py
    │   └── pairs_parallel.py
    ├── LZ76
    │   ├── __init__.py
    │   ├── lzc.py
    │   └── core.pyx
    ├── tests
    │   ├── __init__.py
    │   ├── test_recode.py
    │   ├── test_NSRWS1D.py
    │   └── test_NSRWS2D.py
    ├── NSRWS
    │   ├── __init__.py
    │   ├── x1D
    │   │   ├── __init__.py
    │   │   ├── distance.py
    │   │   ├── core.pyx
    │   │   ├── parallel.py
    │   │   └── onestep.py
    │   └── x2D
    │   │   ├── __init__.py
    │   │   ├── parallel.py
    │   │   ├── core.pyx
    │   │   └── onestep.py
    ├── seq
    │   ├── __init__.py
    │   ├── IO.py
    │   ├── check.py
    │   ├── estimates.pyx
    │   ├── process.py
    │   ├── recode.py
    │   └── markov.py
    ├── NCA
    │   ├── __init__.py
    │   ├── compute.py
    │   ├── parallelize_jl.py
    │   └── parallelize_mp.py
    ├── CCC
    │   ├── __init__.py
    │   ├── simulate_AR.py
    │   ├── compute_CCC.py
    │   ├── simulate_TentMap.py
    │   ├── calibrate_CCC.py
    │   └── _calibrate_CCC.py
    └── __init__.py
├── setup.py
├── .gitignore
├── demo.py
├── LICENSE
└── README.md


/ETC/CCMC/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/ETC/LZ76/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/ETC/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/ETC/seq/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x1D/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x2D/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/ETC/NCA/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | from ETC.NCA.compute import compute_CCC, compute_CCM, get_causal, get_NCA
3 | 


--------------------------------------------------------------------------------
/ETC/CCC/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | from ETC.CCC.compute_CCC import compute, get_params
3 | from ETC.CCC.simulate_AR import coupled_AR
4 | 


--------------------------------------------------------------------------------
/ETC/LZ76/lzc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | from ETC.LZ76 import core
 9 | from ETC.seq.recode import cast
10 | from ETC.seq.check import arraytype
11 | 
12 | 
13 | def compute_complexity(seq):
14 | 
15 |     # Coerce input to appropriate array type, if not possible throw a fit & exit
16 |     if not arraytype(seq):
17 |         seq = cast(seq)
18 |         if seq is None:
19 |             return None
20 | 
21 |     # Check whether all elements are equal, & exit if True (LZ76 of such inputs is 2)
22 |     if core.check_equality(seq):
23 |         print("> All elements in sequence are equal!")
24 |         return 2
25 | 
26 |     # Else execute Cython function for computing LZ complexity
27 |     return core.lzc_a(seq)
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | 
 9 | from setuptools import setup, find_packages
10 | from Cython.Build import cythonize
11 | import numpy
12 | 
13 | setup(
14 |     ext_modules=cythonize(
15 |         [
16 |             "./ETC/NSRWS/x1D/core.pyx",
17 |             "./ETC/NSRWS/x2D/core.pyx",
18 |             "./ETC/seq/estimates.pyx",
19 |             "./ETC/LZ76/core.pyx",
20 |         ],
21 |         annotate=False,
22 |         compiler_directives={"language_level": "3"},
23 |     ),
24 |     include_dirs=[numpy.get_include()],
25 |     name="ETCPy",
26 |     version="1.3.5",
27 |     author_email="mail@pranaysy.com",
28 |     description="Compute the Effort-To-Compress (ETC) of a symbolic sequence",
29 |     packages=find_packages(),
30 |     license="Apache License, Version 2.0",
31 | )
32 | 


--------------------------------------------------------------------------------
/ETC/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __import__("pkg_resources").declare_namespace(__name__)
 4 | 
 5 | from ETC.seq.IO import read, save
 6 | from ETC.seq.process import generate, entropy
 7 | from ETC.seq.recode import cast, recode_lexical, partition, partition_numpy
 8 | from ETC.seq import check
 9 | 
10 | from ETC.NSRWS.x1D.etc import compute as compute_1D
11 | 
12 | # from ETC.NSRWS.x1D.etc import compute_save as compute_save_1D
13 | # from ETC.NSRWS.x1D.onestep import onestep as onestep_1D
14 | 
15 | from ETC.NSRWS.x2D.etc import compute as compute_2D
16 | 
17 | # from ETC.NSRWS.x2D.etc import compute_save as compute_save_2D
18 | # from ETC.NSRWS.x2D.onestep import onestep as onestep_2D
19 | 
20 | # from ETC.CCC.compute_CCC import compute as compute_CCC
21 | 
22 | 
23 | from ETC.NSRWS.x1D.parallel import (
24 |     pcompute_multiple_seq,
25 |     pcompute_single,
26 |     pcompute_files,
27 |     pcompute_numpy,
28 | )
29 | 
30 | from ETC.LZ76.lzc import compute_complexity as LZC
31 | from ETC.CCMC.pairs import CCM_causality
32 | from ETC.CCMC.pairs_parallel import parallelized as CCM_causality_parallel
33 | from ETC.CCMC.pairs_parallel import get_rowpairs
34 | 


--------------------------------------------------------------------------------
/ETC/seq/IO.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This module contains helper functions for reading and writing files.
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | 
 9 | from csv import DictWriter
10 | 
11 | # Import functions from standard library modules
12 | from pathlib import Path
13 | 
14 | 
15 | def populate_files(filepath, suffix="*.txt"):
16 |     if not isinstance(filepath, Path):
17 |         filepath = Path(filepath)
18 | 
19 |     if filepath.exists() and filepath.is_dir():
20 |         return filepath.rglob(suffix)
21 | 
22 |     print("Invalid path")
23 |     return None
24 | 
25 | 
26 | def read(filepath, delimiter=None):
27 |     if not isinstance(filepath, Path):
28 |         filepath = Path(filepath)
29 |     text = filepath.read_text()
30 | 
31 |     if delimiter:
32 |         text = "".join(text.split(delimiter))
33 | 
34 |     return text
35 | 
36 | 
37 | def save(out, filename):
38 | 
39 |     with open(filename, "w") as fileout:
40 |         writer = DictWriter(fileout, fieldnames=out[0].keys(), delimiter=",")
41 |         writer.writeheader()
42 |         writer.writerows(out)
43 |         print(f">> Data successfully stored to disk as {filename}")
44 | 


--------------------------------------------------------------------------------
/ETC/seq/check.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | 
 9 | from ETC.seq import estimates
10 | from array import array
11 | import numpy as np
12 | 
13 | 
14 | def zeroes(seq):
15 |     if 0 in seq:
16 |         return True
17 |     return False
18 | 
19 | 
20 | def equality(seq, legacy=False):
21 |     """
22 |     This function checks if all elements of a collection are equal.
23 |     Parameters
24 |     ----------
25 |     seq : list or tuple
26 |         Sequence of integers.
27 |     Returns
28 |     -------
29 |     bool
30 |         True if all elements equal.
31 |     """
32 | 
33 |     if arraytype(seq) and not legacy:
34 |         return estimates.equality(seq)
35 | 
36 |     # Iterate over all elements in sequence
37 |     for element in seq:
38 | 
39 |         # Break at first inequality
40 |         if seq[0] != element:
41 |             return False
42 | 
43 |     # Else all equal
44 |     return True
45 | 
46 | 
47 | def arraytype(seq):
48 |     if isinstance(seq, array) and seq.typecode == "I":
49 |         return True
50 |     if isinstance(seq, np.ndarray) and seq.dtype == "uint32":
51 |         return True
52 |     return False
53 | 


--------------------------------------------------------------------------------
/ETC/tests/test_recode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | from array import array
 9 | from hypothesis import given
10 | from hypothesis import strategies as st
11 | from ETC.seq import recode
12 | from collections import Counter
13 | from string import ascii_lowercase
14 | 
15 | invalid_types = (
16 |     st.fractions(),
17 |     st.characters(),
18 |     st.floats(),
19 |     st.text(),
20 |     st.complex_numbers(),
21 |     st.integers(max_value=0),
22 |     st.integers(min_value=2 ** 32),
23 | )
24 | valid_types = st.integers(min_value=1, max_value=2 ** 32 - 1)
25 | 
26 | def counts(x):
27 |     return tuple(Counter(x).values())
28 | 
29 | @given(
30 |     x=st.one_of(st.tuples(st.one_of(invalid_types)), st.lists(st.one_of(invalid_types)))
31 | )
32 | def test_cast_invalid(x):
33 | 
34 |     x = recode.cast(x)
35 | 
36 |     assert x is None
37 | 
38 | 
39 | def test_cast_zeroes():
40 | 
41 |     x = recode.cast([0, 0, 0, 0])
42 | 
43 |     assert x is None
44 | 
45 | 
46 | @given(x=st.one_of(st.tuples(valid_types), st.lists(valid_types, min_size=1)))
47 | def test_cast_valid(x):
48 | 
49 |     x = recode.cast(x)
50 | 
51 |     assert isinstance(x, array) and x.typecode == "I"
52 | 
53 | @given(x=st.text(min_size=1, alphabet=list(ascii_lowercase)))
54 | def test_all_recodes(x):
55 | 
56 |     x1 = recode.recode_lexical(x)
57 |     x2 = recode.recode_alphabetical(x)
58 |     x3 = recode.recode_randint(x)
59 |     x4 = recode.recode_random(x)
60 | 
61 |     assert counts(x1) == counts(x2) == counts(x3) == counts(x4)
62 |     assert len(set(x1)) == len(set(x2)) == len(set(x3)) == len(set(x4))


--------------------------------------------------------------------------------
/ETC/seq/estimates.pyx:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | cimport cython
 9 | import numpy as np
10 | from libc.math cimport log2
11 | cimport numpy as np
12 | 
13 | cpdef double entropy(unsigned int[::1] x):
14 | 
15 |     # cdef np.ndarray[np.npy_int64, ndim=1] counts = np.bincount(x)
16 |     cdef long int[:] counts_view = np.bincount(x)
17 |     # cdef long int[:] counts_view = counts
18 |     cdef double counts_total = 0
19 |     cdef Py_ssize_t counts_size = counts_view.shape[0]
20 |     cdef Py_ssize_t m
21 | 
22 |     for m in range(counts_size):
23 |         counts_total += counts_view[m]
24 | 
25 |     cdef double E = 0.0
26 |     cdef double prob, logprob
27 | 
28 |     m = 0
29 |     for n in range(counts_size):
30 |         if counts_view[n]!=0:
31 |             prob = counts_view[n] / counts_total
32 |             logprob = log2(prob)
33 |             E = E-prob*logprob
34 | 
35 |     return E
36 | 
37 | 
38 | # Function for checking whether all elements in input are identical
39 | cpdef bint equality(const unsigned int[::1] x):
40 |     """
41 |     INPUT
42 |     -----
43 |     x : array.array
44 |         Array object containing 32-bit unsigned integers.
45 | 
46 | 
47 |     OUTPUT
48 |     ------
49 |     bool
50 |         True if all elements are identical
51 |     """
52 |     # Intialize loop bounds
53 |     cdef Py_ssize_t n
54 |     cdef Py_ssize_t x_size = len(x)
55 | 
56 |     # Iterate over values from input
57 |     for n in range(x_size):
58 | 
59 |         # Short-circuit the loop: check for any element that doesn't equal the first
60 |         if x[0] != x[n]:
61 |             return False
62 | 
63 |     return True


--------------------------------------------------------------------------------
/ETC/seq/process.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | from collections import Counter
 9 | from math import log2
10 | from random import choices
11 | from random import seed as seedvalue
12 | from ETC.seq import estimates, recode
13 | from array import array
14 | import re
15 | 
16 | 
17 | def sanitize(text, whitespace=False, lowercase=False):
18 | 
19 |     if whitespace:
20 |         joiner = " "
21 | 
22 |     else:
23 |         joiner = ""
24 | 
25 |     if lowercase:
26 |         text = text.lower()
27 | 
28 |     text = joiner.join(re.findall("[a-zA-Z]+", text))
29 | 
30 |     return text
31 | 
32 | 
33 | def generate(size=10, partitions=2, seed=None):
34 |     """
35 |     This function generates discrete random data of desired size and bins.
36 | 
37 |     Parameters
38 |     ----------
39 |     size : int, optional
40 |         Length of sequence to generate. The default is 10.
41 |     partitions : int, optional
42 |         Number of bins/paritions to create.
43 |     seed : int, optional
44 |         Seed value for initializing the random number generator. The default is None
45 | 
46 |     Returns
47 |     -------
48 |     list
49 |         Collection of integers sampled from discrete uniform.
50 | 
51 |     """
52 |     if not (isinstance(partitions, int) and isinstance(size, int) and partitions >= 2):
53 |         print(partitions, size)
54 |         print(">> Number of bins is invalid ...")
55 |         return None
56 | 
57 |     if seed:
58 |         seedvalue(seed)
59 | 
60 |     return recode.cast(choices(range(1, partitions + 1), k=size))
61 | 
62 | 
63 | def frequencies(seq):
64 | 
65 |     return Counter(seq).most_common()
66 | 
67 | 
68 | def entropy(seq, legacy=False):
69 |     """
70 |     This function computes Shannon Entropy of a given sequence.
71 | 
72 |     Parameters
73 |     ----------
74 |     seq : list or tuple
75 |         Sequence of integers.
76 | 
77 |     Returns
78 |     -------
79 |     float
80 |         Shannon entropy of sequence.
81 | 
82 |     """
83 | 
84 |     if isinstance(seq, array) and seq.typecode == "I" and not legacy:
85 |         return estimates.entropy(seq)
86 | 
87 |     # Get counts from Counter, normalize by total, transform each and sum all
88 |     return sum(
89 |         -seq * log2(seq) for seq in (elem / len(seq) for elem in Counter(seq).values())
90 |     )
91 | 


--------------------------------------------------------------------------------
/ETC/CCC/simulate_AR.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 
 5 | 
 6 | @author: Pranay S. Yadav
 7 | """
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def coupled_AR(length=1000, a=0.9, b=0.8, c=0.8, e=0.01, burn=100, seed=1):
13 |     """
14 |     Generate discrete-time coupled AR processes with known parameters.
15 | 
16 |     Dependent process defined as:
17 |         x[n] = a * x[n - 1] + b * y[n - 1] + e * noise_x[n]
18 | 
19 |     Independent process defined as:
20 |         y[n] = c * y[n - 1] + e * noise_y[n]
21 | 
22 |     Parameters
23 |     ----------
24 |     length : int, optional
25 |         Legnth of samples drawn from the process. The default is 1000.
26 |     a : float, optional
27 |         Coefficient for dependent process, capturing dependency on its own past.
28 |         The default is 0.9.
29 |     b : float, optional
30 |         Coefficient for dependent process, capturing dependency on the independent
31 |         process - causal interaction from independent to dependent. The default is 0.8.
32 |     c : float, optional
33 |         Coefficient for independent process, capturing dependency on its own past.
34 |         The default is 0.8.
35 |     e : float, optional
36 |         Coefficient for uniform random noise mixture. The default is 0.01
37 |     burn : int, optional
38 |         Number of initial samples to burn. The default is 100.
39 |     seed: int, optional
40 |         Seed value for initialization of random number generator. The default is 1
41 | 
42 |     Returns
43 |     -------
44 |     dict
45 |         Two key-value pairs:
46 |             "dependent": Samples of the dependent process.
47 |             "independent": Samples of the independent process.
48 | 
49 |     """
50 |     # Anchor seed for reproducibility
51 |     np.random.seed(seed)
52 | 
53 |     # AR processes: initialize
54 |     x = np.zeros(length, dtype="float64")
55 |     y = np.zeros(length, dtype="float64")
56 | 
57 |     # Generate noise vector of appropriate length
58 |     noise_x = e * np.random.normal(0, 1, length + burn)
59 |     noise_y = e * np.random.normal(0, 1, length + burn)
60 | 
61 |     # Initialize starting points
62 |     x[0] = np.random.uniform()
63 |     y[0] = np.random.uniform()
64 | 
65 |     # Burn initial samples
66 |     if burn:
67 |         for n in range(burn):
68 |             x[0] = a * x[0] + b * y[0] + noise_x[n]
69 |             y[0] = c * y[0] + noise_y[n]
70 | 
71 |     # Store further samples
72 |     for n in range(1, length):
73 |         x[n] = a * x[n - 1] + b * y[n - 1] + noise_x[n]
74 |         y[n] = c * y[n - 1] + noise_y[n]
75 | 
76 |     return {"dependent": x, "independent": y}
77 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Custom
138 | /dull
139 | 


--------------------------------------------------------------------------------
/ETC/LZ76/core.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True
  2 | #!/usr/bin/env python3
  3 | # -*- coding: utf-8 -*-
  4 | """
  5 | 
  6 | 
  7 | @author: Pranay S. Yadav
  8 | """
  9 | # Import stuff
 10 | # cimport cython
 11 | 
 12 | cpdef unsigned int lzc_a(const unsigned int[::1] intarray):
 13 |     """
 14 |     Lempel-Ziv (LZ76) complexity on 32-bit integer arrays
 15 |     """
 16 | 
 17 |     # Variables Initialization
 18 |     cdef Py_ssize_t arraylength = len(intarray)
 19 |     cdef unsigned int complexity = 1
 20 |     cdef Py_ssize_t prefix_len = 1
 21 |     cdef Py_ssize_t len_substring = 1
 22 |     cdef Py_ssize_t max_len_substring = 1
 23 |     cdef unsigned int pointer = 0
 24 | 
 25 |     # While we haven't decoded the full string we continue
 26 |     while prefix_len + len_substring <= arraylength:
 27 | 
 28 |         # Given a prefix length, find the largest substring
 29 |         if (
 30 |             intarray[pointer + len_substring - 1]
 31 |             == intarray[prefix_len + len_substring - 1]
 32 |         ):
 33 |             len_substring += 1  # increase the length of the substring
 34 |         else:
 35 | 
 36 |             max_len_substring = max(len_substring, max_len_substring)
 37 |             pointer += 1
 38 | 
 39 |             # all the pointers have been investigated, we pick the largest for the jump
 40 |             if pointer == prefix_len:
 41 | 
 42 |                 # Increment complexity
 43 |                 complexity += 1
 44 | 
 45 |                 # Increase the prefix length by the maximum substring size found so far
 46 |                 prefix_len += max_len_substring
 47 | 
 48 |                 # Reset the variables
 49 |                 pointer = 0
 50 |                 max_len_substring = 1
 51 | 
 52 |             # reset the length of the substring
 53 |             len_substring = 1
 54 | 
 55 |     # Check final repetition if we were in the middle of a substring
 56 |     if len_substring != 1:
 57 |         complexity += 1
 58 | 
 59 |     return complexity
 60 | 
 61 | cpdef unsigned int lzc_b(const unsigned char[:] bytestring):
 62 |     """
 63 |     Lempel-Ziv (LZ76) complexity on bytestrings
 64 |     """
 65 | 
 66 |     # Variables Initialization
 67 |     cdef Py_ssize_t stringlength = len(bytestring)
 68 |     cdef unsigned int complexity = 1
 69 |     cdef Py_ssize_t prefix_len = 1
 70 |     cdef Py_ssize_t len_substring = 1
 71 |     cdef Py_ssize_t max_len_substring = 1
 72 |     cdef unsigned int pointer = 0
 73 | 
 74 |     # While we haven't decoded the full string we continue
 75 |     while prefix_len + len_substring <= stringlength:
 76 | 
 77 |         # Given a prefix length, find the largest substring
 78 |         if (
 79 |             bytestring[pointer + len_substring - 1]
 80 |             == bytestring[prefix_len + len_substring - 1]
 81 |         ):
 82 |             len_substring += 1  # increase the length of the substring
 83 |         else:
 84 | 
 85 |             max_len_substring = max(len_substring, max_len_substring)
 86 |             pointer += 1
 87 | 
 88 |             # all the pointers have been investigated, we pick the largest for the jump
 89 |             if pointer == prefix_len:
 90 |                 # Increase the complexity
 91 |                 complexity += 1
 92 | 
 93 |                 # Increase the prefix length by the maximum substring size found so far
 94 |                 prefix_len += max_len_substring
 95 | 
 96 |                 # Reset the variables
 97 |                 pointer = 0
 98 |                 max_len_substring = 1
 99 | 
100 |             # reset the length of the substring
101 |             len_substring = 1
102 | 
103 |     # Check final repetition if we were in the middle of a substring
104 |     if len_substring != 1:
105 |         complexity += 1
106 | 
107 |     return complexity
108 | 
109 | # Function for checking whether all elements in input are identical
110 | cpdef bint check_equality(const unsigned int[::1] x):
111 |     """
112 |     INPUT
113 |     -----
114 |     x : array.array
115 |         Array object containing 32-bit unsigned integers.
116 | 
117 | 
118 |     OUTPUT
119 |     ------
120 |     bool
121 |         True if all elements are identical
122 |     """
123 |     # Intialize loop bounds
124 |     cdef Py_ssize_t n
125 |     cdef Py_ssize_t x_size = len(x)
126 | 
127 |     # Iterate over values from input
128 |     for n in range(x_size):
129 | 
130 |         # Short-circuit the loop: check for any element that doesn't equal the first
131 |         if x[0] != x[n]:
132 |             return False
133 | 
134 |     return True


--------------------------------------------------------------------------------
/ETC/CCMC/pairs_parallel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from ETC.CCMC.pairs import ETC_causality, LZ_causality, CCM_causality
  9 | from multiprocessing import Pool
 10 | from functools import partial
 11 | from itertools import combinations
 12 | 
 13 | 
 14 | def _kernel_seq(inputs, estimator):
 15 |     """
 16 |     Wrapper around a function that computes anything on two sequences and returns a dict
 17 | 
 18 |     While it is written as a general purpose kernel for anything, here it is used for
 19 |     causal discovery and estimation from CCM based methods.
 20 | 
 21 |     The function unpacks inputs into an index element and a sequence pair and runs the
 22 |     estimator function on the sequence pair, returning various estimates in a dict
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     inputs : tuple
 27 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
 28 |         be produced manually or more typically using enumerate; b holds the two sequences
 29 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
 30 |         a, the index, is passed to keep track of order in case of asynchronous execution
 31 |         Should look like this: (index, (sequence_x, sequence_y)
 32 |     estimator : function
 33 |         A function that can compute something on two arrays and return a dict. Preferably
 34 |         one that can compute something meaningful, like causal discovery
 35 | 
 36 |     Returns
 37 |     -------
 38 |     out : dict
 39 |         Estimates obtained by running estimator on inputs.
 40 | 
 41 |     """
 42 |     # Unpack inputs
 43 |     idx, seqs = inputs
 44 | 
 45 |     # Unpack sequences
 46 |     idx_x, idx_y, seq_x, seq_y = seqs
 47 | 
 48 |     # Initialize dictionary of output estimates with index
 49 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
 50 | 
 51 |     # Execute the estimator function on the sequence pair
 52 |     out.update(estimator(seq_x, seq_y))
 53 | 
 54 |     # Some feedback to console
 55 |     # print(".", end="")
 56 | 
 57 |     return out
 58 | 
 59 | 
 60 | def get_rowpairs(matrix):
 61 |     """
 62 |     Create a generator for iterating over pairs of rows of an input matrix
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     matrix : numpy array, int or float, 2D
 67 |         Each row representing a different sequence. (Columns as time)
 68 | 
 69 |     Yields
 70 |     ------
 71 |     row1 : int
 72 |         Index of first row in the pair.
 73 |     row2 : int
 74 |         Index of second row in the pair.
 75 |     np.array, 1D, int
 76 |         Data of first row in the pair.
 77 |     np.array, 1D, int
 78 |         Data of first row in the pair.
 79 | 
 80 |     """
 81 |     for row1, row2 in combinations(range(0, matrix.shape[0]), 2):
 82 |         yield (row1, row2, matrix[row1, :], matrix[row2, :])
 83 | 
 84 | 
 85 | def parallelized(pairs, kernel="CCM"):
 86 |     """
 87 |     This function operates concurrently on a collection of sequence pairs and computes
 88 |     estimates using the chosen kernel function.
 89 | 
 90 |     Here used for computing causal estimates from sequences pairs in batch, each pair
 91 |     runs on a separate CPU core as a process.
 92 | 
 93 |     CAUTION: main module is unguarded, do not run these functions as is,
 94 |         particularly on Windows!
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     pairs : list/tuple/generator
 99 |         Collection of pairs of integer sequences.
100 |     kernel : str, optional
101 |         Name of an estimator function. Currently available: "CCM", "ETC" and "LZ". The
102 |         default is "CCM".
103 | 
104 |     Returns
105 |     -------
106 |     list of dict elements
107 |         Each dictionary element contains index, length of sequence & ETC.
108 | 
109 |     """
110 | 
111 |     if kernel == "CCM":
112 |         exec_kernel = partial(_kernel_seq, estimator=CCM_causality)
113 |     elif kernel == "ETC":
114 |         exec_kernel = partial(_kernel_seq, estimator=ETC_causality)
115 |     elif kernel == "LZ":
116 |         exec_kernel = partial(_kernel_seq, estimator=LZ_causality)
117 |     else:
118 |         print("> ERROR: Invalid kernel specified")
119 |         return None
120 | 
121 |     # Initialize pool of parallel workers
122 |     pool = Pool()
123 | 
124 |     # Confirm to stdout
125 |     print(f"Running kernel={kernel} in parallel on input ... ", end="")
126 | 
127 |     # Map-execute function across sequences
128 |     out = pool.map_async(exec_kernel, enumerate(pairs))
129 | 
130 |     # Graceful exit
131 |     pool.close()
132 |     pool.join()
133 | 
134 |     # Confirm completion
135 |     print("Done!")
136 | 
137 |     # Return collected results
138 |     return out.get()
139 | 


--------------------------------------------------------------------------------
/ETC/seq/recode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | from string import ascii_lowercase
 10 | from random import shuffle, choices
 11 | from array import array
 12 | import numpy as np
 13 | from ETC.seq.check import zeroes
 14 | 
 15 | 
 16 | def cast(seq):
 17 | 
 18 |     if seq is not None:
 19 |         if isinstance(seq, np.ndarray):
 20 |             try:
 21 |                 out = seq.astype("uint32")
 22 |                 if zeroes(out):
 23 |                     print("> Input contains 0!")
 24 |                     print("> Symbols shifted up by 1 ")
 25 |                     return out + 1
 26 |                 return out
 27 | 
 28 |             except TypeError as error:
 29 |                 print("ERROR:", error)
 30 |                 print("> Input must be a list/tuple/array of positive integers!")
 31 |                 print('> Recode or partition using "ETC.seq.recode"')
 32 |                 return None
 33 | 
 34 |             except OverflowError as error:
 35 |                 print("ERROR:", error)
 36 |                 print("> Input must be a list/tuple/array of positive integers!")
 37 |                 print('> Recode or partition using "ETC.seq.recode"')
 38 |                 return None
 39 | 
 40 |         else:
 41 |             try:
 42 |                 out = array("I", seq)
 43 |                 if zeroes(out):
 44 |                     print("> Input contains 0!")
 45 |                     print('> Recode or partition using "ETC.seq.recode" ')
 46 |                     return None
 47 |                 return out
 48 | 
 49 |             except TypeError as error:
 50 |                 print("ERROR:", error)
 51 |                 print("> Input must be a list/tuple/array of positive integers!")
 52 |                 print('> Recode or partition using "ETC.seq.recode"')
 53 |                 return None
 54 | 
 55 |             except OverflowError as error:
 56 |                 print("ERROR:", error)
 57 |                 print("> Input must be a list/tuple/array of positive integers!")
 58 |                 print('> Recode or partition using "ETC.seq.recode"')
 59 |                 return None
 60 | 
 61 |     print("No input sequence provided.")
 62 |     return None
 63 | 
 64 | 
 65 | def recode_lexical(text, case_sensitive=True):
 66 | 
 67 |     if not isinstance(text, str):
 68 |         print("ERROR: Input is not a string.")
 69 |         return None
 70 |     if not case_sensitive:
 71 |         text = text.lower()
 72 |     alphabets = sorted(set(text))
 73 |     replacer = dict((y, x + 1) for x, y in enumerate(alphabets))
 74 |     text = cast([replacer[x] for x in text])
 75 |     return text
 76 | 
 77 | 
 78 | def recode_alphabetical(text):
 79 | 
 80 |     text = text.lower()
 81 |     if not set(text).issubset(ascii_lowercase):
 82 |         print("> Input contains non alphabetical characters!")
 83 |         return None
 84 |     replacer = dict((y, x + 1) for x, y in enumerate(ascii_lowercase))
 85 |     text = cast([replacer[x] for x in text])
 86 |     return text
 87 | 
 88 | 
 89 | def recode_dna(text):
 90 | 
 91 |     replacer = {"A": 1, "G": 1, "C": 2, "T": 2}
 92 |     text = cast([replacer[x] for x in text.upper()])
 93 |     return text
 94 | 
 95 | 
 96 | def recode_random(text):
 97 | 
 98 |     alphabets = list(set(text))
 99 |     shuffle(alphabets)
100 |     replacer = dict((y, x + 1) for x, y in enumerate(alphabets))
101 |     text = cast([replacer[x] for x in text])
102 |     return text
103 | 
104 | 
105 | def recode_randint(text):
106 | 
107 |     alphabets = list(set(text))
108 |     numbers = choices(range(1, 2 ** 20), k=len(alphabets))
109 |     replacer = dict(zip(alphabets, numbers))
110 |     text = cast([replacer[x] for x in text])
111 |     return text
112 | 
113 | 
114 | def partition(seq, n_bins):
115 |     """
116 |     This function takes an input sequence and bins it into discrete points.
117 | 
118 |     Parameters
119 |     ----------
120 |     seq : list/tuple of float
121 |         Collection of floats.
122 |     n_bins : int
123 |         Number of bins/paritions to create.
124 | 
125 |     Returns
126 |     -------
127 |     list
128 |         Collection of integers. Contains unique integers from 1 to n_bins.
129 | 
130 |     """
131 |     assert (
132 |         isinstance(n_bins, int) and n_bins > 1
133 |     ), "ERROR: Number of bins should be a positive integer"
134 | 
135 |     # Get smallest value
136 |     a = min(seq)
137 | 
138 |     # Compute reciprocal of peak-to-peak per bin
139 |     delta_inv = n_bins / (max(seq) - a + 1e-6)
140 | 
141 |     # Transform each element and return
142 |     return [1 + int((elem - a) * delta_inv) for elem in seq]
143 | 
144 | 
145 | def partition_numpy(nparr, n_bins):
146 |     """
147 |     This function takes an input sequence & partitions it into equiwidth discrete bins.
148 | 
149 |     Min-max scaling, followed by equiwidth binning for each row
150 | 
151 |     Parameters
152 |     ----------
153 |     nparr : numpy array, int or float, 2D
154 |         Each row representing a different sequence. (Columns as time)
155 |     n_bins : int
156 |         Number of bins/paritions to create.
157 | 
158 |     Returns
159 |     -------
160 |     list
161 |         Collection of integers. Contains unique integers from 1 to n_bins.
162 | 
163 |     """
164 |     assert (
165 |         isinstance(n_bins, int) and n_bins > 1
166 |     ), "ERROR: Number of bins should be a positive integer"
167 | 
168 |     assert (
169 |         isinstance(nparr, np.ndarray) and nparr.ndim == 2
170 |     ), ">ERROR: Input must be 2D NumPy array of numbers"
171 | 
172 |     # Get smallest value
173 |     a = nparr.min(axis=1)[:, np.newaxis]
174 | 
175 |     # Compute reciprocal of peak-to-peak per bin
176 |     delta_inv = n_bins / (nparr.max(axis=1)[:, np.newaxis] - a + 1e-6)
177 | 
178 |     # Transform each element and return
179 |     return 1 + ((nparr - a) * delta_inv).astype("uint32")
180 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x1D/distance.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | 
 10 | from functools import partial
 11 | from itertools import islice
 12 | 
 13 | # Import functions from standard library modules
 14 | from multiprocessing import Pool
 15 | 
 16 | # Import local modules
 17 | import ETC
 18 | 
 19 | get1D = partial(ETC.compute_1D, order=2, verbose=False, truncate=True)
 20 | 
 21 | 
 22 | def _compute_distance(inputs):
 23 |     """
 24 |     This function operates on a single sequence and computes ETC.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     seq : tuple of 2 elements
 29 |         1st element is index for tracking.
 30 |         2nd element is a sequence of integers used for ETC computation.
 31 |         Output of enumerate.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     out : dict
 36 |         index of sequence, length of sequence and ETC estimate.
 37 | 
 38 |     """
 39 |     idx, seqs = inputs
 40 | 
 41 |     S1 = ETC.seq.recode.recode_lexical(seqs[0])
 42 |     S2 = ETC.seq.recode.recode_lexical(seqs[1])
 43 | 
 44 |     # Prepare output dictionary
 45 |     out = {"item": idx, "length_seq1": len(S1), "length_seq2": len(S2)}
 46 | 
 47 | 
 48 |     # Compute ETC and update output dictionary
 49 |     ETC1D_seq1 = get1D(S1)["ETC1D"]
 50 |     out.update({"ETC1D_seq1": ETC1D_seq1})
 51 | 
 52 |     ETC1D_seq2 = get1D(S2)["ETC1D"]
 53 |     out.update({"ETC1D_seq2": ETC1D_seq2})
 54 | 
 55 |     ETC1D_seq1seq2 = get1D(S1 + S2)["ETC1D"]
 56 |     out.update({"ETC1D_seq1seq2": ETC1D_seq1seq2})
 57 | 
 58 |     ETC1D_seq2seq1 = get1D(S2 + S1)["ETC1D"]
 59 |     out.update({"ETC1D_seq2seq1": ETC1D_seq2seq1})
 60 | 
 61 |     dETC = 0.5 * (ETC1D_seq1seq2 + ETC1D_seq2seq1 - ETC1D_seq1 - ETC1D_seq2)
 62 | 
 63 |     out.update({"distance": dETC})
 64 | 
 65 |     return out
 66 | 
 67 | 
 68 | def _overlapping_chunks(seq, size, offset=1):
 69 |     """
 70 |     This function takes an input sequence and produces chunks of chosen size.
 71 |     Offset can be used to control degree of overlap (or distance between chunks
 72 |     that don't overlap)
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     seq : tuple or list
 77 |         Sequence of integers.
 78 |     size : int
 79 |         Length of each produced chunk.
 80 |     offset : int, optional
 81 |         Number of elements to shift each chunk by. The default is 1.
 82 |         Setting this to any value less than size allows control of overlap.
 83 |         Setting this >= size produces non-overlapping chunks.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     zip
 88 |         zip object that produces chunks of specified size, one at a time.
 89 | 
 90 |     """
 91 | 
 92 |     return zip(*(islice(seq, i, None, offset) for i in range(size)))
 93 | 
 94 | 
 95 | def _non_overlapping_chunks(seq, size):
 96 |     """
 97 |     This function takes an input sequence and produces chunks of chosen size
 98 |     that strictly do not overlap. This is a much faster implemetnation than
 99 |     _overlapping_chunks and should be preferred if running on very large seq.
100 | 
101 |     Parameters
102 |     ----------
103 |     seq : tuple or list
104 |         Sequence of integers.
105 |     size : int
106 |         Length of each produced chunk.
107 | 
108 |     Returns
109 |     -------
110 |     zip
111 |         zip object that produces chunks of specified size, one at a time.
112 | 
113 |     """
114 | 
115 |     return zip(*[iter(seq)] * size)
116 | 
117 | 
118 | def pcompute_multiple_seq(iterable):
119 |     """
120 |     This function operates concurrently on a collection of sequences. Loads
121 |     each sequence and computes ETC.
122 | 
123 |     CAUTION: main module is unguarded, do not run these functions as is,
124 |         particularly on Windows.
125 | 
126 |     Parameters
127 |     ----------
128 |     iterable : list/tuple/generator
129 |         Collection of integer sequences.
130 | 
131 |     Returns
132 |     -------
133 |     list of dict elements
134 |         Each dictionary element contains index, length of sequence & ETC.
135 | 
136 |     """
137 |     # Initialize pool of parallel workers
138 |     pool = Pool()
139 | 
140 |     # Map-execute function across sequences
141 |     out = pool.map(_compute_distance, enumerate(iterable))
142 | 
143 |     # Graceful exit
144 |     pool.close()
145 |     pool.join()
146 | 
147 |     # Return collected results
148 |     return out
149 | 
150 | 
151 | def truncate(seq1, seq2):
152 | 
153 |     # Truncate the longer sequence
154 |     if len(seq1) == len(seq2):
155 |         return seq1, seq2
156 | 
157 |     if len(seq1) > len(seq2):
158 |         seq1 = seq1[: len(seq2)]
159 |     else:
160 |         seq2 = seq2[: len(seq1)]
161 | 
162 |     return seq1, seq2
163 | 
164 | 
165 | def pcompute_single(seq1, seq2, size, offset=1):
166 |     """
167 |     This function operates concurrently on chunks of a given sequence. Gets
168 |     each chunk and computes ETC one-by-one. Offset parameter controls degree of
169 |     overlap (or non-overlap)
170 | 
171 |     CAUTION: main module is unguarded, do not run these functions as is,
172 |         particularly on Windows.
173 | 
174 |     Parameters
175 |     ----------
176 |     seq : tuple or list
177 |         Sequence of integers.
178 |     size : int
179 |         Length of each produced chunk.
180 |     offset : int, optional
181 |         Number of elements to shift each chunk by. The default is 1.
182 |         Setting this to any value less than size allows control of overlap.
183 |         Setting this >= size produces non-overlapping chunks.
184 | 
185 |     Returns
186 |     -------
187 |     list of dict elements
188 |         Each dictionary element contains index, length of sequence & ETC.
189 | 
190 |     """
191 | 
192 |     seq1, seq2 = truncate(seq1, seq2)
193 | 
194 |     # If offset equals size, get non-overlapping chunks of given size
195 |     if offset == size:
196 |         iterable1 = _non_overlapping_chunks(seq1, size)
197 |         iterable2 = _non_overlapping_chunks(seq2, size)
198 | 
199 |     # Else get overlapping chunks of given size and offset
200 |     else:
201 |         iterable1 = _overlapping_chunks(seq1, size, offset)
202 |         iterable2 = _overlapping_chunks(seq2, size, offset)
203 | 
204 |     # Execute parallel computation over chunks
205 |     return pcompute_multiple_seq(zip(iterable1, iterable2))
206 | 


--------------------------------------------------------------------------------
/ETC/NCA/compute.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This module has functions for multicore computation of NCA from a 2D matrix
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | import pandas as pd
  9 | from ETC.NCA import parallelize_jl as NCAP
 10 | 
 11 | 
 12 | def compute_CCC(matrix, CCC_params):
 13 |     """
 14 |     Compute causal complexity estimates for all pairs of rows of input matrix
 15 | 
 16 |     Estimates are derived as per 4 models: ETCP, ETCE, LZP and CCC
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     matrix : np.ndarray, 2d, uint32
 21 |         MxN matrix, C(M,2) rowpairs, with each row of length N.
 22 |     CCC_params : dict
 23 |         CCC parameters with the following names for keys:
 24 |             "LEN_past", "ADD_meas", "STEP_size"
 25 | 
 26 |     Returns
 27 |     -------
 28 |     pd.DataFrame
 29 |         DataFrame containing causal estimates from all 4 models.
 30 | 
 31 |     """
 32 |     # Create a generator that produces rowpairs one at a time
 33 |     rowpairs = NCAP.get_rowpairs(matrix)
 34 | 
 35 |     # Compute causal estimates in parallel across rowpairs
 36 |     estimates = NCAP.parallelized_CCC(rowpairs, CCC_params)
 37 | 
 38 |     # Convert estimates to a DataFrame and return
 39 |     return pd.DataFrame(estimates)
 40 | 
 41 | 
 42 | def compute_CCM(matrix, kernel="LZ"):
 43 |     """
 44 |     Compute causal complexity estimates for all pairs of rows of input matrix
 45 | 
 46 |     Estimates are derived as per 4 models: ETCP, ETCE, LZP and CCC
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     matrix : np.ndarray, 2d, uint32
 51 |         MxN matrix, C(M,2) rowpairs, with each row of length N.
 52 |     CCC_params : dict
 53 |         CCC parameters with the following names for keys:
 54 |             "LEN_past", "ADD_meas", "STEP_size"
 55 | 
 56 |     Returns
 57 |     -------
 58 |     pd.DataFrame
 59 |         DataFrame containing causal estimates from all 4 models.
 60 | 
 61 |     """
 62 |     # Create a generator that produces rowpairs one at a time
 63 |     rowpairs = NCAP.get_rowpairs(matrix)
 64 | 
 65 |     # Compute causal estimates in parallel across rowpairs
 66 |     estimates = NCAP.parallelized_CCM(rowpairs, kernel)
 67 | 
 68 |     # Convert estimates to a DataFrame and return
 69 |     return pd.DataFrame(estimates)
 70 | 
 71 | 
 72 | def get_causal(df):
 73 |     """
 74 |     Extract causal strengths from all estimates as a long-form (tidy) DataFrame
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     df : pd.DataFrame
 79 |         DataFrame containing raw causal estimates from all 4 models.
 80 |         As returned by get_estimates()
 81 | 
 82 |     Returns
 83 |     -------
 84 |     pd.DataFrame
 85 |         DataFrame containing only causal strengths from the 4 models in both directions.
 86 | 
 87 |     """
 88 |     # Fix pair identifiers
 89 |     identifiers = ["index_pair", "index_x", "index_y"]
 90 | 
 91 |     # Initialize aggregator of melted / tidied dataframes
 92 |     dfs = list()
 93 | 
 94 |     # Iterate over each model and tidy up
 95 |     for model in ["ETCP", "ETCE", "LZP", "CCC"]:
 96 | 
 97 |         if df.filter(like=model).shape[-1] != 0:
 98 |             dat = df.melt(
 99 |                 id_vars=identifiers,
100 |                 value_vars=[f"{model}_x_to_y", f"{model}_y_to_x"],
101 |                 var_name="direction",
102 |                 value_name=model,
103 |             )
104 | 
105 |             # Strip column names to make it generic -> easier to concatenate
106 |             dat["direction"] = dat["direction"].str.replace(f"{model}_", "")
107 | 
108 |             # Set identifiers as indices and append to aggregator
109 |             dfs.append(dat.set_index(identifiers + ["direction"]))
110 | 
111 |     # Concatenate all tidied dataframes and return
112 |     return pd.concat(dfs, axis=1).reset_index()
113 | 
114 | 
115 | def get_NCA(df, k=0.1):
116 |     """
117 |     Compute NCA from top k causal strengths (top k pairs)
118 | 
119 |     Parameters
120 |     ----------
121 |     df : pd.DataFrame
122 |         DataFrame containing only causal strengths from the 4 models in both directions
123 |         in tidy/long-form, as returned by get_causal()
124 | 
125 |     k : float, optional, 0 < k < 1
126 |         Top proportion of causal strengths. The default is 0.1.
127 | 
128 |     Returns
129 |     -------
130 |     pd.DataFrame
131 |         DataFrame containing NCA estimates with summary statistics.
132 | 
133 |     """
134 |     # Convert k to numeric based on available causal pairs
135 |     k_int = round(k * len(df))
136 | 
137 |     # Initialize aggregator of estimates from each model
138 |     agg = list()
139 | 
140 |     # Iterate over each model, compute summary statistics and aggregate
141 |     for model in ["ETCP", "ETCE", "LZP", "CCC"]:
142 | 
143 |         if df.filter(like=model).shape[-1] != 0:
144 |             dat = df[model].nlargest(k_int)
145 |             agg.append(
146 |                 {
147 |                     "model": model,
148 |                     "mean": dat.mean(),
149 |                     "median": dat.median(),
150 |                     "max": dat.max(),
151 |                     "min": dat.min(),
152 |                     "mad": dat.mad(),
153 |                     "std": dat.std(),
154 |                 }
155 |             )
156 | 
157 |     # Combine all estimates into a DataFrame and return
158 |     return pd.DataFrame(agg).set_index("model")
159 | 
160 | 
161 | # Function for earlier versions, preserved for later
162 | # def get_complexity(df):
163 | #     """
164 | #     Extract CCM estimates of complexity - ETC and LZ of each row element
165 | 
166 | #     Parameters
167 | #     ----------
168 | #     df : pd.DataFrame
169 | #         DataFrame containing raw causal estimates from all 4 models.
170 | #         As returned by get_estimates()
171 | 
172 | #     Returns
173 | #     -------
174 | #     pd.DataFrame
175 | #         DataFrame containing unique ETC and LZ estimates for each row.
176 | 
177 | #     """
178 | #     # Store estimates for one row of the pair
179 | #     dfx = df[["index_x", "ETC_x", "LZ_x"]]
180 | #     dfx = dfx.rename(columns=lambda m: m.replace("_x", ""))
181 | 
182 | #     # Store estimates for the other row of the pair
183 | #     dfy = df[["index_y", "ETC_y", "LZ_y"]]
184 | #     dfy = dfy.rename(columns=lambda m: m.replace("_y", ""))
185 | 
186 | #     # Combine the two, drop duplicates and return
187 | #     return pd.concat([dfx, dfy]).drop_duplicates().set_index("index")
188 | 


--------------------------------------------------------------------------------
/ETC/CCC/compute_CCC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Compute the Compression-Complexity based Causality for two sequences.
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | # Import functions
 10 | from functools import partial
 11 | from ETC import compute_1D, compute_2D
 12 | from ETC.seq.recode import partition, cast
 13 | from ETC.seq.check import arraytype
 14 | 
 15 | import numpy as np
 16 | import array
 17 | 
 18 | # Curry the functions for computing 1D and 2D ETC estimates
 19 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True)
 20 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True)
 21 | 
 22 | 
 23 | def get_params():
 24 |     """
 25 |     Helper function for creating a dictionary of CCC params interactively
 26 | 
 27 |     Returns
 28 |     -------
 29 |     dict
 30 |         The main 3 parameters for passing directly into CCC.compute
 31 | 
 32 |     """
 33 |     print("#" * 80)
 34 |     print("-" * 80)
 35 |     print("Initialize CCC Parameters: The thorns")
 36 |     print("-" * 80, "\n")
 37 |     print("All the following 3 should be integers\n")
 38 |     LEN_past = int(input("1. Window length of immediate past values: "))
 39 |     ADD_meas = int(input("2. Window length of present values: "))
 40 |     STEP_size = int(input("3. Step-size for sliding window ahead: "))
 41 |     print("\n", "-" * 80)
 42 |     print("#" * 80)
 43 |     return {"LEN_past": LEN_past, "ADD_meas": ADD_meas, "STEP_size": STEP_size}
 44 | 
 45 | 
 46 | def compute(seq_x, seq_y, LEN_past, ADD_meas, STEP_size, n_partitions=False):
 47 |     """
 48 |     Estimate the Compression-Complexity based Causality for two sequences.
 49 | 
 50 |     The direction of causality being assessed is from seq_y -> seq_x. Various other
 51 |     parameters need to be specified, a brief description is offered below.
 52 | 
 53 |     For detailed explanations regarding the parameters, interpretations and of the inner
 54 |     workings, please refer to the research article along with the supplementary:
 55 |         Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for
 56 |         Complexity-Causality Measure.” PeerJ Computer Science 5 (May 2019): e196.
 57 |         https://doi.org/10.7717/peerj-cs.196.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     seq_x : list or tuple
 62 |         Sequence of numbers, if not integers specify n_partitions for binning.
 63 |     seq_y : list or tuple
 64 |         Sequence of numbers, if not integers specify n_partitions for binnings.
 65 |     LEN_past : int
 66 |         Parameter "L": Window length of immediate past values of seq_x and seq_y.
 67 |     ADD_meas : int
 68 |         Parameter "w": Window length of present values of seq_x. Minimal data length
 69 |         over which CC rate can be reliably estimated, application/domain-specific
 70 |     STEP_size : int
 71 |         Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
 72 |         of 20-50% between successive chunks or windows suggested.
 73 |     n_partitions : int or bool, optional
 74 |         Parameter "B": Number of bins. Smalles number of symbols that capture the time
 75 |         series dynamics. The default is False indicating that the data is already in the
 76 |         form of discrete symbolic sequences.
 77 | 
 78 |     Returns
 79 |     -------
 80 |     CCC : float
 81 |         Estimated Compression-Complexity based Causality for direction seq_y -> seq_x.
 82 | 
 83 |     """
 84 |     # Sanity checks
 85 |     assert len(seq_x) == len(seq_y), "ERROR: Sequences must have the same length!"
 86 |     assert (
 87 |         isinstance(LEN_past, int) and LEN_past > 1
 88 |     ), "ERROR: LEN_past must be a positive integer!"
 89 |     assert (
 90 |         isinstance(ADD_meas, int) and ADD_meas > 1
 91 |     ), "ERROR: ADD_meas must be a positive integer!"
 92 |     assert (
 93 |         isinstance(STEP_size, int) and STEP_size > 1
 94 |     ), "ERROR: STEP_size must be a positive integer!"
 95 | 
 96 |     # Partition data if requested with the specificed number of bins
 97 |     if n_partitions:
 98 |         seq_x = partition(seq_x, n_partitions)
 99 |         seq_y = partition(seq_y, n_partitions)
100 | 
101 |     # Check whether input is a discrete symbolic sequence
102 |     if not arraytype(seq_x):
103 |         seq_x = cast(seq_x)
104 |     if not arraytype(seq_y):
105 |         seq_y = cast(seq_y)
106 | 
107 |     # Set switch for operating differently on native vs numpy arrays
108 |     if type(seq_x) == np.ndarray or type(seq_y) == np.ndarray:
109 |         combine = lambda x, y: np.hstack([x, y])
110 |     if type(seq_x) == array.array or type(seq_y) == array.array:
111 |         combine = lambda x, y: x + y
112 | 
113 |     # Setup variables
114 |     LEN = len(seq_x)
115 |     LEN_to_check = LEN_past + ADD_meas
116 | 
117 |     # Initialize aggregators
118 |     l_1D = []
119 |     l_2D = []
120 | 
121 |     # Iterate over chunks of both sequences
122 |     for k in range(0, LEN - LEN_to_check, STEP_size):
123 | 
124 |         ## Compression-Complexity of past values of seq_x
125 |         # 1D ETC of a chunk of seq_x of length LEN_past
126 |         ETC1D_ini = get1D(seq_x[k : k + LEN_past])["NETC1D"]
127 | 
128 |         ## Compression-Complexity of past values of seq_x and seq_y
129 |         # 2D ETC of chunks of both seq_x,seq_y of length LEN_past at the same locus
130 |         ETC2D_ini = get2D(seq_x[k : k + LEN_past], seq_y[k : k + LEN_past],)["NETC2D"]
131 | 
132 |         ## Compression-Complexity of present values of seq_x
133 |         # 1D ETC of a chunk of seq_x of length LEN_to_check
134 |         ETC1D_fin = get1D(seq_x[k : k + LEN_to_check])["NETC1D"]
135 | 
136 |         ## Compression-Complexity of values of seq_x & past of seq_y + present of seq_x
137 |         # 2D ETC of chunks of both seq_x, seq_y of length LEN_to_check at the same locus
138 |         ETC2D_fin = get2D(
139 |             seq_x[k : k + LEN_to_check],
140 |             combine(seq_y[k : k + LEN_past], seq_x[k + LEN_past : k + LEN_to_check]),
141 |         )["NETC2D"]
142 | 
143 |         # Dynamic Compression-Complexity of seq_x
144 |         ETC1D_delta = ETC1D_fin - ETC1D_ini
145 | 
146 |         # Dynamic Compression Complexity of seq_x conditional on seq_y
147 |         ETC2D_delta = ETC2D_fin - ETC2D_ini
148 | 
149 |         # Aggregate Dynamic CCs
150 |         l_1D.append(ETC1D_delta)
151 |         l_2D.append(ETC2D_delta)
152 | 
153 |     ## Compute Compession-Complexity Causality
154 |     # Average of the difference: CC(X | X_past) - CC(X | Y_past + X_present)
155 |     CCC = (sum(l_1D) - sum(l_2D)) / len(l_1D)
156 |     # print(f"CCC for seq_y -> seq_x = {CCC}")
157 |     return CCC
158 | 


--------------------------------------------------------------------------------
/ETC/CCC/simulate_TentMap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | # Import calls
 10 | import numpy as np
 11 | from numba import vectorize, float64, njit
 12 | 
 13 | # Compute single step of iteration through skew-tent map
 14 | @vectorize([float64(float64, float64)])
 15 | def _skewtent_onestep(value, threshold):
 16 |     """
 17 |     Computes a single step of iteration through the skew-tent map given an
 18 |     input (previous) value and a threshold. Returns the next value as output.
 19 |     This function is called by _iterate_skewtent for iterating repeatedly.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     value : scalar, float64
 24 |         Input value to the skew-tent map.
 25 |     threshold : scalar, float64
 26 |         Threshold value of the skew-tent map.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     Output value as float64 from the skew-tent map.
 31 |     Computed conditionally as follows:
 32 |         If value < threshold, then output is value / threshold
 33 |         Else, output is (1 - value)/(1 - threshold)
 34 | 
 35 |     """
 36 |     if value < threshold:
 37 |         return value / threshold
 38 |     return (1 - value) / (1 - threshold)
 39 | 
 40 | 
 41 | # Multiple iterations along skew-tent map
 42 | @njit
 43 | def _iterate_skewtent(threshold, traj_vec, coupling):
 44 |     """
 45 |     Computes multiple steps of iteration through the skew-tent map given a
 46 |     starting condition, as the first element of an array full of zeros, and
 47 |     a threshold for the skew-tent map. This function calls _skewtent_onestep
 48 |     for running a single step, and is itself called by _compute_trajectory,
 49 |     which initializes the trajectory array.
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     threshold : vector of size 2, float64
 54 |         Threshold value of the skew-tent map.
 55 |     traj_vec : array, 2D, float64
 56 |         Pre-allocated array of zeroes with the 1st element containing a
 57 |         value corresponding to initial condition of the skew-tent map
 58 | 
 59 |     Returns
 60 |     -------
 61 |     traj_vec : array, 2D, float64
 62 |         Array populated with values corresponding to the trajectory taken by
 63 |         recursive iteration through a skew-tent map. Length of this trajectory
 64 |         is inferred from the array shape.
 65 | 
 66 |     """
 67 |     # Iteration using for-loop over indices
 68 |     for idx in range(1, max(traj_vec.shape)):
 69 | 
 70 |         # Execute single step of iteration using previous value and threshold
 71 |         traj_vec[0, idx] = _skewtent_onestep(traj_vec[0, idx - 1], threshold[0])
 72 | 
 73 |         # Linearly dependent tent map
 74 |         buffer = _skewtent_onestep(traj_vec[1, idx - 1], threshold[1])
 75 |         traj_vec[1, idx] = coupling[0] * traj_vec[0, idx] + (1 - coupling[0]) * buffer
 76 | 
 77 |         # Nonlinearly dependent tent map
 78 |         buffer = (
 79 |             coupling[1] * traj_vec[0, idx - 1]
 80 |             + (1 - coupling[1]) * traj_vec[2, idx - 1]
 81 |         )
 82 |         traj_vec[2, idx] = _skewtent_onestep(buffer, threshold[2])
 83 | 
 84 |     # Return populated array
 85 |     return traj_vec
 86 | 
 87 | 
 88 | # Compute trajectory given initial conditions, threshold and size
 89 | @njit
 90 | def _compute_trajectory(init_cond, threshold, length, coupling):
 91 |     """
 92 |     Computes the trajectory along a skew-tent map with given threshold and an
 93 |     initial condition for a given distance. Doesn't validate input. This is
 94 |     called by compute_trajectory after checking inputs.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     init_cond : vector of size 3, float64
 99 |         Initial value for iterating through the skew-tent map.
100 |     threshold : vector of size 3, float64
101 |         Threshold value of the skew-tent map.
102 |     length : scalar, integer
103 |         Size of the trajectory to compute through iteration.
104 | 
105 |     Returns
106 |     -------
107 |     array, 2D, float64
108 |         Array of demanded size filled with values corresponding to the
109 |         trajectory.
110 | 
111 |     """
112 |     # Pre-allocate array for trajectory with known size
113 |     traj_vec = np.zeros((3, length), dtype=np.float64)
114 | 
115 |     # Assign initial condition to first elements of Y, Xlin, Xnonlin
116 |     traj_vec[:, 0] = init_cond
117 | 
118 |     # Run iterations and return populated array
119 |     return _iterate_skewtent(threshold, traj_vec, coupling)
120 | 
121 | 
122 | # Warmup for Numba cache initialization
123 | def warmup():
124 |     """
125 |     Runs all the Numba-optimized functions to initialize Numba's JIT.
126 |     Returns nothing and only prints to stdout.
127 | 
128 |     Returns
129 |     -------
130 |     None.
131 | 
132 |     """
133 |     initials = np.array([0.1] * 3)
134 |     threshs = np.array([0.2] * 3)
135 |     couplings = np.array([0] * 2)
136 |     expected = np.array([0.625] * 3)
137 |     # Test for a known value
138 |     if (_compute_trajectory(initials, threshs, 3, couplings)[:, -1] == expected).all():
139 |         print("> Numba JIT warmup successful for chaotic_sampler ...")
140 |     else:
141 |         print("> Numba JIT warmup failed for chaotic_sampler ...")
142 | 
143 | 
144 | def compute_trajectory(init_cond, threshold, length, burn, coupling):
145 |     """
146 |     Computes the trajectory along a skew-tent map with given threshold and an
147 |     initial condition for a given distance. Wrapper around _compute_trajectory
148 |     and checks inputs for sanity
149 | 
150 |     Parameters
151 |     ----------
152 |     init_cond : vector of size 3, float64
153 |         Initial value for iterating through the skew-tent map.
154 |             range: 0 < init_cond < 1
155 |     threshold : vector of size 3, float64
156 |         Threshold value of the skew-tent map.
157 |             range: 0 < threshold < 1
158 |     length : scalar, integer
159 |         Size of the trajectory to compute through iteration.
160 |             range: 10^2 < length < 10^7
161 | 
162 |     Returns
163 |     -------
164 |     array, 2D, float64
165 |         Array of demanded size filled with values corresponding to the
166 |         trajectory.
167 | 
168 |     """
169 |     # Return trajectory if inputs are valid
170 | 
171 |     return _compute_trajectory(init_cond, threshold, length + burn, coupling)[:, burn:]
172 | 
173 | 
174 | def coupled_TM(threshold, length, burn, coupling, seed):
175 | 
176 |     np.random.seed(seed)
177 | 
178 |     # Initialize starting points
179 |     init_cond = np.random.uniform(size=(3))
180 | 
181 |     # Initialize thresholds
182 |     thresholds = np.array([threshold] * 3)
183 | 
184 |     # Initialize couplings
185 |     couplings = np.array([coupling] * 2)
186 | 
187 |     trajectories = compute_trajectory(init_cond, thresholds, length, burn, couplings)
188 | 
189 |     return {
190 |         "independent": trajectories[0, :],
191 |         "dependent_linear": trajectories[1, :],
192 |         "dependent_nonlinear": trajectories[2, :],
193 |     }
194 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This is a demo script for showcasing this package's functionality in brief.
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | # Import call
 10 | import ETC
 11 | 
 12 | # ------------------------
 13 | # IO & SEQUENCE MANAGEMENT
 14 | # ------------------------
 15 | # Read data to a list
 16 | text = ETC.read(filepath="somefile.txt", delimiter=",") # Pick any file
 17 | 
 18 | # Check validity of input and automatically cast to the right form if valid
 19 | ETC.cast(text)
 20 | 
 21 | # Recode data to integers in lexicographic order
 22 | ETC.recode_lexical("bbacdbedf", case_sensitive=False)
 23 | 
 24 | # Partition real-valued data to integer-valued discrete data
 25 | ETC.partition([0.1, 0.34, 0.68, -1.9, 25.3], n_bins=2)
 26 | 
 27 | # Generate synthetic data from the discrete uniform distribution
 28 | ETC.generate(size=1000, partitions=4)
 29 | 
 30 | # Reproducibility of random generation can be controlled by passing the same seed value
 31 | ETC.generate(size=1000, partitions=4, seed=101)
 32 | 
 33 | # Compute Shannon Entropy for a sequence
 34 | ETC.entropy(seq=[1, 2, 1, 1, 1, 2, 1])
 35 | 
 36 | 
 37 | # ---------------------------------------
 38 | # 1D ETC ESTIMATION FOR A SINGLE SEQUENCE
 39 | # ---------------------------------------
 40 | # Generate a random discrete symbolic sequence
 41 | seq = ETC.generate(size=1000, partitions=2, seed=31)
 42 | 
 43 | # Simplest way to run
 44 | out = ETC.compute_1D(seq)
 45 | 
 46 | # The result is a dict of 2 key-value pairs: the raw and normalized ETC estimates
 47 | print(out)
 48 | 
 49 | # Get whichever is needed by using their respective keys
 50 | print(out.get('ETC1D'))
 51 | # [Out]: 225
 52 | 
 53 | print(out.get('NETC1D'))
 54 | # [Out]: 0.22522522522522523
 55 | 
 56 | # The normalization is done over one less than the overall length
 57 | print(out.get('ETC1D') / (len(seq) - 1))
 58 | # [Out]: 0.22522522522522523
 59 | 
 60 | # If more details about the trajectory are desired, set verbosity to True
 61 | out = ETC.compute_1D(seq, verbose=True)
 62 | 
 63 | # The result is now a dict of 3 elements: the 2 ETC estimates and the Trajectory
 64 | print(out.get('Trajectory')) # List of dicts - one dict for each step
 65 | 
 66 | # The default behavior is to truncate the iteration process until the sequence gets
 67 | # saturated to have all unique pairs occurring just once. This speeds up computation as
 68 | # the remaining steps don't need to be computed and ETC reduces to an analytic expression.
 69 | # However, the substitution table or features may be of interest and this truncation can
 70 | # then be turned off so that the iteration continues till entropy of 0 is reached:
 71 | out = ETC.compute_1D(seq, verbose=True, truncate=False)
 72 | 
 73 | print(out.get('Trajectory')) # Last step has length 1 and entropy 0
 74 | 
 75 | # This Trajectory can be saved to CSV for later use through a convenience function:
 76 | ETC.save(out.get('Trajectory'), filename="ETC_results.csv")
 77 | 
 78 | # -------------------------------------------------------------------------------------#
 79 | # Additionally, instead of pair-substitution (NSRPS), a window of any size may be
 80 | # substituted using the order switch, for example substitute triplets:
 81 | out = ETC.compute_1D(seq, order=3, verbose=True, truncate=False)
 82 | 
 83 | print(out.get("Trajectory"))
 84 | 
 85 | # The default function call ETC.compute_1D(seq) is the same as:
 86 | # ETC.compute_1D(seq, order=2, verbose=False, truncate=True)
 87 | 
 88 | # --------------------------------------------------------------
 89 | # PARALLELIZED 1D ETC ESTIMATION FOR CHUNKS OF A SINGLE SEQUENCE
 90 | # --------------------------------------------------------------
 91 | # Generate a long sequence
 92 | seq = ETC.generate(size=20000, partitions=2)
 93 | 
 94 | # Compute ETC on overlapping chunks of 1000 elements offsetted by 100, in parallel
 95 | if __name__ == "__main__":
 96 |     outp = ETC.pcompute_single(seq, size=1000, offset=100)
 97 | 
 98 | # The output is a list of dictionaries with estimates, one dict for each ordered chunk
 99 | print(outp)
100 | 
101 | # Compute ETC on non-overlapping chunks of 1000 elements (set offset = size), in parallel
102 | if __name__ == "__main__":
103 |     outp = ETC.pcompute_single(seq, size=1000, offset=1000)
104 | 
105 | # Similarly,
106 | print(outp)
107 | 
108 | # ------------------------------------------------------------------
109 | # PARALLELIZED 1D ETC ESTIMATION  FOR MULTIPLE SEQUENCES IN PARALLEL
110 | # ------------------------------------------------------------------
111 | # Generate 10 long sequences
112 | seqs = [ETC.generate(10000, 2) for _ in range(10)]
113 | 
114 | # Compute ETC estimates for each sequence in parallel
115 | if __name__ == "__main__":
116 |     outp = ETC.pcompute_multiple_seq(seqs)
117 | 
118 | print(outp)
119 | 
120 | 
121 | # --------------------------------
122 | # WORKS WITH NUMPY OUT OF THE BOX!
123 | # --------------------------------
124 | # Generate a random discrete symbolic sequence and compute 1D ETC on it
125 | import numpy as np
126 | np.random.seed(10)
127 | seq = np.random.randint(1, 3, size=5000)
128 | out = ETC.compute_1D(seq)
129 | 
130 | print(out)
131 | # {'ETC1D': 884, 'NETC1D': 0.17683536707341468}
132 | 
133 | # Parallelized ETC estimation - row-wise for 2D numpy arrays
134 | seq = np.random.normal(1, 3, size=[10,5000]) # Each row is a distinct sequence
135 | seq = ETC.partition_numpy(nparr=seq, n_bins=2)
136 | out = ETC.pcompute_numpy(nparr=seq)
137 | 
138 | print(out)
139 | # One estimate per row
140 | 
141 | # -----------------------------------------
142 | # 2D ETC ESTIMATION FOR A PAIR OF SEQUENCES
143 | # -----------------------------------------
144 | # Generate two random sequences
145 | seq_x = ETC.generate(size=1000, partitions=2, seed=17)
146 | seq_y = ETC.generate(size=1000, partitions=2, seed=19)
147 | 
148 | # Compute Effort To Compress using Non-Sequential Recursive Pair Substitution
149 | out = ETC.compute_2D(seq_x, seq_y, order=2, verbose=True, truncate=False)
150 | 
151 | # View estimates
152 | print(out.get('ETC2D'))
153 | print(out.get('NETC2D'))
154 | 
155 | # View trajectory
156 | print(out.get('Trajectory'))
157 | 
158 | # -----------------------------------------
159 | # CAUSALITY TESTING USING THE CCC FRAMEWORK
160 | # -----------------------------------------
161 | # Import call for CCC sub-package
162 | from ETC import CCC
163 | 
164 | # Compute CCC for the above two sequences
165 | ccc_est = CCC.compute(
166 |     seq_x, seq_y, LEN_past=150, ADD_meas=15, STEP_size=20, n_partitions=False
167 | )
168 | # [Out]: CCC for seq_y -> seq_x = -0.00301035257856264
169 | 
170 | # See docstrings for more information on CCC estimation
171 | # ?CCC.compute
172 | 
173 | # Simulate a pair of coupled first-order AR processes
174 | ar = CCC.coupled_AR(length=10000, a=0.8, b=0.9, c=0.8, e=0.01, burn=1000, seed=1)
175 | # ar is a dictionary of two key-value pairs with the following keys:
176 | #   "dependent" and "independent", each with their respective values in float arrays
177 | # ?CCC.coupled_AR for more information on sampling from AR processes
178 | 
179 | # Estimate CCC for the direction independent -> dependent with binning
180 | ccc_ar = CCC.compute(
181 |     seq_x=ar["dependent"],
182 |     seq_y=ar["independent"],
183 |     LEN_past=150,
184 |     ADD_meas=15,
185 |     STEP_size=20,
186 |     n_partitions=2,
187 | )
188 | # [Out]: CCC for seq_y -> seq_x = 0.005755172746030292
189 | 
190 | # And for the opposite direction
191 | ccc_ar = CCC.compute(
192 |     seq_x=ar["independent"],
193 |     seq_y=ar["dependent"],
194 |     LEN_past=150,
195 |     ADD_meas=15,
196 |     STEP_size=20,
197 |     n_partitions=2,
198 | )
199 | # [Out]: CCC for seq_y -> seq_x = 0.0002971309733327245
200 | 


--------------------------------------------------------------------------------
/ETC/CCC/calibrate_CCC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | from ETC import compute_1D, compute_2D, generate
 10 | from ETC.seq.recode import partition
 11 | from itertools import product, chain
 12 | from functools import partial
 13 | from multiprocessing import Pool
 14 | from time import perf_counter
 15 | from matplotlib import pyplot as plt
 16 | import pandas as pd
 17 | import seaborn as sns
 18 | 
 19 | sns.set()
 20 | 
 21 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True)
 22 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True)
 23 | 
 24 | 
 25 | def test(seq_x, seq_y, past_win_size, delta, step_size, partitions=False):
 26 |     if partitions:
 27 |         seq_x = partition(seq_x, partitions)
 28 |         seq_y = partition(seq_y, partitions)
 29 | 
 30 |     aggregator = []
 31 | 
 32 |     length = len(seq_x)
 33 |     total_win_size = past_win_size + delta
 34 | 
 35 |     for n, k in enumerate(range(0, length - total_win_size, step_size)):
 36 |         out = {}
 37 |         out["partitions"] = partitions
 38 |         out["window"] = n + 1
 39 |         out["begin"] = k
 40 |         out["past_win_size"] = past_win_size
 41 |         out["end_past"] = k + past_win_size
 42 |         out["delta"] = delta
 43 |         out["total_win_size"] = total_win_size
 44 |         out["end_total"] = k + total_win_size
 45 |         out["step_size"] = step_size
 46 | 
 47 |         ## CC 1D for X --------------------------------------------------------
 48 |         # ETC 1D for past and current+past=total values of X
 49 |         ETC1D_X_total = get1D(seq_x[k : k + total_win_size])["ETC1D"]
 50 |         out["ETC_1D_X_total_raw"] = ETC1D_X_total
 51 | 
 52 |         ETC1D_X_total /= total_win_size - 1
 53 |         out["ETC_1D_X_total_norm"] = ETC1D_X_total
 54 | 
 55 |         # ETC 1D for past of Y and current of X
 56 |         segment = tuple(
 57 |             chain(
 58 |                 seq_y[k : k + past_win_size],
 59 |                 seq_x[k + past_win_size : k + total_win_size],
 60 |             )
 61 |         )
 62 |         ETC1D_X_Ypast = get1D(segment)["ETC1D"]
 63 |         out["ETC_1D_X_YpastXcurr_raw"] = ETC1D_X_Ypast
 64 | 
 65 |         ETC1D_X_Ypast /= total_win_size - 1
 66 |         out["ETC_1D_X_YpastXcurr_norm"] = ETC1D_X_Ypast
 67 | 
 68 |         ## --------------------------------------------------------------------
 69 | 
 70 |         ## CC 1D for Y --------------------------------------------------------
 71 |         # ETC 1D for past values of Y
 72 |         # ETC1D_Y_ini = get1D(seq_y[k : k + past_win_size])["ETC1D"]
 73 |         # out["ETC_1D_Y_past_raw"] = ETC1D_Y_ini
 74 | 
 75 |         # ETC1D_Y_ini /= past_win_size - 1
 76 |         # out["ETC_1D_Y_past_norm"] = ETC1D_Y_ini
 77 | 
 78 |         # # ETC 1D for past and current+past=total values of Y
 79 |         # ETC1D_Y_fin = get1D(seq_y[k : k + total_win_size])["ETC1D"]
 80 |         # out["ETC_1D_Y_total_raw"] = ETC1D_Y_fin
 81 | 
 82 |         # ETC1D_Y_fin /= total_win_size - 1
 83 |         # out["ETC_1D_Y_total_norm"] = ETC1D_Y_fin
 84 | 
 85 |         # # CC 1D for past and total values of Y
 86 |         # CC1D_Y_past = ETC1D_Y_fin - ETC1D_Y_ini
 87 |         # out["CC_1D_Y"] = CC1D_Y_past
 88 |         ## --------------------------------------------------------------------
 89 | 
 90 |         # ETC 2D for past values of X and Y -----------------------------------
 91 |         ETC2D_ini = get2D(seq_x[k : k + past_win_size], seq_y[k : k + past_win_size])[
 92 |             "ETC2D"
 93 |         ]
 94 |         out["ETC_2D_X_past_Y_past_raw"] = ETC2D_ini
 95 |         out["ETC_2D_Y_past_X_past_raw"] = ETC2D_ini
 96 | 
 97 |         ETC2D_ini /= past_win_size - 1
 98 |         out["ETC_2D_X_past_Y_past_norm"] = ETC2D_ini
 99 |         out["ETC_2D_Y_past_X_past_norm"] = ETC2D_ini
100 |         ## --------------------------------------------------------------------
101 | 
102 |         # ETC 2D for current+past=total values of X and past values of Y plus current values of X
103 |         ETC2D_X_fin = get2D(seq_x[k : k + total_win_size], segment,)["ETC2D"]
104 |         out["ETC_2D_X_total_Y_pastX_curr_raw"] = ETC2D_X_fin
105 | 
106 |         ETC2D_X_fin /= total_win_size - 1
107 |         out["ETC_2D_X_total_Y_pastX_curr_norm"] = ETC2D_X_fin
108 | 
109 |         ## --------------------------------------------------------------------
110 | 
111 |         # ETC 2D for current+past=total values of Y and past values of X plus current values of Y
112 |         # ETC2D_Y_fin = get2D(
113 |         #     seq_y[k : k + total_win_size],
114 |         #     seq_x[k : k + past_win_size]
115 |         #     + seq_y[k + past_win_size : k + total_win_size],
116 |         # )["ETC2D"]
117 |         # out["ETC_2D_Y_total_X_past_raw"] = ETC2D_Y_fin
118 | 
119 |         # ETC2D_Y_fin /= total_win_size - 1
120 |         # out["ETC_2D_Y_total_X_past_norm"] = ETC2D_Y_fin
121 | 
122 |         # # CC 2D for past and total values of X
123 |         # CC2D_Y_total_X_past = ETC2D_Y_fin - ETC2D_ini
124 |         # out["CC_2D_Y_by_X_past"] = CC2D_Y_total_X_past
125 |         ## --------------------------------------------------------------------
126 |         aggregator.append(out)
127 | 
128 |     return pd.DataFrame(aggregator)
129 | 
130 | 
131 | def test_multiple(seq_x, seq_y):
132 | 
133 |     # Past window size
134 |     PWS = [100, 150, 175, 200]
135 | 
136 |     # Current window size
137 |     CWS = [10, 15, 20, 25]
138 | 
139 |     # Jump step size
140 |     SS = [10, 15, 20, 25, 30]
141 | 
142 |     before = perf_counter()
143 | 
144 |     results = []
145 | 
146 |     for past_win_size, delta, step_size in product(PWS, CWS, SS):
147 |         results.append(test(seq_x, seq_y, past_win_size, delta, step_size))
148 | 
149 |     results = pd.concat(results)
150 | 
151 |     after = perf_counter()
152 | 
153 |     return results, after - before
154 | 
155 | 
156 | def unpack(function, params):
157 |     past_win_size, delta, step_size = params
158 |     return function(past_win_size, delta, step_size)
159 | 
160 | 
161 | def test_multiple_parallel(seq_x, seq_y):
162 | 
163 |     # Past window size
164 |     PWS = [50, 75]
165 | 
166 |     # Current window size
167 |     CWS = range(10, 51, 5)
168 | 
169 |     # Jump step size
170 |     SS = [25, 50]
171 | 
172 |     func = partial(test, seq_x, seq_y)
173 |     func = partial(unpack, func)
174 | 
175 |     before = perf_counter()
176 |     # Initialize pool of parallel workers
177 |     pool = Pool()
178 | 
179 |     # Map-execute function across files
180 |     results = pool.map(func, product(PWS, CWS, SS))
181 | 
182 |     # Graceful exit
183 |     pool.close()
184 |     pool.join()
185 | 
186 |     results = pd.concat(results)
187 | 
188 |     after = perf_counter()
189 | 
190 |     return results, after - before
191 | 
192 | 
193 | # x = generate(1000)
194 | # y = generate(1000)
195 | # a2, timings = test_multiple_parallel(x, y)
196 | 
197 | 
198 | # %%
199 | # fig, ax = plt.subplots(1,1)
200 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_past_norm', ax=ax)
201 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_total_norm', ax=ax)
202 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_past_Y_past_norm', ax=ax)
203 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_total_Y_past_norm', ax=ax)
204 | 
205 | # # %%
206 | # fig, ax = plt.subplots(1, 1)
207 | # sns.lineplot(
208 | #     data=a2,
209 | #     hue="past_win_size",
210 | #     y="ETC_1D_X_past_norm",
211 | #     x="delta",
212 | #     ci=None,
213 | #     ax=ax,
214 | #     palette="viridis",
215 | # )
216 | # sns.lineplot(
217 | #     data=a2,
218 | #     hue="past_win_size",
219 | #     y="ETC_1D_X_total_norm",
220 | #     x="delta",
221 | #     ci=None,
222 | #     ax=ax,
223 | #     palette="viridis",
224 | # )
225 | 


--------------------------------------------------------------------------------
/ETC/CCC/_calibrate_CCC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | from ETC import compute_1D, compute_2D, generate
 10 | from ETC.seq.recode import partition
 11 | from itertools import product
 12 | from functools import partial
 13 | from multiprocessing import Pool
 14 | from time import perf_counter
 15 | from matplotlib import pyplot as plt
 16 | import pandas as pd
 17 | import seaborn as sns
 18 | 
 19 | sns.set()
 20 | 
 21 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True)
 22 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True)
 23 | 
 24 | 
 25 | def test(seq_x, seq_y, past_win_size, delta, step_size, partitions=False):
 26 |     if partitions:
 27 |         seq_x = partition(seq_x, partitions)
 28 |         seq_y = partition(seq_y, partitions)
 29 | 
 30 |     aggregator = []
 31 | 
 32 |     length = len(seq_x)
 33 |     total_win_size = past_win_size + delta
 34 | 
 35 |     for n, k in enumerate(range(0, length - total_win_size, step_size)):
 36 |         out = {}
 37 |         out["partitions"] = partitions
 38 |         out["window"] = n + 1
 39 |         out["begin"] = k
 40 |         out["past_win_size"] = past_win_size
 41 |         out["end_past"] = k + past_win_size
 42 |         out["delta"] = delta
 43 |         out["total_win_size"] = total_win_size
 44 |         out["end_total"] = k + total_win_size
 45 |         out["step_size"] = step_size
 46 | 
 47 |         ## CC 1D for X --------------------------------------------------------
 48 |         # ETC 1D for past values of X
 49 |         ETC1D_X_ini = get1D(seq_x[k : k + past_win_size])["ETC1D"]
 50 |         out["ETC_1D_X_past_raw"] = ETC1D_X_ini
 51 | 
 52 |         ETC1D_X_ini /= past_win_size - 1
 53 |         out["ETC_1D_X_past_norm"] = ETC1D_X_ini
 54 | 
 55 |         # ETC 1D for past and current+past=total values of X
 56 |         ETC1D_X_fin = get1D(seq_x[k : k + total_win_size])["ETC1D"]
 57 |         out["ETC_1D_X_total_raw"] = ETC1D_X_fin
 58 | 
 59 |         ETC1D_X_fin /= total_win_size - 1
 60 |         out["ETC_1D_X_total_norm"] = ETC1D_X_fin
 61 | 
 62 |         # CC 1D for past and total values of X
 63 |         CC1D_X_past = ETC1D_X_fin - ETC1D_X_ini
 64 |         out["CC_1D_X"] = CC1D_X_past
 65 |         ## --------------------------------------------------------------------
 66 | 
 67 |         ## CC 1D for Y --------------------------------------------------------
 68 |         # ETC 1D for past values of Y
 69 |         ETC1D_Y_ini = get1D(seq_y[k : k + past_win_size])["ETC1D"]
 70 |         out["ETC_1D_Y_past_raw"] = ETC1D_Y_ini
 71 | 
 72 |         ETC1D_Y_ini /= past_win_size - 1
 73 |         out["ETC_1D_Y_past_norm"] = ETC1D_Y_ini
 74 | 
 75 |         # ETC 1D for past and current+past=total values of Y
 76 |         ETC1D_Y_fin = get1D(seq_y[k : k + total_win_size])["ETC1D"]
 77 |         out["ETC_1D_Y_total_raw"] = ETC1D_Y_fin
 78 | 
 79 |         ETC1D_Y_fin /= total_win_size - 1
 80 |         out["ETC_1D_Y_total_norm"] = ETC1D_Y_fin
 81 | 
 82 |         # CC 1D for past and total values of Y
 83 |         CC1D_Y_past = ETC1D_Y_fin - ETC1D_Y_ini
 84 |         out["CC_1D_Y"] = CC1D_Y_past
 85 |         ## --------------------------------------------------------------------
 86 | 
 87 |         # ETC 2D for past values of X and Y -----------------------------------
 88 |         ETC2D_ini = get2D(seq_x[k : k + past_win_size], seq_y[k : k + past_win_size])[
 89 |             "ETC2D"
 90 |         ]
 91 |         out["ETC_2D_X_past_Y_past_raw"] = ETC2D_ini
 92 |         out["ETC_2D_Y_past_X_past_raw"] = ETC2D_ini
 93 | 
 94 |         ETC2D_ini /= past_win_size - 1
 95 |         out["ETC_2D_X_past_Y_past_norm"] = ETC2D_ini
 96 |         out["ETC_2D_Y_past_X_past_norm"] = ETC2D_ini
 97 |         ## --------------------------------------------------------------------
 98 | 
 99 |         # ETC 2D for current+past=total values of X and past values of Y plus current values of X
100 |         ETC2D_X_fin = get2D(
101 |             seq_x[k : k + total_win_size],
102 |             seq_y[k : k + past_win_size]
103 |             + seq_x[k + past_win_size : k + total_win_size],
104 |         )["ETC2D"]
105 |         out["ETC_2D_X_total_Y_past_raw"] = ETC2D_X_fin
106 | 
107 |         ETC2D_X_fin /= total_win_size - 1
108 |         out["ETC_2D_X_total_Y_past_norm"] = ETC2D_X_fin
109 | 
110 |         # CC 2D for past and total values of X
111 |         CC2D_X_total_Y_past = ETC2D_X_fin - ETC2D_ini
112 |         out["CC_2D_X_by_Y_past"] = CC2D_X_total_Y_past
113 |         ## --------------------------------------------------------------------
114 | 
115 |         # ETC 2D for current+past=total values of Y and past values of X plus current values of Y
116 |         ETC2D_Y_fin = get2D(
117 |             seq_y[k : k + total_win_size],
118 |             seq_x[k : k + past_win_size]
119 |             + seq_y[k + past_win_size : k + total_win_size],
120 |         )["ETC2D"]
121 |         out["ETC_2D_Y_total_X_past_raw"] = ETC2D_Y_fin
122 | 
123 |         ETC2D_Y_fin /= total_win_size - 1
124 |         out["ETC_2D_Y_total_X_past_norm"] = ETC2D_Y_fin
125 | 
126 |         # CC 2D for past and total values of X
127 |         CC2D_Y_total_X_past = ETC2D_Y_fin - ETC2D_ini
128 |         out["CC_2D_Y_by_X_past"] = CC2D_Y_total_X_past
129 |         ## --------------------------------------------------------------------
130 |         aggregator.append(out)
131 | 
132 |     return pd.DataFrame(aggregator)
133 | 
134 | 
135 | def test_multiple(seq_x, seq_y):
136 | 
137 |     # Past window size
138 |     PWS = [100, 150, 175, 200]
139 | 
140 |     # Current window size
141 |     CWS = [10, 15, 20, 25]
142 | 
143 |     # Jump step size
144 |     SS = [10, 15, 20, 25, 30]
145 | 
146 |     before = perf_counter()
147 | 
148 |     results = []
149 | 
150 |     for past_win_size, delta, step_size in product(PWS, CWS, SS):
151 |         results.append(test(seq_x, seq_y, past_win_size, delta, step_size))
152 | 
153 |     results = pd.concat(results)
154 | 
155 |     after = perf_counter()
156 | 
157 |     return results, after - before
158 | 
159 | 
160 | def unpack(function, params):
161 |     past_win_size, delta, step_size = params
162 |     return function(past_win_size, delta, step_size)
163 | 
164 | 
165 | def test_multiple_parallel(seq_x, seq_y):
166 | 
167 |     # Past window size
168 |     PWS = [50, 75]
169 | 
170 |     # Current window size
171 |     CWS = range(10, 51, 5)
172 | 
173 |     # Jump step size
174 |     SS = [25, 50]
175 | 
176 |     func = partial(test, seq_x, seq_y)
177 |     func = partial(unpack, func)
178 | 
179 |     before = perf_counter()
180 |     # Initialize pool of parallel workers
181 |     pool = Pool()
182 | 
183 |     # Map-execute function across files
184 |     results = pool.map(func, product(PWS, CWS, SS))
185 | 
186 |     # Graceful exit
187 |     pool.close()
188 |     pool.join()
189 | 
190 |     results = pd.concat(results)
191 | 
192 |     after = perf_counter()
193 | 
194 |     return results, after - before
195 | 
196 | 
197 | # x = generate(1000)
198 | # y = generate(1000)
199 | # a2, timings = test_multiple_parallel(x, y)
200 | 
201 | 
202 | # %%
203 | # fig, ax = plt.subplots(1,1)
204 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_past_norm', ax=ax)
205 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_total_norm', ax=ax)
206 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_past_Y_past_norm', ax=ax)
207 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_total_Y_past_norm', ax=ax)
208 | 
209 | # # %%
210 | # fig, ax = plt.subplots(1, 1)
211 | # sns.lineplot(
212 | #     data=a2,
213 | #     hue="past_win_size",
214 | #     y="ETC_1D_X_past_norm",
215 | #     x="delta",
216 | #     ci=None,
217 | #     ax=ax,
218 | #     palette="viridis",
219 | # )
220 | # sns.lineplot(
221 | #     data=a2,
222 | #     hue="past_win_size",
223 | #     y="ETC_1D_X_total_norm",
224 | #     x="delta",
225 | #     ci=None,
226 | #     ax=ax,
227 | #     palette="viridis",
228 | # )
229 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x1D/core.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True
  2 | #!/usr/bin/env python3
  3 | # -*- coding: utf-8 -*-
  4 | """
  5 | 
  6 | 
  7 | @author: Pranay S. Yadav
  8 | """
  9 | # Import stuff
 10 | from cpython cimport array, bool
 11 | cimport cython
 12 | import array
 13 | 
 14 | # Function for getting mask for pairs
 15 | cpdef array.array get_mask_pairs(const unsigned int[::1] x):
 16 |     """
 17 |     INPUT
 18 |     -----
 19 |     x : array.array
 20 |         Array object containing 32-bit integers.
 21 | 
 22 |     OUTPUT
 23 |     ------
 24 |     mask : array.array
 25 |         Array object containing 32-bit integers - 0s or 1s corresponding to values in
 26 |         x for which successive overlapping pairs occur.
 27 |     """
 28 |     # Get size of input
 29 |     cdef Py_ssize_t x_size = len(x)
 30 | 
 31 |     # Initialize a mask of Falses
 32 |     cdef array.array int_template = array.array('I', [])
 33 |     cdef array.array mask = array.clone(int_template, x_size-1, zero=True)
 34 |     cdef unsigned int[:] mask_view = mask
 35 | 
 36 |     # Initialize bounds for iteration
 37 |     cdef Py_ssize_t n = 0
 38 | 
 39 |     # Turn all values in mask to Trues
 40 |     for n in range(x_size-1):
 41 |         mask_view[n] += 1
 42 | 
 43 |     # Iterate over all values of input
 44 |     n = 0
 45 |     while n < x_size - 2:
 46 | 
 47 |         # If successive pairs match
 48 |         if x[n] == x[n+1] and x[n+1] == x[n+2]:
 49 | 
 50 |             # Mask out the second one
 51 |             mask_view[n+1] = 0
 52 | 
 53 |             # And slide over it
 54 |             n += 1
 55 | 
 56 |         # Increment while loop index
 57 |         n += 1
 58 | 
 59 |     return mask
 60 | 
 61 | # Function for substituting pairs
 62 | cpdef list substitute_pairs(unsigned int[::1] x, unsigned int[::1] pair, unsigned int value):
 63 |     """
 64 |     INPUT
 65 |     -----
 66 |     x : array.array
 67 |         Array object containing 32-bit unsigned integers.
 68 | 
 69 |     pair : array.array, length = 2
 70 |         Array object containing 2 32-bit unsigned integers.
 71 | 
 72 |     value : unsigned 32-bit int
 73 |         Value to substitute the first element of pair with
 74 | 
 75 |     OUTPUT
 76 |     ------
 77 |     out : list
 78 |         Array object containing 32-bit integers, with supplied pair replaced everywhere
 79 |         by the supplied value.
 80 |     """
 81 |     # Initialize looping variables and output list
 82 |     cdef Py_ssize_t n = 0
 83 |     cdef Py_ssize_t x_size = len(x)
 84 |     cdef list out = []
 85 | 
 86 |     # Loop over input and replace pair
 87 |     while n < x_size-1:
 88 | 
 89 |         # Check for match with supplied pair
 90 |         if x[n] == pair[0] and x[n+1] == pair[1]:
 91 | 
 92 |             # Replace first value with supplied value
 93 |             x[n] = value
 94 | 
 95 |             # Replace second value with 0
 96 |             x[n+1] = 0
 97 | 
 98 |             n += 1
 99 | 
100 |         n += 1
101 | 
102 |     # Reset indexing variable
103 |     n = 0
104 | 
105 |     # Loop over mutated input and append non-zero values to list
106 |     for n in range(x_size):
107 | 
108 |         if x[n]:
109 | 
110 |             out.append(x[n])
111 | 
112 |     return out
113 | 
114 | # Function for checking whether all elements in input are identical
115 | cpdef bint check_equality(const unsigned int[::1] x):
116 |     """
117 |     INPUT
118 |     -----
119 |     x : array.array
120 |         Array object containing 32-bit unsigned integers.
121 | 
122 | 
123 |     OUTPUT
124 |     ------
125 |     bool
126 |         True if all elements are identical
127 |     """
128 |     # Intialize loop bounds
129 |     cdef Py_ssize_t n
130 |     cdef Py_ssize_t x_size = len(x)
131 | 
132 |     # Iterate over values from input
133 |     for n in range(x_size):
134 | 
135 |         # Short-circuit the loop: check for any element that doesn't equal the first
136 |         if x[0] != x[n]:
137 |             return False
138 | 
139 |     return True
140 | 
141 | # Function for getting mask for windows of any length
142 | cpdef array.array get_mask_windows(const unsigned int[::1] x, unsigned int order):
143 |     """
144 |     INPUT
145 |     -----
146 |     x : array.array
147 |         Array object containing 32-bit integers.
148 | 
149 |     order: unsigned 32-bit int
150 |         Length of the window to slide across input
151 | 
152 |     OUTPUT
153 |     ------
154 |     mask : array.array
155 |         Array object containing 32-bit integers - 0s or 1s corresponding to values in
156 |         x for which successive overlapping pairs occur.
157 |     """
158 |     # Get size of input
159 |     cdef Py_ssize_t x_size = len(x)
160 | 
161 |     # Initialize a mask of Falses
162 |     cdef array.array int_template = array.array('I', [])
163 |     cdef array.array mask = array.clone(int_template, x_size - (order-1), zero=True)
164 |     cdef unsigned int[:] mask_view = mask
165 | 
166 |      # Initialize variable for iteration
167 |     cdef Py_ssize_t n = 0
168 | 
169 |     # Turn all values in mask to Trues
170 |     for n in range(x_size - order + 1):
171 |         mask_view[n] += 1
172 | 
173 |     # Initialize variables for iteration across input
174 |     cdef Py_ssize_t k = 0 # Outer loop
175 |     cdef Py_ssize_t m = 0 # Inner loop
176 | 
177 |     # Tracking variable for counting matching elements in pairwise window comparison
178 |     cdef unsigned int track = 0
179 | 
180 |     # Iterate over input values except the last 'order' values [Outermost master loop]
181 |     n = 0
182 |     for n in range(x_size - (order-1)):
183 | 
184 |         # proceed only if mask is True for current element
185 |         if mask_view[n]:
186 | 
187 |             # Outer loop for sliding the 'next' window by unit step (current vs next)
188 |             for k in range(1,order):  # Start from 1 - begin comparing from next window
189 | 
190 |                 # Inner loop for comparing elements in current and next windows
191 |                 for m in range(order):
192 | 
193 |                     if n+m+k >= x_size:
194 |                         return mask
195 | 
196 |                     # If elements match, increment tracker
197 |                     if x[n+m] == x[n+m+k]:
198 |                         track += 1
199 | 
200 |                     # Else stop iteration over this comparison of windows
201 |                     # else:
202 |                     #     break
203 | 
204 |                 # Trick: preserve mask only if track doesn't equal order
205 |                 # If track == order, short-circuit eval takes precedence, returning 0
206 |                 mask_view[n+k] = track!=order and mask_view[n+k]
207 | 
208 |                 # Reset tracker
209 |                 track = 0
210 | 
211 |     return mask
212 | 
213 | # Function for substituting windows of any length
214 | cpdef list substitute_windows(unsigned int[::1] x, unsigned int order, unsigned int[::1] window, unsigned int value):
215 |     """
216 |     INPUT
217 |     -----
218 |     x : array.array
219 |         Array object containing 32-bit unsigned integers.
220 | 
221 |     order: unsigned 32-bit int
222 |         Length of the window to slide across input
223 | 
224 |     window : array.array, length = 2
225 |         Array object containing 2 32-bit unsigned integers.
226 | 
227 |     value : unsigned 32-bit int
228 |         Value to substitute the first element of pair with
229 | 
230 |     OUTPUT
231 |     ------
232 |     out : list
233 |         Array object containing 32-bit integers, with supplied pair replaced everywhere
234 |         by the supplied value.
235 |     """
236 |     # Initialize looping variables and output list
237 |     cdef Py_ssize_t n = 0 # Outer loop
238 |     cdef Py_ssize_t m = 0 # Inner loop
239 |     cdef Py_ssize_t x_size = len(x)
240 |     cdef list out = []
241 | 
242 |     # Tracking variable for counting matching elements in pairwise window comparison
243 |     cdef unsigned int track = 0
244 | 
245 |     # Iterate over input values except one less than the last 'order' values
246 |     # Logic: last window, say triplet must begin from 3rd-last index, leaving 2 values
247 |     for n in range(x_size-order+1):
248 | 
249 |         # Slide window of given order and do element-wise comparison
250 |         for m in range(order):
251 | 
252 |             # Reset tracker
253 | 
254 | 
255 |             # Track comparison of input elements with window elements
256 |             if x[n+m] == window[m]:
257 |                 track += 1
258 |             # # If mismatch, break
259 |             else:
260 |                 break
261 | 
262 |         # If all compared elements match for current window
263 |         if track == order:
264 | 
265 |             # Replace the first element with provided value
266 |             x[n] = value
267 | 
268 |             # Replace the remaining subsequent values with zeros
269 |             for m in range(1, order):
270 |                 x[n+m] = 0
271 | 
272 |         # Reset tracker
273 |         track = 0
274 |     # Reset indexing variable
275 |     n = 0
276 | 
277 |     # Loop over mutated input and append non-zero values to list
278 |     for n in range(x_size):
279 | 
280 |         if x[n]:
281 | 
282 |             out.append(x[n])
283 | 
284 |     return out


--------------------------------------------------------------------------------
/ETC/NSRWS/x1D/parallel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from functools import partial
  9 | from itertools import islice
 10 | 
 11 | # Import functions from standard library modules
 12 | from multiprocessing import Pool
 13 | 
 14 | # Import local modules
 15 | import ETC
 16 | from ETC.seq.process import entropy
 17 | 
 18 | # from ETC.seq.process import entropy
 19 | import numpy as np
 20 | 
 21 | # Function definitions
 22 | def _compute_single_file(filepath, order=2):
 23 |     """
 24 |     This function operates on a single file - reads sequence, computes ETC
 25 |     and writes to disk.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     filepath : str or Path object
 30 |         Valid path to a file containing sequence.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     out : dict
 35 |         filename, length of sequence and ETC estimate.
 36 | 
 37 |     """
 38 |     # Read file as a sequence
 39 |     seq = ETC.seq.IO.read(filepath)
 40 |     seq = ETC.seq.recode.recode_lexical(seq)
 41 | 
 42 |     # Filename for writing output of ETC computation
 43 |     fname = filepath.with_name(filepath.stem + f"_ETC_order{order}.csv")
 44 | 
 45 |     # Prepare output dictionary
 46 |     out = {"file": filepath.name, "length": len(seq), "entropy": entropy(seq)}
 47 | 
 48 |     # Compute ETC, write to file and update output dictionary
 49 |     out.update(ETC.NSRWS.x1D.etc.compute_save(seq, fname, order=order, truncate=True))
 50 | 
 51 |     return out
 52 | 
 53 | 
 54 | def pcompute_files(filelist, order=2):
 55 |     """
 56 |     This function operates concurrently on a list of files. Reads each as a
 57 |     sequence, computes ETC and writes output to disk.
 58 | 
 59 |     CAUTION: main module is unguarded, do not run these functions as is,
 60 |         particularly on Windows.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     filelist : list/tuple/generator
 65 |         Collection of filenames of files containing sequence data.
 66 | 
 67 |     Returns
 68 |     -------
 69 |     list of dict elements
 70 |         Each dictionary element contains filename, length of sequence & ETC.
 71 | 
 72 |     """
 73 |     # Initialize pool of parallel workers
 74 |     pool = Pool()
 75 |     func = partial(_compute_single_file, order=order)
 76 |     # Map-execute function across files
 77 |     out = pool.map_async(func, filelist)
 78 | 
 79 |     # Graceful exit
 80 |     pool.close()
 81 |     pool.join()
 82 | 
 83 |     # Return collected results
 84 |     return out.get()
 85 | 
 86 | 
 87 | def _compute_single_seq(seq):
 88 |     """
 89 |     This function operates on a single sequence and computes ETC.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     seq : tuple of 2 elements
 94 |         1st element is index for tracking.
 95 |         2nd element is a sequence of integers used for ETC computation.
 96 |         Output of enumerate.
 97 | 
 98 |     Returns
 99 |     -------
100 |     out : dict
101 |         index of sequence, length of sequence and ETC estimate.
102 | 
103 |     """
104 |     # Prepare output dictionary
105 |     out = {"index": seq[0], "length": len(seq[1]), "entropy": entropy(seq[1])}
106 | 
107 |     # Compute ETC and update output dictionary
108 |     out.update(ETC.compute_1D(seq[1], order=2, verbose=False, truncate=True))
109 | 
110 |     return out
111 | 
112 | 
113 | # TEMPORARY MOD FOR ASSEMBLY-FREE
114 | # def _compute_single_seq(seq):
115 | #     """
116 | #     This function operates on a single sequence and computes ETC.
117 | 
118 | #     Parameters
119 | #     ----------
120 | #     seq : tuple of 2 elements
121 | #         1st element is index for tracking.
122 | #         2nd element is a sequence of integers used for ETC computation.
123 | #         Output of enumerate.
124 | 
125 | #     Returns
126 | #     -------
127 | #     out : dict
128 | #         index of sequence, length of sequence and ETC estimate.
129 | 
130 | #     """
131 | #     data = ETC.seq.recode.recode_lexical(seq[1])
132 | 
133 | #     # Prepare output dictionary
134 | #     out = {"item": seq[0], "length": len(data)}
135 | 
136 | #     # Compute ETC and update output dictionary
137 | #     out.update(ETC.compute_1D(data, order=2, verbose=False, truncate=True))
138 | #     out.update({"Entropy": entropy(data, legacy=False)})
139 | 
140 | #     return out
141 | 
142 | 
143 | def pcompute_multiple_seq(iterable):
144 |     """
145 |     This function operates concurrently on a collection of sequences. Loads
146 |     each sequence and computes ETC.
147 | 
148 |     CAUTION: main module is unguarded, do not run these functions as is,
149 |         particularly on Windows.
150 | 
151 |     Parameters
152 |     ----------
153 |     iterable : list/tuple/generator
154 |         Collection of integer sequences.
155 | 
156 |     Returns
157 |     -------
158 |     list of dict elements
159 |         Each dictionary element contains index, length of sequence & ETC.
160 | 
161 |     """
162 |     # Initialize pool of parallel workers
163 |     pool = Pool()
164 | 
165 |     # Map-execute function across sequences
166 |     out = pool.map_async(_compute_single_seq, enumerate(iterable))
167 | 
168 |     # Graceful exit
169 |     pool.close()
170 |     pool.join()
171 | 
172 |     # Return collected results
173 |     return out.get()
174 | 
175 | 
176 | def _overlapping_chunks(seq, size, offset=1):
177 |     """
178 |     This function takes an input sequence and produces chunks of chosen size.
179 |     Offset can be used to control degree of overlap (or distance between chunks
180 |     that don't overlap)
181 | 
182 |     Parameters
183 |     ----------
184 |     seq : tuple or list
185 |         Sequence of integers.
186 |     size : int
187 |         Length of each produced chunk.
188 |     offset : int, optional
189 |         Number of elements to shift each chunk by. The default is 1.
190 |         Setting this to any value less than size allows control of overlap.
191 |         Setting this >= size produces non-overlapping chunks.
192 | 
193 |     Returns
194 |     -------
195 |     zip
196 |         zip object that produces chunks of specified size, one at a time.
197 | 
198 |     """
199 | 
200 |     return zip(*(islice(seq, i, None, offset) for i in range(size)))
201 | 
202 | 
203 | def _non_overlapping_chunks(seq, size):
204 |     """
205 |     This function takes an input sequence and produces chunks of chosen size
206 |     that strictly do not overlap. This is a much faster implemetnation than
207 |     _overlapping_chunks and should be preferred if running on very large seq.
208 | 
209 |     Parameters
210 |     ----------
211 |     seq : tuple or list
212 |         Sequence of integers.
213 |     size : int
214 |         Length of each produced chunk.
215 | 
216 |     Returns
217 |     -------
218 |     zip
219 |         zip object that produces chunks of specified size, one at a time.
220 | 
221 |     """
222 | 
223 |     return zip(*[iter(seq)] * size)
224 | 
225 | 
226 | def pcompute_single(seq, size, offset=1):
227 |     """
228 |     This function operates concurrently on chunks of a given sequence. Gets
229 |     each chunk and computes ETC one-by-one. Offset parameter controls degree of
230 |     overlap (or non-overlap)
231 | 
232 |     CAUTION: main module is unguarded, do not run these functions as is,
233 |         particularly on Windows.
234 | 
235 |     Parameters
236 |     ----------
237 |     seq : tuple or list
238 |         Sequence of integers.
239 |     size : int
240 |         Length of each produced chunk.
241 |     offset : int, optional
242 |         Number of elements to shift each chunk by. The default is 1.
243 |         Setting this to any value less than size allows control of overlap.
244 |         Setting this >= size produces non-overlapping chunks.
245 | 
246 |     Returns
247 |     -------
248 |     list of dict elements
249 |         Each dictionary element contains index, length of sequence & ETC.
250 | 
251 |     """
252 |     # If offset equals size, get non-overlapping chunks of given size
253 |     if offset == size:
254 |         iterable = _non_overlapping_chunks(seq, size)
255 | 
256 |     # Else get overlapping chunks of given size and offset
257 |     else:
258 |         iterable = _overlapping_chunks(seq, size, offset)
259 | 
260 |     # Execute parallel computation over chunks
261 |     return pcompute_multiple_seq(iterable)
262 | 
263 | 
264 | def pcompute_numpy(nparr):
265 |     """
266 |     This function operates concurrently row-wise on a 2D NumPy array. Loads
267 |     each sequence and computes ETC.
268 | 
269 |     CAUTION: main module is unguarded, do not run these functions as is,
270 |         particularly on Windows.
271 | 
272 |     Parameters
273 |     ----------
274 |     nparr : numpy array, int, 2D
275 |         Sequence present as column, each row representing a different sequence
276 | 
277 |     Returns
278 |     -------
279 |     list of dict elements
280 |         Each dictionary element contains index, length of sequence & ETC.
281 | 
282 |     """
283 |     assert (
284 |         isinstance(nparr, np.ndarray) and nparr.ndim == 2 and nparr.dtype == np.uint32
285 |     ), ">ERROR: Input must be 2D NumPy array of 32-bit unsigned integers (np.uint32)"
286 |     # Initialize pool of parallel workers
287 |     pool = Pool()
288 | 
289 |     # Map-execute function across sequences
290 |     out = pool.map_async(
291 |         _compute_single_seq, enumerate([nparr[idx] for idx in range(nparr.shape[0])])
292 |     )
293 | 
294 |     # Graceful exit
295 |     pool.close()
296 |     pool.join()
297 | 
298 |     # Return collected results
299 |     return out.get()
300 | 


--------------------------------------------------------------------------------
/ETC/tests/test_NSRWS1D.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | from array import array
 10 | from random import choice
 11 | 
 12 | from hypothesis import given
 13 | from hypothesis.strategies import composite, integers, lists
 14 | 
 15 | from ETC.NSRWS.x1D import onestep
 16 | from ETC.NSRWS.x1D import etc as cetc
 17 | from ETC.NSRWS.x1D import core as cc
 18 | 
 19 | 
 20 | @composite
 21 | def generate_sequence(draw, elements=[lists, integers]):
 22 |     """
 23 |     Generate a list of integers as sequence input, and an integer >= 2 for order param
 24 |     """
 25 |     seq = draw(lists(integers(min_value=1, max_value=100), min_size=3, max_size=10_000))
 26 | 
 27 |     order = draw(integers(min_value=2, max_value=len(seq) - 1))
 28 | 
 29 |     return seq, order
 30 | 
 31 | 
 32 | @composite
 33 | def generate_sequence_identical(draw, elements=[lists, integers]):
 34 |     """
 35 |     Generate a list with all identical integers, and an integer >= 2 for order param
 36 |     """
 37 |     seq = draw(lists(integers(min_value=1, max_value=1), min_size=3, max_size=10_000))
 38 | 
 39 |     order = draw(integers(min_value=2, max_value=len(seq) - 1))
 40 | 
 41 |     return seq, order
 42 | 
 43 | 
 44 | @given(generate_sequence())
 45 | def test_onestep(inputs):
 46 |     """
 47 |     Test the outermost onestep function exposed for direct estimation
 48 |     """
 49 |     seq, order = inputs
 50 |     output1, signal = onestep.onestep(seq, order, verbose=False, check=False)
 51 |     output2verbose = onestep.onestep(seq, order, verbose=True, check=False)
 52 | 
 53 |     # Substituted sequence should be shorter than input
 54 |     assert len(output1) < len(seq)
 55 | 
 56 |     # Highest value in substituted sequence should be greater than that in input
 57 |     assert max(output1) > max(seq)
 58 | 
 59 |     # Smallest value in substituted sequence should be at least as large as that in input
 60 |     assert min(output1) >= min(seq)
 61 | 
 62 |     # Number of unique symbols in output should be one less than that in input
 63 |     assert len(set(output1) - set(seq)) == 1
 64 | 
 65 |     # Changing verbosity parameter should not alter the substituted sequence
 66 |     assert output1 == output2verbose[0]
 67 | 
 68 | 
 69 | @given(generate_sequence())
 70 | def test_onestep_invalid(inputs):
 71 |     """
 72 |     Test the outermost onestep function for invalid input: sequence shorter than order
 73 |     """
 74 |     seq, order = inputs
 75 | 
 76 |     output = onestep.onestep(seq[: order - 1], order, verbose=False, check=False)
 77 | 
 78 |     assert output is None
 79 | 
 80 | 
 81 | def test_onestep_invalid_str():
 82 |     """
 83 |     Test the outermost onestep function for invalid input: string input
 84 |     """
 85 |     output = onestep.onestep("abcdef", 6, verbose=False, check=False)
 86 | 
 87 |     assert output is None
 88 | 
 89 | 
 90 | @given(generate_sequence_identical())
 91 | def test_onestep_identical(inputs):
 92 |     """
 93 |     Test the outermost onestep function for sequence with identical symbols
 94 |     """
 95 |     seq, order = inputs
 96 | 
 97 |     output = onestep.onestep(seq, order, verbose=False, check=True)
 98 | 
 99 |     assert output is None
100 | 
101 | 
102 | @given(generate_sequence())
103 | def test_onestep_pairs_vs_windows(inputs):
104 |     """
105 |     Test the parity of output for NSRWS with order=2 & NSRPS for random sequences
106 |     """
107 |     seq, _ = inputs
108 |     seq = array("I", seq)
109 | 
110 |     # Pairs (NSRPS)
111 |     out_pairs = onestep._onestep_pairs(seq[:], verbose=True)
112 | 
113 |     # Order = 2 (NSRWS)
114 |     out_windows = onestep._onestep_windows(seq[:], 2, verbose=True)
115 | 
116 |     # Check equality of all but last (timings) part of output
117 |     for n in range(4):
118 |         assert out_pairs[n] == out_windows[n]
119 | 
120 | 
121 | @given(generate_sequence_identical())
122 | def test_onestep_pairs_vs_windows_identical(inputs):
123 |     """
124 |     Test the parity of output for NSRWS with order=2 & NSRPS for all identical sequence
125 |     """
126 |     seq, _ = inputs
127 |     seq = array("I", seq)
128 | 
129 |     # Pairs (NSRPS)
130 |     out_pairs = onestep._onestep_pairs(seq[:], verbose=True)
131 | 
132 |     # Order = 2 (NSRWS)
133 |     out_windows = onestep._onestep_windows(seq[:], 2, verbose=True)
134 | 
135 |     # Check equality of all but last (timings) part of output
136 |     for n in range(4):
137 |         assert out_pairs[n] == out_windows[n]
138 | 
139 | 
140 | @given(generate_sequence())
141 | def test_get_mask_general(inputs):
142 |     """
143 |     Test the get_mask function for random sequences and orders
144 |     """
145 |     seq, order = inputs
146 |     seq = array("I", seq)
147 | 
148 |     # Get mask depending on order
149 |     if order == 2:
150 |         mask = cc.get_mask_pairs(seq)
151 |     else:
152 |         mask = cc.get_mask_windows(seq, order)
153 | 
154 |     # Mask should be precisely shorter than input sequence
155 |     assert len(mask) == len(seq) - order + 1
156 | 
157 |     # Mask should only contain 0s and 1s
158 |     assert set(mask).issubset({0, 1})
159 | 
160 |     # First element must be 1
161 |     assert mask[0] == 1
162 | 
163 |     # If mask contains a 0, then that position in the sequence indicates an overlap
164 |     try:
165 |         idx0 = mask.index(0)
166 |         # Check if consecutive elements equal where 0 found in mask
167 |         assert seq[idx0] == seq[idx0 + order - 1]
168 |     except ValueError:
169 |         pass
170 | 
171 | 
172 | @given(generate_sequence_identical())
173 | def test_get_mask_identical(inputs):
174 |     """
175 |     Test the get_mask function for sequences with identical symbols
176 |     """
177 |     seq, order = inputs
178 |     seq = array("I", seq)
179 | 
180 |     # Get mask depending on order from left-to-right and reversed sequence
181 |     if order == 2:
182 |         mask = cc.get_mask_pairs(seq)
183 |         mask_rev = cc.get_mask_pairs(seq[::-1])
184 |     else:
185 |         mask = cc.get_mask_windows(seq, order)
186 |         mask_rev = cc.get_mask_windows(seq[::-1], order)
187 | 
188 |     # Find zeroes for they must be present
189 |     idx0 = mask.index(0)
190 | 
191 |     # Both masks should be precisely shorter than input sequence
192 |     assert len(mask) == len(mask_rev) == len(seq) - order + 1
193 | 
194 |     # Both masks should only contain 0s and 1s
195 |     assert set(mask).issubset({0, 1})
196 |     assert set(mask_rev).issubset({0, 1})
197 | 
198 |     # First element has to be a 1
199 |     assert mask[0] == 1
200 | 
201 |     # Check if consecutive elements equal where 0 found in mask
202 |     assert seq[idx0] == seq[idx0 + order - 1]
203 | 
204 | 
205 | def test_mask_and_count():
206 |     """
207 |     Test the function for applying mask and counting frequent windows
208 |     """
209 |     seq = (1, 2, 3, 4, 5, 6, 7)
210 |     mask = (1, 0, 0, 1, 1)
211 |     assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 2, 3)), 1)
212 | 
213 |     seq = (1, 2, 3, 4, 5, 6, 7)
214 |     mask = (1, 1, 1, 1, 1)
215 |     assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 2, 3)), 1)
216 | 
217 |     seq = (1, 2, 3, 4, 5, 6, 7)
218 |     mask = (0, 1, 1, 1, 1)
219 |     assert onestep._mask_and_count(seq, mask, 3) == (array("I", (2, 3, 4)), 1)
220 | 
221 |     seq = (1, 1, 1, 1, 1, 2, 1)
222 |     mask = (1, 0, 0, 1, 1)
223 |     assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 1, 1)), 1)
224 | 
225 | 
226 | @given(generate_sequence())
227 | def test_substitution(inputs):
228 |     """
229 |     Test the substitution step for random sequences
230 |     """
231 |     seq, order = inputs
232 |     seq = array("I", seq)
233 | 
234 |     # Get value to substitute
235 |     sub_value = 1 + max(seq)
236 | 
237 |     # Pick a random pair for substitution
238 |     idx = seq.index(choice(seq[:-1]))
239 |     pair = array("I", [seq[idx], seq[idx + 1]])
240 | 
241 |     # Substitute the pair using both functions
242 |     out1 = cc.substitute_pairs(seq[:], pair, sub_value)
243 |     out2 = cc.substitute_windows(seq[:], 2, pair, sub_value)
244 | 
245 |     # The 2 outputs should be equal
246 |     assert out1 == out2
247 | 
248 |     # The length of the substituted sequence should be less than the input sequence
249 |     assert len(out1) < len(seq)
250 | 
251 |     # The highest value in the substituted sequence should be more than that in the input sequence
252 |     assert max(out1) > max(seq)
253 | 
254 |     # The highest value in the substitute sequence should match the provided value
255 |     assert max(out1) == sub_value
256 | 
257 | 
258 | @given(generate_sequence())
259 | def test_truncation(inputs):
260 |     """
261 |     Test ETC estimation from all 4 methods based on verbosity and truncation
262 |     """
263 |     seq, order = inputs
264 | 
265 |     etc_vf = cetc.compute(seq, order, verbose=True, truncate=False)["ETC1D"]
266 |     etc_vt = cetc.compute(seq, order, verbose=True, truncate=True)["ETC1D"]
267 |     etc_cf = cetc.compute(seq, order, verbose=False, truncate=False)["ETC1D"]
268 |     etc_ct = cetc.compute(seq, order, verbose=False, truncate=True)["ETC1D"]
269 | 
270 |     # All 4 estimates should be identical
271 |     assert etc_vf == etc_vt == etc_cf == etc_ct
272 | 
273 | 
274 | def test_compute_save(tmp_path):
275 |     """
276 |     Test ETC estimation with write-to-disk functionality
277 |     """
278 |     seq = array("I", [2, 4] * 100)
279 | 
280 |     # Temporary file (Path object) for use
281 |     file = tmp_path / "test.csv"
282 | 
283 |     # Test without truncation
284 |     etc_vf = cetc.compute_save(seq, file, order=2, truncate=False)
285 |     assert isinstance(etc_vf, dict)
286 | 
287 |     # Test with truncation
288 |     etc_vt = cetc.compute_save(seq, file, order=2, truncate=True)
289 |     assert isinstance(etc_vt, dict)
290 | 
291 |     # Values should be same of course
292 |     assert etc_vf["ETC1D"] == etc_vt["ETC1D"]
293 | 


--------------------------------------------------------------------------------
/ETC/NCA/parallelize_jl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Parallelized NCA estimation using joblib
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from ETC.CCMC.pairs import ETC_causality as ETC_compute
  9 | from ETC.CCMC.pairs import LZ_causality as LZ_compute
 10 | from ETC.CCC.compute_CCC import compute as CCC_compute
 11 | from joblib import Parallel, delayed
 12 | from functools import partial
 13 | from itertools import combinations
 14 | 
 15 | 
 16 | def _kernel_CCC(inputs, CCC_params):
 17 |     """
 18 |     Wrapper for computing causality estimates on a sequence pair
 19 | 
 20 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
 21 | 
 22 |     The function unpacks inputs into an index element and a sequence pair and runs the
 23 |     estimator function on the sequence pair, returning various estimates in a dict
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     inputs : tuple
 28 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
 29 |         be produced manually or more typically using enumerate; b holds the two sequences
 30 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
 31 |         a, the index, is passed to keep track of order in case of asynchronous execution
 32 |         Should look like this: (index, (sequence_x, sequence_y)
 33 |    CCC_params : dict
 34 |         The following 3 parameters for CCC as key-value pairs:
 35 |         "LEN_past" : int
 36 |             Parameter "L": Window length of immediate past values of seq_x and seq_y.
 37 |         "ADD_meas" : int
 38 |             Parameter "w": Window length of present values of seq_x. Minimal data length
 39 |             over which CC rate can be reliably estimated, application/domain-specific
 40 |         "STEP_size" : int
 41 |             Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
 42 |             of 20-50% between successive chunks or windows suggested.
 43 |         The dictionary can be generated interactively using CCC.get_params()
 44 | 
 45 |     Returns
 46 |     -------
 47 |     out : dict
 48 |         Estimates obtained by running estimator on inputs.
 49 | 
 50 |     """
 51 | 
 52 |     # Unpack inputs
 53 |     idx, seqs = inputs
 54 | 
 55 |     # Unpack sequences
 56 |     idx_x, idx_y, seq_x, seq_y = seqs
 57 | 
 58 |     # Initialize dictionary of output estimates with index
 59 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
 60 | 
 61 |     # Execute CCC_compute on the sequence pair in one direction
 62 |     out.update({"CCC_y_to_x": CCC_compute(seq_x, seq_y, **CCC_params)})
 63 | 
 64 |     # Execute CCC_compute on the sequence pair in the other direction
 65 |     out.update({"CCC_x_to_y": CCC_compute(seq_y, seq_x, **CCC_params)})
 66 | 
 67 |     return out
 68 | 
 69 | 
 70 | def _kernel_ETC(inputs):
 71 |     """
 72 |     Wrapper for computing causality estimates on a sequence pair
 73 | 
 74 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
 75 | 
 76 |     The function unpacks inputs into an index element and a sequence pair and runs the
 77 |     estimator function on the sequence pair, returning various estimates in a dict
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     inputs : tuple
 82 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
 83 |         be produced manually or more typically using enumerate; b holds the two sequences
 84 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
 85 |         a, the index, is passed to keep track of order in case of asynchronous execution
 86 |         Should look like this: (index, (sequence_x, sequence_y)
 87 | 
 88 |     Returns
 89 |     -------
 90 |     out : dict
 91 |         Estimates obtained by running estimator on inputs.
 92 | 
 93 |     """
 94 | 
 95 |     # Unpack inputs
 96 |     idx, seqs = inputs
 97 | 
 98 |     # Unpack sequences
 99 |     idx_x, idx_y, seq_x, seq_y = seqs
100 | 
101 |     # Initialize dictionary of output estimates with index
102 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
103 | 
104 |     # Execute ETC_compute on the sequence pair
105 |     out.update(ETC_compute(seq_x, seq_y))
106 | 
107 |     return out
108 | 
109 | 
110 | def _kernel_LZ(inputs):
111 |     """
112 |     Wrapper for computing causality estimates on a sequence pair
113 | 
114 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
115 | 
116 |     The function unpacks inputs into an index element and a sequence pair and runs the
117 |     estimator function on the sequence pair, returning various estimates in a dict
118 | 
119 |     Parameters
120 |     ----------
121 |     inputs : tuple
122 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
123 |         be produced manually or more typically using enumerate; b holds the two sequences
124 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
125 |         a, the index, is passed to keep track of order in case of asynchronous execution
126 |         Should look like this: (index, (sequence_x, sequence_y)
127 | 
128 |     Returns
129 |     -------
130 |     out : dict
131 |         Estimates obtained by running estimator on inputs.
132 | 
133 |     """
134 | 
135 |     # Unpack inputs
136 |     idx, seqs = inputs
137 | 
138 |     # Unpack sequences
139 |     idx_x, idx_y, seq_x, seq_y = seqs
140 | 
141 |     # Initialize dictionary of output estimates with index
142 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
143 | 
144 |     # Execute LZ_compute on the sequence pair
145 |     out.update(LZ_compute(seq_x, seq_y))
146 | 
147 |     return out
148 | 
149 | 
150 | def get_rowpairs(matrix):
151 |     """
152 |     Create a generator for iterating over pairs of rows of an input matrix
153 | 
154 |     Parameters
155 |     ----------
156 |     matrix : numpy array, int or float, 2D
157 |         Each row representing a different sequence. (Columns as time)
158 | 
159 |     Yields
160 |     ------
161 |     row1 : int
162 |         Index of first row in the pair.
163 |     row2 : int
164 |         Index of second row in the pair.
165 |     np.array, 1D, int
166 |         Data of first row in the pair.
167 |     np.array, 1D, int
168 |         Data of first row in the pair.
169 | 
170 |     """
171 |     for row1, row2 in combinations(range(0, matrix.shape[0]), 2):
172 |         yield (row1, row2, matrix[row1, :], matrix[row2, :])
173 | 
174 | 
175 | def parallelized_CCC(pairs, CCC_params):
176 |     """
177 |     This function operates concurrently on a collection of sequence pairs and computes
178 |     estimates using the chosen kernel function.
179 | 
180 |     Here used for computing causal estimates from sequences pairs in batch, each pair
181 |     runs on a separate CPU core as a process.
182 | 
183 |     CAUTION: main module is unguarded, do not run these functions as is,
184 |         particularly on Windows!
185 | 
186 |     Parameters
187 |     ----------
188 |     pairs : list/tuple/generator
189 |         Collection of pairs of integer sequences.
190 |     CCC_params : dict
191 |         The following 3 parameters for CCC as key-value pairs:
192 |         "LEN_past" : int
193 |             Parameter "L": Window length of immediate past values of seq_x and seq_y.
194 |         "ADD_meas" : int
195 |             Parameter "w": Window length of present values of seq_x. Minimal data length
196 |             over which CC rate can be reliably estimated, application/domain-specific
197 |         "STEP_size" : int
198 |             Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
199 |             of 20-50% between successive chunks or windows suggested.
200 |         The dictionary can be generated interactively using CCC.get_params()
201 | 
202 |     Returns
203 |     -------
204 |     list of dict elements
205 |         Each dictionary element contains index, length of sequence & ETC.
206 | 
207 |     """
208 | 
209 |     exec_kernel = partial(_kernel_CCC, CCC_params=CCC_params)
210 | 
211 |     # Confirm to stdout
212 |     print("Computing CCC estimates in parallel ... ")
213 | 
214 |     # joblib's paralellization
215 |     out = Parallel(n_jobs=-1, verbose=50)(
216 |         delayed(exec_kernel)(rowElem) for rowElem in enumerate(pairs)
217 |     )
218 | 
219 |     # Return collected results
220 |     return out
221 | 
222 | 
223 | def parallelized_CCM(pairs, kernel="LZ"):
224 |     """
225 |     This function operates concurrently on a collection of sequence pairs and computes
226 |     estimates using the chosen kernel function.
227 | 
228 |     Here used for computing causal estimates from sequences pairs in batch, each pair
229 |     runs on a separate CPU core as a process.
230 | 
231 |     CAUTION: main module is unguarded, do not run these functions as is,
232 |         particularly on Windows!
233 | 
234 |     Parameters
235 |     ----------
236 |     pairs : list/tuple/generator
237 |         Collection of pairs of integer sequences.
238 |     kernel : str, optional
239 |         Name of an estimator function. Currently available: "ETC" and "LZ". The
240 |         default is "LZ".
241 | 
242 |     Returns
243 |     -------
244 |     list of dict elements
245 |         Each dictionary element contains various estimates and identifiers.
246 | 
247 |     """
248 | 
249 |     if kernel == "LZ":
250 |         exec_kernel = _kernel_LZ
251 |     elif kernel == "ETC":
252 |         exec_kernel = _kernel_ETC
253 |     else:
254 |         print("Invalid kernel selected")
255 |         return None
256 | 
257 |     # Confirm to stdout
258 |     print(f"Computing CCM estimates in parallel using {kernel} ... ")
259 | 
260 |     # joblib's paralellization
261 |     out = Parallel(n_jobs=-1, verbose=50)(
262 |         delayed(exec_kernel)(rowElem) for rowElem in enumerate(pairs)
263 |     )
264 | 
265 |     # Return collected results
266 |     return out
267 | 


--------------------------------------------------------------------------------
/ETC/seq/markov.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Module for computing markov transition probability matrices from
  5 | nucleotide sequence stored as text files.
  6 | 
  7 | compute() is the main function that wraps around smaller modular functions.
  8 | 
  9 | @author: pranay
 10 | """
 11 | 
 12 | # Import calls
 13 | from pathlib import Path
 14 | from random import choices, seed
 15 | import numpy as np
 16 | import pandas as pd
 17 | 
 18 | # Function Definitions
 19 | def _read_sequence(filepath):
 20 |     """
 21 |     This function reads a file & returns it as a string.
 22 |     Uses pathlib's functionality & is called by the wrapper compute()
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     filepath : Path object
 27 |         Valid path to file containing nucleotide sequence.
 28 | 
 29 |     Returns
 30 |     -------
 31 |     string
 32 |         String containing nucleotide sequence.
 33 | 
 34 |     """
 35 |     # Use Path object's hook to read file as text and return
 36 |     return filepath.read_text()
 37 | 
 38 | 
 39 | def _generate_overlaps(sequence, order):
 40 |     """
 41 |     This function takes an input sequence & generates overlapping subsequences
 42 |     of length order + 1, returned as a tuple. Has no dependencies & is called
 43 |     by the wrapper compute()
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     sequence : string
 48 |         String containing nucleotide sequence.
 49 |     order : int
 50 |         Order of Markov Transition Probability Matrix for computing overlaps
 51 | 
 52 |     Returns
 53 |     -------
 54 |     tuple
 55 |         Contains all overlapping sub-sequences (of length order + 1) for given
 56 |         order. Total number of sub-sequences is len(sequence) - (order + 1)
 57 | 
 58 |     """
 59 |     # Initialize aggregator
 60 |     aggregator = []
 61 | 
 62 |     # Increment order by 1 as the Markov model includes the current state, such
 63 |     # that the length of sequence corresponding to a state is order + 1
 64 |     order += 1
 65 | 
 66 |     # Iteratively store sequences shifted to the left by 1 step
 67 |     for idx in range(order):
 68 |         aggregator.append(sequence[idx : idx - order])
 69 | 
 70 |     # Join the shifted sequences through element-wise concatenation & return
 71 |     return tuple(map("".join, zip(*aggregator)))
 72 | 
 73 | 
 74 | def _compute_transition_probs(sequence, compact=True, flatten=False):
 75 |     """
 76 |     This function takes a tuple of strings (overlapping subsequences) as input
 77 |     and computes transition probabilities, returned as a dataframe. Switches
 78 |     control the form (compactness & wide-form vs long-form) of output dataframe.
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     sequence : string
 83 |         String containing nucleotide sequence.
 84 |     compact : bool, optional
 85 |         Whether to return the full sparse matrix or to return a more compact
 86 |         representation of it. If False, returns the a square matrix with most
 87 |         elements zero. If True, returns non-zero columns only for all rows.
 88 |             The default is True.
 89 |     flatten : bool, optional
 90 |         Whether to flatten (or tidy / long-form) the matrix or not (wide-form).
 91 |         If True, returns only one column containing probabilities through a
 92 |         row-wise representation. If False, returns multiple columns with
 93 |         probabilities.
 94 |             The default is False.
 95 | 
 96 |     Returns
 97 |     -------
 98 |     pandas DataFrame
 99 |         Tabulated transition probabilities with row & column labels describing
100 |         the (n-1)th and nth state respectively. If flatten is True, column
101 |         labels for nth state are transposed into a column such that there are
102 |         2 columns, 1 each for the (n-1)th and nth state.
103 | 
104 |     """
105 |     # If compact requested, use only the last alphabet of the next subsequence.
106 |     # The Nth element will only differ from the (N-1)th in shift by 1
107 |     if compact:
108 | 
109 |         # Convert tuple to numpy array, excluding last element
110 |         temp = np.array(sequence[:-1])
111 | 
112 |         # Extract the last alphabet from each word except the first
113 |         next_alphabet = np.array([x[-1] for x in sequence[1:]])
114 | 
115 |         # Compute normalized frequencies via cross-tabulation
116 |         df = pd.crosstab(temp, next_alphabet, normalize="index")
117 | 
118 |     # If full matrix to be returned, cross-tabulate shifted sequences
119 |     else:
120 | 
121 |         # Convert tuple containing overlapping subsequences to numpy array
122 |         temp = np.array(sequence)
123 | 
124 |         # Compute normalized frequencies via cross-tabulation
125 |         df = pd.crosstab(temp[:-1], temp[1:], normalize="index")
126 | 
127 |     # Set proper identifier labels
128 |     df.index.name = "previous"
129 |     df.columns.name = "next"
130 | 
131 |     # If flatten requested, pivot all columns into a single column
132 |     if flatten:
133 | 
134 |         # Stack all columns
135 |         df = df.stack()
136 | 
137 |         # Set name for the Series object
138 |         df.name = "probability"
139 | 
140 |         # Return a DataFrame by resetting the Series index
141 |         return df.reset_index()
142 | 
143 |     # If flatten not requested, return DataFrame
144 |     return df
145 | 
146 | 
147 | def _check_inputs(filepath, order, compact, flatten):
148 |     """
149 |     This function checks the input arguments to compute() for validity based
150 |     on descriptions below.
151 | 
152 |     Parameters
153 |     ----------
154 |     filepath : Path object
155 |         Valid path to file containing nucleotide sequence.
156 |     order : int
157 |         Order of Markov Transition Probability Matrix for computing overlaps
158 |     compact : bool, optional
159 |         Whether to return the full sparse matrix or to return a more compact
160 |         representation of it
161 |     flatten : bool, optional
162 |         Whether to flatten (or tidy / long-form) the matrix or not.
163 | 
164 |     Returns
165 |     -------
166 |     bool
167 |         True if all inputs are valid.
168 | 
169 |     """
170 |     # Check type of input path
171 |     if not isinstance(filepath, Path):
172 |         print("> ERROR: Input should be a Path object ...")
173 |         return False
174 | 
175 |     # Check if path exists and points to a file
176 |     if not (filepath.exists() and filepath.is_file()):
177 |         print("> ERROR: Path does not exist ...")
178 |         return False
179 | 
180 |     # Check if order is a non-negative integer
181 |     if not (isinstance(order, int) and order >= 0):
182 |         print("> ERROR: order should be a non-negative integer ...")
183 |         return False
184 | 
185 |     # Check if other args are boolean types
186 |     if not (isinstance(compact, bool) and isinstance(flatten, bool)):
187 |         print("> ERROR: compact and flatten args should be a boolean ...")
188 |         return False
189 | 
190 |     # If all inputs are valid, yay
191 |     return True
192 | 
193 | 
194 | def compute(filepath, order, compact=True, flatten=False):
195 |     """
196 |     This function takes a text file containing a nucleotide sequence, computes
197 |     computes the transition probability matrix of given order and returns it as
198 |     a labelled dataframe. Output can be tuned through optional switches. This
199 |     function is modular and wraps around the following 4 functions:
200 |         _check_inputs - for validating input arguments
201 |         _read_sequence - for reading text file containing nucleotide sequence
202 |         _generate_overlaps - for creating overlapped subsequences
203 |         _compute_transition_probs - for creating transition probability matrix
204 | 
205 |     Parameters
206 |     ----------
207 |     filepath : Path object
208 |         Valid path to file containing nucleotide sequence.
209 |     order : int
210 |         Order of Markov Transition Probability Matrix for computing overlaps
211 |     compact : bool, optional
212 |         Whether to return the full sparse matrix or to return a more compact
213 |         representation of it. If False, returns the a square matrix with most
214 |         elements zero. If True, returns non-zero columns only for all rows.
215 |             The default is True.
216 |     flatten : bool, optional
217 |         Whether to flatten (or tidy / long-form) the matrix or not. If True,
218 |         returns only one column containing probabilities through a row-wise
219 |         representation. If False, returns multiple columns with probabilities.
220 |             The default is False.
221 | 
222 |     Returns
223 |     -------
224 |     pandas DataFrame
225 |         Tabulated transition probabilities with row & column labels describing
226 |         the (n-1)th and nth state respectively. If flatten is True, column
227 |         labels for nth state are transposed into a column such that there are
228 |         2 columns, 1 each for the (n-1)th and nth state.
229 | 
230 |     """
231 |     # If any input is not valid, break
232 |     if not _check_inputs(filepath, order, compact, flatten):
233 |         return None
234 | 
235 |     # Read sequence file and get tuple of overlapping subsequences
236 |     sequence = _generate_overlaps(_read_sequence(filepath), order)
237 | 
238 |     # Compute and return transition probability matrix
239 |     return _compute_transition_probs(sequence, compact=compact, flatten=flatten)
240 | 
241 | 
242 | def sample_sequence(sequence, order, size, sampler_seed=0):
243 | 
244 |     # Read sequence file and get tuple of overlapping subsequences
245 |     overlapped = _generate_overlaps(sequence, order)
246 | 
247 |     # Compute and return transition probability matrix
248 |     transition_probs = _compute_transition_probs(
249 |         overlapped, compact=True, flatten=False
250 |     )
251 | 
252 |     order += 1
253 |     chain = sequence[-order:]
254 | 
255 |     seed(sampler_seed)
256 | 
257 |     for n in range(size):
258 |         last = chain[-order:]
259 |         probs = transition_probs.loc[last, :]
260 |         new = "".join(choices(population=probs.index, weights=probs.values, k=1))
261 |         chain += new
262 | 
263 |     return chain[order:]
264 | 


--------------------------------------------------------------------------------
/ETC/NCA/parallelize_mp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Parallelized NCA estimation using multiprocessing
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from ETC.CCMC.pairs import ETC_causality as ETC_compute
  9 | from ETC.CCMC.pairs import LZ_causality as LZ_compute
 10 | from ETC.CCC.compute_CCC import compute as CCC_compute
 11 | from multiprocessing import Pool
 12 | from functools import partial
 13 | from itertools import combinations
 14 | 
 15 | 
 16 | def _kernel_CCC(inputs, CCC_params):
 17 |     """
 18 |     Wrapper for computing causality estimates on a sequence pair
 19 | 
 20 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
 21 | 
 22 |     The function unpacks inputs into an index element and a sequence pair and runs the
 23 |     estimator function on the sequence pair, returning various estimates in a dict
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     inputs : tuple
 28 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
 29 |         be produced manually or more typically using enumerate; b holds the two sequences
 30 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
 31 |         a, the index, is passed to keep track of order in case of asynchronous execution
 32 |         Should look like this: (index, (sequence_x, sequence_y)
 33 |    CCC_params : dict
 34 |         The following 3 parameters for CCC as key-value pairs:
 35 |         "LEN_past" : int
 36 |             Parameter "L": Window length of immediate past values of seq_x and seq_y.
 37 |         "ADD_meas" : int
 38 |             Parameter "w": Window length of present values of seq_x. Minimal data length
 39 |             over which CC rate can be reliably estimated, application/domain-specific
 40 |         "STEP_size" : int
 41 |             Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
 42 |             of 20-50% between successive chunks or windows suggested.
 43 |         The dictionary can be generated interactively using CCC.get_params()
 44 | 
 45 |     Returns
 46 |     -------
 47 |     out : dict
 48 |         Estimates obtained by running estimator on inputs.
 49 | 
 50 |     """
 51 | 
 52 |     # Unpack inputs
 53 |     idx, seqs = inputs
 54 | 
 55 |     # Unpack sequences
 56 |     idx_x, idx_y, seq_x, seq_y = seqs
 57 | 
 58 |     # Initialize dictionary of output estimates with index
 59 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
 60 | 
 61 |     # Execute CCC_compute on the sequence pair in one direction
 62 |     out.update({"CCC_y_to_x": CCC_compute(seq_x, seq_y, **CCC_params)})
 63 | 
 64 |     # Execute CCC_compute on the sequence pair in the other direction
 65 |     out.update({"CCC_x_to_y": CCC_compute(seq_y, seq_x, **CCC_params)})
 66 | 
 67 |     return out
 68 | 
 69 | 
 70 | def _kernel_ETC(inputs):
 71 |     """
 72 |     Wrapper for computing causality estimates on a sequence pair
 73 | 
 74 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
 75 | 
 76 |     The function unpacks inputs into an index element and a sequence pair and runs the
 77 |     estimator function on the sequence pair, returning various estimates in a dict
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     inputs : tuple
 82 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
 83 |         be produced manually or more typically using enumerate; b holds the two sequences
 84 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
 85 |         a, the index, is passed to keep track of order in case of asynchronous execution
 86 |         Should look like this: (index, (sequence_x, sequence_y)
 87 | 
 88 |     Returns
 89 |     -------
 90 |     out : dict
 91 |         Estimates obtained by running estimator on inputs.
 92 | 
 93 |     """
 94 | 
 95 |     # Unpack inputs
 96 |     idx, seqs = inputs
 97 | 
 98 |     # Unpack sequences
 99 |     idx_x, idx_y, seq_x, seq_y = seqs
100 | 
101 |     # Initialize dictionary of output estimates with index
102 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
103 | 
104 |     # Execute ETC_compute on the sequence pair
105 |     out.update(ETC_compute(seq_x, seq_y))
106 | 
107 |     return out
108 | 
109 | 
110 | def _kernel_LZ(inputs):
111 |     """
112 |     Wrapper for computing causality estimates on a sequence pair
113 | 
114 |     Used for causal discovery and estimation from CCM based methods as well as CCC.
115 | 
116 |     The function unpacks inputs into an index element and a sequence pair and runs the
117 |     estimator function on the sequence pair, returning various estimates in a dict
118 | 
119 |     Parameters
120 |     ----------
121 |     inputs : tuple
122 |         Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can
123 |         be produced manually or more typically using enumerate; b holds the two sequences
124 |         usually passed in by zip-ping larger iterables or itertools' product/combinations.
125 |         a, the index, is passed to keep track of order in case of asynchronous execution
126 |         Should look like this: (index, (sequence_x, sequence_y)
127 | 
128 |     Returns
129 |     -------
130 |     out : dict
131 |         Estimates obtained by running estimator on inputs.
132 | 
133 |     """
134 | 
135 |     # Unpack inputs
136 |     idx, seqs = inputs
137 | 
138 |     # Unpack sequences
139 |     idx_x, idx_y, seq_x, seq_y = seqs
140 | 
141 |     # Initialize dictionary of output estimates with index
142 |     out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y}
143 | 
144 |     # Execute LZ_compute on the sequence pair
145 |     out.update(LZ_compute(seq_x, seq_y))
146 | 
147 |     return out
148 | 
149 | 
150 | def get_rowpairs(matrix):
151 |     """
152 |     Create a generator for iterating over pairs of rows of an input matrix
153 | 
154 |     Parameters
155 |     ----------
156 |     matrix : numpy array, int or float, 2D
157 |         Each row representing a different sequence. (Columns as time)
158 | 
159 |     Yields
160 |     ------
161 |     row1 : int
162 |         Index of first row in the pair.
163 |     row2 : int
164 |         Index of second row in the pair.
165 |     np.array, 1D, int
166 |         Data of first row in the pair.
167 |     np.array, 1D, int
168 |         Data of first row in the pair.
169 | 
170 |     """
171 |     for row1, row2 in combinations(range(0, matrix.shape[0]), 2):
172 |         yield (row1, row2, matrix[row1, :], matrix[row2, :])
173 | 
174 | 
175 | def parallelized_CCC(pairs, CCC_params):
176 |     """
177 |     This function operates concurrently on a collection of sequence pairs and computes
178 |     estimates using the chosen kernel function.
179 | 
180 |     Here used for computing causal estimates from sequences pairs in batch, each pair
181 |     runs on a separate CPU core as a process.
182 | 
183 |     CAUTION: main module is unguarded, do not run these functions as is,
184 |         particularly on Windows!
185 | 
186 |     Parameters
187 |     ----------
188 |     pairs : list/tuple/generator
189 |         Collection of pairs of integer sequences.
190 |     CCC_params : dict
191 |         The following 3 parameters for CCC as key-value pairs:
192 |         "LEN_past" : int
193 |             Parameter "L": Window length of immediate past values of seq_x and seq_y.
194 |         "ADD_meas" : int
195 |             Parameter "w": Window length of present values of seq_x. Minimal data length
196 |             over which CC rate can be reliably estimated, application/domain-specific
197 |         "STEP_size" : int
198 |             Parameter "delta": Step-size for sliding chunks across both sequences. An overlap
199 |             of 20-50% between successive chunks or windows suggested.
200 |         The dictionary can be generated interactively using CCC.get_params()
201 | 
202 |     Returns
203 |     -------
204 |     list of dict elements
205 |         Each dictionary element contains index, length of sequence & ETC.
206 | 
207 |     """
208 | 
209 |     exec_kernel = partial(_kernel_CCC, CCC_params=CCC_params)
210 | 
211 |     # Initialize pool of parallel workers
212 |     pool = Pool()
213 | 
214 |     # Confirm to stdout
215 |     print("Computing CCC estimates in parallel on input ... ", end="")
216 | 
217 |     # Map-execute function across sequences
218 |     out = pool.map_async(exec_kernel, enumerate(pairs))
219 | 
220 |     # Graceful exit
221 |     pool.close()
222 |     pool.join()
223 | 
224 |     # Confirm completion
225 |     print("Done!")
226 | 
227 |     # Return collected results
228 |     return out.get()
229 | 
230 | 
231 | def parallelized_CCM(pairs, kernel="LZ"):
232 |     """
233 |     This function operates concurrently on a collection of sequence pairs and computes
234 |     estimates using the chosen kernel function.
235 | 
236 |     Here used for computing causal estimates from sequences pairs in batch, each pair
237 |     runs on a separate CPU core as a process.
238 | 
239 |     CAUTION: main module is unguarded, do not run these functions as is,
240 |         particularly on Windows!
241 | 
242 |     Parameters
243 |     ----------
244 |     pairs : list/tuple/generator
245 |         Collection of pairs of integer sequences.
246 |     kernel : str, optional
247 |         Name of an estimator function. Currently available: "ETC" and "LZ". The
248 |         default is "LZ".
249 | 
250 |     Returns
251 |     -------
252 |     list of dict elements
253 |         Each dictionary element contains various estimates and identifiers.
254 | 
255 |     """
256 | 
257 |     if kernel == "LZ":
258 |         exec_kernel = _kernel_LZ
259 |     elif kernel == "ETC":
260 |         exec_kernel = _kernel_ETC
261 |     else:
262 |         print("Invalid kernel selected")
263 |         return None
264 | 
265 |     # Initialize pool of parallel workers
266 |     pool = Pool()
267 | 
268 |     # Confirm to stdout
269 |     print("Computing CCM estimates in parallel on input ... ", end="")
270 | 
271 |     # Map-execute function across sequences
272 |     out = pool.map_async(exec_kernel, enumerate(pairs))
273 | 
274 |     # Graceful exit
275 |     pool.close()
276 |     pool.join()
277 | 
278 |     # Confirm completion
279 |     print("Done!")
280 | 
281 |     # Return collected results
282 |     return out.get()
283 | 


--------------------------------------------------------------------------------
/ETC/tests/test_NSRWS2D.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | 
  9 | from array import array
 10 | from random import choice
 11 | 
 12 | from hypothesis import given
 13 | from hypothesis.strategies import composite, integers, lists
 14 | 
 15 | from ETC.NSRWS.x2D import onestep
 16 | from ETC.NSRWS.x2D import core as cc
 17 | from ETC.NSRWS.x2D import etc as cetc
 18 | 
 19 | 
 20 | @composite
 21 | def generate_sequences(draw, elements=[lists, integers]):
 22 |     """
 23 |     Generate 2 lists of integers as sequence input with equal lengths
 24 |     """
 25 |     seq_x = draw(
 26 |         lists(integers(min_value=1, max_value=100), min_size=3, max_size=10_000)
 27 |     )
 28 |     seq_y = draw(
 29 |         lists(
 30 |             integers(min_value=1, max_value=100),
 31 |             min_size=len(seq_x),
 32 |             max_size=len(seq_x),
 33 |         )
 34 |     )
 35 | 
 36 |     return seq_x, seq_y
 37 | 
 38 | 
 39 | @composite
 40 | def generate_sequences_identical(draw, elements=[lists, integers]):
 41 |     """
 42 |     Generate 2 lists with all identical integers with equal lengths
 43 |     """
 44 |     seq_x = draw(lists(integers(min_value=1, max_value=1), min_size=3, max_size=10_000))
 45 |     seq_y = draw(
 46 |         lists(
 47 |             integers(min_value=2, max_value=2), min_size=len(seq_x), max_size=len(seq_x)
 48 |         )
 49 |     )
 50 | 
 51 |     return seq_x, seq_y
 52 | 
 53 | 
 54 | @given(generate_sequences())
 55 | def test_onestep(inputs):
 56 |     """
 57 |     Test the outermost onestep function exposed for direct estimation
 58 |     """
 59 |     seq_x, seq_y = inputs
 60 | 
 61 |     out_x1, out_y1, _ = onestep.onestep(
 62 |         seq_x, seq_y, order=2, verbose=False, check=False
 63 |     )
 64 |     outputs = onestep.onestep(seq_x, seq_y, order=2, verbose=True, check=False)
 65 |     out_x2, out_y2 = outputs[0], outputs[1]
 66 | 
 67 |     # Substituted sequence should be shorter than input
 68 |     assert len(out_x1) < len(seq_x) and len(out_y1) < len(seq_y)
 69 | 
 70 |     # Highest value in substituted sequence should be greater than that in input
 71 |     assert max(out_x1) > max(seq_x) and max(out_y1) > max(seq_y)
 72 | 
 73 |     # Smallest value in substituted sequence should be at least as large as that in input
 74 |     assert min(out_x1) >= min(seq_x) and min(out_y1) >= min(seq_y)
 75 | 
 76 |     # Number of unique symbols in output should be one less than that in input
 77 |     assert len(set(out_x1) - set(seq_x)) == 1
 78 |     assert len(set(out_y1) - set(seq_y)) == 1
 79 | 
 80 |     # Number of symbols in output that are not in input should be between 1 and 3
 81 |     assert 1 <= len(set(out_x1) ^ set(seq_x)) <= 3
 82 |     assert 1 <= len(set(out_y1) ^ set(seq_y)) <= 3
 83 | 
 84 |     # Changing verbosity parameter should not alter the substituted sequence
 85 |     assert out_x1 == out_x2 and out_y1 == out_y2
 86 | 
 87 | 
 88 | @given(generate_sequences())
 89 | def test_onestep_unequal(inputs):
 90 |     """
 91 |     Test the outermost onestep function for invalid input: sequences shorter than order
 92 |     """
 93 |     seq_x, seq_y = inputs
 94 | 
 95 |     output = onestep.onestep(seq_x, seq_y[:-11], order=2, verbose=False, check=False)
 96 |     assert output is None
 97 | 
 98 | 
 99 | @given(generate_sequences())
100 | def test_onestep_invalid(inputs):
101 |     """
102 |     Test the outermost onestep function for invalid input: sequences shorter than order
103 |     """
104 |     seq_x, seq_y = inputs
105 | 
106 |     output = onestep.onestep(seq_x[:1], seq_y[:1], order=2, verbose=False, check=False)
107 |     assert output is None
108 | 
109 | 
110 | def test_onestep_invalid_str():
111 |     """
112 |     Test the outermost onestep function for invalid input: string inputs
113 |     """
114 |     output = onestep.onestep(
115 |         [1, 2, 3, 4, 5, 6], "abcdef", 2, verbose=False, check=False
116 |     )
117 |     assert output is None
118 | 
119 |     output = onestep.onestep(
120 |         "abcdef", [1, 2, 3, 4, 5, 6], 2, verbose=False, check=False
121 |     )
122 |     assert output is None
123 | 
124 |     output = onestep.onestep("abcdef", "abcdef", 2, verbose=False, check=False)
125 |     assert output is None
126 | 
127 | 
128 | @given(generate_sequences_identical())
129 | def test_onestep_identical(inputs):
130 |     """
131 |     Test the outermost onestep function for sequence with identical symbols
132 |     """
133 |     seq_x, seq_y = inputs
134 | 
135 |     output = onestep.onestep(seq_x, seq_y, order=2, verbose=False, check=True)
136 | 
137 |     assert output is None
138 | 
139 | 
140 | @given(generate_sequences())
141 | def test_get_mask_general(inputs):
142 |     """
143 |     Test the get_mask function for random sequences and orders
144 |     """
145 |     seq_x, seq_y = inputs
146 |     seq_x = array("I", seq_x)
147 |     seq_y = array("I", seq_y)
148 | 
149 |     # Get mask
150 |     mask = cc.get_mask_pairs(seq_x, seq_y)
151 | 
152 |     # Mask should be precisely shorter than input sequence
153 |     assert len(mask) == len(seq_x) - 1 == len(seq_y) - 1
154 | 
155 |     # Mask should only contain 0s and 1s
156 |     assert set(mask).issubset({0, 1})
157 | 
158 |     # First element must be 1
159 |     assert mask[0] == 1
160 | 
161 |     # If mask contains a 0, then that position in the sequence indicates an overlap
162 |     try:
163 |         idx0 = mask.index(0)
164 |         # Check if consecutive elements equal where 0 found in mask
165 |         assert seq_x[idx0] == seq_x[idx0 + 1] and seq_y[idx0] == seq_y[idx0 + 1]
166 |     except ValueError:
167 |         pass
168 | 
169 | 
170 | @given(generate_sequences_identical())
171 | def test_get_mask_identical(inputs):
172 |     """
173 |     Test the get_mask function for sequences with identical symbols
174 |     """
175 |     seq_x, seq_y = inputs
176 |     seq_x = array("I", seq_x)
177 |     seq_y = array("I", seq_y)
178 | 
179 |     # Get mask from left-to-right and reversed sequence
180 |     mask = cc.get_mask_pairs(seq_x, seq_y)
181 |     mask_rev = cc.get_mask_pairs(seq_x[::-1], seq_y[::-1])
182 | 
183 |     # Find zeroes for they must be present
184 |     idx0 = mask.index(0)
185 | 
186 |     # Both masks should be precisely shorter than input sequence
187 |     assert len(mask) == len(mask_rev) == len(seq_x) - 1 == len(seq_y) - 1
188 | 
189 |     # Both masks should only contain 0s and 1s
190 |     assert set(mask).issubset({0, 1})
191 |     assert set(mask_rev).issubset({0, 1})
192 | 
193 |     # Both masks should have an exact number of zeros corresponding to overlaps
194 |     assert mask.count(0) == mask_rev.count(0) == (len(seq_x) - 1) // 2
195 |     assert mask.count(1) == mask_rev.count(1) == len(seq_x) // 2
196 | 
197 |     # Check if consecutive elements equal where 0 found in mask
198 |     assert seq_x[idx0] == seq_x[idx0 + 1] and seq_y[idx0] == seq_y[idx0 + 1]
199 | 
200 | 
201 | def test_mask_and_count():
202 |     """
203 |     Test the function for applying mask and counting frequent windows
204 |     """
205 |     seq_x = (1, 2, 3, 4, 5, 6, 7)
206 |     seq_y = (3, 4, 5, 6, 7, 8, 9)
207 |     mask = (1, 0, 0, 1, 1)
208 | 
209 |     freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2)
210 |     assert (
211 |         freq_pair_x == array("I", (1, 2))
212 |         and freq_pair_y == array("I", (3, 4))
213 |         and count == 1
214 |     )
215 | 
216 |     mask = (1, 1, 1, 1, 1)
217 | 
218 |     freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2)
219 |     assert (
220 |         freq_pair_x == array("I", (1, 2))
221 |         and freq_pair_y == array("I", (3, 4))
222 |         and count == 1
223 |     )
224 | 
225 |     mask = (0, 1, 1, 1, 1)
226 |     freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2)
227 |     assert (
228 |         freq_pair_x == array("I", (2, 3))
229 |         and freq_pair_y == array("I", (4, 5))
230 |         and count == 1
231 |     )
232 | 
233 | 
234 | @given(generate_sequences())
235 | def test_substitution(inputs):
236 |     """
237 |     Test the substitution step for random sequences
238 |     """
239 |     seq_x, seq_y = inputs
240 |     seq_x = array("I", seq_x)
241 |     seq_y = array("I", seq_y)
242 | 
243 |     # Get values to substitute
244 |     sub_value_x = 1 + max(seq_x)
245 |     sub_value_y = 1 + max(seq_y)
246 | 
247 |     # Pick a random pair for substitution
248 |     idx = seq_x.index(choice(seq_x[:-1]))
249 |     pair_x = array("I", [seq_x[idx], seq_x[idx + 1]])
250 |     pair_y = array("I", [seq_y[idx], seq_y[idx + 1]])
251 | 
252 |     # Substitute the pairs
253 |     out_x, out_y = cc.substitute_pairs(
254 |         seq_x[:], seq_y[:], pair_x, pair_y, sub_value_x, sub_value_y
255 |     )
256 | 
257 |     # The length of the substituted sequence should be less than the input sequence
258 |     assert len(out_x) < len(seq_x) and len(out_y) < len(seq_y)
259 | 
260 |     # The lengths of the 2 substituted sequences should be identical
261 |     assert len(out_x) == len(out_y)
262 | 
263 |     # The highest value in the substituted sequence should be more than that in the input sequence
264 |     assert max(out_x) > max(seq_x) and max(out_y) > max(seq_y)
265 | 
266 |     # The highest value in the substitute sequence should match the provided value
267 |     assert max(out_x) == sub_value_x and max(out_y) == sub_value_y
268 | 
269 | 
270 | @given(generate_sequences())
271 | def test_truncation(inputs):
272 |     """
273 |     Test ETC estimation from all 4 methods based on verbosity and truncation
274 |     """
275 |     seq_x, seq_y = inputs
276 | 
277 |     etc_vf = cetc.compute(seq_x, seq_y, order=2, verbose=True, truncate=False)["ETC2D"]
278 |     etc_vt = cetc.compute(seq_x, seq_y, order=2, verbose=True, truncate=True)["ETC2D"]
279 |     etc_cf = cetc.compute(seq_x, seq_y, order=2, verbose=False, truncate=False)["ETC2D"]
280 |     etc_ct = cetc.compute(seq_x, seq_y, order=2, verbose=False, truncate=True)["ETC2D"]
281 | 
282 |     # All 4 estimates should be identical
283 |     assert etc_vf == etc_vt == etc_cf == etc_ct
284 | 
285 | 
286 | def test_compute_save(tmp_path):
287 |     """
288 |     Test ETC estimation with write-to-disk functionality
289 |     """
290 |     seq_x = array("I", [2, 4] * 100)
291 |     seq_y = array("I", [1, 3, 5, 7] * 50)
292 | 
293 |     # Temporary file (Path object) for use
294 |     file = tmp_path / "test.csv"
295 | 
296 |     # Test without truncation
297 |     etc_vf = cetc.compute_save(seq_x, seq_y, file, order=2, truncate=False)
298 |     assert isinstance(etc_vf, dict)
299 | 
300 |     # Test with truncation
301 |     etc_vt = cetc.compute_save(seq_x, seq_y, file, order=2, truncate=True)
302 |     assert isinstance(etc_vt, dict)
303 | 
304 |     # Values should be same of course
305 |     assert etc_vf["ETC2D"] == etc_vt["ETC2D"]
306 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x2D/parallel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from functools import partial
  9 | from itertools import islice
 10 | from collections import Counter
 11 | from random import choices
 12 | 
 13 | # Import functions from standard library modules
 14 | from multiprocessing import Pool
 15 | 
 16 | # Import local modules
 17 | import ETC
 18 | from ETC.seq.process import entropy
 19 | from ETC.helper.compute_markov_transition_probs import sample_sequence
 20 | 
 21 | # Function definitions
 22 | def _compute_two_files_truncated(files, order=2):
 23 |     """
 24 |     This function operates on a single file - reads sequence, computes ETC
 25 |     and writes to disk.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     filepath : str or Path object
 30 |         Valid path to a file containing sequence.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     out : dict
 35 |         filename, length of sequence and ETC estimate.
 36 | 
 37 |     """
 38 |     # Read file as a sequence
 39 |     filepath1, filepath2 = files
 40 |     seq1 = ETC.helper.IO.read(filepath1)
 41 |     seq2 = ETC.helper.IO.read(filepath2)
 42 | 
 43 |     if len(seq1) > len(seq2):
 44 |         seq1 = seq1[: len(seq2)]
 45 |     else:
 46 |         seq2 = seq2[: len(seq1)]
 47 | 
 48 |     # Prepare output dictionary
 49 |     out = {"seq1": filepath1.stem, "seq2": filepath2.stem, "length": len(seq1)}
 50 | 
 51 |     # Compute ETC, write to file and update output dictionary
 52 |     out.update(ETC.compute_2D(seq1, seq2, order=order, truncate=True, verbose=False))
 53 |     seq1etc = ETC.compute_1D(seq1, order=order, truncate=True, verbose=False)["ETC1D"]
 54 |     out.update({"ETC1D_seq1": seq1etc})
 55 | 
 56 |     seq2etc = ETC.compute_1D(seq2, order=order, truncate=True, verbose=False)["ETC1D"]
 57 |     out.update({"ETC1D_seq2": seq2etc})
 58 | 
 59 |     return out
 60 | 
 61 | 
 62 | # Function definitions
 63 | def _compute_two_files_markov(files, markov_order, order=2):
 64 |     """
 65 |     This function operates on a single file - reads sequence, computes ETC
 66 |     and writes to disk.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     filepath : str or Path object
 71 |         Valid path to a file containing sequence.
 72 | 
 73 |     Returns
 74 |     -------
 75 |     out : dict
 76 |         filename, length of sequence and ETC estimate.
 77 | 
 78 |     """
 79 |     # Read file as a sequence
 80 |     filepath1, filepath2 = files
 81 |     seq1 = ETC.helper.IO.read(filepath1, recode=False)
 82 |     seq2 = ETC.helper.IO.read(filepath2, recode=False)
 83 | 
 84 |     lseq1 = len(seq1)
 85 |     lseq2 = len(seq2)
 86 | 
 87 |     if lseq1 > lseq2:
 88 |         diff = lseq1 - lseq2
 89 |         extra_tail = sample_sequence(
 90 |             seq2, order=markov_order, size=diff, sampler_seed=64
 91 |         )
 92 |         seq2 += extra_tail
 93 | 
 94 |     elif lseq1 < lseq2:
 95 |         diff = lseq2 - lseq1
 96 |         extra_tail = sample_sequence(
 97 |             seq1, order=markov_order, size=diff, sampler_seed=64
 98 |         )
 99 |         seq1 += extra_tail
100 | 
101 |     assert len(seq1) == len(seq2)
102 | 
103 |     seq1 = ETC.helper.IO.recode_to_int(seq1)
104 |     seq2 = ETC.helper.IO.recode_to_int(seq2)
105 | 
106 |     # Filename for writing output of ETC computation
107 |     # fname = filepath1.with_name(filepath1.stem + '_&_'+ filepath2.stem + f"_etc_order{order}_markov_order{markov_order}.csv")
108 | 
109 |     # Prepare output dictionary
110 |     out = {"seq1": filepath1.stem, "seq2": filepath2.stem, "length": len(seq1)}
111 | 
112 |     # Compute ETC, write to file and update output dictionary
113 |     out.update(ETC.compute_2D(seq1, seq2, order=order, truncate=True, verbose=False))
114 |     seq1etc = ETC.compute_1D(seq1, order=order, truncate=True, verbose=False)["ETC1D"]
115 |     out.update({"ETC1D_seq1": seq1etc})
116 | 
117 |     seq2etc = ETC.compute_1D(seq2, order=order, truncate=True, verbose=False)["ETC1D"]
118 |     out.update({"ETC1D_seq2": seq2etc})
119 | 
120 |     return out
121 | 
122 | 
123 | def pcompute_files_markov(filelist, markov_order, order=2):
124 |     """
125 |     This function operates concurrently on a list of files. Reads each as a
126 |     sequence, computes ETC and writes output to disk.
127 | 
128 |     CAUTION: main module is unguarded, do not run these functions as is,
129 |         particularly on Windows.
130 | 
131 |     Parameters
132 |     ----------
133 |     filelist : list/tuple/generator
134 |         Collection of filenames of files containing sequence data.
135 | 
136 |     Returns
137 |     -------
138 |     list of dict elements
139 |         Each dictionary element contains filename, length of sequence & ETC.
140 | 
141 |     """
142 |     # Initialize pool of parallel workers
143 |     pool = Pool()
144 |     func = partial(_compute_two_files_markov, markov_order=markov_order, order=order)
145 |     # Map-execute function across files
146 |     out = pool.map_async(func, filelist)
147 | 
148 |     # Graceful exit
149 |     pool.close()
150 |     pool.join()
151 | 
152 |     # Return collected results
153 |     return out.get()
154 | 
155 | 
156 | def pcompute_files_truncated(filelist, order=2):
157 |     """
158 |     This function operates concurrently on a list of files. Reads each as a
159 |     sequence, computes ETC and writes output to disk.
160 | 
161 |     CAUTION: main module is unguarded, do not run these functions as is,
162 |         particularly on Windows.
163 | 
164 |     Parameters
165 |     ----------
166 |     filelist : list/tuple/generator
167 |         Collection of filenames of files containing sequence data.
168 | 
169 |     Returns
170 |     -------
171 |     list of dict elements
172 |         Each dictionary element contains filename, length of sequence & ETC.
173 | 
174 |     """
175 |     # Initialize pool of parallel workers
176 |     pool = Pool()
177 |     func = partial(_compute_two_files_truncated, order=order)
178 |     # Map-execute function across files
179 |     out = pool.map_async(func, filelist)
180 | 
181 |     # Graceful exit
182 |     pool.close()
183 |     pool.join()
184 | 
185 |     # Return collected results
186 |     return out.get()
187 | 
188 | 
189 | def _compute_single_seq(seq):
190 |     """
191 |     This function operates on a single sequence and computes ETC.
192 | 
193 |     Parameters
194 |     ----------
195 |     seq : tuple of 2 elements
196 |         1st element is index for tracking.
197 |         2nd element is a sequence of integers used for ETC computation.
198 |         Output of enumerate.
199 | 
200 |     Returns
201 |     -------
202 |     out : dict
203 |         index of sequence, length of sequence and ETC estimate.
204 | 
205 |     """
206 |     # Prepare output dictionary
207 |     out = {"item": seq[0], "length": len(seq[1]), "entropy": entropy(seq[1])}
208 | 
209 |     # Compute ETC and update output dictionary
210 |     out.update(ETC.compute(seq[1], order=2, verbose=False, truncate=True))
211 | 
212 |     return out
213 | 
214 | 
215 | def pcompute_multiple_seq(iterable):
216 |     """
217 |     This function operates concurrently on a collection of sequences. Loads
218 |     each sequence and computes ETC.
219 | 
220 |     CAUTION: main module is unguarded, do not run these functions as is,
221 |         particularly on Windows.
222 | 
223 |     Parameters
224 |     ----------
225 |     iterable : list/tuple/generator
226 |         Collection of integer sequences.
227 | 
228 |     Returns
229 |     -------
230 |     list of dict elements
231 |         Each dictionary element contains index, length of sequence & ETC.
232 | 
233 |     """
234 |     # Initialize pool of parallel workers
235 |     pool = Pool()
236 | 
237 |     # Map-execute function across sequences
238 |     out = pool.map_async(_compute_single_seq, enumerate(iterable))
239 | 
240 |     # Graceful exit
241 |     pool.close()
242 |     pool.join()
243 | 
244 |     # Return collected results
245 |     return out.get()
246 | 
247 | 
248 | def _overlapping_chunks(seq, size, offset=1):
249 |     """
250 |     This function takes an input sequence and produces chunks of chosen size.
251 |     Offset can be used to control degree of overlap (or distance between chunks
252 |     that don't overlap)
253 | 
254 |     Parameters
255 |     ----------
256 |     seq : tuple or list
257 |         Sequence of integers.
258 |     size : int
259 |         Length of each produced chunk.
260 |     offset : int, optional
261 |         Number of elements to shift each chunk by. The default is 1.
262 |         Setting this to any value less than size allows control of overlap.
263 |         Setting this >= size produces non-overlapping chunks.
264 | 
265 |     Returns
266 |     -------
267 |     zip
268 |         zip object that produces chunks of specified size, one at a time.
269 | 
270 |     """
271 | 
272 |     return zip(*(islice(seq, i, None, offset) for i in range(size)))
273 | 
274 | 
275 | def _non_overlapping_chunks(seq, size):
276 |     """
277 |     This function takes an input sequence and produces chunks of chosen size
278 |     that strictly do not overlap. This is a much faster implemetnation than
279 |     _overlapping_chunks and should be preferred if running on very large seq.
280 | 
281 |     Parameters
282 |     ----------
283 |     seq : tuple or list
284 |         Sequence of integers.
285 |     size : int
286 |         Length of each produced chunk.
287 | 
288 |     Returns
289 |     -------
290 |     zip
291 |         zip object that produces chunks of specified size, one at a time.
292 | 
293 |     """
294 | 
295 |     return zip(*[iter(seq)] * size)
296 | 
297 | 
298 | def pcompute_single(seq, size, offset=1):
299 |     """
300 |     This function operates concurrently on chunks of a given sequence. Gets
301 |     each chunk and computes ETC one-by-one. Offset parameter controls degree of
302 |     overlap (or non-overlap)
303 | 
304 |     CAUTION: main module is unguarded, do not run these functions as is,
305 |         particularly on Windows.
306 | 
307 |     Parameters
308 |     ----------
309 |     seq : tuple or list
310 |         Sequence of integers.
311 |     size : int
312 |         Length of each produced chunk.
313 |     offset : int, optional
314 |         Number of elements to shift each chunk by. The default is 1.
315 |         Setting this to any value less than size allows control of overlap.
316 |         Setting this >= size produces non-overlapping chunks.
317 | 
318 |     Returns
319 |     -------
320 |     list of dict elements
321 |         Each dictionary element contains index, length of sequence & ETC.
322 | 
323 |     """
324 |     # If offset equals size, get non-overlapping chunks of given size
325 |     if offset == size:
326 |         iterable = _non_overlapping_chunks(seq, size)
327 | 
328 |     # Else get overlapping chunks of given size and offset
329 |     else:
330 |         iterable = _overlapping_chunks(seq, size, offset)
331 | 
332 |     # Execute parallel computation over chunks
333 |     return pcompute_multiple_seq(iterable)
334 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x2D/core.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True
  2 | #!/usr/bin/env python3
  3 | # -*- coding: utf-8 -*-
  4 | """
  5 | 
  6 | 
  7 | @author: Pranay S. Yadav
  8 | """
  9 | # Import stuff
 10 | from cpython cimport array, bool
 11 | cimport cython
 12 | import array
 13 | 
 14 | # Function for getting mask for pairs
 15 | cpdef array.array get_mask_pairs(const unsigned int[::1] x, const unsigned int[::1] y):
 16 |     """
 17 |     INPUT
 18 |     -----
 19 |     x : array.array
 20 |         Array object containing 32-bit integers.
 21 | 
 22 |     OUTPUT
 23 |     ------
 24 |     mask : array.array
 25 |         Array object containing 32-bit integers - 0s or 1s corresponding to values in
 26 |         x for which successive overlapping pairs occur.
 27 |     """
 28 |     # Get size of input
 29 |     cdef Py_ssize_t x_size = len(x)
 30 | 
 31 |     # Initialize a mask of Falses
 32 |     cdef array.array int_template = array.array('I', [])
 33 |     cdef array.array mask = array.clone(int_template, x_size-1, zero=True)
 34 |     cdef unsigned int[:] mask_view = mask
 35 | 
 36 |     # Initialize bounds for iteration
 37 |     cdef Py_ssize_t n = 0
 38 | 
 39 |     # Turn all values in mask to Trues
 40 |     for n in range(x_size-1):
 41 |         mask_view[n] += 1
 42 | 
 43 |     # Iterate over all values of input
 44 |     n = 0
 45 |     while n < x_size - 2:
 46 | 
 47 |         # If successive pairs match
 48 |         if x[n] == x[n+1] and x[n+1] == x[n+2] and y[n] == y[n+1] and y[n+1] == y[n+2]:
 49 | 
 50 |             # Mask out the second one
 51 |             mask_view[n+1] = 0
 52 | 
 53 |             # And slide over it
 54 |             n += 1
 55 | 
 56 |         # Increment while loop index
 57 |         n += 1
 58 | 
 59 |     return mask
 60 | 
 61 | # Function for substituting pairs (old non-ideal version, uses for loop)
 62 | # cpdef substitute_pairs_old(unsigned int[::1] x, unsigned int[::1] y, unsigned int[::1] pair_x, unsigned int[::1] pair_y, unsigned int value_x, unsigned int value_y):
 63 | #     """
 64 | #     INPUT
 65 | #     -----
 66 | #     x : array.array
 67 | #         Array object containing 32-bit unsigned integers.
 68 | 
 69 | #     pair : array.array, length = 2
 70 | #         Array object containing 2 32-bit unsigned integers.
 71 | 
 72 | #     value : unsigned 32-bit int
 73 | #         Value to substitute the first element of pair with
 74 | 
 75 | #     OUTPUT
 76 | #     ------
 77 | #     out : list
 78 | #         Array object containing 32-bit integers, with supplied pair replaced everywhere
 79 | #         by the supplied value.
 80 | #     """
 81 | #     # Initialize looping variables and output list
 82 | #     cdef Py_ssize_t n
 83 | #     cdef Py_ssize_t x_size = len(x)
 84 | #     cdef list out_x = []
 85 | #     cdef list out_y = []
 86 | 
 87 | #     # Loop over input and replace pair
 88 | #     for n in range(x_size-1):
 89 | 
 90 | #         # Check for match with supplied pair
 91 | #         if x[n] == pair_x[0] and x[n+1] == pair_x[1] and y[n] == pair_y[0] and y[n+1] == pair_y[1]:
 92 | 
 93 | #             # Replace first value with supplied value
 94 | #             x[n] = value_x
 95 | #             y[n] = value_y
 96 | 
 97 | #             # Replace second value with 0
 98 | #             x[n+1] = 0
 99 | #             y[n+1] = 0
100 | 
101 | #     # Reset indexing variable
102 | #     n = 0
103 | 
104 | #     # Loop over mutated input and append non-zero values to list
105 | #     for n in range(x_size):
106 | 
107 | #         if x[n]: # Check only for x as both x & y can only be simultaneously 0
108 | 
109 | #             out_x.append(x[n])
110 | #             out_y.append(y[n])
111 | 
112 | #     return out_x, out_y
113 | 
114 | # Function for substituting pairs (new version, uses while loop)
115 | cpdef substitute_pairs(unsigned int[::1] x, unsigned int[::1] y, unsigned int[::1] pair_x, unsigned int[::1] pair_y, unsigned int value_x, unsigned int value_y):
116 |     """
117 |     INPUT
118 |     -----
119 |     x : array.array
120 |         Array object containing 32-bit unsigned integers.
121 | 
122 |     pair : array.array, length = 2
123 |         Array object containing 2 32-bit unsigned integers.
124 | 
125 |     value : unsigned 32-bit int
126 |         Value to substitute the first element of pair with
127 | 
128 |     OUTPUT
129 |     ------
130 |     out : list
131 |         Array object containing 32-bit integers, with supplied pair replaced everywhere
132 |         by the supplied value.
133 |     """
134 |     # Initialize looping variables and output list
135 |     cdef Py_ssize_t n = 0
136 |     cdef Py_ssize_t x_size = len(x)
137 |     cdef list out_x = []
138 |     cdef list out_y = []
139 | 
140 |     # Loop over input and replace pair
141 |     while n < x_size-1:
142 | 
143 |         # Check for match with supplied pair
144 |         if x[n] == pair_x[0] and x[n+1] == pair_x[1] and y[n] == pair_y[0] and y[n+1] == pair_y[1]:
145 | 
146 |             # Replace first value with supplied value
147 |             x[n] = value_x
148 |             y[n] = value_y
149 | 
150 |             # Replace second value with 0
151 |             x[n+1] = 0
152 |             y[n+1] = 0
153 | 
154 |             n += 1
155 | 
156 |         n += 1
157 | 
158 |     # Reset indexing variable
159 |     n = 0
160 | 
161 |     # Loop over mutated input and append non-zero values to list
162 |     for n in range(x_size):
163 | 
164 |         if x[n]: # Check only for x as both x & y can only be simultaneously 0
165 | 
166 |             out_x.append(x[n])
167 |             out_y.append(y[n])
168 | 
169 |     return out_x, out_y
170 | 
171 | # Function for checking whether all elements in input are identical
172 | cpdef bint check_equality(const unsigned int[::1] x, const unsigned int[::1] y):
173 |     """
174 |     INPUT
175 |     -----
176 |     x : array.array
177 |         Array object containing 32-bit unsigned integers.
178 | 
179 | 
180 |     OUTPUT
181 |     ------
182 |     bool
183 |         True if all elements are identical
184 |     """
185 |     # Intialize loop bounds
186 |     cdef Py_ssize_t n
187 |     cdef Py_ssize_t x_size = len(x)
188 | 
189 |     # Iterate over values from input
190 |     for n in range(x_size):
191 | 
192 |         # Short-circuit the loop: check for any element that doesn't equal the first
193 |         if x[0] != x[n] or y[0] != y[n]:
194 |             return False
195 | 
196 |     return True
197 | 
198 | # # Function for getting mask for windows of any length
199 | # cpdef array.array get_mask_windows(const unsigned int[::1] x, unsigned int order):
200 | #     """
201 | #     INPUT
202 | #     -----
203 | #     x : array.array
204 | #         Array object containing 32-bit integers.
205 | 
206 | #     order: unsigned 32-bit int
207 | #         Length of the window to slide across input
208 | 
209 | #     OUTPUT
210 | #     ------
211 | #     mask : array.array
212 | #         Array object containing 32-bit integers - 0s or 1s corresponding to values in
213 | #         x for which successive overlapping pairs occur.
214 | #     """
215 | #     # Get size of input
216 | #     cdef Py_ssize_t x_size = len(x)
217 | 
218 | #     # Initialize a mask of Falses
219 | #     cdef array.array int_template = array.array('I', [])
220 | #     cdef array.array mask = array.clone(int_template, x_size, zero=True)
221 | #     cdef unsigned int[:] mask_view = mask
222 | 
223 | #      # Initialize variable for iteration
224 | #     cdef Py_ssize_t n = 0
225 | 
226 | #     # Turn all values in mask to Trues
227 | #     for n in range(x_size):
228 | #         mask_view[n] += 1
229 | 
230 | #     # Initialize variables for iteration across input
231 | #     cdef Py_ssize_t k = 0 # Outer loop
232 | #     cdef Py_ssize_t m = 0 # Inner loop
233 | 
234 | #     # Tracking variable for counting matching elements in pairwise window comparison
235 | #     cdef unsigned int track = 0
236 | 
237 | #     # Iterate over input values except the last 'order' values [Outermost master loop]
238 | #     n = 0
239 | #     for n in range(x_size-order):
240 | 
241 | #         # proceed only if mask is True for current element
242 | #         if mask_view[n]:
243 | 
244 | #             # Outer loop for sliding the 'next' window by unit step (current vs next)
245 | #             for k in range(1,order):  # Start from 1 - begin comparing from next window
246 | 
247 | #                 # Inner loop for comparing elements in current and next windows
248 | #                 for m in range(order):
249 | 
250 | #                     # If elements match, increment tracker
251 | #                     if x[n+m] == x[n+m+k]:
252 | #                         track += 1
253 | 
254 | #                     # Else stop iteration over this comparison of windows
255 | #                     else:
256 | #                         break
257 | 
258 | #                 # Trick: preserve mask only if track doesn't equal order
259 | #                 # If track == order, short-circuit eval takes precedence, returning 0
260 | #                 mask_view[n+k] = track!=order and mask_view[n+k]
261 | 
262 | #                 # Reset tracker
263 | #                 track = 0
264 | 
265 | #     return mask
266 | 
267 | # # Function for substituting windows of any length
268 | # cpdef list substitute_windows(unsigned int[::1] x, unsigned int order, unsigned int[::1] window, unsigned int value):
269 | #     """
270 | #     INPUT
271 | #     -----
272 | #     x : array.array
273 | #         Array object containing 32-bit unsigned integers.
274 | 
275 | #     order: unsigned 32-bit int
276 | #         Length of the window to slide across input
277 | 
278 | #     window : array.array, length = 2
279 | #         Array object containing 2 32-bit unsigned integers.
280 | 
281 | #     value : unsigned 32-bit int
282 | #         Value to substitute the first element of pair with
283 | 
284 | #     OUTPUT
285 | #     ------
286 | #     out : list
287 | #         Array object containing 32-bit integers, with supplied pair replaced everywhere
288 | #         by the supplied value.
289 | #     """
290 | #     # Initialize looping variables and output list
291 | #     cdef Py_ssize_t n = 0 # Outer loop
292 | #     cdef Py_ssize_t m = 0 # Inner loop
293 | #     cdef Py_ssize_t x_size = len(x)
294 | #     cdef list out = []
295 | 
296 | #     # Tracking variable for counting matching elements in pairwise window comparison
297 | #     cdef unsigned int track = 0
298 | 
299 | #     # Iterate over input values except one less than the last 'order' values
300 | #     # Logic: last window, say triplet must begin from 3rd-last index, leaving 2 values
301 | #     for n in range(x_size-order+1):
302 | 
303 | #         # Slide window of given order and do element-wise comparison
304 | #         for m in range(order):
305 | 
306 | #             # Reset tracker
307 | 
308 | 
309 | #             # Track comparison of input elements with window elements
310 | #             if x[n+m] == window[m]:
311 | #                 track += 1
312 | #             # # If mismatch, break
313 | #             else:
314 | #                 break
315 | 
316 | #         # If all compared elements match for current window
317 | #         if track == order:
318 | 
319 | #             # Replace the first element with provided value
320 | #             x[n] = value
321 | 
322 | #             # Replace the remaining subsequent values with zeros
323 | #             for m in range(1, order):
324 | #                 x[n+m] = 0
325 | 
326 | #         # Reset tracker
327 | #         track = 0
328 | #     # Reset indexing variable
329 | #     n = 0
330 | 
331 | #     # Loop over mutated input and append non-zero values to list
332 | #     for n in range(x_size):
333 | 
334 | #         if x[n]:
335 | 
336 | #             out.append(x[n])
337 | 
338 | #     return out


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ETCPy
  2 | **E**ffort-**T**o-**C**ompress in **Py**thon
  3 |  - [What is this](https://github.com/pranaysy/ETCPy#what-is-this)
  4 |    - [References](https://github.com/pranaysy/ETCPy#references)
  5 |  - [What can it do](https://github.com/pranaysy/ETCPy#what-can-it-do)
  6 |    - [Study Haemodynamics, Heart-Rate Variability and Cardiac Aging using ECG/EKG](https://github.com/pranaysy/ETCPy#study-haemodynamics-heart-rate-variability-and-cardiac-aging-using-ecgekg)
  7 |    - [Network Neuroscience, Psychophysics and Scientific Study of Consciousness](https://github.com/pranaysy/ETCPy#network-neuroscience-psychophysics-and-scientific-study-of-consciousness)
  8 |    - [Genome Complexity Analysis and Classification of Nucleotide Sequences](https://github.com/pranaysy/ETCPy#genome-complexity-analysis-and-classification-of-nucleotide-sequences)
  9 |    - [Audio Signal Processing and Denoising](https://github.com/pranaysy/ETCPy#audio-signal-processing-and-denoising)
 10 |  - [How to use it](https://github.com/pranaysy/ETCPy#how-to-use-it)
 11 |    - [Dependencies](https://github.com/pranaysy/ETCPy#dependencies)
 12 |    - [Installation](https://github.com/pranaysy/ETCPy#installation)
 13 |    - [Updating](https://github.com/pranaysy/ETCPy#updating)
 14 |    - [Usage](https://github.com/pranaysy/ETCPy#usage)
 15 |    - [Testing](https://github.com/pranaysy/ETCPy#testing)
 16 |    - [MATLAB Implementation](https://github.com/pranaysy/ETCPy#matlab-implementation)
 17 |  - [TODO](https://github.com/pranaysy/ETCPy#todo)
 18 |  - [License](https://github.com/pranaysy/ETCPy#license)
 19 | 
 20 | ---
 21 | 
 22 | ## What is this
 23 | A Python implementation of the compression-complexity measure called Effort-To-Compress or ETC. ETC captures the compressibility and complexity of discrete symbolic sequences using lossless compression. It has been shown to robustly estimate complexity, comparing favorably for short and noisy time series in comparison with entropy and Lempel-Ziv complexity.
 24 | 
 25 | Using ETC, causal information flow between multiple discrete symbolic sequences can be assessed and recently, such a use has been presented, rigorously proven and demonstrated to be an effective model-free measure of causality. Introduced as Compression-Complexity Causality or CCC, this measure is robust to numerous data contaminants, noise sources and pre-processing artifacts. On comparison with Granger Causality and Transfer Entropy, CCC compares favorably and outperforms them on synthetic as well as real world causal interactions. An implementation of CCC is included in this repository.
 26 | 
 27 | While any lossless compressor may be used with ETC and subsequently with CCC, a grammar-based lossless compression algorithm called Non-Sequential Recursive Pair Substitution or NSRPS is used presently. NSRPS has been rigorously studied and shown to be an effective tool for data compression and entropy estimation. This repository also contains a fast Cython implementation of NSRPS for use with ETC and CCC.
 28 | 
 29 | #### References
 30 |  - Benedetto, Dario, Emanuele Caglioti, and Davide Gabrielli. “Non-Sequential Recursive Pair Substitution: Some Rigorous Results.” Journal of Statistical Mechanics: Theory and Experiment 2006, no. 09 (September 25, 2006): P09011–P09011. https://doi.org/10.1088/1742-5468/2006/09/P09011.
 31 |  - Balasubramanian, Karthi, Gayathri R. Prabhu, Lakshmipriya V. K. , Maneesha Krishnan, Praveena R. , and Nithin Nagaraj. “Classification of Periodic, Chaotic and Random Sequences Using NSRPS Complexity Measure.” ArXiv:1205.4886 [Nlin], May 22, 2012. http://arxiv.org/abs/1205.4886.
 32 |  - Nagaraj, Nithin, Karthi Balasubramanian, and Sutirth Dey. “A New Complexity Measure for Time Series Analysis and Classification.” The European Physical Journal Special Topics 222, no. 3–4 (July 2013): 847–60. https://doi.org/10.1140/epjst/e2013-01888-9.
 33 |  - Nagaraj, Nithin, and Karthi Balasubramanian. “Dynamical Complexity of Short and Noisy Time Series: Compression-Complexity vs. Shannon Entropy.” The European Physical Journal Special Topics 226, no. 10 (July 2017): 2191–2204. https://doi.org/10.1140/epjst/e2016-60397-x.
 34 |  - Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for Complexity-Causality Measure.” PeerJ Computer Science 5 (May 27, 2019): e196. https://doi.org/10.7717/peerj-cs.196.
 35 | 
 36 | 
 37 | ## What can it do
 38 | #### Study Haemodynamics, Heart-Rate Variability and Cardiac Aging using ECG/EKG
 39 |    - Balasubramanian, Karthi, Nithin Nagaraj, and Sandipan Pati. “Chaos or Randomness? Effect of Vagus Nerve Stimulation During Sleep on Heart-Rate Variability.” IETE Journal of Research, June 30, 2020, 1–7. https://doi.org/10.1080/03772063.2020.1780165.
 40 |    - Srilakshmi, P, Karthi Balasubramanian, Nithin Nagaraj, and Sandipan Pati. “Multiscale Analysis of Heart Rate Variability Using Subsymmetry and Effort-to-Compress Complexity Measures.” In 2018 15th IEEE India Council International Conference (INDICON), 1–5. Coimbatore, India: IEEE, 2018. https://doi.org/10.1109/INDICON45594.2018.8986972.
 41 |    - Thanaj, Marjola, Andrew J. Chipperfield, and Geraldine F. Clough. “Analysis of Microvascular Blood Flow and Oxygenation: Discrimination between Two Haemodynamic Steady States Using Nonlinear Measures and Multiscale Analysis.” Computers in Biology and Medicine 102 (November 2018): 157–67. https://doi.org/10.1016/j.compbiomed.2018.09.026.
 42 |    - Balasubramanian, Karthi, K Harikumar, Nithin Nagaraj, and Sandipan Pati. “Vagus Nerve Stimulation Modulates Complexity of Heart Rate Variability Differently during Sleep and Wakefulness.” Annals of Indian Academy of Neurology 20, no. 4 (2017): 403. https://doi.org/10.4103/aian.AIAN_148_17.
 43 |    - Balasubramanian, Karthi, and Nithin Nagaraj. “Aging and Cardiovascular Complexity: Effect of the Length of RR Tachograms.” PeerJ 4 (2016): e2755. https://doi.org/10.7717/peerj.2755.
 44 | 
 45 | #### Network Neuroscience, Psychophysics and Scientific Study of Consciousness
 46 |    - Ashley J. Funkhouser. "The Role of Action in Affordance Perception Using Virtual Reality" 2020. Honors College Thesis with Dr. Alen Hajnal, Department of Psychology, The University of Southwestern Mississipi. https://aquila.usm.edu/honors_theses/714/
 47 |    - Agarwal, Nikita, Aditi Kathpalia, and Nithin Nagaraj. “Distinguishing Different Levels of Consciousness Using a Novel Network Causal Activity Measure.” In 2019 Global Conference for Advancement in Technology (GCAT), 1–5. BANGALURU, India: IEEE, 2019. https://doi.org/10.1109/GCAT47503.2019.8978424.
 48 |    - Virmani, Mohit, and Nithin Nagaraj. “A Novel Perturbation Based Compression Complexity Measure for Networks.” Heliyon 5, no. 2 (February 2019): e01181. https://doi.org/10.1016/j.heliyon.2019.e01181.
 49 |    - Kondo, Fumika. “Can Alterations in the Temporal Structure of Spontaneous Brain Activity Serve as a Disease-Specific Biomarker for Schizophrenia? A Multi Cohort FMRI Study,” 2017. https://doi.org/10.20381/RUOR-20801.
 50 |    - Kimiskidis, Vasilios K., Christos Koutlis, Alkiviadis Tsimpiris, Reetta Kälviäinen, Philippe Ryvlin, and Dimitris Kugiumtzis. “Transcranial Magnetic Stimulation Combined with EEG Reveals Covert States of Elevated Excitability in the Human Epileptic Brain.” International Journal of Neural Systems 25, no. 05 (August 2015): 1550018. https://doi.org/10.1142/S0129065715500185.
 51 | 
 52 | #### Genome Complexity Analysis and Classification of Nucleotide Sequences
 53 |    - Balasubramanian, Karthi, and Nithin Nagaraj. “Automatic Identification of SARS Coronavirus Using Compression-Complexity Measures.” Preprint. Bioinformatics, March 27, 2020. https://doi.org/10.1101/2020.03.24.006007.
 54 | 
 55 | #### Audio Signal Processing and Denoising
 56 |    - Kiefer, Chris, Overholt, Dan and Eldridge, Alice (2020) Shaping the behaviour of feedback instruments with complexity-controlled gain dynamics. New Interfaces for Musical Expression, Birmingham, UK, 21-25 July 2020. Published in: Proceedings of the International Conference on New Interfaces for Musical Expression. 343-348. NIME, Birmingham, UK. ISSN 2220-4806. https://sro.sussex.ac.uk/id/eprint/91009/
 57 |    - Li, Guohui, Qianru Guan, and Hong Yang. “Noise Reduction Method of Underwater Acoustic Signals Based on CEEMDAN, Effort-To-Compress Complexity, Refined Composite Multiscale Dispersion Entropy and Wavelet Threshold Denoising.” Entropy 21, no. 1 (December 24, 2018): 11. https://doi.org/10.3390/e21010011.
 58 | 
 59 | 
 60 | ## How to use it
 61 | The simplest way right now is to use `pip` to clone this repository and install locally inside a `conda` or a `virtualenv` environment. This way several functions implemented in Cython will be automatically compiled natively on the host system. Instructions below.
 62 | 
 63 | While the repository is called `ETCPy`, the package namespsace available for use is `ETC`. All functionality is available through the `ETC` namespace.
 64 | 
 65 | For running tests (strongly recommended), additional packages need to be installed.
 66 | 
 67 | ### Operating System Support
 68 |  - GNU/Linux-based distributions (tested on Ubuntu 16.04, 18.04, 20.04)
 69 |  - **Currently does not work out of the box on Windows.** Cython and C/C++ build toolchain need to be setup properly for compilation on Windows to work. It may work with some gymnastics using MinGW + Visual Studio Build Tools, **currently untested.** Although does work on WSL!
 70 | 
 71 | ### Dependencies
 72 | For core functionality:
 73 |  - `numpy`
 74 |  - `pandas`
 75 |  - `joblib`
 76 |  - `cython`
 77 |    - Note: Cython needs a working C/C++ compiler such as GCC/Clang and associated build-utils/toolchain. While it should work out of the box on any modern Linux distribution, ensure a proper installation as instructed in the [official documentation.](https://cython.readthedocs.io/en/latest/src/quickstart/install.html).
 78 | 
 79 | For tests:
 80 |  - `pytest`
 81 |  - `hypothesis`
 82 | 
 83 | ### Installation
 84 | Skip the first step if an environment is already available:
 85 | 1. Create a fresh `conda` or `pip`/`virtualenv`-based environment with `numpy` and `cython` packages. Choose an appropriate name instead of `myenv`.
 86 |     ```bash
 87 |     $ conda create -n myenv python numpy pandas joblib cython
 88 |     ```
 89 | 2. Activate environment using `conda activate myenv` or virtualenv equivalent.
 90 | 
 91 |    If `git` is not installed, then:
 92 |      - either install it at a system level directly from the [official website](https://git-scm.com/download) or via prefereed package manager
 93 |      - or install it within the newly created conda environment using `conda install git`
 94 | 
 95 | 3. Use `pip`* to install directly from GitHub using the `git` VCS backend
 96 |     ```bash
 97 |     $ python -m pip install git+https://github.com/pranaysy/ETCPy.git
 98 |     ```
 99 | 4. Done! Open a Python shell, execute `import ETC` and proceed to the [demo](./demo.py)
100 | 
101 | ---
102 | *mixing `pip` and `conda` is not a generally advised but can be used based on [certain recommendations](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#pip-in-env)
103 | 
104 | ### Updating
105 | Use the `-U` flag with pip for updating to the most current version available from this repository:
106 | ```
107 | $ python -m pip install -U git+https://github.com/pranaysy/ETCPy.git
108 | ```
109 | This will rebuild the compiled Cython functions as well.
110 | 
111 | ### Usage
112 | Please check out [`demo.py`](./demo.py) to see ETC in action. [Functions for dealing with NumPy arrays](https://github.com/pranaysy/ETCPy/blob/master/demo.py#L121) are also available. In addition to the core functionality of ETC, a [brief demo of Compression-Complexity Causality (CCC)](https://github.com/pranaysy/ETCPy/blob/master/demo.py#L158) is also included for uncoupled as well as coupled first-order auto-regressive processes.
113 | 
114 | The implementations of ETC as well as CCC include multicore parallelization (using [`joblib`](https://joblib.readthedocs.io/en/latest/index.html)) and can benefit from more available CPU cores for multiple sequences.
115 | 
116 | ### Testing
117 | Most of the tests are property-based or behavior-based, and are implemented using the awesome [`hypothesis` framework](https://hypothesis.readthedocs.io/en/latest/).
118 | Make sure dependencies are satisfied within the working environment:
119 | ```bash
120 | $ python -m pip install -U pytest hypothesis
121 | ```
122 | Grab a copy of this repository using git and enter the local directory:
123 | ```bash
124 | $ git clone https://github.com/pranaysy/ETCPy.git
125 | $ cd ETCPy
126 | ```
127 | Run tests:
128 | ```bash
129 | $ pytest ETC/
130 | ```
131 | 
132 | ### MATLAB Implementation
133 |  - The original ETC implementation in MATLAB can be found here: https://sites.google.com/site/nithinnagaraj2/journal/etc
134 | 
135 | 
136 | ## TODO
137 |  - Hyperparameter optimization for CCC
138 |  - Add performance metrics
139 |  - Automated tests with `tox`
140 |  - Better packaging: `pip` vs `conda`
141 |  - Visualizations
142 |  - Improve test coverage
143 |  - Documentation using Sphinx/MkDocs
144 |  - Windows support
145 | 
146 | ## License
147 | Copyright 2021 Pranay S. Yadav and Nithin Nagaraj
148 | 
149 |    Licensed under the Apache License, Version 2.0 (the "License");
150 |    you may not use this file except in compliance with the License.
151 |    You may obtain a copy of the License at
152 | 
153 |        http://www.apache.org/licenses/LICENSE-2.0
154 | 
155 |    Unless required by applicable law or agreed to in writing, software
156 |    distributed under the License is distributed on an "AS IS" BASIS,
157 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
158 |    See the License for the specific language governing permissions and
159 |    limitations under the License.
160 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x2D/onestep.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from collections import Counter
  9 | from itertools import compress, islice
 10 | from time import perf_counter
 11 | 
 12 | from ETC.NSRWS.x2D import core
 13 | from ETC.seq.recode import cast
 14 | from ETC.seq.check import arraytype
 15 | 
 16 | 
 17 | def _mask_and_count(seq_x, seq_y, mask, order):
 18 |     """
 19 |     Apply binary mask to a pair of sequences & count most frequently jointly occurring windows
 20 | 
 21 |     This function does 3 things in the following sequence:
 22 |         1. Create sliding windows of a given size (order) - using zip and islice
 23 |         2. Apply a supplied mask to the sliding windows - using compress
 24 |         3. Count most frequently occurring window - using Counter
 25 | 
 26 |     In the NSRWS algorithm, this is the most time consuming step. Essentially expands
 27 |     two 1D sequences to a 2D sequence - where the sequences follows along row-wise &
 28 |     the columnar expansion encodes a sliding window for each row jointly from both
 29 |     sequences:
 30 |         1D sequences:
 31 |             (1,2,3,4,5,6,7)
 32 |             (3,4,5,6,7,8,9)
 33 | 
 34 |         2D expansion for window order=3:
 35 |             (((1,3),(2,4),(3,5)),
 36 |              ((2,4),(3,5),(4,6)),
 37 |              ((3,5),(4,6),(5,7)),
 38 |              ((4,6),(5,7),(6,8)),
 39 |              ((5,7),(6,8),(7,9)))
 40 | 
 41 |         The mask is applied row-wise & must be of the same length as the number of rows
 42 |         in this 2D expansion. This is given by:
 43 |             len(mask) = len(seq) - (order - 1)
 44 | 
 45 |         Example application of the mask (1,0,0,1,1):
 46 |             1 -> (((1,3),(2,4),(3,5)),
 47 |             0 ->  ((2,4),(3,5),(4,6)),          (((1,3),(2,4),(3,5)),
 48 |             0 ->  ((3,5),(4,6),(5,7)),    --->   ((4,6),(5,7),(6,8)),
 49 |             1 ->  ((4,6),(5,7),(6,8)),           ((5,7),(6,8),(7,9)))
 50 |             1 ->  ((5,7),(6,8),(7,9)))
 51 | 
 52 |         Unique windows (rows of 2D expansion) are counted and most frequently occurring
 53 |         row is returned with counts.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     seq_x : array.array
 58 |         Discrete symbolic sequence containing 32-bit unsigned integers.
 59 |     seq_y : array.array
 60 |         Discrete symbolic sequence containing 32-bit unsigned integers.
 61 |     mask : array.array
 62 |         Collection of Booleans, where 0s indicate locations on "seq" to mask out.
 63 |         0s correspond to overlapping windows.
 64 |     order : int
 65 |         Size of window for NSRWS, 2 or greater.
 66 | 
 67 |     Returns
 68 |     -------
 69 |     pair_x : array.array
 70 |         Most frequently occurring non-overlapping "window" of size "order" in seq_x
 71 |     pair_y : array.array
 72 |         Most frequently occurring non-overlapping "window" of size "order" in seq_y
 73 |     count : int
 74 |         Number of times the most frequently occurring window occurs.
 75 | 
 76 |     """
 77 |     # Create overlapped sliding windows (each window a tuple of size order) & apply mask
 78 |     filtered = compress(
 79 |         zip(*(islice(zip(seq_x, seq_y), i, None) for i in range(order))), mask
 80 |     )
 81 | 
 82 |     # Count sliding windows (tuples are hashable!) & get the one most common with counts
 83 |     freq_pair, count = Counter(filtered).most_common(1)[0]
 84 | 
 85 |     # Assign array type and return
 86 |     pair_x = cast([freq_pair[0][0], freq_pair[1][0]])
 87 |     pair_y = cast([freq_pair[0][1], freq_pair[1][1]])
 88 | 
 89 |     return pair_x, pair_y, count
 90 | 
 91 | 
 92 | def _onestep_pairs(seq_x, seq_y, verbose=True):
 93 |     """
 94 |     Execute one full step of NSRPS (NSRWS with order=2) for a given sequence
 95 | 
 96 |     Makes use of 2 functions written in Cython & _mask_and_count in the following steps:
 97 |         1. Find overlapping pairs & store their indices for masking -> get_mask_pairs()
 98 |         2. Apply the mask and find most frequent pair -> _mask_and_count()
 99 |         3. Substitute all occurrences of the most frequent pair -> substitute_pairs()
100 | 
101 |     This function is different from _onestep_windows because:
102 |         1. It is *much* faster due to fewer nested loops
103 |         2. It targets a more common use case scenario: for distances, for CCC, etc
104 |         3. For higher window orders, correctness needs to be proved outside of tests
105 | 
106 |     The implementation will benefit from:
107 |         1. Decorators for timing
108 |         2. Decorators for verbosity of output
109 |         3. Cython implementation of the slowest part: _mask_and_count
110 |             problem: counting windows in C?
111 | 
112 |     Parameters
113 |     ----------
114 |     seq_x : array.array
115 |         Discrete symbolic sequence containing 32-bit unsigned integers.
116 |     seq_y : array.array
117 |         Discrete symbolic sequence containing 32-bit unsigned integers.
118 |     verbose : bool, optional
119 |         Whether to report extra details. These include the frequent pair that was
120 |         substituted, its counts & total time taken. The default is True.
121 | 
122 |     Returns
123 |     -------
124 |     tuple, of the following fixed elements:
125 |         seq_x : array.array
126 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
127 |             frequently occurring non-sequentially overlapping pair substituted.
128 | 
129 |         seq_y : array.array
130 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
131 |             frequently occurring non-sequentially overlapping pair substituted.
132 | 
133 |         signal : bool
134 |             indicator for the state of sequence with all distinct pairs (count=1)
135 | 
136 |     optional elements of tuple that depend on verbosity:
137 |         freq_pair_x : array.array
138 |             Frequent pair substituted in seq_x
139 | 
140 |         freq_pair_y : array.array
141 |             Frequent pair substituted in seq_y
142 | 
143 |         count : int
144 |             Number of times the frequent pair occurred in the sequence
145 | 
146 |         time_taken : float
147 |             Time taken to execute step
148 | 
149 | 
150 |     """
151 |     # Initialize timer
152 |     before = perf_counter()
153 | 
154 |     # Initialize signal for tracking sequence state with all distinct pairs
155 |     signal = False
156 | 
157 |     # Compute mask for overlapping pairs
158 |     mask = core.get_mask_pairs(seq_x, seq_y)
159 | 
160 |     # Apply mask and find most frequent pair
161 |     pair_x, pair_y, count = _mask_and_count(seq_x, seq_y, mask, 2)
162 | 
163 |     # Get values for substitution of the most frequent pair with
164 |     sub_value_x = 1 + max(seq_x)
165 |     sub_value_y = 1 + max(seq_y)
166 | 
167 |     # If all distinct pairs, substitute the first one & set signal to True
168 |     if count == 1:
169 |         out_x = cast(seq_x[1:])
170 |         out_x[0] = sub_value_x
171 | 
172 |         out_y = cast(seq_y[1:])
173 |         out_y[0] = sub_value_y
174 | 
175 |         signal = True
176 |     # Else, substitute all instances of the frequent pair
177 |     else:
178 |         out_x, out_y = core.substitute_pairs(
179 |             seq_x, seq_y, pair_x, pair_y, sub_value_x, sub_value_y
180 |         )
181 |         out_x = cast(out_x)
182 |         out_y = cast(out_y)
183 | 
184 |     # Completion timer
185 |     after = perf_counter()
186 | 
187 |     # If verbose, return more things
188 |     if verbose:
189 |         time_taken = after - before
190 |         return out_x, out_y, signal, pair_x, pair_y, count, time_taken
191 | 
192 |     # Else return bare essentials
193 |     return out_x, out_y, signal
194 | 
195 | 
196 | # def _onestep_windows(seq, order, verbose=True):
197 | 
198 | #     before = perf_counter()
199 | #     mask = core.get_mask_windows(seq, order)[: -(order - 1)]
200 | #     z_windowed = compress(zip(*(islice(seq, i, None) for i in range(order))), mask)
201 | #     z_windowed = tuple(z_windowed)
202 | #     freq_window, count = Counter(z_windowed).most_common(1)[0]
203 | #     sub_value = 1 + max(seq)
204 | #     window = array("I", freq_window)
205 | #     if count == 1:
206 | #         out = array("I", seq[order - 1 :])
207 | #         out[0] = sub_value
208 | #         signal = True
209 | #     else:
210 | #         out = array("I", core.substitute_windows(seq, order, window, sub_value))
211 | #         signal = False
212 | #     out = array("I", core.substitute_windows(seq, order, window, sub_value))
213 | #     after = perf_counter()
214 | #     if verbose:
215 | #         return out, freq_window, count, after - before, signal
216 | #     return out, signal
217 | 
218 | 
219 | # def _onestep_windows(seq, order, verbose=True):
220 | #     pass
221 | 
222 | 
223 | def _onestep(seq_x, seq_y, order, verbose=True):
224 |     """
225 |     Wrapper that switches routine (pairs vs windows) depending on order
226 | 
227 |     For pairs (order=2), execute _onestep_pairs which is faster
228 |     For higher orders, execute _onestep_windows
229 | 
230 |     Parameters
231 |     ----------
232 |     seq_x : array.array
233 |         Discrete symbolic sequence containing 32-bit unsigned integers.
234 |     seq_y : array.array
235 |         Discrete symbolic sequence containing 32-bit unsigned integers.
236 |     order : int
237 |         Size of window for NSRWS, 2 or greater.
238 |     verbose : bool, optional
239 |         Whether to report extra details. These include the frequent pair that was
240 |         substituted, its counts & total time taken. The default is True.
241 | 
242 |     Returns
243 |     -------
244 |     tuple, of the following fixed elements:
245 |         seq_x : array.array
246 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
247 |             frequently occurring non-sequentially overlapping pair substituted.
248 | 
249 |         seq_y : array.array
250 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
251 |             frequently occurring non-sequentially overlapping pair substituted.
252 | 
253 |         signal : bool
254 |             indicator for the state of sequence with all distinct pairs (count=1)
255 | 
256 |     optional elements of tuple that depend on verbosity:
257 |         freq_pair_x : array.array
258 |             Frequent pair substituted in seq_x
259 | 
260 |         freq_pair_y : array.array
261 |             Frequent pair substituted in seq_y
262 | 
263 |         count : int
264 |             Number of times the frequent pair occurred in the sequence
265 | 
266 |         time_taken : float
267 |             Time taken to execute step
268 | 
269 |     """
270 |     if order == 2:
271 |         return _onestep_pairs(seq_x[:], seq_y[:], verbose)
272 |     # if order > 2:
273 |     #     return _onestep_windows(seq_x[:], seq_y[:], order, verbose)
274 | 
275 | 
276 | def onestep(seq_x, seq_y, order, verbose=True, check=True):
277 |     """
278 |     Execute one step of NSRWS on given sequence and window size.
279 | 
280 |     This function exposes the functionality of NSRWS with various checks for inputs and
281 |     sizes. Wraps around _onestep & for convenience, allows disabling of equality check.
282 | 
283 |     Parameters
284 |     ----------
285 |     seq_x : array.array
286 |         Discrete symbolic sequence containing 32-bit unsigned integers.
287 |     seq_y : array.array
288 |         Discrete symbolic sequence containing 32-bit unsigned integers.
289 |     order : int
290 |         Size of window for NSRWS, 2 or greater.
291 |     verbose : bool, optional
292 |         Whether to report extra details. These include the frequent pair that was
293 |         substituted, its counts & total time taken. The default is True.
294 |     check : bool, optional
295 |         Check for equality of all symbols in sequence. The default is True.
296 | 
297 |     Returns
298 |     -------
299 |     tuple, of the following fixed elements:
300 |         seq_x : array.array
301 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
302 |             frequently occurring non-sequentially overlapping pair substituted.
303 | 
304 |         seq_y : array.array
305 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
306 |             frequently occurring non-sequentially overlapping pair substituted.
307 | 
308 |         signal : bool
309 |             indicator for the state of sequence with all distinct pairs (count=1)
310 | 
311 |     optional elements of tuple that depend on verbosity:
312 |         freq_pair_x : array.array
313 |             Frequent pair substituted in seq_x
314 | 
315 |         freq_pair_y : array.array
316 |             Frequent pair substituted in seq_y
317 | 
318 |         count : int
319 |             Number of times the frequent pair occurred in the sequence
320 | 
321 |         time_taken : float
322 |             Time taken to execute step
323 | 
324 |     """
325 |     # Check if both sequences are of same length, if not then exit
326 |     if len(seq_x) != len(seq_y):
327 |         print("> Both inputs must be of the same length!")
328 |         return None
329 | 
330 |     # Coerce input 1 to appropriate array type, if not possible throw a fit & exit
331 |     if not arraytype(seq_x):
332 |         seq_x = cast(seq_x)
333 | 
334 |     # Coerce input 2 to appropriate array type, if not possible throw a fit & exit
335 |     if not arraytype(seq_y):
336 |         seq_y = cast(seq_y)
337 | 
338 |     # Exit if neither inputs could be coerced
339 |     if seq_x is None or seq_y is None:
340 |         return None
341 | 
342 |     # Check if size of sequence is shorter than order, exit if True
343 |     if len(seq_x) < order or len(seq_y) < order:
344 |         print("> Sequence input shorter than order!\n> Can't perform substitution ...")
345 |         return None
346 | 
347 |     # Check whether all elements are equal, if requested, & exit if True
348 |     if check and core.check_equality(seq_x, seq_y):
349 |         print("> All elements in sequence x are equal!")
350 |         return None
351 | 
352 |     # Else execute one step of NSRWS and return
353 |     return _onestep(seq_x, seq_y, order, verbose)
354 | 


--------------------------------------------------------------------------------
/ETC/NSRWS/x1D/onestep.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | 
  6 | @author: Pranay S. Yadav
  7 | """
  8 | from collections import Counter
  9 | from itertools import compress, islice
 10 | from time import perf_counter
 11 | 
 12 | from ETC.NSRWS.x1D import core
 13 | from ETC.seq.recode import cast
 14 | from ETC.seq.check import arraytype
 15 | 
 16 | 
 17 | def _mask_and_count(seq, mask, order):
 18 |     """
 19 |     Apply binary mask to a sequence and count most frequently occurring windows
 20 | 
 21 |     This function does 3 things in the following sequence:
 22 |         1. Create sliding windows of a given size (order) - using zip and islice
 23 |         2. Apply a supplied mask to the sliding windows - using compress
 24 |         3. Count most frequently occurring window - using Counter
 25 | 
 26 |     In the NSRWS algorithm, this is the most time consuming step. Essentially expands
 27 |     a 1D sequence to a 2D sequence - where the sequence follows row-wise & the columnar
 28 |     expansion encodes a sliding window for each row:
 29 |         1D sequence:
 30 |             (1,2,3,4,5,6,7)
 31 | 
 32 |         2D expansion for window order=3:
 33 |             ((1,2,3),
 34 |              (2,3,4),
 35 |              (3,4,5),
 36 |              (4,5,6),
 37 |              (5,6,7))
 38 | 
 39 |         The mask is applied row-wise & must be of the same length as the number of rows
 40 |         in this 2D expansion. This is given by:
 41 |             len(mask) = len(seq) - (order - 1)
 42 | 
 43 |         Example application of the mask (1,0,0,1,1):
 44 |             1 -> ((1,2,3),
 45 |             0 ->  (2,3,4),    ---->      ((1,2,3),
 46 |             0 ->  (3,4,5),                (4,5,6),
 47 |             1 ->  (4,5,6),                (5,6,7))
 48 |             1 ->  (5,6,7))
 49 | 
 50 |         Unique windows (rows of 2D expansion) are counted and most frequently occurring
 51 |         row is returned with counts.
 52 | 
 53 |         1D sequence with overlap:
 54 |             (1,1,1,1,1,2,1)
 55 | 
 56 |         2D expansion for window order=3:
 57 |             ((1,1,1),
 58 |              (1,1,1),    ----> overlap
 59 |              (1,1,1),    ----> overlap
 60 |              (1,1,2),
 61 |              (1,2,1))
 62 | 
 63 |         mask will be (1,0,0,1,1) and its application will yield:
 64 |             ((1,1,1),
 65 |              (1,1,2),
 66 |              (1,2,1))
 67 | 
 68 |         Here, each window occurs once and the first one is returned -> (1,1,1)
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     seq : array.array
 73 |         Discrete symbolic sequence containing 32-bit unsigned integers.
 74 |     mask : array.array
 75 |         Collection of Booleans, where 0s indicate locations on "seq" to mask out.
 76 |         0s correspond to overlapping windows.
 77 |     order : int
 78 |         Size of window for NSRWS, 2 or greater.
 79 | 
 80 |     Returns
 81 |     -------
 82 |     freq_window : array.array
 83 |         Most frequently occurring non-overlapping "window" of size "order".
 84 |     count : int
 85 |         Number of times the most frequently occurring window occurs.
 86 | 
 87 |     """
 88 | 
 89 |     # Create overlapped sliding windows (each window a tuple of size order) & apply mask
 90 |     filtered = compress(zip(*(islice(seq, i, None) for i in range(order))), mask)
 91 | 
 92 |     # Count sliding windows (tuples are hashable!) & get the one most common with counts
 93 |     freq_window, count = Counter(filtered).most_common(1)[0]
 94 | 
 95 |     # Assign array type and return
 96 |     freq_window = cast(freq_window)
 97 | 
 98 |     return freq_window, count
 99 | 
100 | 
101 | def _onestep_pairs(seq, verbose=True):
102 |     """
103 |     Execute one full step of NSRPS (NSRWS with order=2) for a given sequence
104 | 
105 |     Makes use of 2 functions written in Cython & _mask_and_count in the following steps:
106 |         1. Find overlapping pairs & store their indices for masking -> get_mask_pairs()
107 |         2. Apply the mask and find most frequent pair -> _mask_and_count()
108 |         3. Substitute all occurrences of the most frequent pair -> substitute_pairs()
109 | 
110 |     This function is different from _onestep_windows because:
111 |         1. It is *much* faster due to fewer nested loops
112 |         2. It targets a more common use case scenario: for distances, for CCC, etc
113 |         3. For higher window orders, correctness needs to be proved outside of tests
114 | 
115 |     The implementation will benefit from:
116 |         1. Decorators for timing
117 |         2. Decorators for verbosity of output
118 |         3. Cython implementation of the slowest part: _mask_and_count
119 |             problem: counting windows in C?
120 | 
121 |     Parameters
122 |     ----------
123 |     seq : array.array
124 |         Discrete symbolic sequence containing 32-bit unsigned integers.
125 |     verbose : bool, optional
126 |         Whether to report extra details. These include the frequent pair that was
127 |         substituted, its counts & total time taken. The default is True.
128 | 
129 |     Returns
130 |     -------
131 |     tuple, of the following fixed elements:
132 |         seq : array.array
133 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
134 |             frequently occurring non-sequentially overlapping pair substituted.
135 | 
136 |         signal : bool
137 |             indicator for the state of sequence with all distinct pairs (count=1)
138 | 
139 |     optional elements of tuple that depend on verbosity:
140 |         freq_pair : array.array
141 |             Frequent pair substituted
142 | 
143 |         count : int
144 |             Number of times the frequent pair occurred in the sequence
145 | 
146 |         time_taken : float
147 |             Time taken to execute step
148 | 
149 | 
150 |     """
151 | 
152 |     # Initialize timer
153 |     before = perf_counter()
154 | 
155 |     # Initialize signal for tracking sequence state with all distinct pairs
156 |     signal = False
157 | 
158 |     # Compute mask for overlapping pairs
159 |     mask = core.get_mask_pairs(seq)
160 | 
161 |     # Apply mask and find most frequent pair
162 |     freq_pair, count = _mask_and_count(seq, mask, 2)
163 | 
164 |     # Get value for substitution of the most frequent pair with
165 |     sub_value = 1 + max(seq)
166 | 
167 |     # If all distinct pairs, substitute the first one & set signal to True
168 |     if count == 1:
169 |         out = cast(seq[1:])
170 |         out[0] = sub_value
171 |         signal = True
172 |     # Else, substitute all instances of the frequent pair
173 |     else:
174 |         out = cast(core.substitute_pairs(seq, freq_pair, sub_value))
175 | 
176 |     # Completion timer
177 |     after = perf_counter()
178 | 
179 |     # If verbose, return more things
180 |     if verbose:
181 |         time_taken = after - before
182 |         return out, signal, freq_pair, count, time_taken
183 | 
184 |     # Else return bare essentials
185 |     return out, signal
186 | 
187 | 
188 | def _onestep_windows(seq, order, verbose=True):
189 |     """
190 |     Execute one full step of NSRWS with order>=2 for a given sequence
191 | 
192 |     Makes use of 2 functions written in Cython & _mask_and_count in the following steps:
193 |         1. Find overlapping windows & store their indices as mask -> get_mask_windows()
194 |         2. Apply the mask and find most frequent window -> _mask_and_count()
195 |         3. Substitute all occurrences of most frequent window -> substitute_windows()
196 | 
197 |     This function is different from _onestep_pairs because:
198 |         1. This is slower due to more nested loops and checks
199 |         2. Of course, it handles the generalized case for different window orders
200 |         3. For higher window orders, correctness needs to be proved outside of tests
201 | 
202 |     The implementation will benefit from:
203 |         1. Decorators for timing
204 |         2. Decorators for verbosity of output
205 |         3. Cython implementation of the slowest part: _mask_and_count
206 |             problem: counting windows in C?
207 | 
208 |     Parameters
209 |     ----------
210 |     seq : array.array
211 |         Discrete symbolic sequence containing 32-bit unsigned integers.
212 |     order : int
213 |         Size of window for NSRWS, 2 or greater.
214 |     verbose : bool, optional
215 |         Whether to report extra details. These include the frequent pair that was
216 |         substituted, its counts & total time taken. The default is True.
217 | 
218 |     Returns
219 |     -------
220 |     tuple, of the following fixed elements:
221 |         seq : array.array
222 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
223 |             frequently occurring non-sequentially overlapping window substituted.
224 | 
225 |         signal : bool
226 |             indicator for the state of sequence with all distinct pairs (count=1)
227 | 
228 |     optional elements of tuple that depend on verbosity:
229 |         freq_pair : array.array
230 |             Frequent window substituted
231 | 
232 |         count : int
233 |             Number of times the frequent window occurred in the sequence
234 | 
235 |         time_taken : float
236 |             Time taken to execute step
237 | 
238 | 
239 |     """
240 | 
241 |     # Initialize timer
242 |     before = perf_counter()
243 | 
244 |     # Initialize signal for tracking sequence state with all distinct windows
245 |     signal = False
246 | 
247 |     # Compute mask for overlapping windows
248 |     mask = core.get_mask_windows(seq, order)
249 | 
250 |     # Apply mask and find most frequent window
251 |     freq_window, count = _mask_and_count(seq, mask, order)
252 | 
253 |     # Get value for substitution of the most frequent window with
254 |     sub_value = 1 + max(seq)
255 | 
256 |     # If all distinct windows, substitute the first one & set signal to True
257 |     if count == 1:
258 |         out = cast(seq[order - 1 :])
259 |         out[0] = sub_value
260 |         signal = True
261 |     # Else, substitute all instances of the frequent window
262 |     else:
263 |         out = cast(core.substitute_windows(seq, order, freq_window, sub_value))
264 | 
265 |     # Completion timer
266 |     after = perf_counter()
267 | 
268 |     # If verbose, return more things
269 |     if verbose:
270 |         return out, signal, freq_window, count, after - before
271 | 
272 |     # Else return bare essentials
273 |     return out, signal
274 | 
275 | 
276 | def _onestep(seq, order, verbose=True):
277 |     """
278 |     Wrapper that switches routine (pairs vs windows) depending on order
279 | 
280 |     For pairs (order=2), execute _onestep_pairs which is faster
281 |     For higher orders, execute _onestep_windows
282 | 
283 |     Parameters
284 |     ----------
285 |     seq : array.array
286 |         Discrete symbolic sequence containing 32-bit unsigned integers.
287 |     order : int
288 |         Size of window for NSRWS, 2 or greater.
289 |     verbose : bool, optional
290 |         Whether to report extra details. These include the frequent pair that was
291 |         substituted, its counts & total time taken. The default is True.
292 | 
293 |     Returns
294 |     -------
295 |     tuple, of the following fixed elements:
296 |         seq : array.array
297 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
298 |             frequently occurring non-sequentially overlapping window substituted.
299 | 
300 |         signal : bool
301 |             indicator for the state of sequence with all distinct pairs (count=1)
302 | 
303 |     optional elements of tuple that depend on verbosity:
304 |         freq_pair : array.array
305 |             Frequent window substituted
306 | 
307 |         count : int
308 |             Number of times the frequent window occurred in the sequence
309 | 
310 |         time_taken : float
311 |             Time taken to execute step
312 | 
313 |     """
314 | 
315 |     if order == 2:
316 |         return _onestep_pairs(seq[:], verbose)
317 | 
318 |     if order > 2:
319 |         return _onestep_windows(seq[:], order, verbose)
320 | 
321 | 
322 | def onestep(seq, order, verbose=True, check=True):
323 |     """
324 |     Execute one step of NSRWS on given sequence and window size.
325 | 
326 |     This function exposes the functionality of NSRWS with various checks for inputs and
327 |     sizes. Wraps around _onestep & for convenience, allows disabling of equality check.
328 | 
329 |     Parameters
330 |     ----------
331 |     seq : array.array
332 |         Discrete symbolic sequence containing 32-bit unsigned integers.
333 |     order : int
334 |         Size of window for NSRWS, 2 or greater.
335 |     verbose : bool, optional
336 |         Whether to report extra details. These include the frequent pair that was
337 |         substituted, its counts & total time taken. The default is True.
338 |     check : bool, optional
339 |         Check for equality of all symbols in sequence. The default is True.
340 | 
341 |     Returns
342 |     -------
343 |     tuple, of the following fixed elements in this order:
344 |         array.array
345 |             Discrete symbolic sequence containing 32-bit unsigned integers, with most
346 |             frequently occurring non-sequentially overlapping window substituted.
347 | 
348 |         bool
349 |             indicator for the state of sequence with all distinct pairs (count=1)
350 | 
351 |     optional elements of tuple that depend on verbosity:
352 |         array.array
353 |             Frequent window substituted
354 | 
355 |         int
356 |             Number of times the frequent window occurred in the sequence
357 | 
358 |         float
359 |             Time taken to execute step
360 | 
361 |     """
362 | 
363 |     # Coerce input to appropriate array type, if not possible throw a fit & exit
364 |     if not arraytype(seq):
365 |         seq = cast(seq)
366 |         if seq is None:
367 |             return None
368 | 
369 |     # Check whether all elements are equal, if requested, & exit if True
370 |     if check and core.check_equality(seq):
371 |         print("> All elements in sequence are equal!")
372 |         return None
373 | 
374 |     # Check if size of sequence is shorter than order, exit if True
375 |     if len(seq) < order:
376 |         print("> Sequence input shorter than order!\n> Can't perform substitution ...")
377 |         return None
378 | 
379 |     # Else execute one step of NSRWS and return
380 |     return _onestep(seq, order, verbose)
381 | 


--------------------------------------------------------------------------------