├── ETC ├── CCMC │ ├── __init__.py │ └── pairs_parallel.py ├── LZ76 │ ├── __init__.py │ ├── lzc.py │ └── core.pyx ├── tests │ ├── __init__.py │ ├── test_recode.py │ ├── test_NSRWS1D.py │ └── test_NSRWS2D.py ├── NSRWS │ ├── __init__.py │ ├── x1D │ │ ├── __init__.py │ │ ├── distance.py │ │ ├── core.pyx │ │ ├── parallel.py │ │ └── onestep.py │ └── x2D │ │ ├── __init__.py │ │ ├── parallel.py │ │ ├── core.pyx │ │ └── onestep.py ├── seq │ ├── __init__.py │ ├── IO.py │ ├── check.py │ ├── estimates.pyx │ ├── process.py │ ├── recode.py │ └── markov.py ├── NCA │ ├── __init__.py │ ├── compute.py │ ├── parallelize_jl.py │ └── parallelize_mp.py ├── CCC │ ├── __init__.py │ ├── simulate_AR.py │ ├── compute_CCC.py │ ├── simulate_TentMap.py │ ├── calibrate_CCC.py │ └── _calibrate_CCC.py └── __init__.py ├── setup.py ├── .gitignore ├── demo.py ├── LICENSE └── README.md /ETC/CCMC/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /ETC/LZ76/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /ETC/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ETC/NSRWS/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /ETC/seq/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /ETC/NSRWS/x1D/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /ETC/NSRWS/x2D/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /ETC/NCA/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | from ETC.NCA.compute import compute_CCC, compute_CCM, get_causal, get_NCA 3 | -------------------------------------------------------------------------------- /ETC/CCC/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | from ETC.CCC.compute_CCC import compute, get_params 3 | from ETC.CCC.simulate_AR import coupled_AR 4 | -------------------------------------------------------------------------------- /ETC/LZ76/lzc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from ETC.LZ76 import core 9 | from ETC.seq.recode import cast 10 | from ETC.seq.check import arraytype 11 | 12 | 13 | def compute_complexity(seq): 14 | 15 | # Coerce input to appropriate array type, if not possible throw a fit & exit 16 | if not arraytype(seq): 17 | seq = cast(seq) 18 | if seq is None: 19 | return None 20 | 21 | # Check whether all elements are equal, & exit if True (LZ76 of such inputs is 2) 22 | if core.check_equality(seq): 23 | print("> All elements in sequence are equal!") 24 | return 2 25 | 26 | # Else execute Cython function for computing LZ complexity 27 | return core.lzc_a(seq) 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | from Cython.Build import cythonize 11 | import numpy 12 | 13 | setup( 14 | ext_modules=cythonize( 15 | [ 16 | "./ETC/NSRWS/x1D/core.pyx", 17 | "./ETC/NSRWS/x2D/core.pyx", 18 | "./ETC/seq/estimates.pyx", 19 | "./ETC/LZ76/core.pyx", 20 | ], 21 | annotate=False, 22 | compiler_directives={"language_level": "3"}, 23 | ), 24 | include_dirs=[numpy.get_include()], 25 | name="ETCPy", 26 | version="1.3.5", 27 | author_email="mail@pranaysy.com", 28 | description="Compute the Effort-To-Compress (ETC) of a symbolic sequence", 29 | packages=find_packages(), 30 | license="Apache License, Version 2.0", 31 | ) 32 | -------------------------------------------------------------------------------- /ETC/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __import__("pkg_resources").declare_namespace(__name__) 4 | 5 | from ETC.seq.IO import read, save 6 | from ETC.seq.process import generate, entropy 7 | from ETC.seq.recode import cast, recode_lexical, partition, partition_numpy 8 | from ETC.seq import check 9 | 10 | from ETC.NSRWS.x1D.etc import compute as compute_1D 11 | 12 | # from ETC.NSRWS.x1D.etc import compute_save as compute_save_1D 13 | # from ETC.NSRWS.x1D.onestep import onestep as onestep_1D 14 | 15 | from ETC.NSRWS.x2D.etc import compute as compute_2D 16 | 17 | # from ETC.NSRWS.x2D.etc import compute_save as compute_save_2D 18 | # from ETC.NSRWS.x2D.onestep import onestep as onestep_2D 19 | 20 | # from ETC.CCC.compute_CCC import compute as compute_CCC 21 | 22 | 23 | from ETC.NSRWS.x1D.parallel import ( 24 | pcompute_multiple_seq, 25 | pcompute_single, 26 | pcompute_files, 27 | pcompute_numpy, 28 | ) 29 | 30 | from ETC.LZ76.lzc import compute_complexity as LZC 31 | from ETC.CCMC.pairs import CCM_causality 32 | from ETC.CCMC.pairs_parallel import parallelized as CCM_causality_parallel 33 | from ETC.CCMC.pairs_parallel import get_rowpairs 34 | -------------------------------------------------------------------------------- /ETC/seq/IO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This module contains helper functions for reading and writing files. 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from csv import DictWriter 10 | 11 | # Import functions from standard library modules 12 | from pathlib import Path 13 | 14 | 15 | def populate_files(filepath, suffix="*.txt"): 16 | if not isinstance(filepath, Path): 17 | filepath = Path(filepath) 18 | 19 | if filepath.exists() and filepath.is_dir(): 20 | return filepath.rglob(suffix) 21 | 22 | print("Invalid path") 23 | return None 24 | 25 | 26 | def read(filepath, delimiter=None): 27 | if not isinstance(filepath, Path): 28 | filepath = Path(filepath) 29 | text = filepath.read_text() 30 | 31 | if delimiter: 32 | text = "".join(text.split(delimiter)) 33 | 34 | return text 35 | 36 | 37 | def save(out, filename): 38 | 39 | with open(filename, "w") as fileout: 40 | writer = DictWriter(fileout, fieldnames=out[0].keys(), delimiter=",") 41 | writer.writeheader() 42 | writer.writerows(out) 43 | print(f">> Data successfully stored to disk as {filename}") 44 | -------------------------------------------------------------------------------- /ETC/seq/check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from ETC.seq import estimates 10 | from array import array 11 | import numpy as np 12 | 13 | 14 | def zeroes(seq): 15 | if 0 in seq: 16 | return True 17 | return False 18 | 19 | 20 | def equality(seq, legacy=False): 21 | """ 22 | This function checks if all elements of a collection are equal. 23 | Parameters 24 | ---------- 25 | seq : list or tuple 26 | Sequence of integers. 27 | Returns 28 | ------- 29 | bool 30 | True if all elements equal. 31 | """ 32 | 33 | if arraytype(seq) and not legacy: 34 | return estimates.equality(seq) 35 | 36 | # Iterate over all elements in sequence 37 | for element in seq: 38 | 39 | # Break at first inequality 40 | if seq[0] != element: 41 | return False 42 | 43 | # Else all equal 44 | return True 45 | 46 | 47 | def arraytype(seq): 48 | if isinstance(seq, array) and seq.typecode == "I": 49 | return True 50 | if isinstance(seq, np.ndarray) and seq.dtype == "uint32": 51 | return True 52 | return False 53 | -------------------------------------------------------------------------------- /ETC/tests/test_recode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from array import array 9 | from hypothesis import given 10 | from hypothesis import strategies as st 11 | from ETC.seq import recode 12 | from collections import Counter 13 | from string import ascii_lowercase 14 | 15 | invalid_types = ( 16 | st.fractions(), 17 | st.characters(), 18 | st.floats(), 19 | st.text(), 20 | st.complex_numbers(), 21 | st.integers(max_value=0), 22 | st.integers(min_value=2 ** 32), 23 | ) 24 | valid_types = st.integers(min_value=1, max_value=2 ** 32 - 1) 25 | 26 | def counts(x): 27 | return tuple(Counter(x).values()) 28 | 29 | @given( 30 | x=st.one_of(st.tuples(st.one_of(invalid_types)), st.lists(st.one_of(invalid_types))) 31 | ) 32 | def test_cast_invalid(x): 33 | 34 | x = recode.cast(x) 35 | 36 | assert x is None 37 | 38 | 39 | def test_cast_zeroes(): 40 | 41 | x = recode.cast([0, 0, 0, 0]) 42 | 43 | assert x is None 44 | 45 | 46 | @given(x=st.one_of(st.tuples(valid_types), st.lists(valid_types, min_size=1))) 47 | def test_cast_valid(x): 48 | 49 | x = recode.cast(x) 50 | 51 | assert isinstance(x, array) and x.typecode == "I" 52 | 53 | @given(x=st.text(min_size=1, alphabet=list(ascii_lowercase))) 54 | def test_all_recodes(x): 55 | 56 | x1 = recode.recode_lexical(x) 57 | x2 = recode.recode_alphabetical(x) 58 | x3 = recode.recode_randint(x) 59 | x4 = recode.recode_random(x) 60 | 61 | assert counts(x1) == counts(x2) == counts(x3) == counts(x4) 62 | assert len(set(x1)) == len(set(x2)) == len(set(x3)) == len(set(x4)) -------------------------------------------------------------------------------- /ETC/seq/estimates.pyx: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | cimport cython 9 | import numpy as np 10 | from libc.math cimport log2 11 | cimport numpy as np 12 | 13 | cpdef double entropy(unsigned int[::1] x): 14 | 15 | # cdef np.ndarray[np.npy_int64, ndim=1] counts = np.bincount(x) 16 | cdef long int[:] counts_view = np.bincount(x) 17 | # cdef long int[:] counts_view = counts 18 | cdef double counts_total = 0 19 | cdef Py_ssize_t counts_size = counts_view.shape[0] 20 | cdef Py_ssize_t m 21 | 22 | for m in range(counts_size): 23 | counts_total += counts_view[m] 24 | 25 | cdef double E = 0.0 26 | cdef double prob, logprob 27 | 28 | m = 0 29 | for n in range(counts_size): 30 | if counts_view[n]!=0: 31 | prob = counts_view[n] / counts_total 32 | logprob = log2(prob) 33 | E = E-prob*logprob 34 | 35 | return E 36 | 37 | 38 | # Function for checking whether all elements in input are identical 39 | cpdef bint equality(const unsigned int[::1] x): 40 | """ 41 | INPUT 42 | ----- 43 | x : array.array 44 | Array object containing 32-bit unsigned integers. 45 | 46 | 47 | OUTPUT 48 | ------ 49 | bool 50 | True if all elements are identical 51 | """ 52 | # Intialize loop bounds 53 | cdef Py_ssize_t n 54 | cdef Py_ssize_t x_size = len(x) 55 | 56 | # Iterate over values from input 57 | for n in range(x_size): 58 | 59 | # Short-circuit the loop: check for any element that doesn't equal the first 60 | if x[0] != x[n]: 61 | return False 62 | 63 | return True -------------------------------------------------------------------------------- /ETC/seq/process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from collections import Counter 9 | from math import log2 10 | from random import choices 11 | from random import seed as seedvalue 12 | from ETC.seq import estimates, recode 13 | from array import array 14 | import re 15 | 16 | 17 | def sanitize(text, whitespace=False, lowercase=False): 18 | 19 | if whitespace: 20 | joiner = " " 21 | 22 | else: 23 | joiner = "" 24 | 25 | if lowercase: 26 | text = text.lower() 27 | 28 | text = joiner.join(re.findall("[a-zA-Z]+", text)) 29 | 30 | return text 31 | 32 | 33 | def generate(size=10, partitions=2, seed=None): 34 | """ 35 | This function generates discrete random data of desired size and bins. 36 | 37 | Parameters 38 | ---------- 39 | size : int, optional 40 | Length of sequence to generate. The default is 10. 41 | partitions : int, optional 42 | Number of bins/paritions to create. 43 | seed : int, optional 44 | Seed value for initializing the random number generator. The default is None 45 | 46 | Returns 47 | ------- 48 | list 49 | Collection of integers sampled from discrete uniform. 50 | 51 | """ 52 | if not (isinstance(partitions, int) and isinstance(size, int) and partitions >= 2): 53 | print(partitions, size) 54 | print(">> Number of bins is invalid ...") 55 | return None 56 | 57 | if seed: 58 | seedvalue(seed) 59 | 60 | return recode.cast(choices(range(1, partitions + 1), k=size)) 61 | 62 | 63 | def frequencies(seq): 64 | 65 | return Counter(seq).most_common() 66 | 67 | 68 | def entropy(seq, legacy=False): 69 | """ 70 | This function computes Shannon Entropy of a given sequence. 71 | 72 | Parameters 73 | ---------- 74 | seq : list or tuple 75 | Sequence of integers. 76 | 77 | Returns 78 | ------- 79 | float 80 | Shannon entropy of sequence. 81 | 82 | """ 83 | 84 | if isinstance(seq, array) and seq.typecode == "I" and not legacy: 85 | return estimates.entropy(seq) 86 | 87 | # Get counts from Counter, normalize by total, transform each and sum all 88 | return sum( 89 | -seq * log2(seq) for seq in (elem / len(seq) for elem in Counter(seq).values()) 90 | ) 91 | -------------------------------------------------------------------------------- /ETC/CCC/simulate_AR.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | import numpy as np 10 | 11 | 12 | def coupled_AR(length=1000, a=0.9, b=0.8, c=0.8, e=0.01, burn=100, seed=1): 13 | """ 14 | Generate discrete-time coupled AR processes with known parameters. 15 | 16 | Dependent process defined as: 17 | x[n] = a * x[n - 1] + b * y[n - 1] + e * noise_x[n] 18 | 19 | Independent process defined as: 20 | y[n] = c * y[n - 1] + e * noise_y[n] 21 | 22 | Parameters 23 | ---------- 24 | length : int, optional 25 | Legnth of samples drawn from the process. The default is 1000. 26 | a : float, optional 27 | Coefficient for dependent process, capturing dependency on its own past. 28 | The default is 0.9. 29 | b : float, optional 30 | Coefficient for dependent process, capturing dependency on the independent 31 | process - causal interaction from independent to dependent. The default is 0.8. 32 | c : float, optional 33 | Coefficient for independent process, capturing dependency on its own past. 34 | The default is 0.8. 35 | e : float, optional 36 | Coefficient for uniform random noise mixture. The default is 0.01 37 | burn : int, optional 38 | Number of initial samples to burn. The default is 100. 39 | seed: int, optional 40 | Seed value for initialization of random number generator. The default is 1 41 | 42 | Returns 43 | ------- 44 | dict 45 | Two key-value pairs: 46 | "dependent": Samples of the dependent process. 47 | "independent": Samples of the independent process. 48 | 49 | """ 50 | # Anchor seed for reproducibility 51 | np.random.seed(seed) 52 | 53 | # AR processes: initialize 54 | x = np.zeros(length, dtype="float64") 55 | y = np.zeros(length, dtype="float64") 56 | 57 | # Generate noise vector of appropriate length 58 | noise_x = e * np.random.normal(0, 1, length + burn) 59 | noise_y = e * np.random.normal(0, 1, length + burn) 60 | 61 | # Initialize starting points 62 | x[0] = np.random.uniform() 63 | y[0] = np.random.uniform() 64 | 65 | # Burn initial samples 66 | if burn: 67 | for n in range(burn): 68 | x[0] = a * x[0] + b * y[0] + noise_x[n] 69 | y[0] = c * y[0] + noise_y[n] 70 | 71 | # Store further samples 72 | for n in range(1, length): 73 | x[n] = a * x[n - 1] + b * y[n - 1] + noise_x[n] 74 | y[n] = c * y[n - 1] + noise_y[n] 75 | 76 | return {"dependent": x, "independent": y} 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Custom 138 | /dull 139 | -------------------------------------------------------------------------------- /ETC/LZ76/core.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | 6 | 7 | @author: Pranay S. Yadav 8 | """ 9 | # Import stuff 10 | # cimport cython 11 | 12 | cpdef unsigned int lzc_a(const unsigned int[::1] intarray): 13 | """ 14 | Lempel-Ziv (LZ76) complexity on 32-bit integer arrays 15 | """ 16 | 17 | # Variables Initialization 18 | cdef Py_ssize_t arraylength = len(intarray) 19 | cdef unsigned int complexity = 1 20 | cdef Py_ssize_t prefix_len = 1 21 | cdef Py_ssize_t len_substring = 1 22 | cdef Py_ssize_t max_len_substring = 1 23 | cdef unsigned int pointer = 0 24 | 25 | # While we haven't decoded the full string we continue 26 | while prefix_len + len_substring <= arraylength: 27 | 28 | # Given a prefix length, find the largest substring 29 | if ( 30 | intarray[pointer + len_substring - 1] 31 | == intarray[prefix_len + len_substring - 1] 32 | ): 33 | len_substring += 1 # increase the length of the substring 34 | else: 35 | 36 | max_len_substring = max(len_substring, max_len_substring) 37 | pointer += 1 38 | 39 | # all the pointers have been investigated, we pick the largest for the jump 40 | if pointer == prefix_len: 41 | 42 | # Increment complexity 43 | complexity += 1 44 | 45 | # Increase the prefix length by the maximum substring size found so far 46 | prefix_len += max_len_substring 47 | 48 | # Reset the variables 49 | pointer = 0 50 | max_len_substring = 1 51 | 52 | # reset the length of the substring 53 | len_substring = 1 54 | 55 | # Check final repetition if we were in the middle of a substring 56 | if len_substring != 1: 57 | complexity += 1 58 | 59 | return complexity 60 | 61 | cpdef unsigned int lzc_b(const unsigned char[:] bytestring): 62 | """ 63 | Lempel-Ziv (LZ76) complexity on bytestrings 64 | """ 65 | 66 | # Variables Initialization 67 | cdef Py_ssize_t stringlength = len(bytestring) 68 | cdef unsigned int complexity = 1 69 | cdef Py_ssize_t prefix_len = 1 70 | cdef Py_ssize_t len_substring = 1 71 | cdef Py_ssize_t max_len_substring = 1 72 | cdef unsigned int pointer = 0 73 | 74 | # While we haven't decoded the full string we continue 75 | while prefix_len + len_substring <= stringlength: 76 | 77 | # Given a prefix length, find the largest substring 78 | if ( 79 | bytestring[pointer + len_substring - 1] 80 | == bytestring[prefix_len + len_substring - 1] 81 | ): 82 | len_substring += 1 # increase the length of the substring 83 | else: 84 | 85 | max_len_substring = max(len_substring, max_len_substring) 86 | pointer += 1 87 | 88 | # all the pointers have been investigated, we pick the largest for the jump 89 | if pointer == prefix_len: 90 | # Increase the complexity 91 | complexity += 1 92 | 93 | # Increase the prefix length by the maximum substring size found so far 94 | prefix_len += max_len_substring 95 | 96 | # Reset the variables 97 | pointer = 0 98 | max_len_substring = 1 99 | 100 | # reset the length of the substring 101 | len_substring = 1 102 | 103 | # Check final repetition if we were in the middle of a substring 104 | if len_substring != 1: 105 | complexity += 1 106 | 107 | return complexity 108 | 109 | # Function for checking whether all elements in input are identical 110 | cpdef bint check_equality(const unsigned int[::1] x): 111 | """ 112 | INPUT 113 | ----- 114 | x : array.array 115 | Array object containing 32-bit unsigned integers. 116 | 117 | 118 | OUTPUT 119 | ------ 120 | bool 121 | True if all elements are identical 122 | """ 123 | # Intialize loop bounds 124 | cdef Py_ssize_t n 125 | cdef Py_ssize_t x_size = len(x) 126 | 127 | # Iterate over values from input 128 | for n in range(x_size): 129 | 130 | # Short-circuit the loop: check for any element that doesn't equal the first 131 | if x[0] != x[n]: 132 | return False 133 | 134 | return True -------------------------------------------------------------------------------- /ETC/CCMC/pairs_parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from ETC.CCMC.pairs import ETC_causality, LZ_causality, CCM_causality 9 | from multiprocessing import Pool 10 | from functools import partial 11 | from itertools import combinations 12 | 13 | 14 | def _kernel_seq(inputs, estimator): 15 | """ 16 | Wrapper around a function that computes anything on two sequences and returns a dict 17 | 18 | While it is written as a general purpose kernel for anything, here it is used for 19 | causal discovery and estimation from CCM based methods. 20 | 21 | The function unpacks inputs into an index element and a sequence pair and runs the 22 | estimator function on the sequence pair, returning various estimates in a dict 23 | 24 | Parameters 25 | ---------- 26 | inputs : tuple 27 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 28 | be produced manually or more typically using enumerate; b holds the two sequences 29 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 30 | a, the index, is passed to keep track of order in case of asynchronous execution 31 | Should look like this: (index, (sequence_x, sequence_y) 32 | estimator : function 33 | A function that can compute something on two arrays and return a dict. Preferably 34 | one that can compute something meaningful, like causal discovery 35 | 36 | Returns 37 | ------- 38 | out : dict 39 | Estimates obtained by running estimator on inputs. 40 | 41 | """ 42 | # Unpack inputs 43 | idx, seqs = inputs 44 | 45 | # Unpack sequences 46 | idx_x, idx_y, seq_x, seq_y = seqs 47 | 48 | # Initialize dictionary of output estimates with index 49 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 50 | 51 | # Execute the estimator function on the sequence pair 52 | out.update(estimator(seq_x, seq_y)) 53 | 54 | # Some feedback to console 55 | # print(".", end="") 56 | 57 | return out 58 | 59 | 60 | def get_rowpairs(matrix): 61 | """ 62 | Create a generator for iterating over pairs of rows of an input matrix 63 | 64 | Parameters 65 | ---------- 66 | matrix : numpy array, int or float, 2D 67 | Each row representing a different sequence. (Columns as time) 68 | 69 | Yields 70 | ------ 71 | row1 : int 72 | Index of first row in the pair. 73 | row2 : int 74 | Index of second row in the pair. 75 | np.array, 1D, int 76 | Data of first row in the pair. 77 | np.array, 1D, int 78 | Data of first row in the pair. 79 | 80 | """ 81 | for row1, row2 in combinations(range(0, matrix.shape[0]), 2): 82 | yield (row1, row2, matrix[row1, :], matrix[row2, :]) 83 | 84 | 85 | def parallelized(pairs, kernel="CCM"): 86 | """ 87 | This function operates concurrently on a collection of sequence pairs and computes 88 | estimates using the chosen kernel function. 89 | 90 | Here used for computing causal estimates from sequences pairs in batch, each pair 91 | runs on a separate CPU core as a process. 92 | 93 | CAUTION: main module is unguarded, do not run these functions as is, 94 | particularly on Windows! 95 | 96 | Parameters 97 | ---------- 98 | pairs : list/tuple/generator 99 | Collection of pairs of integer sequences. 100 | kernel : str, optional 101 | Name of an estimator function. Currently available: "CCM", "ETC" and "LZ". The 102 | default is "CCM". 103 | 104 | Returns 105 | ------- 106 | list of dict elements 107 | Each dictionary element contains index, length of sequence & ETC. 108 | 109 | """ 110 | 111 | if kernel == "CCM": 112 | exec_kernel = partial(_kernel_seq, estimator=CCM_causality) 113 | elif kernel == "ETC": 114 | exec_kernel = partial(_kernel_seq, estimator=ETC_causality) 115 | elif kernel == "LZ": 116 | exec_kernel = partial(_kernel_seq, estimator=LZ_causality) 117 | else: 118 | print("> ERROR: Invalid kernel specified") 119 | return None 120 | 121 | # Initialize pool of parallel workers 122 | pool = Pool() 123 | 124 | # Confirm to stdout 125 | print(f"Running kernel={kernel} in parallel on input ... ", end="") 126 | 127 | # Map-execute function across sequences 128 | out = pool.map_async(exec_kernel, enumerate(pairs)) 129 | 130 | # Graceful exit 131 | pool.close() 132 | pool.join() 133 | 134 | # Confirm completion 135 | print("Done!") 136 | 137 | # Return collected results 138 | return out.get() 139 | -------------------------------------------------------------------------------- /ETC/seq/recode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from string import ascii_lowercase 10 | from random import shuffle, choices 11 | from array import array 12 | import numpy as np 13 | from ETC.seq.check import zeroes 14 | 15 | 16 | def cast(seq): 17 | 18 | if seq is not None: 19 | if isinstance(seq, np.ndarray): 20 | try: 21 | out = seq.astype("uint32") 22 | if zeroes(out): 23 | print("> Input contains 0!") 24 | print("> Symbols shifted up by 1 ") 25 | return out + 1 26 | return out 27 | 28 | except TypeError as error: 29 | print("ERROR:", error) 30 | print("> Input must be a list/tuple/array of positive integers!") 31 | print('> Recode or partition using "ETC.seq.recode"') 32 | return None 33 | 34 | except OverflowError as error: 35 | print("ERROR:", error) 36 | print("> Input must be a list/tuple/array of positive integers!") 37 | print('> Recode or partition using "ETC.seq.recode"') 38 | return None 39 | 40 | else: 41 | try: 42 | out = array("I", seq) 43 | if zeroes(out): 44 | print("> Input contains 0!") 45 | print('> Recode or partition using "ETC.seq.recode" ') 46 | return None 47 | return out 48 | 49 | except TypeError as error: 50 | print("ERROR:", error) 51 | print("> Input must be a list/tuple/array of positive integers!") 52 | print('> Recode or partition using "ETC.seq.recode"') 53 | return None 54 | 55 | except OverflowError as error: 56 | print("ERROR:", error) 57 | print("> Input must be a list/tuple/array of positive integers!") 58 | print('> Recode or partition using "ETC.seq.recode"') 59 | return None 60 | 61 | print("No input sequence provided.") 62 | return None 63 | 64 | 65 | def recode_lexical(text, case_sensitive=True): 66 | 67 | if not isinstance(text, str): 68 | print("ERROR: Input is not a string.") 69 | return None 70 | if not case_sensitive: 71 | text = text.lower() 72 | alphabets = sorted(set(text)) 73 | replacer = dict((y, x + 1) for x, y in enumerate(alphabets)) 74 | text = cast([replacer[x] for x in text]) 75 | return text 76 | 77 | 78 | def recode_alphabetical(text): 79 | 80 | text = text.lower() 81 | if not set(text).issubset(ascii_lowercase): 82 | print("> Input contains non alphabetical characters!") 83 | return None 84 | replacer = dict((y, x + 1) for x, y in enumerate(ascii_lowercase)) 85 | text = cast([replacer[x] for x in text]) 86 | return text 87 | 88 | 89 | def recode_dna(text): 90 | 91 | replacer = {"A": 1, "G": 1, "C": 2, "T": 2} 92 | text = cast([replacer[x] for x in text.upper()]) 93 | return text 94 | 95 | 96 | def recode_random(text): 97 | 98 | alphabets = list(set(text)) 99 | shuffle(alphabets) 100 | replacer = dict((y, x + 1) for x, y in enumerate(alphabets)) 101 | text = cast([replacer[x] for x in text]) 102 | return text 103 | 104 | 105 | def recode_randint(text): 106 | 107 | alphabets = list(set(text)) 108 | numbers = choices(range(1, 2 ** 20), k=len(alphabets)) 109 | replacer = dict(zip(alphabets, numbers)) 110 | text = cast([replacer[x] for x in text]) 111 | return text 112 | 113 | 114 | def partition(seq, n_bins): 115 | """ 116 | This function takes an input sequence and bins it into discrete points. 117 | 118 | Parameters 119 | ---------- 120 | seq : list/tuple of float 121 | Collection of floats. 122 | n_bins : int 123 | Number of bins/paritions to create. 124 | 125 | Returns 126 | ------- 127 | list 128 | Collection of integers. Contains unique integers from 1 to n_bins. 129 | 130 | """ 131 | assert ( 132 | isinstance(n_bins, int) and n_bins > 1 133 | ), "ERROR: Number of bins should be a positive integer" 134 | 135 | # Get smallest value 136 | a = min(seq) 137 | 138 | # Compute reciprocal of peak-to-peak per bin 139 | delta_inv = n_bins / (max(seq) - a + 1e-6) 140 | 141 | # Transform each element and return 142 | return [1 + int((elem - a) * delta_inv) for elem in seq] 143 | 144 | 145 | def partition_numpy(nparr, n_bins): 146 | """ 147 | This function takes an input sequence & partitions it into equiwidth discrete bins. 148 | 149 | Min-max scaling, followed by equiwidth binning for each row 150 | 151 | Parameters 152 | ---------- 153 | nparr : numpy array, int or float, 2D 154 | Each row representing a different sequence. (Columns as time) 155 | n_bins : int 156 | Number of bins/paritions to create. 157 | 158 | Returns 159 | ------- 160 | list 161 | Collection of integers. Contains unique integers from 1 to n_bins. 162 | 163 | """ 164 | assert ( 165 | isinstance(n_bins, int) and n_bins > 1 166 | ), "ERROR: Number of bins should be a positive integer" 167 | 168 | assert ( 169 | isinstance(nparr, np.ndarray) and nparr.ndim == 2 170 | ), ">ERROR: Input must be 2D NumPy array of numbers" 171 | 172 | # Get smallest value 173 | a = nparr.min(axis=1)[:, np.newaxis] 174 | 175 | # Compute reciprocal of peak-to-peak per bin 176 | delta_inv = n_bins / (nparr.max(axis=1)[:, np.newaxis] - a + 1e-6) 177 | 178 | # Transform each element and return 179 | return 1 + ((nparr - a) * delta_inv).astype("uint32") 180 | -------------------------------------------------------------------------------- /ETC/NSRWS/x1D/distance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | 10 | from functools import partial 11 | from itertools import islice 12 | 13 | # Import functions from standard library modules 14 | from multiprocessing import Pool 15 | 16 | # Import local modules 17 | import ETC 18 | 19 | get1D = partial(ETC.compute_1D, order=2, verbose=False, truncate=True) 20 | 21 | 22 | def _compute_distance(inputs): 23 | """ 24 | This function operates on a single sequence and computes ETC. 25 | 26 | Parameters 27 | ---------- 28 | seq : tuple of 2 elements 29 | 1st element is index for tracking. 30 | 2nd element is a sequence of integers used for ETC computation. 31 | Output of enumerate. 32 | 33 | Returns 34 | ------- 35 | out : dict 36 | index of sequence, length of sequence and ETC estimate. 37 | 38 | """ 39 | idx, seqs = inputs 40 | 41 | S1 = ETC.seq.recode.recode_lexical(seqs[0]) 42 | S2 = ETC.seq.recode.recode_lexical(seqs[1]) 43 | 44 | # Prepare output dictionary 45 | out = {"item": idx, "length_seq1": len(S1), "length_seq2": len(S2)} 46 | 47 | 48 | # Compute ETC and update output dictionary 49 | ETC1D_seq1 = get1D(S1)["ETC1D"] 50 | out.update({"ETC1D_seq1": ETC1D_seq1}) 51 | 52 | ETC1D_seq2 = get1D(S2)["ETC1D"] 53 | out.update({"ETC1D_seq2": ETC1D_seq2}) 54 | 55 | ETC1D_seq1seq2 = get1D(S1 + S2)["ETC1D"] 56 | out.update({"ETC1D_seq1seq2": ETC1D_seq1seq2}) 57 | 58 | ETC1D_seq2seq1 = get1D(S2 + S1)["ETC1D"] 59 | out.update({"ETC1D_seq2seq1": ETC1D_seq2seq1}) 60 | 61 | dETC = 0.5 * (ETC1D_seq1seq2 + ETC1D_seq2seq1 - ETC1D_seq1 - ETC1D_seq2) 62 | 63 | out.update({"distance": dETC}) 64 | 65 | return out 66 | 67 | 68 | def _overlapping_chunks(seq, size, offset=1): 69 | """ 70 | This function takes an input sequence and produces chunks of chosen size. 71 | Offset can be used to control degree of overlap (or distance between chunks 72 | that don't overlap) 73 | 74 | Parameters 75 | ---------- 76 | seq : tuple or list 77 | Sequence of integers. 78 | size : int 79 | Length of each produced chunk. 80 | offset : int, optional 81 | Number of elements to shift each chunk by. The default is 1. 82 | Setting this to any value less than size allows control of overlap. 83 | Setting this >= size produces non-overlapping chunks. 84 | 85 | Returns 86 | ------- 87 | zip 88 | zip object that produces chunks of specified size, one at a time. 89 | 90 | """ 91 | 92 | return zip(*(islice(seq, i, None, offset) for i in range(size))) 93 | 94 | 95 | def _non_overlapping_chunks(seq, size): 96 | """ 97 | This function takes an input sequence and produces chunks of chosen size 98 | that strictly do not overlap. This is a much faster implemetnation than 99 | _overlapping_chunks and should be preferred if running on very large seq. 100 | 101 | Parameters 102 | ---------- 103 | seq : tuple or list 104 | Sequence of integers. 105 | size : int 106 | Length of each produced chunk. 107 | 108 | Returns 109 | ------- 110 | zip 111 | zip object that produces chunks of specified size, one at a time. 112 | 113 | """ 114 | 115 | return zip(*[iter(seq)] * size) 116 | 117 | 118 | def pcompute_multiple_seq(iterable): 119 | """ 120 | This function operates concurrently on a collection of sequences. Loads 121 | each sequence and computes ETC. 122 | 123 | CAUTION: main module is unguarded, do not run these functions as is, 124 | particularly on Windows. 125 | 126 | Parameters 127 | ---------- 128 | iterable : list/tuple/generator 129 | Collection of integer sequences. 130 | 131 | Returns 132 | ------- 133 | list of dict elements 134 | Each dictionary element contains index, length of sequence & ETC. 135 | 136 | """ 137 | # Initialize pool of parallel workers 138 | pool = Pool() 139 | 140 | # Map-execute function across sequences 141 | out = pool.map(_compute_distance, enumerate(iterable)) 142 | 143 | # Graceful exit 144 | pool.close() 145 | pool.join() 146 | 147 | # Return collected results 148 | return out 149 | 150 | 151 | def truncate(seq1, seq2): 152 | 153 | # Truncate the longer sequence 154 | if len(seq1) == len(seq2): 155 | return seq1, seq2 156 | 157 | if len(seq1) > len(seq2): 158 | seq1 = seq1[: len(seq2)] 159 | else: 160 | seq2 = seq2[: len(seq1)] 161 | 162 | return seq1, seq2 163 | 164 | 165 | def pcompute_single(seq1, seq2, size, offset=1): 166 | """ 167 | This function operates concurrently on chunks of a given sequence. Gets 168 | each chunk and computes ETC one-by-one. Offset parameter controls degree of 169 | overlap (or non-overlap) 170 | 171 | CAUTION: main module is unguarded, do not run these functions as is, 172 | particularly on Windows. 173 | 174 | Parameters 175 | ---------- 176 | seq : tuple or list 177 | Sequence of integers. 178 | size : int 179 | Length of each produced chunk. 180 | offset : int, optional 181 | Number of elements to shift each chunk by. The default is 1. 182 | Setting this to any value less than size allows control of overlap. 183 | Setting this >= size produces non-overlapping chunks. 184 | 185 | Returns 186 | ------- 187 | list of dict elements 188 | Each dictionary element contains index, length of sequence & ETC. 189 | 190 | """ 191 | 192 | seq1, seq2 = truncate(seq1, seq2) 193 | 194 | # If offset equals size, get non-overlapping chunks of given size 195 | if offset == size: 196 | iterable1 = _non_overlapping_chunks(seq1, size) 197 | iterable2 = _non_overlapping_chunks(seq2, size) 198 | 199 | # Else get overlapping chunks of given size and offset 200 | else: 201 | iterable1 = _overlapping_chunks(seq1, size, offset) 202 | iterable2 = _overlapping_chunks(seq2, size, offset) 203 | 204 | # Execute parallel computation over chunks 205 | return pcompute_multiple_seq(zip(iterable1, iterable2)) 206 | -------------------------------------------------------------------------------- /ETC/NCA/compute.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This module has functions for multicore computation of NCA from a 2D matrix 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | import pandas as pd 9 | from ETC.NCA import parallelize_jl as NCAP 10 | 11 | 12 | def compute_CCC(matrix, CCC_params): 13 | """ 14 | Compute causal complexity estimates for all pairs of rows of input matrix 15 | 16 | Estimates are derived as per 4 models: ETCP, ETCE, LZP and CCC 17 | 18 | Parameters 19 | ---------- 20 | matrix : np.ndarray, 2d, uint32 21 | MxN matrix, C(M,2) rowpairs, with each row of length N. 22 | CCC_params : dict 23 | CCC parameters with the following names for keys: 24 | "LEN_past", "ADD_meas", "STEP_size" 25 | 26 | Returns 27 | ------- 28 | pd.DataFrame 29 | DataFrame containing causal estimates from all 4 models. 30 | 31 | """ 32 | # Create a generator that produces rowpairs one at a time 33 | rowpairs = NCAP.get_rowpairs(matrix) 34 | 35 | # Compute causal estimates in parallel across rowpairs 36 | estimates = NCAP.parallelized_CCC(rowpairs, CCC_params) 37 | 38 | # Convert estimates to a DataFrame and return 39 | return pd.DataFrame(estimates) 40 | 41 | 42 | def compute_CCM(matrix, kernel="LZ"): 43 | """ 44 | Compute causal complexity estimates for all pairs of rows of input matrix 45 | 46 | Estimates are derived as per 4 models: ETCP, ETCE, LZP and CCC 47 | 48 | Parameters 49 | ---------- 50 | matrix : np.ndarray, 2d, uint32 51 | MxN matrix, C(M,2) rowpairs, with each row of length N. 52 | CCC_params : dict 53 | CCC parameters with the following names for keys: 54 | "LEN_past", "ADD_meas", "STEP_size" 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | DataFrame containing causal estimates from all 4 models. 60 | 61 | """ 62 | # Create a generator that produces rowpairs one at a time 63 | rowpairs = NCAP.get_rowpairs(matrix) 64 | 65 | # Compute causal estimates in parallel across rowpairs 66 | estimates = NCAP.parallelized_CCM(rowpairs, kernel) 67 | 68 | # Convert estimates to a DataFrame and return 69 | return pd.DataFrame(estimates) 70 | 71 | 72 | def get_causal(df): 73 | """ 74 | Extract causal strengths from all estimates as a long-form (tidy) DataFrame 75 | 76 | Parameters 77 | ---------- 78 | df : pd.DataFrame 79 | DataFrame containing raw causal estimates from all 4 models. 80 | As returned by get_estimates() 81 | 82 | Returns 83 | ------- 84 | pd.DataFrame 85 | DataFrame containing only causal strengths from the 4 models in both directions. 86 | 87 | """ 88 | # Fix pair identifiers 89 | identifiers = ["index_pair", "index_x", "index_y"] 90 | 91 | # Initialize aggregator of melted / tidied dataframes 92 | dfs = list() 93 | 94 | # Iterate over each model and tidy up 95 | for model in ["ETCP", "ETCE", "LZP", "CCC"]: 96 | 97 | if df.filter(like=model).shape[-1] != 0: 98 | dat = df.melt( 99 | id_vars=identifiers, 100 | value_vars=[f"{model}_x_to_y", f"{model}_y_to_x"], 101 | var_name="direction", 102 | value_name=model, 103 | ) 104 | 105 | # Strip column names to make it generic -> easier to concatenate 106 | dat["direction"] = dat["direction"].str.replace(f"{model}_", "") 107 | 108 | # Set identifiers as indices and append to aggregator 109 | dfs.append(dat.set_index(identifiers + ["direction"])) 110 | 111 | # Concatenate all tidied dataframes and return 112 | return pd.concat(dfs, axis=1).reset_index() 113 | 114 | 115 | def get_NCA(df, k=0.1): 116 | """ 117 | Compute NCA from top k causal strengths (top k pairs) 118 | 119 | Parameters 120 | ---------- 121 | df : pd.DataFrame 122 | DataFrame containing only causal strengths from the 4 models in both directions 123 | in tidy/long-form, as returned by get_causal() 124 | 125 | k : float, optional, 0 < k < 1 126 | Top proportion of causal strengths. The default is 0.1. 127 | 128 | Returns 129 | ------- 130 | pd.DataFrame 131 | DataFrame containing NCA estimates with summary statistics. 132 | 133 | """ 134 | # Convert k to numeric based on available causal pairs 135 | k_int = round(k * len(df)) 136 | 137 | # Initialize aggregator of estimates from each model 138 | agg = list() 139 | 140 | # Iterate over each model, compute summary statistics and aggregate 141 | for model in ["ETCP", "ETCE", "LZP", "CCC"]: 142 | 143 | if df.filter(like=model).shape[-1] != 0: 144 | dat = df[model].nlargest(k_int) 145 | agg.append( 146 | { 147 | "model": model, 148 | "mean": dat.mean(), 149 | "median": dat.median(), 150 | "max": dat.max(), 151 | "min": dat.min(), 152 | "mad": dat.mad(), 153 | "std": dat.std(), 154 | } 155 | ) 156 | 157 | # Combine all estimates into a DataFrame and return 158 | return pd.DataFrame(agg).set_index("model") 159 | 160 | 161 | # Function for earlier versions, preserved for later 162 | # def get_complexity(df): 163 | # """ 164 | # Extract CCM estimates of complexity - ETC and LZ of each row element 165 | 166 | # Parameters 167 | # ---------- 168 | # df : pd.DataFrame 169 | # DataFrame containing raw causal estimates from all 4 models. 170 | # As returned by get_estimates() 171 | 172 | # Returns 173 | # ------- 174 | # pd.DataFrame 175 | # DataFrame containing unique ETC and LZ estimates for each row. 176 | 177 | # """ 178 | # # Store estimates for one row of the pair 179 | # dfx = df[["index_x", "ETC_x", "LZ_x"]] 180 | # dfx = dfx.rename(columns=lambda m: m.replace("_x", "")) 181 | 182 | # # Store estimates for the other row of the pair 183 | # dfy = df[["index_y", "ETC_y", "LZ_y"]] 184 | # dfy = dfy.rename(columns=lambda m: m.replace("_y", "")) 185 | 186 | # # Combine the two, drop duplicates and return 187 | # return pd.concat([dfx, dfy]).drop_duplicates().set_index("index") 188 | -------------------------------------------------------------------------------- /ETC/CCC/compute_CCC.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Compute the Compression-Complexity based Causality for two sequences. 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | # Import functions 10 | from functools import partial 11 | from ETC import compute_1D, compute_2D 12 | from ETC.seq.recode import partition, cast 13 | from ETC.seq.check import arraytype 14 | 15 | import numpy as np 16 | import array 17 | 18 | # Curry the functions for computing 1D and 2D ETC estimates 19 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True) 20 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True) 21 | 22 | 23 | def get_params(): 24 | """ 25 | Helper function for creating a dictionary of CCC params interactively 26 | 27 | Returns 28 | ------- 29 | dict 30 | The main 3 parameters for passing directly into CCC.compute 31 | 32 | """ 33 | print("#" * 80) 34 | print("-" * 80) 35 | print("Initialize CCC Parameters: The thorns") 36 | print("-" * 80, "\n") 37 | print("All the following 3 should be integers\n") 38 | LEN_past = int(input("1. Window length of immediate past values: ")) 39 | ADD_meas = int(input("2. Window length of present values: ")) 40 | STEP_size = int(input("3. Step-size for sliding window ahead: ")) 41 | print("\n", "-" * 80) 42 | print("#" * 80) 43 | return {"LEN_past": LEN_past, "ADD_meas": ADD_meas, "STEP_size": STEP_size} 44 | 45 | 46 | def compute(seq_x, seq_y, LEN_past, ADD_meas, STEP_size, n_partitions=False): 47 | """ 48 | Estimate the Compression-Complexity based Causality for two sequences. 49 | 50 | The direction of causality being assessed is from seq_y -> seq_x. Various other 51 | parameters need to be specified, a brief description is offered below. 52 | 53 | For detailed explanations regarding the parameters, interpretations and of the inner 54 | workings, please refer to the research article along with the supplementary: 55 | Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for 56 | Complexity-Causality Measure.” PeerJ Computer Science 5 (May 2019): e196. 57 | https://doi.org/10.7717/peerj-cs.196. 58 | 59 | Parameters 60 | ---------- 61 | seq_x : list or tuple 62 | Sequence of numbers, if not integers specify n_partitions for binning. 63 | seq_y : list or tuple 64 | Sequence of numbers, if not integers specify n_partitions for binnings. 65 | LEN_past : int 66 | Parameter "L": Window length of immediate past values of seq_x and seq_y. 67 | ADD_meas : int 68 | Parameter "w": Window length of present values of seq_x. Minimal data length 69 | over which CC rate can be reliably estimated, application/domain-specific 70 | STEP_size : int 71 | Parameter "delta": Step-size for sliding chunks across both sequences. An overlap 72 | of 20-50% between successive chunks or windows suggested. 73 | n_partitions : int or bool, optional 74 | Parameter "B": Number of bins. Smalles number of symbols that capture the time 75 | series dynamics. The default is False indicating that the data is already in the 76 | form of discrete symbolic sequences. 77 | 78 | Returns 79 | ------- 80 | CCC : float 81 | Estimated Compression-Complexity based Causality for direction seq_y -> seq_x. 82 | 83 | """ 84 | # Sanity checks 85 | assert len(seq_x) == len(seq_y), "ERROR: Sequences must have the same length!" 86 | assert ( 87 | isinstance(LEN_past, int) and LEN_past > 1 88 | ), "ERROR: LEN_past must be a positive integer!" 89 | assert ( 90 | isinstance(ADD_meas, int) and ADD_meas > 1 91 | ), "ERROR: ADD_meas must be a positive integer!" 92 | assert ( 93 | isinstance(STEP_size, int) and STEP_size > 1 94 | ), "ERROR: STEP_size must be a positive integer!" 95 | 96 | # Partition data if requested with the specificed number of bins 97 | if n_partitions: 98 | seq_x = partition(seq_x, n_partitions) 99 | seq_y = partition(seq_y, n_partitions) 100 | 101 | # Check whether input is a discrete symbolic sequence 102 | if not arraytype(seq_x): 103 | seq_x = cast(seq_x) 104 | if not arraytype(seq_y): 105 | seq_y = cast(seq_y) 106 | 107 | # Set switch for operating differently on native vs numpy arrays 108 | if type(seq_x) == np.ndarray or type(seq_y) == np.ndarray: 109 | combine = lambda x, y: np.hstack([x, y]) 110 | if type(seq_x) == array.array or type(seq_y) == array.array: 111 | combine = lambda x, y: x + y 112 | 113 | # Setup variables 114 | LEN = len(seq_x) 115 | LEN_to_check = LEN_past + ADD_meas 116 | 117 | # Initialize aggregators 118 | l_1D = [] 119 | l_2D = [] 120 | 121 | # Iterate over chunks of both sequences 122 | for k in range(0, LEN - LEN_to_check, STEP_size): 123 | 124 | ## Compression-Complexity of past values of seq_x 125 | # 1D ETC of a chunk of seq_x of length LEN_past 126 | ETC1D_ini = get1D(seq_x[k : k + LEN_past])["NETC1D"] 127 | 128 | ## Compression-Complexity of past values of seq_x and seq_y 129 | # 2D ETC of chunks of both seq_x,seq_y of length LEN_past at the same locus 130 | ETC2D_ini = get2D(seq_x[k : k + LEN_past], seq_y[k : k + LEN_past],)["NETC2D"] 131 | 132 | ## Compression-Complexity of present values of seq_x 133 | # 1D ETC of a chunk of seq_x of length LEN_to_check 134 | ETC1D_fin = get1D(seq_x[k : k + LEN_to_check])["NETC1D"] 135 | 136 | ## Compression-Complexity of values of seq_x & past of seq_y + present of seq_x 137 | # 2D ETC of chunks of both seq_x, seq_y of length LEN_to_check at the same locus 138 | ETC2D_fin = get2D( 139 | seq_x[k : k + LEN_to_check], 140 | combine(seq_y[k : k + LEN_past], seq_x[k + LEN_past : k + LEN_to_check]), 141 | )["NETC2D"] 142 | 143 | # Dynamic Compression-Complexity of seq_x 144 | ETC1D_delta = ETC1D_fin - ETC1D_ini 145 | 146 | # Dynamic Compression Complexity of seq_x conditional on seq_y 147 | ETC2D_delta = ETC2D_fin - ETC2D_ini 148 | 149 | # Aggregate Dynamic CCs 150 | l_1D.append(ETC1D_delta) 151 | l_2D.append(ETC2D_delta) 152 | 153 | ## Compute Compession-Complexity Causality 154 | # Average of the difference: CC(X | X_past) - CC(X | Y_past + X_present) 155 | CCC = (sum(l_1D) - sum(l_2D)) / len(l_1D) 156 | # print(f"CCC for seq_y -> seq_x = {CCC}") 157 | return CCC 158 | -------------------------------------------------------------------------------- /ETC/CCC/simulate_TentMap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | # Import calls 10 | import numpy as np 11 | from numba import vectorize, float64, njit 12 | 13 | # Compute single step of iteration through skew-tent map 14 | @vectorize([float64(float64, float64)]) 15 | def _skewtent_onestep(value, threshold): 16 | """ 17 | Computes a single step of iteration through the skew-tent map given an 18 | input (previous) value and a threshold. Returns the next value as output. 19 | This function is called by _iterate_skewtent for iterating repeatedly. 20 | 21 | Parameters 22 | ---------- 23 | value : scalar, float64 24 | Input value to the skew-tent map. 25 | threshold : scalar, float64 26 | Threshold value of the skew-tent map. 27 | 28 | Returns 29 | ------- 30 | Output value as float64 from the skew-tent map. 31 | Computed conditionally as follows: 32 | If value < threshold, then output is value / threshold 33 | Else, output is (1 - value)/(1 - threshold) 34 | 35 | """ 36 | if value < threshold: 37 | return value / threshold 38 | return (1 - value) / (1 - threshold) 39 | 40 | 41 | # Multiple iterations along skew-tent map 42 | @njit 43 | def _iterate_skewtent(threshold, traj_vec, coupling): 44 | """ 45 | Computes multiple steps of iteration through the skew-tent map given a 46 | starting condition, as the first element of an array full of zeros, and 47 | a threshold for the skew-tent map. This function calls _skewtent_onestep 48 | for running a single step, and is itself called by _compute_trajectory, 49 | which initializes the trajectory array. 50 | 51 | Parameters 52 | ---------- 53 | threshold : vector of size 2, float64 54 | Threshold value of the skew-tent map. 55 | traj_vec : array, 2D, float64 56 | Pre-allocated array of zeroes with the 1st element containing a 57 | value corresponding to initial condition of the skew-tent map 58 | 59 | Returns 60 | ------- 61 | traj_vec : array, 2D, float64 62 | Array populated with values corresponding to the trajectory taken by 63 | recursive iteration through a skew-tent map. Length of this trajectory 64 | is inferred from the array shape. 65 | 66 | """ 67 | # Iteration using for-loop over indices 68 | for idx in range(1, max(traj_vec.shape)): 69 | 70 | # Execute single step of iteration using previous value and threshold 71 | traj_vec[0, idx] = _skewtent_onestep(traj_vec[0, idx - 1], threshold[0]) 72 | 73 | # Linearly dependent tent map 74 | buffer = _skewtent_onestep(traj_vec[1, idx - 1], threshold[1]) 75 | traj_vec[1, idx] = coupling[0] * traj_vec[0, idx] + (1 - coupling[0]) * buffer 76 | 77 | # Nonlinearly dependent tent map 78 | buffer = ( 79 | coupling[1] * traj_vec[0, idx - 1] 80 | + (1 - coupling[1]) * traj_vec[2, idx - 1] 81 | ) 82 | traj_vec[2, idx] = _skewtent_onestep(buffer, threshold[2]) 83 | 84 | # Return populated array 85 | return traj_vec 86 | 87 | 88 | # Compute trajectory given initial conditions, threshold and size 89 | @njit 90 | def _compute_trajectory(init_cond, threshold, length, coupling): 91 | """ 92 | Computes the trajectory along a skew-tent map with given threshold and an 93 | initial condition for a given distance. Doesn't validate input. This is 94 | called by compute_trajectory after checking inputs. 95 | 96 | Parameters 97 | ---------- 98 | init_cond : vector of size 3, float64 99 | Initial value for iterating through the skew-tent map. 100 | threshold : vector of size 3, float64 101 | Threshold value of the skew-tent map. 102 | length : scalar, integer 103 | Size of the trajectory to compute through iteration. 104 | 105 | Returns 106 | ------- 107 | array, 2D, float64 108 | Array of demanded size filled with values corresponding to the 109 | trajectory. 110 | 111 | """ 112 | # Pre-allocate array for trajectory with known size 113 | traj_vec = np.zeros((3, length), dtype=np.float64) 114 | 115 | # Assign initial condition to first elements of Y, Xlin, Xnonlin 116 | traj_vec[:, 0] = init_cond 117 | 118 | # Run iterations and return populated array 119 | return _iterate_skewtent(threshold, traj_vec, coupling) 120 | 121 | 122 | # Warmup for Numba cache initialization 123 | def warmup(): 124 | """ 125 | Runs all the Numba-optimized functions to initialize Numba's JIT. 126 | Returns nothing and only prints to stdout. 127 | 128 | Returns 129 | ------- 130 | None. 131 | 132 | """ 133 | initials = np.array([0.1] * 3) 134 | threshs = np.array([0.2] * 3) 135 | couplings = np.array([0] * 2) 136 | expected = np.array([0.625] * 3) 137 | # Test for a known value 138 | if (_compute_trajectory(initials, threshs, 3, couplings)[:, -1] == expected).all(): 139 | print("> Numba JIT warmup successful for chaotic_sampler ...") 140 | else: 141 | print("> Numba JIT warmup failed for chaotic_sampler ...") 142 | 143 | 144 | def compute_trajectory(init_cond, threshold, length, burn, coupling): 145 | """ 146 | Computes the trajectory along a skew-tent map with given threshold and an 147 | initial condition for a given distance. Wrapper around _compute_trajectory 148 | and checks inputs for sanity 149 | 150 | Parameters 151 | ---------- 152 | init_cond : vector of size 3, float64 153 | Initial value for iterating through the skew-tent map. 154 | range: 0 < init_cond < 1 155 | threshold : vector of size 3, float64 156 | Threshold value of the skew-tent map. 157 | range: 0 < threshold < 1 158 | length : scalar, integer 159 | Size of the trajectory to compute through iteration. 160 | range: 10^2 < length < 10^7 161 | 162 | Returns 163 | ------- 164 | array, 2D, float64 165 | Array of demanded size filled with values corresponding to the 166 | trajectory. 167 | 168 | """ 169 | # Return trajectory if inputs are valid 170 | 171 | return _compute_trajectory(init_cond, threshold, length + burn, coupling)[:, burn:] 172 | 173 | 174 | def coupled_TM(threshold, length, burn, coupling, seed): 175 | 176 | np.random.seed(seed) 177 | 178 | # Initialize starting points 179 | init_cond = np.random.uniform(size=(3)) 180 | 181 | # Initialize thresholds 182 | thresholds = np.array([threshold] * 3) 183 | 184 | # Initialize couplings 185 | couplings = np.array([coupling] * 2) 186 | 187 | trajectories = compute_trajectory(init_cond, thresholds, length, burn, couplings) 188 | 189 | return { 190 | "independent": trajectories[0, :], 191 | "dependent_linear": trajectories[1, :], 192 | "dependent_nonlinear": trajectories[2, :], 193 | } 194 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This is a demo script for showcasing this package's functionality in brief. 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | # Import call 10 | import ETC 11 | 12 | # ------------------------ 13 | # IO & SEQUENCE MANAGEMENT 14 | # ------------------------ 15 | # Read data to a list 16 | text = ETC.read(filepath="somefile.txt", delimiter=",") # Pick any file 17 | 18 | # Check validity of input and automatically cast to the right form if valid 19 | ETC.cast(text) 20 | 21 | # Recode data to integers in lexicographic order 22 | ETC.recode_lexical("bbacdbedf", case_sensitive=False) 23 | 24 | # Partition real-valued data to integer-valued discrete data 25 | ETC.partition([0.1, 0.34, 0.68, -1.9, 25.3], n_bins=2) 26 | 27 | # Generate synthetic data from the discrete uniform distribution 28 | ETC.generate(size=1000, partitions=4) 29 | 30 | # Reproducibility of random generation can be controlled by passing the same seed value 31 | ETC.generate(size=1000, partitions=4, seed=101) 32 | 33 | # Compute Shannon Entropy for a sequence 34 | ETC.entropy(seq=[1, 2, 1, 1, 1, 2, 1]) 35 | 36 | 37 | # --------------------------------------- 38 | # 1D ETC ESTIMATION FOR A SINGLE SEQUENCE 39 | # --------------------------------------- 40 | # Generate a random discrete symbolic sequence 41 | seq = ETC.generate(size=1000, partitions=2, seed=31) 42 | 43 | # Simplest way to run 44 | out = ETC.compute_1D(seq) 45 | 46 | # The result is a dict of 2 key-value pairs: the raw and normalized ETC estimates 47 | print(out) 48 | 49 | # Get whichever is needed by using their respective keys 50 | print(out.get('ETC1D')) 51 | # [Out]: 225 52 | 53 | print(out.get('NETC1D')) 54 | # [Out]: 0.22522522522522523 55 | 56 | # The normalization is done over one less than the overall length 57 | print(out.get('ETC1D') / (len(seq) - 1)) 58 | # [Out]: 0.22522522522522523 59 | 60 | # If more details about the trajectory are desired, set verbosity to True 61 | out = ETC.compute_1D(seq, verbose=True) 62 | 63 | # The result is now a dict of 3 elements: the 2 ETC estimates and the Trajectory 64 | print(out.get('Trajectory')) # List of dicts - one dict for each step 65 | 66 | # The default behavior is to truncate the iteration process until the sequence gets 67 | # saturated to have all unique pairs occurring just once. This speeds up computation as 68 | # the remaining steps don't need to be computed and ETC reduces to an analytic expression. 69 | # However, the substitution table or features may be of interest and this truncation can 70 | # then be turned off so that the iteration continues till entropy of 0 is reached: 71 | out = ETC.compute_1D(seq, verbose=True, truncate=False) 72 | 73 | print(out.get('Trajectory')) # Last step has length 1 and entropy 0 74 | 75 | # This Trajectory can be saved to CSV for later use through a convenience function: 76 | ETC.save(out.get('Trajectory'), filename="ETC_results.csv") 77 | 78 | # -------------------------------------------------------------------------------------# 79 | # Additionally, instead of pair-substitution (NSRPS), a window of any size may be 80 | # substituted using the order switch, for example substitute triplets: 81 | out = ETC.compute_1D(seq, order=3, verbose=True, truncate=False) 82 | 83 | print(out.get("Trajectory")) 84 | 85 | # The default function call ETC.compute_1D(seq) is the same as: 86 | # ETC.compute_1D(seq, order=2, verbose=False, truncate=True) 87 | 88 | # -------------------------------------------------------------- 89 | # PARALLELIZED 1D ETC ESTIMATION FOR CHUNKS OF A SINGLE SEQUENCE 90 | # -------------------------------------------------------------- 91 | # Generate a long sequence 92 | seq = ETC.generate(size=20000, partitions=2) 93 | 94 | # Compute ETC on overlapping chunks of 1000 elements offsetted by 100, in parallel 95 | if __name__ == "__main__": 96 | outp = ETC.pcompute_single(seq, size=1000, offset=100) 97 | 98 | # The output is a list of dictionaries with estimates, one dict for each ordered chunk 99 | print(outp) 100 | 101 | # Compute ETC on non-overlapping chunks of 1000 elements (set offset = size), in parallel 102 | if __name__ == "__main__": 103 | outp = ETC.pcompute_single(seq, size=1000, offset=1000) 104 | 105 | # Similarly, 106 | print(outp) 107 | 108 | # ------------------------------------------------------------------ 109 | # PARALLELIZED 1D ETC ESTIMATION FOR MULTIPLE SEQUENCES IN PARALLEL 110 | # ------------------------------------------------------------------ 111 | # Generate 10 long sequences 112 | seqs = [ETC.generate(10000, 2) for _ in range(10)] 113 | 114 | # Compute ETC estimates for each sequence in parallel 115 | if __name__ == "__main__": 116 | outp = ETC.pcompute_multiple_seq(seqs) 117 | 118 | print(outp) 119 | 120 | 121 | # -------------------------------- 122 | # WORKS WITH NUMPY OUT OF THE BOX! 123 | # -------------------------------- 124 | # Generate a random discrete symbolic sequence and compute 1D ETC on it 125 | import numpy as np 126 | np.random.seed(10) 127 | seq = np.random.randint(1, 3, size=5000) 128 | out = ETC.compute_1D(seq) 129 | 130 | print(out) 131 | # {'ETC1D': 884, 'NETC1D': 0.17683536707341468} 132 | 133 | # Parallelized ETC estimation - row-wise for 2D numpy arrays 134 | seq = np.random.normal(1, 3, size=[10,5000]) # Each row is a distinct sequence 135 | seq = ETC.partition_numpy(nparr=seq, n_bins=2) 136 | out = ETC.pcompute_numpy(nparr=seq) 137 | 138 | print(out) 139 | # One estimate per row 140 | 141 | # ----------------------------------------- 142 | # 2D ETC ESTIMATION FOR A PAIR OF SEQUENCES 143 | # ----------------------------------------- 144 | # Generate two random sequences 145 | seq_x = ETC.generate(size=1000, partitions=2, seed=17) 146 | seq_y = ETC.generate(size=1000, partitions=2, seed=19) 147 | 148 | # Compute Effort To Compress using Non-Sequential Recursive Pair Substitution 149 | out = ETC.compute_2D(seq_x, seq_y, order=2, verbose=True, truncate=False) 150 | 151 | # View estimates 152 | print(out.get('ETC2D')) 153 | print(out.get('NETC2D')) 154 | 155 | # View trajectory 156 | print(out.get('Trajectory')) 157 | 158 | # ----------------------------------------- 159 | # CAUSALITY TESTING USING THE CCC FRAMEWORK 160 | # ----------------------------------------- 161 | # Import call for CCC sub-package 162 | from ETC import CCC 163 | 164 | # Compute CCC for the above two sequences 165 | ccc_est = CCC.compute( 166 | seq_x, seq_y, LEN_past=150, ADD_meas=15, STEP_size=20, n_partitions=False 167 | ) 168 | # [Out]: CCC for seq_y -> seq_x = -0.00301035257856264 169 | 170 | # See docstrings for more information on CCC estimation 171 | # ?CCC.compute 172 | 173 | # Simulate a pair of coupled first-order AR processes 174 | ar = CCC.coupled_AR(length=10000, a=0.8, b=0.9, c=0.8, e=0.01, burn=1000, seed=1) 175 | # ar is a dictionary of two key-value pairs with the following keys: 176 | # "dependent" and "independent", each with their respective values in float arrays 177 | # ?CCC.coupled_AR for more information on sampling from AR processes 178 | 179 | # Estimate CCC for the direction independent -> dependent with binning 180 | ccc_ar = CCC.compute( 181 | seq_x=ar["dependent"], 182 | seq_y=ar["independent"], 183 | LEN_past=150, 184 | ADD_meas=15, 185 | STEP_size=20, 186 | n_partitions=2, 187 | ) 188 | # [Out]: CCC for seq_y -> seq_x = 0.005755172746030292 189 | 190 | # And for the opposite direction 191 | ccc_ar = CCC.compute( 192 | seq_x=ar["independent"], 193 | seq_y=ar["dependent"], 194 | LEN_past=150, 195 | ADD_meas=15, 196 | STEP_size=20, 197 | n_partitions=2, 198 | ) 199 | # [Out]: CCC for seq_y -> seq_x = 0.0002971309733327245 200 | -------------------------------------------------------------------------------- /ETC/CCC/calibrate_CCC.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from ETC import compute_1D, compute_2D, generate 10 | from ETC.seq.recode import partition 11 | from itertools import product, chain 12 | from functools import partial 13 | from multiprocessing import Pool 14 | from time import perf_counter 15 | from matplotlib import pyplot as plt 16 | import pandas as pd 17 | import seaborn as sns 18 | 19 | sns.set() 20 | 21 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True) 22 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True) 23 | 24 | 25 | def test(seq_x, seq_y, past_win_size, delta, step_size, partitions=False): 26 | if partitions: 27 | seq_x = partition(seq_x, partitions) 28 | seq_y = partition(seq_y, partitions) 29 | 30 | aggregator = [] 31 | 32 | length = len(seq_x) 33 | total_win_size = past_win_size + delta 34 | 35 | for n, k in enumerate(range(0, length - total_win_size, step_size)): 36 | out = {} 37 | out["partitions"] = partitions 38 | out["window"] = n + 1 39 | out["begin"] = k 40 | out["past_win_size"] = past_win_size 41 | out["end_past"] = k + past_win_size 42 | out["delta"] = delta 43 | out["total_win_size"] = total_win_size 44 | out["end_total"] = k + total_win_size 45 | out["step_size"] = step_size 46 | 47 | ## CC 1D for X -------------------------------------------------------- 48 | # ETC 1D for past and current+past=total values of X 49 | ETC1D_X_total = get1D(seq_x[k : k + total_win_size])["ETC1D"] 50 | out["ETC_1D_X_total_raw"] = ETC1D_X_total 51 | 52 | ETC1D_X_total /= total_win_size - 1 53 | out["ETC_1D_X_total_norm"] = ETC1D_X_total 54 | 55 | # ETC 1D for past of Y and current of X 56 | segment = tuple( 57 | chain( 58 | seq_y[k : k + past_win_size], 59 | seq_x[k + past_win_size : k + total_win_size], 60 | ) 61 | ) 62 | ETC1D_X_Ypast = get1D(segment)["ETC1D"] 63 | out["ETC_1D_X_YpastXcurr_raw"] = ETC1D_X_Ypast 64 | 65 | ETC1D_X_Ypast /= total_win_size - 1 66 | out["ETC_1D_X_YpastXcurr_norm"] = ETC1D_X_Ypast 67 | 68 | ## -------------------------------------------------------------------- 69 | 70 | ## CC 1D for Y -------------------------------------------------------- 71 | # ETC 1D for past values of Y 72 | # ETC1D_Y_ini = get1D(seq_y[k : k + past_win_size])["ETC1D"] 73 | # out["ETC_1D_Y_past_raw"] = ETC1D_Y_ini 74 | 75 | # ETC1D_Y_ini /= past_win_size - 1 76 | # out["ETC_1D_Y_past_norm"] = ETC1D_Y_ini 77 | 78 | # # ETC 1D for past and current+past=total values of Y 79 | # ETC1D_Y_fin = get1D(seq_y[k : k + total_win_size])["ETC1D"] 80 | # out["ETC_1D_Y_total_raw"] = ETC1D_Y_fin 81 | 82 | # ETC1D_Y_fin /= total_win_size - 1 83 | # out["ETC_1D_Y_total_norm"] = ETC1D_Y_fin 84 | 85 | # # CC 1D for past and total values of Y 86 | # CC1D_Y_past = ETC1D_Y_fin - ETC1D_Y_ini 87 | # out["CC_1D_Y"] = CC1D_Y_past 88 | ## -------------------------------------------------------------------- 89 | 90 | # ETC 2D for past values of X and Y ----------------------------------- 91 | ETC2D_ini = get2D(seq_x[k : k + past_win_size], seq_y[k : k + past_win_size])[ 92 | "ETC2D" 93 | ] 94 | out["ETC_2D_X_past_Y_past_raw"] = ETC2D_ini 95 | out["ETC_2D_Y_past_X_past_raw"] = ETC2D_ini 96 | 97 | ETC2D_ini /= past_win_size - 1 98 | out["ETC_2D_X_past_Y_past_norm"] = ETC2D_ini 99 | out["ETC_2D_Y_past_X_past_norm"] = ETC2D_ini 100 | ## -------------------------------------------------------------------- 101 | 102 | # ETC 2D for current+past=total values of X and past values of Y plus current values of X 103 | ETC2D_X_fin = get2D(seq_x[k : k + total_win_size], segment,)["ETC2D"] 104 | out["ETC_2D_X_total_Y_pastX_curr_raw"] = ETC2D_X_fin 105 | 106 | ETC2D_X_fin /= total_win_size - 1 107 | out["ETC_2D_X_total_Y_pastX_curr_norm"] = ETC2D_X_fin 108 | 109 | ## -------------------------------------------------------------------- 110 | 111 | # ETC 2D for current+past=total values of Y and past values of X plus current values of Y 112 | # ETC2D_Y_fin = get2D( 113 | # seq_y[k : k + total_win_size], 114 | # seq_x[k : k + past_win_size] 115 | # + seq_y[k + past_win_size : k + total_win_size], 116 | # )["ETC2D"] 117 | # out["ETC_2D_Y_total_X_past_raw"] = ETC2D_Y_fin 118 | 119 | # ETC2D_Y_fin /= total_win_size - 1 120 | # out["ETC_2D_Y_total_X_past_norm"] = ETC2D_Y_fin 121 | 122 | # # CC 2D for past and total values of X 123 | # CC2D_Y_total_X_past = ETC2D_Y_fin - ETC2D_ini 124 | # out["CC_2D_Y_by_X_past"] = CC2D_Y_total_X_past 125 | ## -------------------------------------------------------------------- 126 | aggregator.append(out) 127 | 128 | return pd.DataFrame(aggregator) 129 | 130 | 131 | def test_multiple(seq_x, seq_y): 132 | 133 | # Past window size 134 | PWS = [100, 150, 175, 200] 135 | 136 | # Current window size 137 | CWS = [10, 15, 20, 25] 138 | 139 | # Jump step size 140 | SS = [10, 15, 20, 25, 30] 141 | 142 | before = perf_counter() 143 | 144 | results = [] 145 | 146 | for past_win_size, delta, step_size in product(PWS, CWS, SS): 147 | results.append(test(seq_x, seq_y, past_win_size, delta, step_size)) 148 | 149 | results = pd.concat(results) 150 | 151 | after = perf_counter() 152 | 153 | return results, after - before 154 | 155 | 156 | def unpack(function, params): 157 | past_win_size, delta, step_size = params 158 | return function(past_win_size, delta, step_size) 159 | 160 | 161 | def test_multiple_parallel(seq_x, seq_y): 162 | 163 | # Past window size 164 | PWS = [50, 75] 165 | 166 | # Current window size 167 | CWS = range(10, 51, 5) 168 | 169 | # Jump step size 170 | SS = [25, 50] 171 | 172 | func = partial(test, seq_x, seq_y) 173 | func = partial(unpack, func) 174 | 175 | before = perf_counter() 176 | # Initialize pool of parallel workers 177 | pool = Pool() 178 | 179 | # Map-execute function across files 180 | results = pool.map(func, product(PWS, CWS, SS)) 181 | 182 | # Graceful exit 183 | pool.close() 184 | pool.join() 185 | 186 | results = pd.concat(results) 187 | 188 | after = perf_counter() 189 | 190 | return results, after - before 191 | 192 | 193 | # x = generate(1000) 194 | # y = generate(1000) 195 | # a2, timings = test_multiple_parallel(x, y) 196 | 197 | 198 | # %% 199 | # fig, ax = plt.subplots(1,1) 200 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_past_norm', ax=ax) 201 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_total_norm', ax=ax) 202 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_past_Y_past_norm', ax=ax) 203 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_total_Y_past_norm', ax=ax) 204 | 205 | # # %% 206 | # fig, ax = plt.subplots(1, 1) 207 | # sns.lineplot( 208 | # data=a2, 209 | # hue="past_win_size", 210 | # y="ETC_1D_X_past_norm", 211 | # x="delta", 212 | # ci=None, 213 | # ax=ax, 214 | # palette="viridis", 215 | # ) 216 | # sns.lineplot( 217 | # data=a2, 218 | # hue="past_win_size", 219 | # y="ETC_1D_X_total_norm", 220 | # x="delta", 221 | # ci=None, 222 | # ax=ax, 223 | # palette="viridis", 224 | # ) 225 | -------------------------------------------------------------------------------- /ETC/CCC/_calibrate_CCC.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from ETC import compute_1D, compute_2D, generate 10 | from ETC.seq.recode import partition 11 | from itertools import product 12 | from functools import partial 13 | from multiprocessing import Pool 14 | from time import perf_counter 15 | from matplotlib import pyplot as plt 16 | import pandas as pd 17 | import seaborn as sns 18 | 19 | sns.set() 20 | 21 | get1D = partial(compute_1D, order=2, verbose=False, truncate=True) 22 | get2D = partial(compute_2D, order=2, verbose=False, truncate=True) 23 | 24 | 25 | def test(seq_x, seq_y, past_win_size, delta, step_size, partitions=False): 26 | if partitions: 27 | seq_x = partition(seq_x, partitions) 28 | seq_y = partition(seq_y, partitions) 29 | 30 | aggregator = [] 31 | 32 | length = len(seq_x) 33 | total_win_size = past_win_size + delta 34 | 35 | for n, k in enumerate(range(0, length - total_win_size, step_size)): 36 | out = {} 37 | out["partitions"] = partitions 38 | out["window"] = n + 1 39 | out["begin"] = k 40 | out["past_win_size"] = past_win_size 41 | out["end_past"] = k + past_win_size 42 | out["delta"] = delta 43 | out["total_win_size"] = total_win_size 44 | out["end_total"] = k + total_win_size 45 | out["step_size"] = step_size 46 | 47 | ## CC 1D for X -------------------------------------------------------- 48 | # ETC 1D for past values of X 49 | ETC1D_X_ini = get1D(seq_x[k : k + past_win_size])["ETC1D"] 50 | out["ETC_1D_X_past_raw"] = ETC1D_X_ini 51 | 52 | ETC1D_X_ini /= past_win_size - 1 53 | out["ETC_1D_X_past_norm"] = ETC1D_X_ini 54 | 55 | # ETC 1D for past and current+past=total values of X 56 | ETC1D_X_fin = get1D(seq_x[k : k + total_win_size])["ETC1D"] 57 | out["ETC_1D_X_total_raw"] = ETC1D_X_fin 58 | 59 | ETC1D_X_fin /= total_win_size - 1 60 | out["ETC_1D_X_total_norm"] = ETC1D_X_fin 61 | 62 | # CC 1D for past and total values of X 63 | CC1D_X_past = ETC1D_X_fin - ETC1D_X_ini 64 | out["CC_1D_X"] = CC1D_X_past 65 | ## -------------------------------------------------------------------- 66 | 67 | ## CC 1D for Y -------------------------------------------------------- 68 | # ETC 1D for past values of Y 69 | ETC1D_Y_ini = get1D(seq_y[k : k + past_win_size])["ETC1D"] 70 | out["ETC_1D_Y_past_raw"] = ETC1D_Y_ini 71 | 72 | ETC1D_Y_ini /= past_win_size - 1 73 | out["ETC_1D_Y_past_norm"] = ETC1D_Y_ini 74 | 75 | # ETC 1D for past and current+past=total values of Y 76 | ETC1D_Y_fin = get1D(seq_y[k : k + total_win_size])["ETC1D"] 77 | out["ETC_1D_Y_total_raw"] = ETC1D_Y_fin 78 | 79 | ETC1D_Y_fin /= total_win_size - 1 80 | out["ETC_1D_Y_total_norm"] = ETC1D_Y_fin 81 | 82 | # CC 1D for past and total values of Y 83 | CC1D_Y_past = ETC1D_Y_fin - ETC1D_Y_ini 84 | out["CC_1D_Y"] = CC1D_Y_past 85 | ## -------------------------------------------------------------------- 86 | 87 | # ETC 2D for past values of X and Y ----------------------------------- 88 | ETC2D_ini = get2D(seq_x[k : k + past_win_size], seq_y[k : k + past_win_size])[ 89 | "ETC2D" 90 | ] 91 | out["ETC_2D_X_past_Y_past_raw"] = ETC2D_ini 92 | out["ETC_2D_Y_past_X_past_raw"] = ETC2D_ini 93 | 94 | ETC2D_ini /= past_win_size - 1 95 | out["ETC_2D_X_past_Y_past_norm"] = ETC2D_ini 96 | out["ETC_2D_Y_past_X_past_norm"] = ETC2D_ini 97 | ## -------------------------------------------------------------------- 98 | 99 | # ETC 2D for current+past=total values of X and past values of Y plus current values of X 100 | ETC2D_X_fin = get2D( 101 | seq_x[k : k + total_win_size], 102 | seq_y[k : k + past_win_size] 103 | + seq_x[k + past_win_size : k + total_win_size], 104 | )["ETC2D"] 105 | out["ETC_2D_X_total_Y_past_raw"] = ETC2D_X_fin 106 | 107 | ETC2D_X_fin /= total_win_size - 1 108 | out["ETC_2D_X_total_Y_past_norm"] = ETC2D_X_fin 109 | 110 | # CC 2D for past and total values of X 111 | CC2D_X_total_Y_past = ETC2D_X_fin - ETC2D_ini 112 | out["CC_2D_X_by_Y_past"] = CC2D_X_total_Y_past 113 | ## -------------------------------------------------------------------- 114 | 115 | # ETC 2D for current+past=total values of Y and past values of X plus current values of Y 116 | ETC2D_Y_fin = get2D( 117 | seq_y[k : k + total_win_size], 118 | seq_x[k : k + past_win_size] 119 | + seq_y[k + past_win_size : k + total_win_size], 120 | )["ETC2D"] 121 | out["ETC_2D_Y_total_X_past_raw"] = ETC2D_Y_fin 122 | 123 | ETC2D_Y_fin /= total_win_size - 1 124 | out["ETC_2D_Y_total_X_past_norm"] = ETC2D_Y_fin 125 | 126 | # CC 2D for past and total values of X 127 | CC2D_Y_total_X_past = ETC2D_Y_fin - ETC2D_ini 128 | out["CC_2D_Y_by_X_past"] = CC2D_Y_total_X_past 129 | ## -------------------------------------------------------------------- 130 | aggregator.append(out) 131 | 132 | return pd.DataFrame(aggregator) 133 | 134 | 135 | def test_multiple(seq_x, seq_y): 136 | 137 | # Past window size 138 | PWS = [100, 150, 175, 200] 139 | 140 | # Current window size 141 | CWS = [10, 15, 20, 25] 142 | 143 | # Jump step size 144 | SS = [10, 15, 20, 25, 30] 145 | 146 | before = perf_counter() 147 | 148 | results = [] 149 | 150 | for past_win_size, delta, step_size in product(PWS, CWS, SS): 151 | results.append(test(seq_x, seq_y, past_win_size, delta, step_size)) 152 | 153 | results = pd.concat(results) 154 | 155 | after = perf_counter() 156 | 157 | return results, after - before 158 | 159 | 160 | def unpack(function, params): 161 | past_win_size, delta, step_size = params 162 | return function(past_win_size, delta, step_size) 163 | 164 | 165 | def test_multiple_parallel(seq_x, seq_y): 166 | 167 | # Past window size 168 | PWS = [50, 75] 169 | 170 | # Current window size 171 | CWS = range(10, 51, 5) 172 | 173 | # Jump step size 174 | SS = [25, 50] 175 | 176 | func = partial(test, seq_x, seq_y) 177 | func = partial(unpack, func) 178 | 179 | before = perf_counter() 180 | # Initialize pool of parallel workers 181 | pool = Pool() 182 | 183 | # Map-execute function across files 184 | results = pool.map(func, product(PWS, CWS, SS)) 185 | 186 | # Graceful exit 187 | pool.close() 188 | pool.join() 189 | 190 | results = pd.concat(results) 191 | 192 | after = perf_counter() 193 | 194 | return results, after - before 195 | 196 | 197 | # x = generate(1000) 198 | # y = generate(1000) 199 | # a2, timings = test_multiple_parallel(x, y) 200 | 201 | 202 | # %% 203 | # fig, ax = plt.subplots(1,1) 204 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_past_norm', ax=ax) 205 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_1D_X_total_norm', ax=ax) 206 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_past_Y_past_norm', ax=ax) 207 | # sns.lineplot(data=a2, x='past_win_size', y='ETC_2D_X_total_Y_past_norm', ax=ax) 208 | 209 | # # %% 210 | # fig, ax = plt.subplots(1, 1) 211 | # sns.lineplot( 212 | # data=a2, 213 | # hue="past_win_size", 214 | # y="ETC_1D_X_past_norm", 215 | # x="delta", 216 | # ci=None, 217 | # ax=ax, 218 | # palette="viridis", 219 | # ) 220 | # sns.lineplot( 221 | # data=a2, 222 | # hue="past_win_size", 223 | # y="ETC_1D_X_total_norm", 224 | # x="delta", 225 | # ci=None, 226 | # ax=ax, 227 | # palette="viridis", 228 | # ) 229 | -------------------------------------------------------------------------------- /ETC/NSRWS/x1D/core.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | 6 | 7 | @author: Pranay S. Yadav 8 | """ 9 | # Import stuff 10 | from cpython cimport array, bool 11 | cimport cython 12 | import array 13 | 14 | # Function for getting mask for pairs 15 | cpdef array.array get_mask_pairs(const unsigned int[::1] x): 16 | """ 17 | INPUT 18 | ----- 19 | x : array.array 20 | Array object containing 32-bit integers. 21 | 22 | OUTPUT 23 | ------ 24 | mask : array.array 25 | Array object containing 32-bit integers - 0s or 1s corresponding to values in 26 | x for which successive overlapping pairs occur. 27 | """ 28 | # Get size of input 29 | cdef Py_ssize_t x_size = len(x) 30 | 31 | # Initialize a mask of Falses 32 | cdef array.array int_template = array.array('I', []) 33 | cdef array.array mask = array.clone(int_template, x_size-1, zero=True) 34 | cdef unsigned int[:] mask_view = mask 35 | 36 | # Initialize bounds for iteration 37 | cdef Py_ssize_t n = 0 38 | 39 | # Turn all values in mask to Trues 40 | for n in range(x_size-1): 41 | mask_view[n] += 1 42 | 43 | # Iterate over all values of input 44 | n = 0 45 | while n < x_size - 2: 46 | 47 | # If successive pairs match 48 | if x[n] == x[n+1] and x[n+1] == x[n+2]: 49 | 50 | # Mask out the second one 51 | mask_view[n+1] = 0 52 | 53 | # And slide over it 54 | n += 1 55 | 56 | # Increment while loop index 57 | n += 1 58 | 59 | return mask 60 | 61 | # Function for substituting pairs 62 | cpdef list substitute_pairs(unsigned int[::1] x, unsigned int[::1] pair, unsigned int value): 63 | """ 64 | INPUT 65 | ----- 66 | x : array.array 67 | Array object containing 32-bit unsigned integers. 68 | 69 | pair : array.array, length = 2 70 | Array object containing 2 32-bit unsigned integers. 71 | 72 | value : unsigned 32-bit int 73 | Value to substitute the first element of pair with 74 | 75 | OUTPUT 76 | ------ 77 | out : list 78 | Array object containing 32-bit integers, with supplied pair replaced everywhere 79 | by the supplied value. 80 | """ 81 | # Initialize looping variables and output list 82 | cdef Py_ssize_t n = 0 83 | cdef Py_ssize_t x_size = len(x) 84 | cdef list out = [] 85 | 86 | # Loop over input and replace pair 87 | while n < x_size-1: 88 | 89 | # Check for match with supplied pair 90 | if x[n] == pair[0] and x[n+1] == pair[1]: 91 | 92 | # Replace first value with supplied value 93 | x[n] = value 94 | 95 | # Replace second value with 0 96 | x[n+1] = 0 97 | 98 | n += 1 99 | 100 | n += 1 101 | 102 | # Reset indexing variable 103 | n = 0 104 | 105 | # Loop over mutated input and append non-zero values to list 106 | for n in range(x_size): 107 | 108 | if x[n]: 109 | 110 | out.append(x[n]) 111 | 112 | return out 113 | 114 | # Function for checking whether all elements in input are identical 115 | cpdef bint check_equality(const unsigned int[::1] x): 116 | """ 117 | INPUT 118 | ----- 119 | x : array.array 120 | Array object containing 32-bit unsigned integers. 121 | 122 | 123 | OUTPUT 124 | ------ 125 | bool 126 | True if all elements are identical 127 | """ 128 | # Intialize loop bounds 129 | cdef Py_ssize_t n 130 | cdef Py_ssize_t x_size = len(x) 131 | 132 | # Iterate over values from input 133 | for n in range(x_size): 134 | 135 | # Short-circuit the loop: check for any element that doesn't equal the first 136 | if x[0] != x[n]: 137 | return False 138 | 139 | return True 140 | 141 | # Function for getting mask for windows of any length 142 | cpdef array.array get_mask_windows(const unsigned int[::1] x, unsigned int order): 143 | """ 144 | INPUT 145 | ----- 146 | x : array.array 147 | Array object containing 32-bit integers. 148 | 149 | order: unsigned 32-bit int 150 | Length of the window to slide across input 151 | 152 | OUTPUT 153 | ------ 154 | mask : array.array 155 | Array object containing 32-bit integers - 0s or 1s corresponding to values in 156 | x for which successive overlapping pairs occur. 157 | """ 158 | # Get size of input 159 | cdef Py_ssize_t x_size = len(x) 160 | 161 | # Initialize a mask of Falses 162 | cdef array.array int_template = array.array('I', []) 163 | cdef array.array mask = array.clone(int_template, x_size - (order-1), zero=True) 164 | cdef unsigned int[:] mask_view = mask 165 | 166 | # Initialize variable for iteration 167 | cdef Py_ssize_t n = 0 168 | 169 | # Turn all values in mask to Trues 170 | for n in range(x_size - order + 1): 171 | mask_view[n] += 1 172 | 173 | # Initialize variables for iteration across input 174 | cdef Py_ssize_t k = 0 # Outer loop 175 | cdef Py_ssize_t m = 0 # Inner loop 176 | 177 | # Tracking variable for counting matching elements in pairwise window comparison 178 | cdef unsigned int track = 0 179 | 180 | # Iterate over input values except the last 'order' values [Outermost master loop] 181 | n = 0 182 | for n in range(x_size - (order-1)): 183 | 184 | # proceed only if mask is True for current element 185 | if mask_view[n]: 186 | 187 | # Outer loop for sliding the 'next' window by unit step (current vs next) 188 | for k in range(1,order): # Start from 1 - begin comparing from next window 189 | 190 | # Inner loop for comparing elements in current and next windows 191 | for m in range(order): 192 | 193 | if n+m+k >= x_size: 194 | return mask 195 | 196 | # If elements match, increment tracker 197 | if x[n+m] == x[n+m+k]: 198 | track += 1 199 | 200 | # Else stop iteration over this comparison of windows 201 | # else: 202 | # break 203 | 204 | # Trick: preserve mask only if track doesn't equal order 205 | # If track == order, short-circuit eval takes precedence, returning 0 206 | mask_view[n+k] = track!=order and mask_view[n+k] 207 | 208 | # Reset tracker 209 | track = 0 210 | 211 | return mask 212 | 213 | # Function for substituting windows of any length 214 | cpdef list substitute_windows(unsigned int[::1] x, unsigned int order, unsigned int[::1] window, unsigned int value): 215 | """ 216 | INPUT 217 | ----- 218 | x : array.array 219 | Array object containing 32-bit unsigned integers. 220 | 221 | order: unsigned 32-bit int 222 | Length of the window to slide across input 223 | 224 | window : array.array, length = 2 225 | Array object containing 2 32-bit unsigned integers. 226 | 227 | value : unsigned 32-bit int 228 | Value to substitute the first element of pair with 229 | 230 | OUTPUT 231 | ------ 232 | out : list 233 | Array object containing 32-bit integers, with supplied pair replaced everywhere 234 | by the supplied value. 235 | """ 236 | # Initialize looping variables and output list 237 | cdef Py_ssize_t n = 0 # Outer loop 238 | cdef Py_ssize_t m = 0 # Inner loop 239 | cdef Py_ssize_t x_size = len(x) 240 | cdef list out = [] 241 | 242 | # Tracking variable for counting matching elements in pairwise window comparison 243 | cdef unsigned int track = 0 244 | 245 | # Iterate over input values except one less than the last 'order' values 246 | # Logic: last window, say triplet must begin from 3rd-last index, leaving 2 values 247 | for n in range(x_size-order+1): 248 | 249 | # Slide window of given order and do element-wise comparison 250 | for m in range(order): 251 | 252 | # Reset tracker 253 | 254 | 255 | # Track comparison of input elements with window elements 256 | if x[n+m] == window[m]: 257 | track += 1 258 | # # If mismatch, break 259 | else: 260 | break 261 | 262 | # If all compared elements match for current window 263 | if track == order: 264 | 265 | # Replace the first element with provided value 266 | x[n] = value 267 | 268 | # Replace the remaining subsequent values with zeros 269 | for m in range(1, order): 270 | x[n+m] = 0 271 | 272 | # Reset tracker 273 | track = 0 274 | # Reset indexing variable 275 | n = 0 276 | 277 | # Loop over mutated input and append non-zero values to list 278 | for n in range(x_size): 279 | 280 | if x[n]: 281 | 282 | out.append(x[n]) 283 | 284 | return out -------------------------------------------------------------------------------- /ETC/NSRWS/x1D/parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from functools import partial 9 | from itertools import islice 10 | 11 | # Import functions from standard library modules 12 | from multiprocessing import Pool 13 | 14 | # Import local modules 15 | import ETC 16 | from ETC.seq.process import entropy 17 | 18 | # from ETC.seq.process import entropy 19 | import numpy as np 20 | 21 | # Function definitions 22 | def _compute_single_file(filepath, order=2): 23 | """ 24 | This function operates on a single file - reads sequence, computes ETC 25 | and writes to disk. 26 | 27 | Parameters 28 | ---------- 29 | filepath : str or Path object 30 | Valid path to a file containing sequence. 31 | 32 | Returns 33 | ------- 34 | out : dict 35 | filename, length of sequence and ETC estimate. 36 | 37 | """ 38 | # Read file as a sequence 39 | seq = ETC.seq.IO.read(filepath) 40 | seq = ETC.seq.recode.recode_lexical(seq) 41 | 42 | # Filename for writing output of ETC computation 43 | fname = filepath.with_name(filepath.stem + f"_ETC_order{order}.csv") 44 | 45 | # Prepare output dictionary 46 | out = {"file": filepath.name, "length": len(seq), "entropy": entropy(seq)} 47 | 48 | # Compute ETC, write to file and update output dictionary 49 | out.update(ETC.NSRWS.x1D.etc.compute_save(seq, fname, order=order, truncate=True)) 50 | 51 | return out 52 | 53 | 54 | def pcompute_files(filelist, order=2): 55 | """ 56 | This function operates concurrently on a list of files. Reads each as a 57 | sequence, computes ETC and writes output to disk. 58 | 59 | CAUTION: main module is unguarded, do not run these functions as is, 60 | particularly on Windows. 61 | 62 | Parameters 63 | ---------- 64 | filelist : list/tuple/generator 65 | Collection of filenames of files containing sequence data. 66 | 67 | Returns 68 | ------- 69 | list of dict elements 70 | Each dictionary element contains filename, length of sequence & ETC. 71 | 72 | """ 73 | # Initialize pool of parallel workers 74 | pool = Pool() 75 | func = partial(_compute_single_file, order=order) 76 | # Map-execute function across files 77 | out = pool.map_async(func, filelist) 78 | 79 | # Graceful exit 80 | pool.close() 81 | pool.join() 82 | 83 | # Return collected results 84 | return out.get() 85 | 86 | 87 | def _compute_single_seq(seq): 88 | """ 89 | This function operates on a single sequence and computes ETC. 90 | 91 | Parameters 92 | ---------- 93 | seq : tuple of 2 elements 94 | 1st element is index for tracking. 95 | 2nd element is a sequence of integers used for ETC computation. 96 | Output of enumerate. 97 | 98 | Returns 99 | ------- 100 | out : dict 101 | index of sequence, length of sequence and ETC estimate. 102 | 103 | """ 104 | # Prepare output dictionary 105 | out = {"index": seq[0], "length": len(seq[1]), "entropy": entropy(seq[1])} 106 | 107 | # Compute ETC and update output dictionary 108 | out.update(ETC.compute_1D(seq[1], order=2, verbose=False, truncate=True)) 109 | 110 | return out 111 | 112 | 113 | # TEMPORARY MOD FOR ASSEMBLY-FREE 114 | # def _compute_single_seq(seq): 115 | # """ 116 | # This function operates on a single sequence and computes ETC. 117 | 118 | # Parameters 119 | # ---------- 120 | # seq : tuple of 2 elements 121 | # 1st element is index for tracking. 122 | # 2nd element is a sequence of integers used for ETC computation. 123 | # Output of enumerate. 124 | 125 | # Returns 126 | # ------- 127 | # out : dict 128 | # index of sequence, length of sequence and ETC estimate. 129 | 130 | # """ 131 | # data = ETC.seq.recode.recode_lexical(seq[1]) 132 | 133 | # # Prepare output dictionary 134 | # out = {"item": seq[0], "length": len(data)} 135 | 136 | # # Compute ETC and update output dictionary 137 | # out.update(ETC.compute_1D(data, order=2, verbose=False, truncate=True)) 138 | # out.update({"Entropy": entropy(data, legacy=False)}) 139 | 140 | # return out 141 | 142 | 143 | def pcompute_multiple_seq(iterable): 144 | """ 145 | This function operates concurrently on a collection of sequences. Loads 146 | each sequence and computes ETC. 147 | 148 | CAUTION: main module is unguarded, do not run these functions as is, 149 | particularly on Windows. 150 | 151 | Parameters 152 | ---------- 153 | iterable : list/tuple/generator 154 | Collection of integer sequences. 155 | 156 | Returns 157 | ------- 158 | list of dict elements 159 | Each dictionary element contains index, length of sequence & ETC. 160 | 161 | """ 162 | # Initialize pool of parallel workers 163 | pool = Pool() 164 | 165 | # Map-execute function across sequences 166 | out = pool.map_async(_compute_single_seq, enumerate(iterable)) 167 | 168 | # Graceful exit 169 | pool.close() 170 | pool.join() 171 | 172 | # Return collected results 173 | return out.get() 174 | 175 | 176 | def _overlapping_chunks(seq, size, offset=1): 177 | """ 178 | This function takes an input sequence and produces chunks of chosen size. 179 | Offset can be used to control degree of overlap (or distance between chunks 180 | that don't overlap) 181 | 182 | Parameters 183 | ---------- 184 | seq : tuple or list 185 | Sequence of integers. 186 | size : int 187 | Length of each produced chunk. 188 | offset : int, optional 189 | Number of elements to shift each chunk by. The default is 1. 190 | Setting this to any value less than size allows control of overlap. 191 | Setting this >= size produces non-overlapping chunks. 192 | 193 | Returns 194 | ------- 195 | zip 196 | zip object that produces chunks of specified size, one at a time. 197 | 198 | """ 199 | 200 | return zip(*(islice(seq, i, None, offset) for i in range(size))) 201 | 202 | 203 | def _non_overlapping_chunks(seq, size): 204 | """ 205 | This function takes an input sequence and produces chunks of chosen size 206 | that strictly do not overlap. This is a much faster implemetnation than 207 | _overlapping_chunks and should be preferred if running on very large seq. 208 | 209 | Parameters 210 | ---------- 211 | seq : tuple or list 212 | Sequence of integers. 213 | size : int 214 | Length of each produced chunk. 215 | 216 | Returns 217 | ------- 218 | zip 219 | zip object that produces chunks of specified size, one at a time. 220 | 221 | """ 222 | 223 | return zip(*[iter(seq)] * size) 224 | 225 | 226 | def pcompute_single(seq, size, offset=1): 227 | """ 228 | This function operates concurrently on chunks of a given sequence. Gets 229 | each chunk and computes ETC one-by-one. Offset parameter controls degree of 230 | overlap (or non-overlap) 231 | 232 | CAUTION: main module is unguarded, do not run these functions as is, 233 | particularly on Windows. 234 | 235 | Parameters 236 | ---------- 237 | seq : tuple or list 238 | Sequence of integers. 239 | size : int 240 | Length of each produced chunk. 241 | offset : int, optional 242 | Number of elements to shift each chunk by. The default is 1. 243 | Setting this to any value less than size allows control of overlap. 244 | Setting this >= size produces non-overlapping chunks. 245 | 246 | Returns 247 | ------- 248 | list of dict elements 249 | Each dictionary element contains index, length of sequence & ETC. 250 | 251 | """ 252 | # If offset equals size, get non-overlapping chunks of given size 253 | if offset == size: 254 | iterable = _non_overlapping_chunks(seq, size) 255 | 256 | # Else get overlapping chunks of given size and offset 257 | else: 258 | iterable = _overlapping_chunks(seq, size, offset) 259 | 260 | # Execute parallel computation over chunks 261 | return pcompute_multiple_seq(iterable) 262 | 263 | 264 | def pcompute_numpy(nparr): 265 | """ 266 | This function operates concurrently row-wise on a 2D NumPy array. Loads 267 | each sequence and computes ETC. 268 | 269 | CAUTION: main module is unguarded, do not run these functions as is, 270 | particularly on Windows. 271 | 272 | Parameters 273 | ---------- 274 | nparr : numpy array, int, 2D 275 | Sequence present as column, each row representing a different sequence 276 | 277 | Returns 278 | ------- 279 | list of dict elements 280 | Each dictionary element contains index, length of sequence & ETC. 281 | 282 | """ 283 | assert ( 284 | isinstance(nparr, np.ndarray) and nparr.ndim == 2 and nparr.dtype == np.uint32 285 | ), ">ERROR: Input must be 2D NumPy array of 32-bit unsigned integers (np.uint32)" 286 | # Initialize pool of parallel workers 287 | pool = Pool() 288 | 289 | # Map-execute function across sequences 290 | out = pool.map_async( 291 | _compute_single_seq, enumerate([nparr[idx] for idx in range(nparr.shape[0])]) 292 | ) 293 | 294 | # Graceful exit 295 | pool.close() 296 | pool.join() 297 | 298 | # Return collected results 299 | return out.get() 300 | -------------------------------------------------------------------------------- /ETC/tests/test_NSRWS1D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from array import array 10 | from random import choice 11 | 12 | from hypothesis import given 13 | from hypothesis.strategies import composite, integers, lists 14 | 15 | from ETC.NSRWS.x1D import onestep 16 | from ETC.NSRWS.x1D import etc as cetc 17 | from ETC.NSRWS.x1D import core as cc 18 | 19 | 20 | @composite 21 | def generate_sequence(draw, elements=[lists, integers]): 22 | """ 23 | Generate a list of integers as sequence input, and an integer >= 2 for order param 24 | """ 25 | seq = draw(lists(integers(min_value=1, max_value=100), min_size=3, max_size=10_000)) 26 | 27 | order = draw(integers(min_value=2, max_value=len(seq) - 1)) 28 | 29 | return seq, order 30 | 31 | 32 | @composite 33 | def generate_sequence_identical(draw, elements=[lists, integers]): 34 | """ 35 | Generate a list with all identical integers, and an integer >= 2 for order param 36 | """ 37 | seq = draw(lists(integers(min_value=1, max_value=1), min_size=3, max_size=10_000)) 38 | 39 | order = draw(integers(min_value=2, max_value=len(seq) - 1)) 40 | 41 | return seq, order 42 | 43 | 44 | @given(generate_sequence()) 45 | def test_onestep(inputs): 46 | """ 47 | Test the outermost onestep function exposed for direct estimation 48 | """ 49 | seq, order = inputs 50 | output1, signal = onestep.onestep(seq, order, verbose=False, check=False) 51 | output2verbose = onestep.onestep(seq, order, verbose=True, check=False) 52 | 53 | # Substituted sequence should be shorter than input 54 | assert len(output1) < len(seq) 55 | 56 | # Highest value in substituted sequence should be greater than that in input 57 | assert max(output1) > max(seq) 58 | 59 | # Smallest value in substituted sequence should be at least as large as that in input 60 | assert min(output1) >= min(seq) 61 | 62 | # Number of unique symbols in output should be one less than that in input 63 | assert len(set(output1) - set(seq)) == 1 64 | 65 | # Changing verbosity parameter should not alter the substituted sequence 66 | assert output1 == output2verbose[0] 67 | 68 | 69 | @given(generate_sequence()) 70 | def test_onestep_invalid(inputs): 71 | """ 72 | Test the outermost onestep function for invalid input: sequence shorter than order 73 | """ 74 | seq, order = inputs 75 | 76 | output = onestep.onestep(seq[: order - 1], order, verbose=False, check=False) 77 | 78 | assert output is None 79 | 80 | 81 | def test_onestep_invalid_str(): 82 | """ 83 | Test the outermost onestep function for invalid input: string input 84 | """ 85 | output = onestep.onestep("abcdef", 6, verbose=False, check=False) 86 | 87 | assert output is None 88 | 89 | 90 | @given(generate_sequence_identical()) 91 | def test_onestep_identical(inputs): 92 | """ 93 | Test the outermost onestep function for sequence with identical symbols 94 | """ 95 | seq, order = inputs 96 | 97 | output = onestep.onestep(seq, order, verbose=False, check=True) 98 | 99 | assert output is None 100 | 101 | 102 | @given(generate_sequence()) 103 | def test_onestep_pairs_vs_windows(inputs): 104 | """ 105 | Test the parity of output for NSRWS with order=2 & NSRPS for random sequences 106 | """ 107 | seq, _ = inputs 108 | seq = array("I", seq) 109 | 110 | # Pairs (NSRPS) 111 | out_pairs = onestep._onestep_pairs(seq[:], verbose=True) 112 | 113 | # Order = 2 (NSRWS) 114 | out_windows = onestep._onestep_windows(seq[:], 2, verbose=True) 115 | 116 | # Check equality of all but last (timings) part of output 117 | for n in range(4): 118 | assert out_pairs[n] == out_windows[n] 119 | 120 | 121 | @given(generate_sequence_identical()) 122 | def test_onestep_pairs_vs_windows_identical(inputs): 123 | """ 124 | Test the parity of output for NSRWS with order=2 & NSRPS for all identical sequence 125 | """ 126 | seq, _ = inputs 127 | seq = array("I", seq) 128 | 129 | # Pairs (NSRPS) 130 | out_pairs = onestep._onestep_pairs(seq[:], verbose=True) 131 | 132 | # Order = 2 (NSRWS) 133 | out_windows = onestep._onestep_windows(seq[:], 2, verbose=True) 134 | 135 | # Check equality of all but last (timings) part of output 136 | for n in range(4): 137 | assert out_pairs[n] == out_windows[n] 138 | 139 | 140 | @given(generate_sequence()) 141 | def test_get_mask_general(inputs): 142 | """ 143 | Test the get_mask function for random sequences and orders 144 | """ 145 | seq, order = inputs 146 | seq = array("I", seq) 147 | 148 | # Get mask depending on order 149 | if order == 2: 150 | mask = cc.get_mask_pairs(seq) 151 | else: 152 | mask = cc.get_mask_windows(seq, order) 153 | 154 | # Mask should be precisely shorter than input sequence 155 | assert len(mask) == len(seq) - order + 1 156 | 157 | # Mask should only contain 0s and 1s 158 | assert set(mask).issubset({0, 1}) 159 | 160 | # First element must be 1 161 | assert mask[0] == 1 162 | 163 | # If mask contains a 0, then that position in the sequence indicates an overlap 164 | try: 165 | idx0 = mask.index(0) 166 | # Check if consecutive elements equal where 0 found in mask 167 | assert seq[idx0] == seq[idx0 + order - 1] 168 | except ValueError: 169 | pass 170 | 171 | 172 | @given(generate_sequence_identical()) 173 | def test_get_mask_identical(inputs): 174 | """ 175 | Test the get_mask function for sequences with identical symbols 176 | """ 177 | seq, order = inputs 178 | seq = array("I", seq) 179 | 180 | # Get mask depending on order from left-to-right and reversed sequence 181 | if order == 2: 182 | mask = cc.get_mask_pairs(seq) 183 | mask_rev = cc.get_mask_pairs(seq[::-1]) 184 | else: 185 | mask = cc.get_mask_windows(seq, order) 186 | mask_rev = cc.get_mask_windows(seq[::-1], order) 187 | 188 | # Find zeroes for they must be present 189 | idx0 = mask.index(0) 190 | 191 | # Both masks should be precisely shorter than input sequence 192 | assert len(mask) == len(mask_rev) == len(seq) - order + 1 193 | 194 | # Both masks should only contain 0s and 1s 195 | assert set(mask).issubset({0, 1}) 196 | assert set(mask_rev).issubset({0, 1}) 197 | 198 | # First element has to be a 1 199 | assert mask[0] == 1 200 | 201 | # Check if consecutive elements equal where 0 found in mask 202 | assert seq[idx0] == seq[idx0 + order - 1] 203 | 204 | 205 | def test_mask_and_count(): 206 | """ 207 | Test the function for applying mask and counting frequent windows 208 | """ 209 | seq = (1, 2, 3, 4, 5, 6, 7) 210 | mask = (1, 0, 0, 1, 1) 211 | assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 2, 3)), 1) 212 | 213 | seq = (1, 2, 3, 4, 5, 6, 7) 214 | mask = (1, 1, 1, 1, 1) 215 | assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 2, 3)), 1) 216 | 217 | seq = (1, 2, 3, 4, 5, 6, 7) 218 | mask = (0, 1, 1, 1, 1) 219 | assert onestep._mask_and_count(seq, mask, 3) == (array("I", (2, 3, 4)), 1) 220 | 221 | seq = (1, 1, 1, 1, 1, 2, 1) 222 | mask = (1, 0, 0, 1, 1) 223 | assert onestep._mask_and_count(seq, mask, 3) == (array("I", (1, 1, 1)), 1) 224 | 225 | 226 | @given(generate_sequence()) 227 | def test_substitution(inputs): 228 | """ 229 | Test the substitution step for random sequences 230 | """ 231 | seq, order = inputs 232 | seq = array("I", seq) 233 | 234 | # Get value to substitute 235 | sub_value = 1 + max(seq) 236 | 237 | # Pick a random pair for substitution 238 | idx = seq.index(choice(seq[:-1])) 239 | pair = array("I", [seq[idx], seq[idx + 1]]) 240 | 241 | # Substitute the pair using both functions 242 | out1 = cc.substitute_pairs(seq[:], pair, sub_value) 243 | out2 = cc.substitute_windows(seq[:], 2, pair, sub_value) 244 | 245 | # The 2 outputs should be equal 246 | assert out1 == out2 247 | 248 | # The length of the substituted sequence should be less than the input sequence 249 | assert len(out1) < len(seq) 250 | 251 | # The highest value in the substituted sequence should be more than that in the input sequence 252 | assert max(out1) > max(seq) 253 | 254 | # The highest value in the substitute sequence should match the provided value 255 | assert max(out1) == sub_value 256 | 257 | 258 | @given(generate_sequence()) 259 | def test_truncation(inputs): 260 | """ 261 | Test ETC estimation from all 4 methods based on verbosity and truncation 262 | """ 263 | seq, order = inputs 264 | 265 | etc_vf = cetc.compute(seq, order, verbose=True, truncate=False)["ETC1D"] 266 | etc_vt = cetc.compute(seq, order, verbose=True, truncate=True)["ETC1D"] 267 | etc_cf = cetc.compute(seq, order, verbose=False, truncate=False)["ETC1D"] 268 | etc_ct = cetc.compute(seq, order, verbose=False, truncate=True)["ETC1D"] 269 | 270 | # All 4 estimates should be identical 271 | assert etc_vf == etc_vt == etc_cf == etc_ct 272 | 273 | 274 | def test_compute_save(tmp_path): 275 | """ 276 | Test ETC estimation with write-to-disk functionality 277 | """ 278 | seq = array("I", [2, 4] * 100) 279 | 280 | # Temporary file (Path object) for use 281 | file = tmp_path / "test.csv" 282 | 283 | # Test without truncation 284 | etc_vf = cetc.compute_save(seq, file, order=2, truncate=False) 285 | assert isinstance(etc_vf, dict) 286 | 287 | # Test with truncation 288 | etc_vt = cetc.compute_save(seq, file, order=2, truncate=True) 289 | assert isinstance(etc_vt, dict) 290 | 291 | # Values should be same of course 292 | assert etc_vf["ETC1D"] == etc_vt["ETC1D"] 293 | -------------------------------------------------------------------------------- /ETC/NCA/parallelize_jl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Parallelized NCA estimation using joblib 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from ETC.CCMC.pairs import ETC_causality as ETC_compute 9 | from ETC.CCMC.pairs import LZ_causality as LZ_compute 10 | from ETC.CCC.compute_CCC import compute as CCC_compute 11 | from joblib import Parallel, delayed 12 | from functools import partial 13 | from itertools import combinations 14 | 15 | 16 | def _kernel_CCC(inputs, CCC_params): 17 | """ 18 | Wrapper for computing causality estimates on a sequence pair 19 | 20 | Used for causal discovery and estimation from CCM based methods as well as CCC. 21 | 22 | The function unpacks inputs into an index element and a sequence pair and runs the 23 | estimator function on the sequence pair, returning various estimates in a dict 24 | 25 | Parameters 26 | ---------- 27 | inputs : tuple 28 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 29 | be produced manually or more typically using enumerate; b holds the two sequences 30 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 31 | a, the index, is passed to keep track of order in case of asynchronous execution 32 | Should look like this: (index, (sequence_x, sequence_y) 33 | CCC_params : dict 34 | The following 3 parameters for CCC as key-value pairs: 35 | "LEN_past" : int 36 | Parameter "L": Window length of immediate past values of seq_x and seq_y. 37 | "ADD_meas" : int 38 | Parameter "w": Window length of present values of seq_x. Minimal data length 39 | over which CC rate can be reliably estimated, application/domain-specific 40 | "STEP_size" : int 41 | Parameter "delta": Step-size for sliding chunks across both sequences. An overlap 42 | of 20-50% between successive chunks or windows suggested. 43 | The dictionary can be generated interactively using CCC.get_params() 44 | 45 | Returns 46 | ------- 47 | out : dict 48 | Estimates obtained by running estimator on inputs. 49 | 50 | """ 51 | 52 | # Unpack inputs 53 | idx, seqs = inputs 54 | 55 | # Unpack sequences 56 | idx_x, idx_y, seq_x, seq_y = seqs 57 | 58 | # Initialize dictionary of output estimates with index 59 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 60 | 61 | # Execute CCC_compute on the sequence pair in one direction 62 | out.update({"CCC_y_to_x": CCC_compute(seq_x, seq_y, **CCC_params)}) 63 | 64 | # Execute CCC_compute on the sequence pair in the other direction 65 | out.update({"CCC_x_to_y": CCC_compute(seq_y, seq_x, **CCC_params)}) 66 | 67 | return out 68 | 69 | 70 | def _kernel_ETC(inputs): 71 | """ 72 | Wrapper for computing causality estimates on a sequence pair 73 | 74 | Used for causal discovery and estimation from CCM based methods as well as CCC. 75 | 76 | The function unpacks inputs into an index element and a sequence pair and runs the 77 | estimator function on the sequence pair, returning various estimates in a dict 78 | 79 | Parameters 80 | ---------- 81 | inputs : tuple 82 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 83 | be produced manually or more typically using enumerate; b holds the two sequences 84 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 85 | a, the index, is passed to keep track of order in case of asynchronous execution 86 | Should look like this: (index, (sequence_x, sequence_y) 87 | 88 | Returns 89 | ------- 90 | out : dict 91 | Estimates obtained by running estimator on inputs. 92 | 93 | """ 94 | 95 | # Unpack inputs 96 | idx, seqs = inputs 97 | 98 | # Unpack sequences 99 | idx_x, idx_y, seq_x, seq_y = seqs 100 | 101 | # Initialize dictionary of output estimates with index 102 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 103 | 104 | # Execute ETC_compute on the sequence pair 105 | out.update(ETC_compute(seq_x, seq_y)) 106 | 107 | return out 108 | 109 | 110 | def _kernel_LZ(inputs): 111 | """ 112 | Wrapper for computing causality estimates on a sequence pair 113 | 114 | Used for causal discovery and estimation from CCM based methods as well as CCC. 115 | 116 | The function unpacks inputs into an index element and a sequence pair and runs the 117 | estimator function on the sequence pair, returning various estimates in a dict 118 | 119 | Parameters 120 | ---------- 121 | inputs : tuple 122 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 123 | be produced manually or more typically using enumerate; b holds the two sequences 124 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 125 | a, the index, is passed to keep track of order in case of asynchronous execution 126 | Should look like this: (index, (sequence_x, sequence_y) 127 | 128 | Returns 129 | ------- 130 | out : dict 131 | Estimates obtained by running estimator on inputs. 132 | 133 | """ 134 | 135 | # Unpack inputs 136 | idx, seqs = inputs 137 | 138 | # Unpack sequences 139 | idx_x, idx_y, seq_x, seq_y = seqs 140 | 141 | # Initialize dictionary of output estimates with index 142 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 143 | 144 | # Execute LZ_compute on the sequence pair 145 | out.update(LZ_compute(seq_x, seq_y)) 146 | 147 | return out 148 | 149 | 150 | def get_rowpairs(matrix): 151 | """ 152 | Create a generator for iterating over pairs of rows of an input matrix 153 | 154 | Parameters 155 | ---------- 156 | matrix : numpy array, int or float, 2D 157 | Each row representing a different sequence. (Columns as time) 158 | 159 | Yields 160 | ------ 161 | row1 : int 162 | Index of first row in the pair. 163 | row2 : int 164 | Index of second row in the pair. 165 | np.array, 1D, int 166 | Data of first row in the pair. 167 | np.array, 1D, int 168 | Data of first row in the pair. 169 | 170 | """ 171 | for row1, row2 in combinations(range(0, matrix.shape[0]), 2): 172 | yield (row1, row2, matrix[row1, :], matrix[row2, :]) 173 | 174 | 175 | def parallelized_CCC(pairs, CCC_params): 176 | """ 177 | This function operates concurrently on a collection of sequence pairs and computes 178 | estimates using the chosen kernel function. 179 | 180 | Here used for computing causal estimates from sequences pairs in batch, each pair 181 | runs on a separate CPU core as a process. 182 | 183 | CAUTION: main module is unguarded, do not run these functions as is, 184 | particularly on Windows! 185 | 186 | Parameters 187 | ---------- 188 | pairs : list/tuple/generator 189 | Collection of pairs of integer sequences. 190 | CCC_params : dict 191 | The following 3 parameters for CCC as key-value pairs: 192 | "LEN_past" : int 193 | Parameter "L": Window length of immediate past values of seq_x and seq_y. 194 | "ADD_meas" : int 195 | Parameter "w": Window length of present values of seq_x. Minimal data length 196 | over which CC rate can be reliably estimated, application/domain-specific 197 | "STEP_size" : int 198 | Parameter "delta": Step-size for sliding chunks across both sequences. An overlap 199 | of 20-50% between successive chunks or windows suggested. 200 | The dictionary can be generated interactively using CCC.get_params() 201 | 202 | Returns 203 | ------- 204 | list of dict elements 205 | Each dictionary element contains index, length of sequence & ETC. 206 | 207 | """ 208 | 209 | exec_kernel = partial(_kernel_CCC, CCC_params=CCC_params) 210 | 211 | # Confirm to stdout 212 | print("Computing CCC estimates in parallel ... ") 213 | 214 | # joblib's paralellization 215 | out = Parallel(n_jobs=-1, verbose=50)( 216 | delayed(exec_kernel)(rowElem) for rowElem in enumerate(pairs) 217 | ) 218 | 219 | # Return collected results 220 | return out 221 | 222 | 223 | def parallelized_CCM(pairs, kernel="LZ"): 224 | """ 225 | This function operates concurrently on a collection of sequence pairs and computes 226 | estimates using the chosen kernel function. 227 | 228 | Here used for computing causal estimates from sequences pairs in batch, each pair 229 | runs on a separate CPU core as a process. 230 | 231 | CAUTION: main module is unguarded, do not run these functions as is, 232 | particularly on Windows! 233 | 234 | Parameters 235 | ---------- 236 | pairs : list/tuple/generator 237 | Collection of pairs of integer sequences. 238 | kernel : str, optional 239 | Name of an estimator function. Currently available: "ETC" and "LZ". The 240 | default is "LZ". 241 | 242 | Returns 243 | ------- 244 | list of dict elements 245 | Each dictionary element contains various estimates and identifiers. 246 | 247 | """ 248 | 249 | if kernel == "LZ": 250 | exec_kernel = _kernel_LZ 251 | elif kernel == "ETC": 252 | exec_kernel = _kernel_ETC 253 | else: 254 | print("Invalid kernel selected") 255 | return None 256 | 257 | # Confirm to stdout 258 | print(f"Computing CCM estimates in parallel using {kernel} ... ") 259 | 260 | # joblib's paralellization 261 | out = Parallel(n_jobs=-1, verbose=50)( 262 | delayed(exec_kernel)(rowElem) for rowElem in enumerate(pairs) 263 | ) 264 | 265 | # Return collected results 266 | return out 267 | -------------------------------------------------------------------------------- /ETC/seq/markov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Module for computing markov transition probability matrices from 5 | nucleotide sequence stored as text files. 6 | 7 | compute() is the main function that wraps around smaller modular functions. 8 | 9 | @author: pranay 10 | """ 11 | 12 | # Import calls 13 | from pathlib import Path 14 | from random import choices, seed 15 | import numpy as np 16 | import pandas as pd 17 | 18 | # Function Definitions 19 | def _read_sequence(filepath): 20 | """ 21 | This function reads a file & returns it as a string. 22 | Uses pathlib's functionality & is called by the wrapper compute() 23 | 24 | Parameters 25 | ---------- 26 | filepath : Path object 27 | Valid path to file containing nucleotide sequence. 28 | 29 | Returns 30 | ------- 31 | string 32 | String containing nucleotide sequence. 33 | 34 | """ 35 | # Use Path object's hook to read file as text and return 36 | return filepath.read_text() 37 | 38 | 39 | def _generate_overlaps(sequence, order): 40 | """ 41 | This function takes an input sequence & generates overlapping subsequences 42 | of length order + 1, returned as a tuple. Has no dependencies & is called 43 | by the wrapper compute() 44 | 45 | Parameters 46 | ---------- 47 | sequence : string 48 | String containing nucleotide sequence. 49 | order : int 50 | Order of Markov Transition Probability Matrix for computing overlaps 51 | 52 | Returns 53 | ------- 54 | tuple 55 | Contains all overlapping sub-sequences (of length order + 1) for given 56 | order. Total number of sub-sequences is len(sequence) - (order + 1) 57 | 58 | """ 59 | # Initialize aggregator 60 | aggregator = [] 61 | 62 | # Increment order by 1 as the Markov model includes the current state, such 63 | # that the length of sequence corresponding to a state is order + 1 64 | order += 1 65 | 66 | # Iteratively store sequences shifted to the left by 1 step 67 | for idx in range(order): 68 | aggregator.append(sequence[idx : idx - order]) 69 | 70 | # Join the shifted sequences through element-wise concatenation & return 71 | return tuple(map("".join, zip(*aggregator))) 72 | 73 | 74 | def _compute_transition_probs(sequence, compact=True, flatten=False): 75 | """ 76 | This function takes a tuple of strings (overlapping subsequences) as input 77 | and computes transition probabilities, returned as a dataframe. Switches 78 | control the form (compactness & wide-form vs long-form) of output dataframe. 79 | 80 | Parameters 81 | ---------- 82 | sequence : string 83 | String containing nucleotide sequence. 84 | compact : bool, optional 85 | Whether to return the full sparse matrix or to return a more compact 86 | representation of it. If False, returns the a square matrix with most 87 | elements zero. If True, returns non-zero columns only for all rows. 88 | The default is True. 89 | flatten : bool, optional 90 | Whether to flatten (or tidy / long-form) the matrix or not (wide-form). 91 | If True, returns only one column containing probabilities through a 92 | row-wise representation. If False, returns multiple columns with 93 | probabilities. 94 | The default is False. 95 | 96 | Returns 97 | ------- 98 | pandas DataFrame 99 | Tabulated transition probabilities with row & column labels describing 100 | the (n-1)th and nth state respectively. If flatten is True, column 101 | labels for nth state are transposed into a column such that there are 102 | 2 columns, 1 each for the (n-1)th and nth state. 103 | 104 | """ 105 | # If compact requested, use only the last alphabet of the next subsequence. 106 | # The Nth element will only differ from the (N-1)th in shift by 1 107 | if compact: 108 | 109 | # Convert tuple to numpy array, excluding last element 110 | temp = np.array(sequence[:-1]) 111 | 112 | # Extract the last alphabet from each word except the first 113 | next_alphabet = np.array([x[-1] for x in sequence[1:]]) 114 | 115 | # Compute normalized frequencies via cross-tabulation 116 | df = pd.crosstab(temp, next_alphabet, normalize="index") 117 | 118 | # If full matrix to be returned, cross-tabulate shifted sequences 119 | else: 120 | 121 | # Convert tuple containing overlapping subsequences to numpy array 122 | temp = np.array(sequence) 123 | 124 | # Compute normalized frequencies via cross-tabulation 125 | df = pd.crosstab(temp[:-1], temp[1:], normalize="index") 126 | 127 | # Set proper identifier labels 128 | df.index.name = "previous" 129 | df.columns.name = "next" 130 | 131 | # If flatten requested, pivot all columns into a single column 132 | if flatten: 133 | 134 | # Stack all columns 135 | df = df.stack() 136 | 137 | # Set name for the Series object 138 | df.name = "probability" 139 | 140 | # Return a DataFrame by resetting the Series index 141 | return df.reset_index() 142 | 143 | # If flatten not requested, return DataFrame 144 | return df 145 | 146 | 147 | def _check_inputs(filepath, order, compact, flatten): 148 | """ 149 | This function checks the input arguments to compute() for validity based 150 | on descriptions below. 151 | 152 | Parameters 153 | ---------- 154 | filepath : Path object 155 | Valid path to file containing nucleotide sequence. 156 | order : int 157 | Order of Markov Transition Probability Matrix for computing overlaps 158 | compact : bool, optional 159 | Whether to return the full sparse matrix or to return a more compact 160 | representation of it 161 | flatten : bool, optional 162 | Whether to flatten (or tidy / long-form) the matrix or not. 163 | 164 | Returns 165 | ------- 166 | bool 167 | True if all inputs are valid. 168 | 169 | """ 170 | # Check type of input path 171 | if not isinstance(filepath, Path): 172 | print("> ERROR: Input should be a Path object ...") 173 | return False 174 | 175 | # Check if path exists and points to a file 176 | if not (filepath.exists() and filepath.is_file()): 177 | print("> ERROR: Path does not exist ...") 178 | return False 179 | 180 | # Check if order is a non-negative integer 181 | if not (isinstance(order, int) and order >= 0): 182 | print("> ERROR: order should be a non-negative integer ...") 183 | return False 184 | 185 | # Check if other args are boolean types 186 | if not (isinstance(compact, bool) and isinstance(flatten, bool)): 187 | print("> ERROR: compact and flatten args should be a boolean ...") 188 | return False 189 | 190 | # If all inputs are valid, yay 191 | return True 192 | 193 | 194 | def compute(filepath, order, compact=True, flatten=False): 195 | """ 196 | This function takes a text file containing a nucleotide sequence, computes 197 | computes the transition probability matrix of given order and returns it as 198 | a labelled dataframe. Output can be tuned through optional switches. This 199 | function is modular and wraps around the following 4 functions: 200 | _check_inputs - for validating input arguments 201 | _read_sequence - for reading text file containing nucleotide sequence 202 | _generate_overlaps - for creating overlapped subsequences 203 | _compute_transition_probs - for creating transition probability matrix 204 | 205 | Parameters 206 | ---------- 207 | filepath : Path object 208 | Valid path to file containing nucleotide sequence. 209 | order : int 210 | Order of Markov Transition Probability Matrix for computing overlaps 211 | compact : bool, optional 212 | Whether to return the full sparse matrix or to return a more compact 213 | representation of it. If False, returns the a square matrix with most 214 | elements zero. If True, returns non-zero columns only for all rows. 215 | The default is True. 216 | flatten : bool, optional 217 | Whether to flatten (or tidy / long-form) the matrix or not. If True, 218 | returns only one column containing probabilities through a row-wise 219 | representation. If False, returns multiple columns with probabilities. 220 | The default is False. 221 | 222 | Returns 223 | ------- 224 | pandas DataFrame 225 | Tabulated transition probabilities with row & column labels describing 226 | the (n-1)th and nth state respectively. If flatten is True, column 227 | labels for nth state are transposed into a column such that there are 228 | 2 columns, 1 each for the (n-1)th and nth state. 229 | 230 | """ 231 | # If any input is not valid, break 232 | if not _check_inputs(filepath, order, compact, flatten): 233 | return None 234 | 235 | # Read sequence file and get tuple of overlapping subsequences 236 | sequence = _generate_overlaps(_read_sequence(filepath), order) 237 | 238 | # Compute and return transition probability matrix 239 | return _compute_transition_probs(sequence, compact=compact, flatten=flatten) 240 | 241 | 242 | def sample_sequence(sequence, order, size, sampler_seed=0): 243 | 244 | # Read sequence file and get tuple of overlapping subsequences 245 | overlapped = _generate_overlaps(sequence, order) 246 | 247 | # Compute and return transition probability matrix 248 | transition_probs = _compute_transition_probs( 249 | overlapped, compact=True, flatten=False 250 | ) 251 | 252 | order += 1 253 | chain = sequence[-order:] 254 | 255 | seed(sampler_seed) 256 | 257 | for n in range(size): 258 | last = chain[-order:] 259 | probs = transition_probs.loc[last, :] 260 | new = "".join(choices(population=probs.index, weights=probs.values, k=1)) 261 | chain += new 262 | 263 | return chain[order:] 264 | -------------------------------------------------------------------------------- /ETC/NCA/parallelize_mp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Parallelized NCA estimation using multiprocessing 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from ETC.CCMC.pairs import ETC_causality as ETC_compute 9 | from ETC.CCMC.pairs import LZ_causality as LZ_compute 10 | from ETC.CCC.compute_CCC import compute as CCC_compute 11 | from multiprocessing import Pool 12 | from functools import partial 13 | from itertools import combinations 14 | 15 | 16 | def _kernel_CCC(inputs, CCC_params): 17 | """ 18 | Wrapper for computing causality estimates on a sequence pair 19 | 20 | Used for causal discovery and estimation from CCM based methods as well as CCC. 21 | 22 | The function unpacks inputs into an index element and a sequence pair and runs the 23 | estimator function on the sequence pair, returning various estimates in a dict 24 | 25 | Parameters 26 | ---------- 27 | inputs : tuple 28 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 29 | be produced manually or more typically using enumerate; b holds the two sequences 30 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 31 | a, the index, is passed to keep track of order in case of asynchronous execution 32 | Should look like this: (index, (sequence_x, sequence_y) 33 | CCC_params : dict 34 | The following 3 parameters for CCC as key-value pairs: 35 | "LEN_past" : int 36 | Parameter "L": Window length of immediate past values of seq_x and seq_y. 37 | "ADD_meas" : int 38 | Parameter "w": Window length of present values of seq_x. Minimal data length 39 | over which CC rate can be reliably estimated, application/domain-specific 40 | "STEP_size" : int 41 | Parameter "delta": Step-size for sliding chunks across both sequences. An overlap 42 | of 20-50% between successive chunks or windows suggested. 43 | The dictionary can be generated interactively using CCC.get_params() 44 | 45 | Returns 46 | ------- 47 | out : dict 48 | Estimates obtained by running estimator on inputs. 49 | 50 | """ 51 | 52 | # Unpack inputs 53 | idx, seqs = inputs 54 | 55 | # Unpack sequences 56 | idx_x, idx_y, seq_x, seq_y = seqs 57 | 58 | # Initialize dictionary of output estimates with index 59 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 60 | 61 | # Execute CCC_compute on the sequence pair in one direction 62 | out.update({"CCC_y_to_x": CCC_compute(seq_x, seq_y, **CCC_params)}) 63 | 64 | # Execute CCC_compute on the sequence pair in the other direction 65 | out.update({"CCC_x_to_y": CCC_compute(seq_y, seq_x, **CCC_params)}) 66 | 67 | return out 68 | 69 | 70 | def _kernel_ETC(inputs): 71 | """ 72 | Wrapper for computing causality estimates on a sequence pair 73 | 74 | Used for causal discovery and estimation from CCM based methods as well as CCC. 75 | 76 | The function unpacks inputs into an index element and a sequence pair and runs the 77 | estimator function on the sequence pair, returning various estimates in a dict 78 | 79 | Parameters 80 | ---------- 81 | inputs : tuple 82 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 83 | be produced manually or more typically using enumerate; b holds the two sequences 84 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 85 | a, the index, is passed to keep track of order in case of asynchronous execution 86 | Should look like this: (index, (sequence_x, sequence_y) 87 | 88 | Returns 89 | ------- 90 | out : dict 91 | Estimates obtained by running estimator on inputs. 92 | 93 | """ 94 | 95 | # Unpack inputs 96 | idx, seqs = inputs 97 | 98 | # Unpack sequences 99 | idx_x, idx_y, seq_x, seq_y = seqs 100 | 101 | # Initialize dictionary of output estimates with index 102 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 103 | 104 | # Execute ETC_compute on the sequence pair 105 | out.update(ETC_compute(seq_x, seq_y)) 106 | 107 | return out 108 | 109 | 110 | def _kernel_LZ(inputs): 111 | """ 112 | Wrapper for computing causality estimates on a sequence pair 113 | 114 | Used for causal discovery and estimation from CCM based methods as well as CCC. 115 | 116 | The function unpacks inputs into an index element and a sequence pair and runs the 117 | estimator function on the sequence pair, returning various estimates in a dict 118 | 119 | Parameters 120 | ---------- 121 | inputs : tuple 122 | Tuple of two elements - (a, b) where a is an index, b is a tuple of two. a can 123 | be produced manually or more typically using enumerate; b holds the two sequences 124 | usually passed in by zip-ping larger iterables or itertools' product/combinations. 125 | a, the index, is passed to keep track of order in case of asynchronous execution 126 | Should look like this: (index, (sequence_x, sequence_y) 127 | 128 | Returns 129 | ------- 130 | out : dict 131 | Estimates obtained by running estimator on inputs. 132 | 133 | """ 134 | 135 | # Unpack inputs 136 | idx, seqs = inputs 137 | 138 | # Unpack sequences 139 | idx_x, idx_y, seq_x, seq_y = seqs 140 | 141 | # Initialize dictionary of output estimates with index 142 | out = {"index_pair": idx, "index_x": idx_x, "index_y": idx_y} 143 | 144 | # Execute LZ_compute on the sequence pair 145 | out.update(LZ_compute(seq_x, seq_y)) 146 | 147 | return out 148 | 149 | 150 | def get_rowpairs(matrix): 151 | """ 152 | Create a generator for iterating over pairs of rows of an input matrix 153 | 154 | Parameters 155 | ---------- 156 | matrix : numpy array, int or float, 2D 157 | Each row representing a different sequence. (Columns as time) 158 | 159 | Yields 160 | ------ 161 | row1 : int 162 | Index of first row in the pair. 163 | row2 : int 164 | Index of second row in the pair. 165 | np.array, 1D, int 166 | Data of first row in the pair. 167 | np.array, 1D, int 168 | Data of first row in the pair. 169 | 170 | """ 171 | for row1, row2 in combinations(range(0, matrix.shape[0]), 2): 172 | yield (row1, row2, matrix[row1, :], matrix[row2, :]) 173 | 174 | 175 | def parallelized_CCC(pairs, CCC_params): 176 | """ 177 | This function operates concurrently on a collection of sequence pairs and computes 178 | estimates using the chosen kernel function. 179 | 180 | Here used for computing causal estimates from sequences pairs in batch, each pair 181 | runs on a separate CPU core as a process. 182 | 183 | CAUTION: main module is unguarded, do not run these functions as is, 184 | particularly on Windows! 185 | 186 | Parameters 187 | ---------- 188 | pairs : list/tuple/generator 189 | Collection of pairs of integer sequences. 190 | CCC_params : dict 191 | The following 3 parameters for CCC as key-value pairs: 192 | "LEN_past" : int 193 | Parameter "L": Window length of immediate past values of seq_x and seq_y. 194 | "ADD_meas" : int 195 | Parameter "w": Window length of present values of seq_x. Minimal data length 196 | over which CC rate can be reliably estimated, application/domain-specific 197 | "STEP_size" : int 198 | Parameter "delta": Step-size for sliding chunks across both sequences. An overlap 199 | of 20-50% between successive chunks or windows suggested. 200 | The dictionary can be generated interactively using CCC.get_params() 201 | 202 | Returns 203 | ------- 204 | list of dict elements 205 | Each dictionary element contains index, length of sequence & ETC. 206 | 207 | """ 208 | 209 | exec_kernel = partial(_kernel_CCC, CCC_params=CCC_params) 210 | 211 | # Initialize pool of parallel workers 212 | pool = Pool() 213 | 214 | # Confirm to stdout 215 | print("Computing CCC estimates in parallel on input ... ", end="") 216 | 217 | # Map-execute function across sequences 218 | out = pool.map_async(exec_kernel, enumerate(pairs)) 219 | 220 | # Graceful exit 221 | pool.close() 222 | pool.join() 223 | 224 | # Confirm completion 225 | print("Done!") 226 | 227 | # Return collected results 228 | return out.get() 229 | 230 | 231 | def parallelized_CCM(pairs, kernel="LZ"): 232 | """ 233 | This function operates concurrently on a collection of sequence pairs and computes 234 | estimates using the chosen kernel function. 235 | 236 | Here used for computing causal estimates from sequences pairs in batch, each pair 237 | runs on a separate CPU core as a process. 238 | 239 | CAUTION: main module is unguarded, do not run these functions as is, 240 | particularly on Windows! 241 | 242 | Parameters 243 | ---------- 244 | pairs : list/tuple/generator 245 | Collection of pairs of integer sequences. 246 | kernel : str, optional 247 | Name of an estimator function. Currently available: "ETC" and "LZ". The 248 | default is "LZ". 249 | 250 | Returns 251 | ------- 252 | list of dict elements 253 | Each dictionary element contains various estimates and identifiers. 254 | 255 | """ 256 | 257 | if kernel == "LZ": 258 | exec_kernel = _kernel_LZ 259 | elif kernel == "ETC": 260 | exec_kernel = _kernel_ETC 261 | else: 262 | print("Invalid kernel selected") 263 | return None 264 | 265 | # Initialize pool of parallel workers 266 | pool = Pool() 267 | 268 | # Confirm to stdout 269 | print("Computing CCM estimates in parallel on input ... ", end="") 270 | 271 | # Map-execute function across sequences 272 | out = pool.map_async(exec_kernel, enumerate(pairs)) 273 | 274 | # Graceful exit 275 | pool.close() 276 | pool.join() 277 | 278 | # Confirm completion 279 | print("Done!") 280 | 281 | # Return collected results 282 | return out.get() 283 | -------------------------------------------------------------------------------- /ETC/tests/test_NSRWS2D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | 9 | from array import array 10 | from random import choice 11 | 12 | from hypothesis import given 13 | from hypothesis.strategies import composite, integers, lists 14 | 15 | from ETC.NSRWS.x2D import onestep 16 | from ETC.NSRWS.x2D import core as cc 17 | from ETC.NSRWS.x2D import etc as cetc 18 | 19 | 20 | @composite 21 | def generate_sequences(draw, elements=[lists, integers]): 22 | """ 23 | Generate 2 lists of integers as sequence input with equal lengths 24 | """ 25 | seq_x = draw( 26 | lists(integers(min_value=1, max_value=100), min_size=3, max_size=10_000) 27 | ) 28 | seq_y = draw( 29 | lists( 30 | integers(min_value=1, max_value=100), 31 | min_size=len(seq_x), 32 | max_size=len(seq_x), 33 | ) 34 | ) 35 | 36 | return seq_x, seq_y 37 | 38 | 39 | @composite 40 | def generate_sequences_identical(draw, elements=[lists, integers]): 41 | """ 42 | Generate 2 lists with all identical integers with equal lengths 43 | """ 44 | seq_x = draw(lists(integers(min_value=1, max_value=1), min_size=3, max_size=10_000)) 45 | seq_y = draw( 46 | lists( 47 | integers(min_value=2, max_value=2), min_size=len(seq_x), max_size=len(seq_x) 48 | ) 49 | ) 50 | 51 | return seq_x, seq_y 52 | 53 | 54 | @given(generate_sequences()) 55 | def test_onestep(inputs): 56 | """ 57 | Test the outermost onestep function exposed for direct estimation 58 | """ 59 | seq_x, seq_y = inputs 60 | 61 | out_x1, out_y1, _ = onestep.onestep( 62 | seq_x, seq_y, order=2, verbose=False, check=False 63 | ) 64 | outputs = onestep.onestep(seq_x, seq_y, order=2, verbose=True, check=False) 65 | out_x2, out_y2 = outputs[0], outputs[1] 66 | 67 | # Substituted sequence should be shorter than input 68 | assert len(out_x1) < len(seq_x) and len(out_y1) < len(seq_y) 69 | 70 | # Highest value in substituted sequence should be greater than that in input 71 | assert max(out_x1) > max(seq_x) and max(out_y1) > max(seq_y) 72 | 73 | # Smallest value in substituted sequence should be at least as large as that in input 74 | assert min(out_x1) >= min(seq_x) and min(out_y1) >= min(seq_y) 75 | 76 | # Number of unique symbols in output should be one less than that in input 77 | assert len(set(out_x1) - set(seq_x)) == 1 78 | assert len(set(out_y1) - set(seq_y)) == 1 79 | 80 | # Number of symbols in output that are not in input should be between 1 and 3 81 | assert 1 <= len(set(out_x1) ^ set(seq_x)) <= 3 82 | assert 1 <= len(set(out_y1) ^ set(seq_y)) <= 3 83 | 84 | # Changing verbosity parameter should not alter the substituted sequence 85 | assert out_x1 == out_x2 and out_y1 == out_y2 86 | 87 | 88 | @given(generate_sequences()) 89 | def test_onestep_unequal(inputs): 90 | """ 91 | Test the outermost onestep function for invalid input: sequences shorter than order 92 | """ 93 | seq_x, seq_y = inputs 94 | 95 | output = onestep.onestep(seq_x, seq_y[:-11], order=2, verbose=False, check=False) 96 | assert output is None 97 | 98 | 99 | @given(generate_sequences()) 100 | def test_onestep_invalid(inputs): 101 | """ 102 | Test the outermost onestep function for invalid input: sequences shorter than order 103 | """ 104 | seq_x, seq_y = inputs 105 | 106 | output = onestep.onestep(seq_x[:1], seq_y[:1], order=2, verbose=False, check=False) 107 | assert output is None 108 | 109 | 110 | def test_onestep_invalid_str(): 111 | """ 112 | Test the outermost onestep function for invalid input: string inputs 113 | """ 114 | output = onestep.onestep( 115 | [1, 2, 3, 4, 5, 6], "abcdef", 2, verbose=False, check=False 116 | ) 117 | assert output is None 118 | 119 | output = onestep.onestep( 120 | "abcdef", [1, 2, 3, 4, 5, 6], 2, verbose=False, check=False 121 | ) 122 | assert output is None 123 | 124 | output = onestep.onestep("abcdef", "abcdef", 2, verbose=False, check=False) 125 | assert output is None 126 | 127 | 128 | @given(generate_sequences_identical()) 129 | def test_onestep_identical(inputs): 130 | """ 131 | Test the outermost onestep function for sequence with identical symbols 132 | """ 133 | seq_x, seq_y = inputs 134 | 135 | output = onestep.onestep(seq_x, seq_y, order=2, verbose=False, check=True) 136 | 137 | assert output is None 138 | 139 | 140 | @given(generate_sequences()) 141 | def test_get_mask_general(inputs): 142 | """ 143 | Test the get_mask function for random sequences and orders 144 | """ 145 | seq_x, seq_y = inputs 146 | seq_x = array("I", seq_x) 147 | seq_y = array("I", seq_y) 148 | 149 | # Get mask 150 | mask = cc.get_mask_pairs(seq_x, seq_y) 151 | 152 | # Mask should be precisely shorter than input sequence 153 | assert len(mask) == len(seq_x) - 1 == len(seq_y) - 1 154 | 155 | # Mask should only contain 0s and 1s 156 | assert set(mask).issubset({0, 1}) 157 | 158 | # First element must be 1 159 | assert mask[0] == 1 160 | 161 | # If mask contains a 0, then that position in the sequence indicates an overlap 162 | try: 163 | idx0 = mask.index(0) 164 | # Check if consecutive elements equal where 0 found in mask 165 | assert seq_x[idx0] == seq_x[idx0 + 1] and seq_y[idx0] == seq_y[idx0 + 1] 166 | except ValueError: 167 | pass 168 | 169 | 170 | @given(generate_sequences_identical()) 171 | def test_get_mask_identical(inputs): 172 | """ 173 | Test the get_mask function for sequences with identical symbols 174 | """ 175 | seq_x, seq_y = inputs 176 | seq_x = array("I", seq_x) 177 | seq_y = array("I", seq_y) 178 | 179 | # Get mask from left-to-right and reversed sequence 180 | mask = cc.get_mask_pairs(seq_x, seq_y) 181 | mask_rev = cc.get_mask_pairs(seq_x[::-1], seq_y[::-1]) 182 | 183 | # Find zeroes for they must be present 184 | idx0 = mask.index(0) 185 | 186 | # Both masks should be precisely shorter than input sequence 187 | assert len(mask) == len(mask_rev) == len(seq_x) - 1 == len(seq_y) - 1 188 | 189 | # Both masks should only contain 0s and 1s 190 | assert set(mask).issubset({0, 1}) 191 | assert set(mask_rev).issubset({0, 1}) 192 | 193 | # Both masks should have an exact number of zeros corresponding to overlaps 194 | assert mask.count(0) == mask_rev.count(0) == (len(seq_x) - 1) // 2 195 | assert mask.count(1) == mask_rev.count(1) == len(seq_x) // 2 196 | 197 | # Check if consecutive elements equal where 0 found in mask 198 | assert seq_x[idx0] == seq_x[idx0 + 1] and seq_y[idx0] == seq_y[idx0 + 1] 199 | 200 | 201 | def test_mask_and_count(): 202 | """ 203 | Test the function for applying mask and counting frequent windows 204 | """ 205 | seq_x = (1, 2, 3, 4, 5, 6, 7) 206 | seq_y = (3, 4, 5, 6, 7, 8, 9) 207 | mask = (1, 0, 0, 1, 1) 208 | 209 | freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2) 210 | assert ( 211 | freq_pair_x == array("I", (1, 2)) 212 | and freq_pair_y == array("I", (3, 4)) 213 | and count == 1 214 | ) 215 | 216 | mask = (1, 1, 1, 1, 1) 217 | 218 | freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2) 219 | assert ( 220 | freq_pair_x == array("I", (1, 2)) 221 | and freq_pair_y == array("I", (3, 4)) 222 | and count == 1 223 | ) 224 | 225 | mask = (0, 1, 1, 1, 1) 226 | freq_pair_x, freq_pair_y, count = onestep._mask_and_count(seq_x, seq_y, mask, 2) 227 | assert ( 228 | freq_pair_x == array("I", (2, 3)) 229 | and freq_pair_y == array("I", (4, 5)) 230 | and count == 1 231 | ) 232 | 233 | 234 | @given(generate_sequences()) 235 | def test_substitution(inputs): 236 | """ 237 | Test the substitution step for random sequences 238 | """ 239 | seq_x, seq_y = inputs 240 | seq_x = array("I", seq_x) 241 | seq_y = array("I", seq_y) 242 | 243 | # Get values to substitute 244 | sub_value_x = 1 + max(seq_x) 245 | sub_value_y = 1 + max(seq_y) 246 | 247 | # Pick a random pair for substitution 248 | idx = seq_x.index(choice(seq_x[:-1])) 249 | pair_x = array("I", [seq_x[idx], seq_x[idx + 1]]) 250 | pair_y = array("I", [seq_y[idx], seq_y[idx + 1]]) 251 | 252 | # Substitute the pairs 253 | out_x, out_y = cc.substitute_pairs( 254 | seq_x[:], seq_y[:], pair_x, pair_y, sub_value_x, sub_value_y 255 | ) 256 | 257 | # The length of the substituted sequence should be less than the input sequence 258 | assert len(out_x) < len(seq_x) and len(out_y) < len(seq_y) 259 | 260 | # The lengths of the 2 substituted sequences should be identical 261 | assert len(out_x) == len(out_y) 262 | 263 | # The highest value in the substituted sequence should be more than that in the input sequence 264 | assert max(out_x) > max(seq_x) and max(out_y) > max(seq_y) 265 | 266 | # The highest value in the substitute sequence should match the provided value 267 | assert max(out_x) == sub_value_x and max(out_y) == sub_value_y 268 | 269 | 270 | @given(generate_sequences()) 271 | def test_truncation(inputs): 272 | """ 273 | Test ETC estimation from all 4 methods based on verbosity and truncation 274 | """ 275 | seq_x, seq_y = inputs 276 | 277 | etc_vf = cetc.compute(seq_x, seq_y, order=2, verbose=True, truncate=False)["ETC2D"] 278 | etc_vt = cetc.compute(seq_x, seq_y, order=2, verbose=True, truncate=True)["ETC2D"] 279 | etc_cf = cetc.compute(seq_x, seq_y, order=2, verbose=False, truncate=False)["ETC2D"] 280 | etc_ct = cetc.compute(seq_x, seq_y, order=2, verbose=False, truncate=True)["ETC2D"] 281 | 282 | # All 4 estimates should be identical 283 | assert etc_vf == etc_vt == etc_cf == etc_ct 284 | 285 | 286 | def test_compute_save(tmp_path): 287 | """ 288 | Test ETC estimation with write-to-disk functionality 289 | """ 290 | seq_x = array("I", [2, 4] * 100) 291 | seq_y = array("I", [1, 3, 5, 7] * 50) 292 | 293 | # Temporary file (Path object) for use 294 | file = tmp_path / "test.csv" 295 | 296 | # Test without truncation 297 | etc_vf = cetc.compute_save(seq_x, seq_y, file, order=2, truncate=False) 298 | assert isinstance(etc_vf, dict) 299 | 300 | # Test with truncation 301 | etc_vt = cetc.compute_save(seq_x, seq_y, file, order=2, truncate=True) 302 | assert isinstance(etc_vt, dict) 303 | 304 | # Values should be same of course 305 | assert etc_vf["ETC2D"] == etc_vt["ETC2D"] 306 | -------------------------------------------------------------------------------- /ETC/NSRWS/x2D/parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from functools import partial 9 | from itertools import islice 10 | from collections import Counter 11 | from random import choices 12 | 13 | # Import functions from standard library modules 14 | from multiprocessing import Pool 15 | 16 | # Import local modules 17 | import ETC 18 | from ETC.seq.process import entropy 19 | from ETC.helper.compute_markov_transition_probs import sample_sequence 20 | 21 | # Function definitions 22 | def _compute_two_files_truncated(files, order=2): 23 | """ 24 | This function operates on a single file - reads sequence, computes ETC 25 | and writes to disk. 26 | 27 | Parameters 28 | ---------- 29 | filepath : str or Path object 30 | Valid path to a file containing sequence. 31 | 32 | Returns 33 | ------- 34 | out : dict 35 | filename, length of sequence and ETC estimate. 36 | 37 | """ 38 | # Read file as a sequence 39 | filepath1, filepath2 = files 40 | seq1 = ETC.helper.IO.read(filepath1) 41 | seq2 = ETC.helper.IO.read(filepath2) 42 | 43 | if len(seq1) > len(seq2): 44 | seq1 = seq1[: len(seq2)] 45 | else: 46 | seq2 = seq2[: len(seq1)] 47 | 48 | # Prepare output dictionary 49 | out = {"seq1": filepath1.stem, "seq2": filepath2.stem, "length": len(seq1)} 50 | 51 | # Compute ETC, write to file and update output dictionary 52 | out.update(ETC.compute_2D(seq1, seq2, order=order, truncate=True, verbose=False)) 53 | seq1etc = ETC.compute_1D(seq1, order=order, truncate=True, verbose=False)["ETC1D"] 54 | out.update({"ETC1D_seq1": seq1etc}) 55 | 56 | seq2etc = ETC.compute_1D(seq2, order=order, truncate=True, verbose=False)["ETC1D"] 57 | out.update({"ETC1D_seq2": seq2etc}) 58 | 59 | return out 60 | 61 | 62 | # Function definitions 63 | def _compute_two_files_markov(files, markov_order, order=2): 64 | """ 65 | This function operates on a single file - reads sequence, computes ETC 66 | and writes to disk. 67 | 68 | Parameters 69 | ---------- 70 | filepath : str or Path object 71 | Valid path to a file containing sequence. 72 | 73 | Returns 74 | ------- 75 | out : dict 76 | filename, length of sequence and ETC estimate. 77 | 78 | """ 79 | # Read file as a sequence 80 | filepath1, filepath2 = files 81 | seq1 = ETC.helper.IO.read(filepath1, recode=False) 82 | seq2 = ETC.helper.IO.read(filepath2, recode=False) 83 | 84 | lseq1 = len(seq1) 85 | lseq2 = len(seq2) 86 | 87 | if lseq1 > lseq2: 88 | diff = lseq1 - lseq2 89 | extra_tail = sample_sequence( 90 | seq2, order=markov_order, size=diff, sampler_seed=64 91 | ) 92 | seq2 += extra_tail 93 | 94 | elif lseq1 < lseq2: 95 | diff = lseq2 - lseq1 96 | extra_tail = sample_sequence( 97 | seq1, order=markov_order, size=diff, sampler_seed=64 98 | ) 99 | seq1 += extra_tail 100 | 101 | assert len(seq1) == len(seq2) 102 | 103 | seq1 = ETC.helper.IO.recode_to_int(seq1) 104 | seq2 = ETC.helper.IO.recode_to_int(seq2) 105 | 106 | # Filename for writing output of ETC computation 107 | # fname = filepath1.with_name(filepath1.stem + '_&_'+ filepath2.stem + f"_etc_order{order}_markov_order{markov_order}.csv") 108 | 109 | # Prepare output dictionary 110 | out = {"seq1": filepath1.stem, "seq2": filepath2.stem, "length": len(seq1)} 111 | 112 | # Compute ETC, write to file and update output dictionary 113 | out.update(ETC.compute_2D(seq1, seq2, order=order, truncate=True, verbose=False)) 114 | seq1etc = ETC.compute_1D(seq1, order=order, truncate=True, verbose=False)["ETC1D"] 115 | out.update({"ETC1D_seq1": seq1etc}) 116 | 117 | seq2etc = ETC.compute_1D(seq2, order=order, truncate=True, verbose=False)["ETC1D"] 118 | out.update({"ETC1D_seq2": seq2etc}) 119 | 120 | return out 121 | 122 | 123 | def pcompute_files_markov(filelist, markov_order, order=2): 124 | """ 125 | This function operates concurrently on a list of files. Reads each as a 126 | sequence, computes ETC and writes output to disk. 127 | 128 | CAUTION: main module is unguarded, do not run these functions as is, 129 | particularly on Windows. 130 | 131 | Parameters 132 | ---------- 133 | filelist : list/tuple/generator 134 | Collection of filenames of files containing sequence data. 135 | 136 | Returns 137 | ------- 138 | list of dict elements 139 | Each dictionary element contains filename, length of sequence & ETC. 140 | 141 | """ 142 | # Initialize pool of parallel workers 143 | pool = Pool() 144 | func = partial(_compute_two_files_markov, markov_order=markov_order, order=order) 145 | # Map-execute function across files 146 | out = pool.map_async(func, filelist) 147 | 148 | # Graceful exit 149 | pool.close() 150 | pool.join() 151 | 152 | # Return collected results 153 | return out.get() 154 | 155 | 156 | def pcompute_files_truncated(filelist, order=2): 157 | """ 158 | This function operates concurrently on a list of files. Reads each as a 159 | sequence, computes ETC and writes output to disk. 160 | 161 | CAUTION: main module is unguarded, do not run these functions as is, 162 | particularly on Windows. 163 | 164 | Parameters 165 | ---------- 166 | filelist : list/tuple/generator 167 | Collection of filenames of files containing sequence data. 168 | 169 | Returns 170 | ------- 171 | list of dict elements 172 | Each dictionary element contains filename, length of sequence & ETC. 173 | 174 | """ 175 | # Initialize pool of parallel workers 176 | pool = Pool() 177 | func = partial(_compute_two_files_truncated, order=order) 178 | # Map-execute function across files 179 | out = pool.map_async(func, filelist) 180 | 181 | # Graceful exit 182 | pool.close() 183 | pool.join() 184 | 185 | # Return collected results 186 | return out.get() 187 | 188 | 189 | def _compute_single_seq(seq): 190 | """ 191 | This function operates on a single sequence and computes ETC. 192 | 193 | Parameters 194 | ---------- 195 | seq : tuple of 2 elements 196 | 1st element is index for tracking. 197 | 2nd element is a sequence of integers used for ETC computation. 198 | Output of enumerate. 199 | 200 | Returns 201 | ------- 202 | out : dict 203 | index of sequence, length of sequence and ETC estimate. 204 | 205 | """ 206 | # Prepare output dictionary 207 | out = {"item": seq[0], "length": len(seq[1]), "entropy": entropy(seq[1])} 208 | 209 | # Compute ETC and update output dictionary 210 | out.update(ETC.compute(seq[1], order=2, verbose=False, truncate=True)) 211 | 212 | return out 213 | 214 | 215 | def pcompute_multiple_seq(iterable): 216 | """ 217 | This function operates concurrently on a collection of sequences. Loads 218 | each sequence and computes ETC. 219 | 220 | CAUTION: main module is unguarded, do not run these functions as is, 221 | particularly on Windows. 222 | 223 | Parameters 224 | ---------- 225 | iterable : list/tuple/generator 226 | Collection of integer sequences. 227 | 228 | Returns 229 | ------- 230 | list of dict elements 231 | Each dictionary element contains index, length of sequence & ETC. 232 | 233 | """ 234 | # Initialize pool of parallel workers 235 | pool = Pool() 236 | 237 | # Map-execute function across sequences 238 | out = pool.map_async(_compute_single_seq, enumerate(iterable)) 239 | 240 | # Graceful exit 241 | pool.close() 242 | pool.join() 243 | 244 | # Return collected results 245 | return out.get() 246 | 247 | 248 | def _overlapping_chunks(seq, size, offset=1): 249 | """ 250 | This function takes an input sequence and produces chunks of chosen size. 251 | Offset can be used to control degree of overlap (or distance between chunks 252 | that don't overlap) 253 | 254 | Parameters 255 | ---------- 256 | seq : tuple or list 257 | Sequence of integers. 258 | size : int 259 | Length of each produced chunk. 260 | offset : int, optional 261 | Number of elements to shift each chunk by. The default is 1. 262 | Setting this to any value less than size allows control of overlap. 263 | Setting this >= size produces non-overlapping chunks. 264 | 265 | Returns 266 | ------- 267 | zip 268 | zip object that produces chunks of specified size, one at a time. 269 | 270 | """ 271 | 272 | return zip(*(islice(seq, i, None, offset) for i in range(size))) 273 | 274 | 275 | def _non_overlapping_chunks(seq, size): 276 | """ 277 | This function takes an input sequence and produces chunks of chosen size 278 | that strictly do not overlap. This is a much faster implemetnation than 279 | _overlapping_chunks and should be preferred if running on very large seq. 280 | 281 | Parameters 282 | ---------- 283 | seq : tuple or list 284 | Sequence of integers. 285 | size : int 286 | Length of each produced chunk. 287 | 288 | Returns 289 | ------- 290 | zip 291 | zip object that produces chunks of specified size, one at a time. 292 | 293 | """ 294 | 295 | return zip(*[iter(seq)] * size) 296 | 297 | 298 | def pcompute_single(seq, size, offset=1): 299 | """ 300 | This function operates concurrently on chunks of a given sequence. Gets 301 | each chunk and computes ETC one-by-one. Offset parameter controls degree of 302 | overlap (or non-overlap) 303 | 304 | CAUTION: main module is unguarded, do not run these functions as is, 305 | particularly on Windows. 306 | 307 | Parameters 308 | ---------- 309 | seq : tuple or list 310 | Sequence of integers. 311 | size : int 312 | Length of each produced chunk. 313 | offset : int, optional 314 | Number of elements to shift each chunk by. The default is 1. 315 | Setting this to any value less than size allows control of overlap. 316 | Setting this >= size produces non-overlapping chunks. 317 | 318 | Returns 319 | ------- 320 | list of dict elements 321 | Each dictionary element contains index, length of sequence & ETC. 322 | 323 | """ 324 | # If offset equals size, get non-overlapping chunks of given size 325 | if offset == size: 326 | iterable = _non_overlapping_chunks(seq, size) 327 | 328 | # Else get overlapping chunks of given size and offset 329 | else: 330 | iterable = _overlapping_chunks(seq, size, offset) 331 | 332 | # Execute parallel computation over chunks 333 | return pcompute_multiple_seq(iterable) 334 | -------------------------------------------------------------------------------- /ETC/NSRWS/x2D/core.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, emit_code_comments=True, cdivision=True, embedsignature=True 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | 6 | 7 | @author: Pranay S. Yadav 8 | """ 9 | # Import stuff 10 | from cpython cimport array, bool 11 | cimport cython 12 | import array 13 | 14 | # Function for getting mask for pairs 15 | cpdef array.array get_mask_pairs(const unsigned int[::1] x, const unsigned int[::1] y): 16 | """ 17 | INPUT 18 | ----- 19 | x : array.array 20 | Array object containing 32-bit integers. 21 | 22 | OUTPUT 23 | ------ 24 | mask : array.array 25 | Array object containing 32-bit integers - 0s or 1s corresponding to values in 26 | x for which successive overlapping pairs occur. 27 | """ 28 | # Get size of input 29 | cdef Py_ssize_t x_size = len(x) 30 | 31 | # Initialize a mask of Falses 32 | cdef array.array int_template = array.array('I', []) 33 | cdef array.array mask = array.clone(int_template, x_size-1, zero=True) 34 | cdef unsigned int[:] mask_view = mask 35 | 36 | # Initialize bounds for iteration 37 | cdef Py_ssize_t n = 0 38 | 39 | # Turn all values in mask to Trues 40 | for n in range(x_size-1): 41 | mask_view[n] += 1 42 | 43 | # Iterate over all values of input 44 | n = 0 45 | while n < x_size - 2: 46 | 47 | # If successive pairs match 48 | if x[n] == x[n+1] and x[n+1] == x[n+2] and y[n] == y[n+1] and y[n+1] == y[n+2]: 49 | 50 | # Mask out the second one 51 | mask_view[n+1] = 0 52 | 53 | # And slide over it 54 | n += 1 55 | 56 | # Increment while loop index 57 | n += 1 58 | 59 | return mask 60 | 61 | # Function for substituting pairs (old non-ideal version, uses for loop) 62 | # cpdef substitute_pairs_old(unsigned int[::1] x, unsigned int[::1] y, unsigned int[::1] pair_x, unsigned int[::1] pair_y, unsigned int value_x, unsigned int value_y): 63 | # """ 64 | # INPUT 65 | # ----- 66 | # x : array.array 67 | # Array object containing 32-bit unsigned integers. 68 | 69 | # pair : array.array, length = 2 70 | # Array object containing 2 32-bit unsigned integers. 71 | 72 | # value : unsigned 32-bit int 73 | # Value to substitute the first element of pair with 74 | 75 | # OUTPUT 76 | # ------ 77 | # out : list 78 | # Array object containing 32-bit integers, with supplied pair replaced everywhere 79 | # by the supplied value. 80 | # """ 81 | # # Initialize looping variables and output list 82 | # cdef Py_ssize_t n 83 | # cdef Py_ssize_t x_size = len(x) 84 | # cdef list out_x = [] 85 | # cdef list out_y = [] 86 | 87 | # # Loop over input and replace pair 88 | # for n in range(x_size-1): 89 | 90 | # # Check for match with supplied pair 91 | # if x[n] == pair_x[0] and x[n+1] == pair_x[1] and y[n] == pair_y[0] and y[n+1] == pair_y[1]: 92 | 93 | # # Replace first value with supplied value 94 | # x[n] = value_x 95 | # y[n] = value_y 96 | 97 | # # Replace second value with 0 98 | # x[n+1] = 0 99 | # y[n+1] = 0 100 | 101 | # # Reset indexing variable 102 | # n = 0 103 | 104 | # # Loop over mutated input and append non-zero values to list 105 | # for n in range(x_size): 106 | 107 | # if x[n]: # Check only for x as both x & y can only be simultaneously 0 108 | 109 | # out_x.append(x[n]) 110 | # out_y.append(y[n]) 111 | 112 | # return out_x, out_y 113 | 114 | # Function for substituting pairs (new version, uses while loop) 115 | cpdef substitute_pairs(unsigned int[::1] x, unsigned int[::1] y, unsigned int[::1] pair_x, unsigned int[::1] pair_y, unsigned int value_x, unsigned int value_y): 116 | """ 117 | INPUT 118 | ----- 119 | x : array.array 120 | Array object containing 32-bit unsigned integers. 121 | 122 | pair : array.array, length = 2 123 | Array object containing 2 32-bit unsigned integers. 124 | 125 | value : unsigned 32-bit int 126 | Value to substitute the first element of pair with 127 | 128 | OUTPUT 129 | ------ 130 | out : list 131 | Array object containing 32-bit integers, with supplied pair replaced everywhere 132 | by the supplied value. 133 | """ 134 | # Initialize looping variables and output list 135 | cdef Py_ssize_t n = 0 136 | cdef Py_ssize_t x_size = len(x) 137 | cdef list out_x = [] 138 | cdef list out_y = [] 139 | 140 | # Loop over input and replace pair 141 | while n < x_size-1: 142 | 143 | # Check for match with supplied pair 144 | if x[n] == pair_x[0] and x[n+1] == pair_x[1] and y[n] == pair_y[0] and y[n+1] == pair_y[1]: 145 | 146 | # Replace first value with supplied value 147 | x[n] = value_x 148 | y[n] = value_y 149 | 150 | # Replace second value with 0 151 | x[n+1] = 0 152 | y[n+1] = 0 153 | 154 | n += 1 155 | 156 | n += 1 157 | 158 | # Reset indexing variable 159 | n = 0 160 | 161 | # Loop over mutated input and append non-zero values to list 162 | for n in range(x_size): 163 | 164 | if x[n]: # Check only for x as both x & y can only be simultaneously 0 165 | 166 | out_x.append(x[n]) 167 | out_y.append(y[n]) 168 | 169 | return out_x, out_y 170 | 171 | # Function for checking whether all elements in input are identical 172 | cpdef bint check_equality(const unsigned int[::1] x, const unsigned int[::1] y): 173 | """ 174 | INPUT 175 | ----- 176 | x : array.array 177 | Array object containing 32-bit unsigned integers. 178 | 179 | 180 | OUTPUT 181 | ------ 182 | bool 183 | True if all elements are identical 184 | """ 185 | # Intialize loop bounds 186 | cdef Py_ssize_t n 187 | cdef Py_ssize_t x_size = len(x) 188 | 189 | # Iterate over values from input 190 | for n in range(x_size): 191 | 192 | # Short-circuit the loop: check for any element that doesn't equal the first 193 | if x[0] != x[n] or y[0] != y[n]: 194 | return False 195 | 196 | return True 197 | 198 | # # Function for getting mask for windows of any length 199 | # cpdef array.array get_mask_windows(const unsigned int[::1] x, unsigned int order): 200 | # """ 201 | # INPUT 202 | # ----- 203 | # x : array.array 204 | # Array object containing 32-bit integers. 205 | 206 | # order: unsigned 32-bit int 207 | # Length of the window to slide across input 208 | 209 | # OUTPUT 210 | # ------ 211 | # mask : array.array 212 | # Array object containing 32-bit integers - 0s or 1s corresponding to values in 213 | # x for which successive overlapping pairs occur. 214 | # """ 215 | # # Get size of input 216 | # cdef Py_ssize_t x_size = len(x) 217 | 218 | # # Initialize a mask of Falses 219 | # cdef array.array int_template = array.array('I', []) 220 | # cdef array.array mask = array.clone(int_template, x_size, zero=True) 221 | # cdef unsigned int[:] mask_view = mask 222 | 223 | # # Initialize variable for iteration 224 | # cdef Py_ssize_t n = 0 225 | 226 | # # Turn all values in mask to Trues 227 | # for n in range(x_size): 228 | # mask_view[n] += 1 229 | 230 | # # Initialize variables for iteration across input 231 | # cdef Py_ssize_t k = 0 # Outer loop 232 | # cdef Py_ssize_t m = 0 # Inner loop 233 | 234 | # # Tracking variable for counting matching elements in pairwise window comparison 235 | # cdef unsigned int track = 0 236 | 237 | # # Iterate over input values except the last 'order' values [Outermost master loop] 238 | # n = 0 239 | # for n in range(x_size-order): 240 | 241 | # # proceed only if mask is True for current element 242 | # if mask_view[n]: 243 | 244 | # # Outer loop for sliding the 'next' window by unit step (current vs next) 245 | # for k in range(1,order): # Start from 1 - begin comparing from next window 246 | 247 | # # Inner loop for comparing elements in current and next windows 248 | # for m in range(order): 249 | 250 | # # If elements match, increment tracker 251 | # if x[n+m] == x[n+m+k]: 252 | # track += 1 253 | 254 | # # Else stop iteration over this comparison of windows 255 | # else: 256 | # break 257 | 258 | # # Trick: preserve mask only if track doesn't equal order 259 | # # If track == order, short-circuit eval takes precedence, returning 0 260 | # mask_view[n+k] = track!=order and mask_view[n+k] 261 | 262 | # # Reset tracker 263 | # track = 0 264 | 265 | # return mask 266 | 267 | # # Function for substituting windows of any length 268 | # cpdef list substitute_windows(unsigned int[::1] x, unsigned int order, unsigned int[::1] window, unsigned int value): 269 | # """ 270 | # INPUT 271 | # ----- 272 | # x : array.array 273 | # Array object containing 32-bit unsigned integers. 274 | 275 | # order: unsigned 32-bit int 276 | # Length of the window to slide across input 277 | 278 | # window : array.array, length = 2 279 | # Array object containing 2 32-bit unsigned integers. 280 | 281 | # value : unsigned 32-bit int 282 | # Value to substitute the first element of pair with 283 | 284 | # OUTPUT 285 | # ------ 286 | # out : list 287 | # Array object containing 32-bit integers, with supplied pair replaced everywhere 288 | # by the supplied value. 289 | # """ 290 | # # Initialize looping variables and output list 291 | # cdef Py_ssize_t n = 0 # Outer loop 292 | # cdef Py_ssize_t m = 0 # Inner loop 293 | # cdef Py_ssize_t x_size = len(x) 294 | # cdef list out = [] 295 | 296 | # # Tracking variable for counting matching elements in pairwise window comparison 297 | # cdef unsigned int track = 0 298 | 299 | # # Iterate over input values except one less than the last 'order' values 300 | # # Logic: last window, say triplet must begin from 3rd-last index, leaving 2 values 301 | # for n in range(x_size-order+1): 302 | 303 | # # Slide window of given order and do element-wise comparison 304 | # for m in range(order): 305 | 306 | # # Reset tracker 307 | 308 | 309 | # # Track comparison of input elements with window elements 310 | # if x[n+m] == window[m]: 311 | # track += 1 312 | # # # If mismatch, break 313 | # else: 314 | # break 315 | 316 | # # If all compared elements match for current window 317 | # if track == order: 318 | 319 | # # Replace the first element with provided value 320 | # x[n] = value 321 | 322 | # # Replace the remaining subsequent values with zeros 323 | # for m in range(1, order): 324 | # x[n+m] = 0 325 | 326 | # # Reset tracker 327 | # track = 0 328 | # # Reset indexing variable 329 | # n = 0 330 | 331 | # # Loop over mutated input and append non-zero values to list 332 | # for n in range(x_size): 333 | 334 | # if x[n]: 335 | 336 | # out.append(x[n]) 337 | 338 | # return out -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ETCPy 2 | **E**ffort-**T**o-**C**ompress in **Py**thon 3 | - [What is this](https://github.com/pranaysy/ETCPy#what-is-this) 4 | - [References](https://github.com/pranaysy/ETCPy#references) 5 | - [What can it do](https://github.com/pranaysy/ETCPy#what-can-it-do) 6 | - [Study Haemodynamics, Heart-Rate Variability and Cardiac Aging using ECG/EKG](https://github.com/pranaysy/ETCPy#study-haemodynamics-heart-rate-variability-and-cardiac-aging-using-ecgekg) 7 | - [Network Neuroscience, Psychophysics and Scientific Study of Consciousness](https://github.com/pranaysy/ETCPy#network-neuroscience-psychophysics-and-scientific-study-of-consciousness) 8 | - [Genome Complexity Analysis and Classification of Nucleotide Sequences](https://github.com/pranaysy/ETCPy#genome-complexity-analysis-and-classification-of-nucleotide-sequences) 9 | - [Audio Signal Processing and Denoising](https://github.com/pranaysy/ETCPy#audio-signal-processing-and-denoising) 10 | - [How to use it](https://github.com/pranaysy/ETCPy#how-to-use-it) 11 | - [Dependencies](https://github.com/pranaysy/ETCPy#dependencies) 12 | - [Installation](https://github.com/pranaysy/ETCPy#installation) 13 | - [Updating](https://github.com/pranaysy/ETCPy#updating) 14 | - [Usage](https://github.com/pranaysy/ETCPy#usage) 15 | - [Testing](https://github.com/pranaysy/ETCPy#testing) 16 | - [MATLAB Implementation](https://github.com/pranaysy/ETCPy#matlab-implementation) 17 | - [TODO](https://github.com/pranaysy/ETCPy#todo) 18 | - [License](https://github.com/pranaysy/ETCPy#license) 19 | 20 | --- 21 | 22 | ## What is this 23 | A Python implementation of the compression-complexity measure called Effort-To-Compress or ETC. ETC captures the compressibility and complexity of discrete symbolic sequences using lossless compression. It has been shown to robustly estimate complexity, comparing favorably for short and noisy time series in comparison with entropy and Lempel-Ziv complexity. 24 | 25 | Using ETC, causal information flow between multiple discrete symbolic sequences can be assessed and recently, such a use has been presented, rigorously proven and demonstrated to be an effective model-free measure of causality. Introduced as Compression-Complexity Causality or CCC, this measure is robust to numerous data contaminants, noise sources and pre-processing artifacts. On comparison with Granger Causality and Transfer Entropy, CCC compares favorably and outperforms them on synthetic as well as real world causal interactions. An implementation of CCC is included in this repository. 26 | 27 | While any lossless compressor may be used with ETC and subsequently with CCC, a grammar-based lossless compression algorithm called Non-Sequential Recursive Pair Substitution or NSRPS is used presently. NSRPS has been rigorously studied and shown to be an effective tool for data compression and entropy estimation. This repository also contains a fast Cython implementation of NSRPS for use with ETC and CCC. 28 | 29 | #### References 30 | - Benedetto, Dario, Emanuele Caglioti, and Davide Gabrielli. “Non-Sequential Recursive Pair Substitution: Some Rigorous Results.” Journal of Statistical Mechanics: Theory and Experiment 2006, no. 09 (September 25, 2006): P09011–P09011. https://doi.org/10.1088/1742-5468/2006/09/P09011. 31 | - Balasubramanian, Karthi, Gayathri R. Prabhu, Lakshmipriya V. K. , Maneesha Krishnan, Praveena R. , and Nithin Nagaraj. “Classification of Periodic, Chaotic and Random Sequences Using NSRPS Complexity Measure.” ArXiv:1205.4886 [Nlin], May 22, 2012. http://arxiv.org/abs/1205.4886. 32 | - Nagaraj, Nithin, Karthi Balasubramanian, and Sutirth Dey. “A New Complexity Measure for Time Series Analysis and Classification.” The European Physical Journal Special Topics 222, no. 3–4 (July 2013): 847–60. https://doi.org/10.1140/epjst/e2013-01888-9. 33 | - Nagaraj, Nithin, and Karthi Balasubramanian. “Dynamical Complexity of Short and Noisy Time Series: Compression-Complexity vs. Shannon Entropy.” The European Physical Journal Special Topics 226, no. 10 (July 2017): 2191–2204. https://doi.org/10.1140/epjst/e2016-60397-x. 34 | - Kathpalia, Aditi, and Nithin Nagaraj. “Data-Based Intervention Approach for Complexity-Causality Measure.” PeerJ Computer Science 5 (May 27, 2019): e196. https://doi.org/10.7717/peerj-cs.196. 35 | 36 | 37 | ## What can it do 38 | #### Study Haemodynamics, Heart-Rate Variability and Cardiac Aging using ECG/EKG 39 | - Balasubramanian, Karthi, Nithin Nagaraj, and Sandipan Pati. “Chaos or Randomness? Effect of Vagus Nerve Stimulation During Sleep on Heart-Rate Variability.” IETE Journal of Research, June 30, 2020, 1–7. https://doi.org/10.1080/03772063.2020.1780165. 40 | - Srilakshmi, P, Karthi Balasubramanian, Nithin Nagaraj, and Sandipan Pati. “Multiscale Analysis of Heart Rate Variability Using Subsymmetry and Effort-to-Compress Complexity Measures.” In 2018 15th IEEE India Council International Conference (INDICON), 1–5. Coimbatore, India: IEEE, 2018. https://doi.org/10.1109/INDICON45594.2018.8986972. 41 | - Thanaj, Marjola, Andrew J. Chipperfield, and Geraldine F. Clough. “Analysis of Microvascular Blood Flow and Oxygenation: Discrimination between Two Haemodynamic Steady States Using Nonlinear Measures and Multiscale Analysis.” Computers in Biology and Medicine 102 (November 2018): 157–67. https://doi.org/10.1016/j.compbiomed.2018.09.026. 42 | - Balasubramanian, Karthi, K Harikumar, Nithin Nagaraj, and Sandipan Pati. “Vagus Nerve Stimulation Modulates Complexity of Heart Rate Variability Differently during Sleep and Wakefulness.” Annals of Indian Academy of Neurology 20, no. 4 (2017): 403. https://doi.org/10.4103/aian.AIAN_148_17. 43 | - Balasubramanian, Karthi, and Nithin Nagaraj. “Aging and Cardiovascular Complexity: Effect of the Length of RR Tachograms.” PeerJ 4 (2016): e2755. https://doi.org/10.7717/peerj.2755. 44 | 45 | #### Network Neuroscience, Psychophysics and Scientific Study of Consciousness 46 | - Ashley J. Funkhouser. "The Role of Action in Affordance Perception Using Virtual Reality" 2020. Honors College Thesis with Dr. Alen Hajnal, Department of Psychology, The University of Southwestern Mississipi. https://aquila.usm.edu/honors_theses/714/ 47 | - Agarwal, Nikita, Aditi Kathpalia, and Nithin Nagaraj. “Distinguishing Different Levels of Consciousness Using a Novel Network Causal Activity Measure.” In 2019 Global Conference for Advancement in Technology (GCAT), 1–5. BANGALURU, India: IEEE, 2019. https://doi.org/10.1109/GCAT47503.2019.8978424. 48 | - Virmani, Mohit, and Nithin Nagaraj. “A Novel Perturbation Based Compression Complexity Measure for Networks.” Heliyon 5, no. 2 (February 2019): e01181. https://doi.org/10.1016/j.heliyon.2019.e01181. 49 | - Kondo, Fumika. “Can Alterations in the Temporal Structure of Spontaneous Brain Activity Serve as a Disease-Specific Biomarker for Schizophrenia? A Multi Cohort FMRI Study,” 2017. https://doi.org/10.20381/RUOR-20801. 50 | - Kimiskidis, Vasilios K., Christos Koutlis, Alkiviadis Tsimpiris, Reetta Kälviäinen, Philippe Ryvlin, and Dimitris Kugiumtzis. “Transcranial Magnetic Stimulation Combined with EEG Reveals Covert States of Elevated Excitability in the Human Epileptic Brain.” International Journal of Neural Systems 25, no. 05 (August 2015): 1550018. https://doi.org/10.1142/S0129065715500185. 51 | 52 | #### Genome Complexity Analysis and Classification of Nucleotide Sequences 53 | - Balasubramanian, Karthi, and Nithin Nagaraj. “Automatic Identification of SARS Coronavirus Using Compression-Complexity Measures.” Preprint. Bioinformatics, March 27, 2020. https://doi.org/10.1101/2020.03.24.006007. 54 | 55 | #### Audio Signal Processing and Denoising 56 | - Kiefer, Chris, Overholt, Dan and Eldridge, Alice (2020) Shaping the behaviour of feedback instruments with complexity-controlled gain dynamics. New Interfaces for Musical Expression, Birmingham, UK, 21-25 July 2020. Published in: Proceedings of the International Conference on New Interfaces for Musical Expression. 343-348. NIME, Birmingham, UK. ISSN 2220-4806. https://sro.sussex.ac.uk/id/eprint/91009/ 57 | - Li, Guohui, Qianru Guan, and Hong Yang. “Noise Reduction Method of Underwater Acoustic Signals Based on CEEMDAN, Effort-To-Compress Complexity, Refined Composite Multiscale Dispersion Entropy and Wavelet Threshold Denoising.” Entropy 21, no. 1 (December 24, 2018): 11. https://doi.org/10.3390/e21010011. 58 | 59 | 60 | ## How to use it 61 | The simplest way right now is to use `pip` to clone this repository and install locally inside a `conda` or a `virtualenv` environment. This way several functions implemented in Cython will be automatically compiled natively on the host system. Instructions below. 62 | 63 | While the repository is called `ETCPy`, the package namespsace available for use is `ETC`. All functionality is available through the `ETC` namespace. 64 | 65 | For running tests (strongly recommended), additional packages need to be installed. 66 | 67 | ### Operating System Support 68 | - GNU/Linux-based distributions (tested on Ubuntu 16.04, 18.04, 20.04) 69 | - **Currently does not work out of the box on Windows.** Cython and C/C++ build toolchain need to be setup properly for compilation on Windows to work. It may work with some gymnastics using MinGW + Visual Studio Build Tools, **currently untested.** Although does work on WSL! 70 | 71 | ### Dependencies 72 | For core functionality: 73 | - `numpy` 74 | - `pandas` 75 | - `joblib` 76 | - `cython` 77 | - Note: Cython needs a working C/C++ compiler such as GCC/Clang and associated build-utils/toolchain. While it should work out of the box on any modern Linux distribution, ensure a proper installation as instructed in the [official documentation.](https://cython.readthedocs.io/en/latest/src/quickstart/install.html). 78 | 79 | For tests: 80 | - `pytest` 81 | - `hypothesis` 82 | 83 | ### Installation 84 | Skip the first step if an environment is already available: 85 | 1. Create a fresh `conda` or `pip`/`virtualenv`-based environment with `numpy` and `cython` packages. Choose an appropriate name instead of `myenv`. 86 | ```bash 87 | $ conda create -n myenv python numpy pandas joblib cython 88 | ``` 89 | 2. Activate environment using `conda activate myenv` or virtualenv equivalent. 90 | 91 | If `git` is not installed, then: 92 | - either install it at a system level directly from the [official website](https://git-scm.com/download) or via prefereed package manager 93 | - or install it within the newly created conda environment using `conda install git` 94 | 95 | 3. Use `pip`* to install directly from GitHub using the `git` VCS backend 96 | ```bash 97 | $ python -m pip install git+https://github.com/pranaysy/ETCPy.git 98 | ``` 99 | 4. Done! Open a Python shell, execute `import ETC` and proceed to the [demo](./demo.py) 100 | 101 | --- 102 | *mixing `pip` and `conda` is not a generally advised but can be used based on [certain recommendations](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#pip-in-env) 103 | 104 | ### Updating 105 | Use the `-U` flag with pip for updating to the most current version available from this repository: 106 | ``` 107 | $ python -m pip install -U git+https://github.com/pranaysy/ETCPy.git 108 | ``` 109 | This will rebuild the compiled Cython functions as well. 110 | 111 | ### Usage 112 | Please check out [`demo.py`](./demo.py) to see ETC in action. [Functions for dealing with NumPy arrays](https://github.com/pranaysy/ETCPy/blob/master/demo.py#L121) are also available. In addition to the core functionality of ETC, a [brief demo of Compression-Complexity Causality (CCC)](https://github.com/pranaysy/ETCPy/blob/master/demo.py#L158) is also included for uncoupled as well as coupled first-order auto-regressive processes. 113 | 114 | The implementations of ETC as well as CCC include multicore parallelization (using [`joblib`](https://joblib.readthedocs.io/en/latest/index.html)) and can benefit from more available CPU cores for multiple sequences. 115 | 116 | ### Testing 117 | Most of the tests are property-based or behavior-based, and are implemented using the awesome [`hypothesis` framework](https://hypothesis.readthedocs.io/en/latest/). 118 | Make sure dependencies are satisfied within the working environment: 119 | ```bash 120 | $ python -m pip install -U pytest hypothesis 121 | ``` 122 | Grab a copy of this repository using git and enter the local directory: 123 | ```bash 124 | $ git clone https://github.com/pranaysy/ETCPy.git 125 | $ cd ETCPy 126 | ``` 127 | Run tests: 128 | ```bash 129 | $ pytest ETC/ 130 | ``` 131 | 132 | ### MATLAB Implementation 133 | - The original ETC implementation in MATLAB can be found here: https://sites.google.com/site/nithinnagaraj2/journal/etc 134 | 135 | 136 | ## TODO 137 | - Hyperparameter optimization for CCC 138 | - Add performance metrics 139 | - Automated tests with `tox` 140 | - Better packaging: `pip` vs `conda` 141 | - Visualizations 142 | - Improve test coverage 143 | - Documentation using Sphinx/MkDocs 144 | - Windows support 145 | 146 | ## License 147 | Copyright 2021 Pranay S. Yadav and Nithin Nagaraj 148 | 149 | Licensed under the Apache License, Version 2.0 (the "License"); 150 | you may not use this file except in compliance with the License. 151 | You may obtain a copy of the License at 152 | 153 | http://www.apache.org/licenses/LICENSE-2.0 154 | 155 | Unless required by applicable law or agreed to in writing, software 156 | distributed under the License is distributed on an "AS IS" BASIS, 157 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 158 | See the License for the specific language governing permissions and 159 | limitations under the License. 160 | -------------------------------------------------------------------------------- /ETC/NSRWS/x2D/onestep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from collections import Counter 9 | from itertools import compress, islice 10 | from time import perf_counter 11 | 12 | from ETC.NSRWS.x2D import core 13 | from ETC.seq.recode import cast 14 | from ETC.seq.check import arraytype 15 | 16 | 17 | def _mask_and_count(seq_x, seq_y, mask, order): 18 | """ 19 | Apply binary mask to a pair of sequences & count most frequently jointly occurring windows 20 | 21 | This function does 3 things in the following sequence: 22 | 1. Create sliding windows of a given size (order) - using zip and islice 23 | 2. Apply a supplied mask to the sliding windows - using compress 24 | 3. Count most frequently occurring window - using Counter 25 | 26 | In the NSRWS algorithm, this is the most time consuming step. Essentially expands 27 | two 1D sequences to a 2D sequence - where the sequences follows along row-wise & 28 | the columnar expansion encodes a sliding window for each row jointly from both 29 | sequences: 30 | 1D sequences: 31 | (1,2,3,4,5,6,7) 32 | (3,4,5,6,7,8,9) 33 | 34 | 2D expansion for window order=3: 35 | (((1,3),(2,4),(3,5)), 36 | ((2,4),(3,5),(4,6)), 37 | ((3,5),(4,6),(5,7)), 38 | ((4,6),(5,7),(6,8)), 39 | ((5,7),(6,8),(7,9))) 40 | 41 | The mask is applied row-wise & must be of the same length as the number of rows 42 | in this 2D expansion. This is given by: 43 | len(mask) = len(seq) - (order - 1) 44 | 45 | Example application of the mask (1,0,0,1,1): 46 | 1 -> (((1,3),(2,4),(3,5)), 47 | 0 -> ((2,4),(3,5),(4,6)), (((1,3),(2,4),(3,5)), 48 | 0 -> ((3,5),(4,6),(5,7)), ---> ((4,6),(5,7),(6,8)), 49 | 1 -> ((4,6),(5,7),(6,8)), ((5,7),(6,8),(7,9))) 50 | 1 -> ((5,7),(6,8),(7,9))) 51 | 52 | Unique windows (rows of 2D expansion) are counted and most frequently occurring 53 | row is returned with counts. 54 | 55 | Parameters 56 | ---------- 57 | seq_x : array.array 58 | Discrete symbolic sequence containing 32-bit unsigned integers. 59 | seq_y : array.array 60 | Discrete symbolic sequence containing 32-bit unsigned integers. 61 | mask : array.array 62 | Collection of Booleans, where 0s indicate locations on "seq" to mask out. 63 | 0s correspond to overlapping windows. 64 | order : int 65 | Size of window for NSRWS, 2 or greater. 66 | 67 | Returns 68 | ------- 69 | pair_x : array.array 70 | Most frequently occurring non-overlapping "window" of size "order" in seq_x 71 | pair_y : array.array 72 | Most frequently occurring non-overlapping "window" of size "order" in seq_y 73 | count : int 74 | Number of times the most frequently occurring window occurs. 75 | 76 | """ 77 | # Create overlapped sliding windows (each window a tuple of size order) & apply mask 78 | filtered = compress( 79 | zip(*(islice(zip(seq_x, seq_y), i, None) for i in range(order))), mask 80 | ) 81 | 82 | # Count sliding windows (tuples are hashable!) & get the one most common with counts 83 | freq_pair, count = Counter(filtered).most_common(1)[0] 84 | 85 | # Assign array type and return 86 | pair_x = cast([freq_pair[0][0], freq_pair[1][0]]) 87 | pair_y = cast([freq_pair[0][1], freq_pair[1][1]]) 88 | 89 | return pair_x, pair_y, count 90 | 91 | 92 | def _onestep_pairs(seq_x, seq_y, verbose=True): 93 | """ 94 | Execute one full step of NSRPS (NSRWS with order=2) for a given sequence 95 | 96 | Makes use of 2 functions written in Cython & _mask_and_count in the following steps: 97 | 1. Find overlapping pairs & store their indices for masking -> get_mask_pairs() 98 | 2. Apply the mask and find most frequent pair -> _mask_and_count() 99 | 3. Substitute all occurrences of the most frequent pair -> substitute_pairs() 100 | 101 | This function is different from _onestep_windows because: 102 | 1. It is *much* faster due to fewer nested loops 103 | 2. It targets a more common use case scenario: for distances, for CCC, etc 104 | 3. For higher window orders, correctness needs to be proved outside of tests 105 | 106 | The implementation will benefit from: 107 | 1. Decorators for timing 108 | 2. Decorators for verbosity of output 109 | 3. Cython implementation of the slowest part: _mask_and_count 110 | problem: counting windows in C? 111 | 112 | Parameters 113 | ---------- 114 | seq_x : array.array 115 | Discrete symbolic sequence containing 32-bit unsigned integers. 116 | seq_y : array.array 117 | Discrete symbolic sequence containing 32-bit unsigned integers. 118 | verbose : bool, optional 119 | Whether to report extra details. These include the frequent pair that was 120 | substituted, its counts & total time taken. The default is True. 121 | 122 | Returns 123 | ------- 124 | tuple, of the following fixed elements: 125 | seq_x : array.array 126 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 127 | frequently occurring non-sequentially overlapping pair substituted. 128 | 129 | seq_y : array.array 130 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 131 | frequently occurring non-sequentially overlapping pair substituted. 132 | 133 | signal : bool 134 | indicator for the state of sequence with all distinct pairs (count=1) 135 | 136 | optional elements of tuple that depend on verbosity: 137 | freq_pair_x : array.array 138 | Frequent pair substituted in seq_x 139 | 140 | freq_pair_y : array.array 141 | Frequent pair substituted in seq_y 142 | 143 | count : int 144 | Number of times the frequent pair occurred in the sequence 145 | 146 | time_taken : float 147 | Time taken to execute step 148 | 149 | 150 | """ 151 | # Initialize timer 152 | before = perf_counter() 153 | 154 | # Initialize signal for tracking sequence state with all distinct pairs 155 | signal = False 156 | 157 | # Compute mask for overlapping pairs 158 | mask = core.get_mask_pairs(seq_x, seq_y) 159 | 160 | # Apply mask and find most frequent pair 161 | pair_x, pair_y, count = _mask_and_count(seq_x, seq_y, mask, 2) 162 | 163 | # Get values for substitution of the most frequent pair with 164 | sub_value_x = 1 + max(seq_x) 165 | sub_value_y = 1 + max(seq_y) 166 | 167 | # If all distinct pairs, substitute the first one & set signal to True 168 | if count == 1: 169 | out_x = cast(seq_x[1:]) 170 | out_x[0] = sub_value_x 171 | 172 | out_y = cast(seq_y[1:]) 173 | out_y[0] = sub_value_y 174 | 175 | signal = True 176 | # Else, substitute all instances of the frequent pair 177 | else: 178 | out_x, out_y = core.substitute_pairs( 179 | seq_x, seq_y, pair_x, pair_y, sub_value_x, sub_value_y 180 | ) 181 | out_x = cast(out_x) 182 | out_y = cast(out_y) 183 | 184 | # Completion timer 185 | after = perf_counter() 186 | 187 | # If verbose, return more things 188 | if verbose: 189 | time_taken = after - before 190 | return out_x, out_y, signal, pair_x, pair_y, count, time_taken 191 | 192 | # Else return bare essentials 193 | return out_x, out_y, signal 194 | 195 | 196 | # def _onestep_windows(seq, order, verbose=True): 197 | 198 | # before = perf_counter() 199 | # mask = core.get_mask_windows(seq, order)[: -(order - 1)] 200 | # z_windowed = compress(zip(*(islice(seq, i, None) for i in range(order))), mask) 201 | # z_windowed = tuple(z_windowed) 202 | # freq_window, count = Counter(z_windowed).most_common(1)[0] 203 | # sub_value = 1 + max(seq) 204 | # window = array("I", freq_window) 205 | # if count == 1: 206 | # out = array("I", seq[order - 1 :]) 207 | # out[0] = sub_value 208 | # signal = True 209 | # else: 210 | # out = array("I", core.substitute_windows(seq, order, window, sub_value)) 211 | # signal = False 212 | # out = array("I", core.substitute_windows(seq, order, window, sub_value)) 213 | # after = perf_counter() 214 | # if verbose: 215 | # return out, freq_window, count, after - before, signal 216 | # return out, signal 217 | 218 | 219 | # def _onestep_windows(seq, order, verbose=True): 220 | # pass 221 | 222 | 223 | def _onestep(seq_x, seq_y, order, verbose=True): 224 | """ 225 | Wrapper that switches routine (pairs vs windows) depending on order 226 | 227 | For pairs (order=2), execute _onestep_pairs which is faster 228 | For higher orders, execute _onestep_windows 229 | 230 | Parameters 231 | ---------- 232 | seq_x : array.array 233 | Discrete symbolic sequence containing 32-bit unsigned integers. 234 | seq_y : array.array 235 | Discrete symbolic sequence containing 32-bit unsigned integers. 236 | order : int 237 | Size of window for NSRWS, 2 or greater. 238 | verbose : bool, optional 239 | Whether to report extra details. These include the frequent pair that was 240 | substituted, its counts & total time taken. The default is True. 241 | 242 | Returns 243 | ------- 244 | tuple, of the following fixed elements: 245 | seq_x : array.array 246 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 247 | frequently occurring non-sequentially overlapping pair substituted. 248 | 249 | seq_y : array.array 250 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 251 | frequently occurring non-sequentially overlapping pair substituted. 252 | 253 | signal : bool 254 | indicator for the state of sequence with all distinct pairs (count=1) 255 | 256 | optional elements of tuple that depend on verbosity: 257 | freq_pair_x : array.array 258 | Frequent pair substituted in seq_x 259 | 260 | freq_pair_y : array.array 261 | Frequent pair substituted in seq_y 262 | 263 | count : int 264 | Number of times the frequent pair occurred in the sequence 265 | 266 | time_taken : float 267 | Time taken to execute step 268 | 269 | """ 270 | if order == 2: 271 | return _onestep_pairs(seq_x[:], seq_y[:], verbose) 272 | # if order > 2: 273 | # return _onestep_windows(seq_x[:], seq_y[:], order, verbose) 274 | 275 | 276 | def onestep(seq_x, seq_y, order, verbose=True, check=True): 277 | """ 278 | Execute one step of NSRWS on given sequence and window size. 279 | 280 | This function exposes the functionality of NSRWS with various checks for inputs and 281 | sizes. Wraps around _onestep & for convenience, allows disabling of equality check. 282 | 283 | Parameters 284 | ---------- 285 | seq_x : array.array 286 | Discrete symbolic sequence containing 32-bit unsigned integers. 287 | seq_y : array.array 288 | Discrete symbolic sequence containing 32-bit unsigned integers. 289 | order : int 290 | Size of window for NSRWS, 2 or greater. 291 | verbose : bool, optional 292 | Whether to report extra details. These include the frequent pair that was 293 | substituted, its counts & total time taken. The default is True. 294 | check : bool, optional 295 | Check for equality of all symbols in sequence. The default is True. 296 | 297 | Returns 298 | ------- 299 | tuple, of the following fixed elements: 300 | seq_x : array.array 301 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 302 | frequently occurring non-sequentially overlapping pair substituted. 303 | 304 | seq_y : array.array 305 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 306 | frequently occurring non-sequentially overlapping pair substituted. 307 | 308 | signal : bool 309 | indicator for the state of sequence with all distinct pairs (count=1) 310 | 311 | optional elements of tuple that depend on verbosity: 312 | freq_pair_x : array.array 313 | Frequent pair substituted in seq_x 314 | 315 | freq_pair_y : array.array 316 | Frequent pair substituted in seq_y 317 | 318 | count : int 319 | Number of times the frequent pair occurred in the sequence 320 | 321 | time_taken : float 322 | Time taken to execute step 323 | 324 | """ 325 | # Check if both sequences are of same length, if not then exit 326 | if len(seq_x) != len(seq_y): 327 | print("> Both inputs must be of the same length!") 328 | return None 329 | 330 | # Coerce input 1 to appropriate array type, if not possible throw a fit & exit 331 | if not arraytype(seq_x): 332 | seq_x = cast(seq_x) 333 | 334 | # Coerce input 2 to appropriate array type, if not possible throw a fit & exit 335 | if not arraytype(seq_y): 336 | seq_y = cast(seq_y) 337 | 338 | # Exit if neither inputs could be coerced 339 | if seq_x is None or seq_y is None: 340 | return None 341 | 342 | # Check if size of sequence is shorter than order, exit if True 343 | if len(seq_x) < order or len(seq_y) < order: 344 | print("> Sequence input shorter than order!\n> Can't perform substitution ...") 345 | return None 346 | 347 | # Check whether all elements are equal, if requested, & exit if True 348 | if check and core.check_equality(seq_x, seq_y): 349 | print("> All elements in sequence x are equal!") 350 | return None 351 | 352 | # Else execute one step of NSRWS and return 353 | return _onestep(seq_x, seq_y, order, verbose) 354 | -------------------------------------------------------------------------------- /ETC/NSRWS/x1D/onestep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | 6 | @author: Pranay S. Yadav 7 | """ 8 | from collections import Counter 9 | from itertools import compress, islice 10 | from time import perf_counter 11 | 12 | from ETC.NSRWS.x1D import core 13 | from ETC.seq.recode import cast 14 | from ETC.seq.check import arraytype 15 | 16 | 17 | def _mask_and_count(seq, mask, order): 18 | """ 19 | Apply binary mask to a sequence and count most frequently occurring windows 20 | 21 | This function does 3 things in the following sequence: 22 | 1. Create sliding windows of a given size (order) - using zip and islice 23 | 2. Apply a supplied mask to the sliding windows - using compress 24 | 3. Count most frequently occurring window - using Counter 25 | 26 | In the NSRWS algorithm, this is the most time consuming step. Essentially expands 27 | a 1D sequence to a 2D sequence - where the sequence follows row-wise & the columnar 28 | expansion encodes a sliding window for each row: 29 | 1D sequence: 30 | (1,2,3,4,5,6,7) 31 | 32 | 2D expansion for window order=3: 33 | ((1,2,3), 34 | (2,3,4), 35 | (3,4,5), 36 | (4,5,6), 37 | (5,6,7)) 38 | 39 | The mask is applied row-wise & must be of the same length as the number of rows 40 | in this 2D expansion. This is given by: 41 | len(mask) = len(seq) - (order - 1) 42 | 43 | Example application of the mask (1,0,0,1,1): 44 | 1 -> ((1,2,3), 45 | 0 -> (2,3,4), ----> ((1,2,3), 46 | 0 -> (3,4,5), (4,5,6), 47 | 1 -> (4,5,6), (5,6,7)) 48 | 1 -> (5,6,7)) 49 | 50 | Unique windows (rows of 2D expansion) are counted and most frequently occurring 51 | row is returned with counts. 52 | 53 | 1D sequence with overlap: 54 | (1,1,1,1,1,2,1) 55 | 56 | 2D expansion for window order=3: 57 | ((1,1,1), 58 | (1,1,1), ----> overlap 59 | (1,1,1), ----> overlap 60 | (1,1,2), 61 | (1,2,1)) 62 | 63 | mask will be (1,0,0,1,1) and its application will yield: 64 | ((1,1,1), 65 | (1,1,2), 66 | (1,2,1)) 67 | 68 | Here, each window occurs once and the first one is returned -> (1,1,1) 69 | 70 | Parameters 71 | ---------- 72 | seq : array.array 73 | Discrete symbolic sequence containing 32-bit unsigned integers. 74 | mask : array.array 75 | Collection of Booleans, where 0s indicate locations on "seq" to mask out. 76 | 0s correspond to overlapping windows. 77 | order : int 78 | Size of window for NSRWS, 2 or greater. 79 | 80 | Returns 81 | ------- 82 | freq_window : array.array 83 | Most frequently occurring non-overlapping "window" of size "order". 84 | count : int 85 | Number of times the most frequently occurring window occurs. 86 | 87 | """ 88 | 89 | # Create overlapped sliding windows (each window a tuple of size order) & apply mask 90 | filtered = compress(zip(*(islice(seq, i, None) for i in range(order))), mask) 91 | 92 | # Count sliding windows (tuples are hashable!) & get the one most common with counts 93 | freq_window, count = Counter(filtered).most_common(1)[0] 94 | 95 | # Assign array type and return 96 | freq_window = cast(freq_window) 97 | 98 | return freq_window, count 99 | 100 | 101 | def _onestep_pairs(seq, verbose=True): 102 | """ 103 | Execute one full step of NSRPS (NSRWS with order=2) for a given sequence 104 | 105 | Makes use of 2 functions written in Cython & _mask_and_count in the following steps: 106 | 1. Find overlapping pairs & store their indices for masking -> get_mask_pairs() 107 | 2. Apply the mask and find most frequent pair -> _mask_and_count() 108 | 3. Substitute all occurrences of the most frequent pair -> substitute_pairs() 109 | 110 | This function is different from _onestep_windows because: 111 | 1. It is *much* faster due to fewer nested loops 112 | 2. It targets a more common use case scenario: for distances, for CCC, etc 113 | 3. For higher window orders, correctness needs to be proved outside of tests 114 | 115 | The implementation will benefit from: 116 | 1. Decorators for timing 117 | 2. Decorators for verbosity of output 118 | 3. Cython implementation of the slowest part: _mask_and_count 119 | problem: counting windows in C? 120 | 121 | Parameters 122 | ---------- 123 | seq : array.array 124 | Discrete symbolic sequence containing 32-bit unsigned integers. 125 | verbose : bool, optional 126 | Whether to report extra details. These include the frequent pair that was 127 | substituted, its counts & total time taken. The default is True. 128 | 129 | Returns 130 | ------- 131 | tuple, of the following fixed elements: 132 | seq : array.array 133 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 134 | frequently occurring non-sequentially overlapping pair substituted. 135 | 136 | signal : bool 137 | indicator for the state of sequence with all distinct pairs (count=1) 138 | 139 | optional elements of tuple that depend on verbosity: 140 | freq_pair : array.array 141 | Frequent pair substituted 142 | 143 | count : int 144 | Number of times the frequent pair occurred in the sequence 145 | 146 | time_taken : float 147 | Time taken to execute step 148 | 149 | 150 | """ 151 | 152 | # Initialize timer 153 | before = perf_counter() 154 | 155 | # Initialize signal for tracking sequence state with all distinct pairs 156 | signal = False 157 | 158 | # Compute mask for overlapping pairs 159 | mask = core.get_mask_pairs(seq) 160 | 161 | # Apply mask and find most frequent pair 162 | freq_pair, count = _mask_and_count(seq, mask, 2) 163 | 164 | # Get value for substitution of the most frequent pair with 165 | sub_value = 1 + max(seq) 166 | 167 | # If all distinct pairs, substitute the first one & set signal to True 168 | if count == 1: 169 | out = cast(seq[1:]) 170 | out[0] = sub_value 171 | signal = True 172 | # Else, substitute all instances of the frequent pair 173 | else: 174 | out = cast(core.substitute_pairs(seq, freq_pair, sub_value)) 175 | 176 | # Completion timer 177 | after = perf_counter() 178 | 179 | # If verbose, return more things 180 | if verbose: 181 | time_taken = after - before 182 | return out, signal, freq_pair, count, time_taken 183 | 184 | # Else return bare essentials 185 | return out, signal 186 | 187 | 188 | def _onestep_windows(seq, order, verbose=True): 189 | """ 190 | Execute one full step of NSRWS with order>=2 for a given sequence 191 | 192 | Makes use of 2 functions written in Cython & _mask_and_count in the following steps: 193 | 1. Find overlapping windows & store their indices as mask -> get_mask_windows() 194 | 2. Apply the mask and find most frequent window -> _mask_and_count() 195 | 3. Substitute all occurrences of most frequent window -> substitute_windows() 196 | 197 | This function is different from _onestep_pairs because: 198 | 1. This is slower due to more nested loops and checks 199 | 2. Of course, it handles the generalized case for different window orders 200 | 3. For higher window orders, correctness needs to be proved outside of tests 201 | 202 | The implementation will benefit from: 203 | 1. Decorators for timing 204 | 2. Decorators for verbosity of output 205 | 3. Cython implementation of the slowest part: _mask_and_count 206 | problem: counting windows in C? 207 | 208 | Parameters 209 | ---------- 210 | seq : array.array 211 | Discrete symbolic sequence containing 32-bit unsigned integers. 212 | order : int 213 | Size of window for NSRWS, 2 or greater. 214 | verbose : bool, optional 215 | Whether to report extra details. These include the frequent pair that was 216 | substituted, its counts & total time taken. The default is True. 217 | 218 | Returns 219 | ------- 220 | tuple, of the following fixed elements: 221 | seq : array.array 222 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 223 | frequently occurring non-sequentially overlapping window substituted. 224 | 225 | signal : bool 226 | indicator for the state of sequence with all distinct pairs (count=1) 227 | 228 | optional elements of tuple that depend on verbosity: 229 | freq_pair : array.array 230 | Frequent window substituted 231 | 232 | count : int 233 | Number of times the frequent window occurred in the sequence 234 | 235 | time_taken : float 236 | Time taken to execute step 237 | 238 | 239 | """ 240 | 241 | # Initialize timer 242 | before = perf_counter() 243 | 244 | # Initialize signal for tracking sequence state with all distinct windows 245 | signal = False 246 | 247 | # Compute mask for overlapping windows 248 | mask = core.get_mask_windows(seq, order) 249 | 250 | # Apply mask and find most frequent window 251 | freq_window, count = _mask_and_count(seq, mask, order) 252 | 253 | # Get value for substitution of the most frequent window with 254 | sub_value = 1 + max(seq) 255 | 256 | # If all distinct windows, substitute the first one & set signal to True 257 | if count == 1: 258 | out = cast(seq[order - 1 :]) 259 | out[0] = sub_value 260 | signal = True 261 | # Else, substitute all instances of the frequent window 262 | else: 263 | out = cast(core.substitute_windows(seq, order, freq_window, sub_value)) 264 | 265 | # Completion timer 266 | after = perf_counter() 267 | 268 | # If verbose, return more things 269 | if verbose: 270 | return out, signal, freq_window, count, after - before 271 | 272 | # Else return bare essentials 273 | return out, signal 274 | 275 | 276 | def _onestep(seq, order, verbose=True): 277 | """ 278 | Wrapper that switches routine (pairs vs windows) depending on order 279 | 280 | For pairs (order=2), execute _onestep_pairs which is faster 281 | For higher orders, execute _onestep_windows 282 | 283 | Parameters 284 | ---------- 285 | seq : array.array 286 | Discrete symbolic sequence containing 32-bit unsigned integers. 287 | order : int 288 | Size of window for NSRWS, 2 or greater. 289 | verbose : bool, optional 290 | Whether to report extra details. These include the frequent pair that was 291 | substituted, its counts & total time taken. The default is True. 292 | 293 | Returns 294 | ------- 295 | tuple, of the following fixed elements: 296 | seq : array.array 297 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 298 | frequently occurring non-sequentially overlapping window substituted. 299 | 300 | signal : bool 301 | indicator for the state of sequence with all distinct pairs (count=1) 302 | 303 | optional elements of tuple that depend on verbosity: 304 | freq_pair : array.array 305 | Frequent window substituted 306 | 307 | count : int 308 | Number of times the frequent window occurred in the sequence 309 | 310 | time_taken : float 311 | Time taken to execute step 312 | 313 | """ 314 | 315 | if order == 2: 316 | return _onestep_pairs(seq[:], verbose) 317 | 318 | if order > 2: 319 | return _onestep_windows(seq[:], order, verbose) 320 | 321 | 322 | def onestep(seq, order, verbose=True, check=True): 323 | """ 324 | Execute one step of NSRWS on given sequence and window size. 325 | 326 | This function exposes the functionality of NSRWS with various checks for inputs and 327 | sizes. Wraps around _onestep & for convenience, allows disabling of equality check. 328 | 329 | Parameters 330 | ---------- 331 | seq : array.array 332 | Discrete symbolic sequence containing 32-bit unsigned integers. 333 | order : int 334 | Size of window for NSRWS, 2 or greater. 335 | verbose : bool, optional 336 | Whether to report extra details. These include the frequent pair that was 337 | substituted, its counts & total time taken. The default is True. 338 | check : bool, optional 339 | Check for equality of all symbols in sequence. The default is True. 340 | 341 | Returns 342 | ------- 343 | tuple, of the following fixed elements in this order: 344 | array.array 345 | Discrete symbolic sequence containing 32-bit unsigned integers, with most 346 | frequently occurring non-sequentially overlapping window substituted. 347 | 348 | bool 349 | indicator for the state of sequence with all distinct pairs (count=1) 350 | 351 | optional elements of tuple that depend on verbosity: 352 | array.array 353 | Frequent window substituted 354 | 355 | int 356 | Number of times the frequent window occurred in the sequence 357 | 358 | float 359 | Time taken to execute step 360 | 361 | """ 362 | 363 | # Coerce input to appropriate array type, if not possible throw a fit & exit 364 | if not arraytype(seq): 365 | seq = cast(seq) 366 | if seq is None: 367 | return None 368 | 369 | # Check whether all elements are equal, if requested, & exit if True 370 | if check and core.check_equality(seq): 371 | print("> All elements in sequence are equal!") 372 | return None 373 | 374 | # Check if size of sequence is shorter than order, exit if True 375 | if len(seq) < order: 376 | print("> Sequence input shorter than order!\n> Can't perform substitution ...") 377 | return None 378 | 379 | # Else execute one step of NSRWS and return 380 | return _onestep(seq, order, verbose) 381 | --------------------------------------------------------------------------------