├── tests-require.txt ├── docs └── img │ ├── simple_graph.png │ ├── bipartite_graph.png │ ├── directed_selfloop_graph.png │ ├── directed_antiparallel_graph.png │ └── xswap.svg ├── .gitignore ├── ci ├── build-wheels.sh └── deploy.sh ├── xswap ├── __init__.py ├── src │ ├── xswap.h │ ├── xswap.cpp │ ├── xswap_wrapper.cpp │ └── bitset.cpp ├── network_formats.py ├── permute.py ├── preprocessing.py ├── prior.py └── lib │ └── roaring.hh ├── LICENSE ├── tests ├── test_time.py ├── test_permute.py ├── test_formats.py ├── test_roaring.cpp ├── test_prior.py └── test_bitset.cpp ├── setup.py ├── .travis.yml └── README.md /tests-require.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pytest 4 | requests 5 | scipy 6 | setuptools 7 | -------------------------------------------------------------------------------- /docs/img/simple_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hetio/xswap/HEAD/docs/img/simple_graph.png -------------------------------------------------------------------------------- /docs/img/bipartite_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hetio/xswap/HEAD/docs/img/bipartite_graph.png -------------------------------------------------------------------------------- /docs/img/directed_selfloop_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hetio/xswap/HEAD/docs/img/directed_selfloop_graph.png -------------------------------------------------------------------------------- /docs/img/directed_antiparallel_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hetio/xswap/HEAD/docs/img/directed_antiparallel_graph.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/output/ 2 | tests/permutation_stats.txt 3 | tests/*.o 4 | build/ 5 | dist/ 6 | .vscode/ 7 | __pycache__/ 8 | .pytest_cache/ 9 | **.so 10 | xswap.egg-info/ 11 | -------------------------------------------------------------------------------- /ci/build-wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compile wheels 4 | /opt/python/cp35-cp35m/bin/pip wheel /io/ -w wheelhouse/ 5 | /opt/python/cp36-cp36m/bin/pip wheel /io/ -w wheelhouse/ 6 | /opt/python/cp37-cp37m/bin/pip wheel /io/ -w wheelhouse/ 7 | 8 | # Bundle external shared libraries into the wheels 9 | for whl in wheelhouse/**.whl; do 10 | auditwheel repair "$whl" --plat $PLAT -w /io/wheelhouse/ 11 | done 12 | -------------------------------------------------------------------------------- /xswap/__init__.py: -------------------------------------------------------------------------------- 1 | from xswap import network_formats 2 | from xswap import preprocessing 3 | from xswap import prior 4 | from xswap.permute import permute_edge_list 5 | 6 | __version__ = '0.0.2' 7 | 8 | __all__ = [ 9 | 'network_formats.edges_to_matrix', 10 | 'network_formats.matrix_to_edges', 11 | 'permute_edge_list', 12 | 'preprocessing.load_str_edges', 13 | 'preprocessing.load_processed_edges', 14 | 'preprocessing.map_str_edges', 15 | 'prior.compute_xswap_occurrence_matrix', 16 | 'prior.compute_xswap_priors', 17 | 'prior.approximate_xswap_prior', 18 | ] 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Greene Laboratory 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /tests/test_time.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import requests 5 | 6 | import xswap 7 | 8 | test_directory = os.path.dirname(os.path.realpath(__file__)) + '/' 9 | 10 | 11 | def load_edges(): 12 | edges_url = "https://github.com/greenelab/xswap/raw/{}/{}".format( 13 | "8c31b4cbdbbf2cfa5018b1277bbd0e9f6263e573", "graphs/GiG_edges_reduced.txt") 14 | response = requests.get(edges_url) 15 | edges = list() 16 | for edge in response.iter_lines(): 17 | edge = str(edge, 'utf-8') 18 | source, target = edge.split(',') 19 | edges.append((int(source), int(target))) 20 | return edges 21 | 22 | 23 | def test_time(): 24 | edges = load_edges() 25 | t1 = time.time() 26 | new_edges, stats = xswap.permute_edge_list(edges) 27 | t2 = time.time() 28 | time_diff = t2 - t1 29 | print("{:.4f} seconds elapsed.".format(time_diff)) 30 | assert edges != new_edges 31 | assert time_diff < 5 32 | 33 | num_repeats = 0 34 | old_set = set(edges) 35 | new_set = set(new_edges) 36 | for edge in old_set: 37 | if edge in new_set: 38 | num_repeats += 1 39 | p_unch = num_repeats / len(edges) 40 | with open(test_directory + 'permutation_stats.txt', 'w') as f: 41 | f.write('Runtime: {:.3f} sec. {:.3f} percent unchanged of {} total edges after ' 42 | '{} swap attempts\n'.format(time_diff, p_unch, len(edges), 10*len(edges))) 43 | -------------------------------------------------------------------------------- /ci/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## deploy.sh: run during a Travis CI build to deploy output directory to the gh-pages branch on GitHub. 4 | ## References 5 | ## - https://github.com/manubot/rootstock/blob/ddb0288895cd5bc5dab117fb366c52216a717d0e/ci/deploy.sh 6 | ## - https://github.com/wp-cli/wp-cli/issues/3798 7 | ## - https://github.com/manubot/catalog/blob/fd0ef6a999cca38890023eb65f19d1b87e96e83c/deploy.sh#L1-L45 8 | 9 | # Set options for extra caution & debugging 10 | set -o errexit \ 11 | -o nounset \ 12 | -o pipefail 13 | 14 | eval "$(ssh-agent -s)" 15 | # Ensure command traces are disabled while dealing with the private key 16 | [[ "$SHELLOPTS" =~ xtrace ]] && XTRACE_ON=1 17 | [[ "${XTRACE_ON:-}" ]] && set +o xtrace && echo "xtrace disabled" 18 | base64 --decode <<< "$GITHUB_DEPLOY_PRIVATE_KEY" | ssh-add - 19 | [[ "${XTRACE_ON:-}" ]] && set -o xtrace && echo "xtrace reenabled" 20 | 21 | # Configure git 22 | git config --global push.default simple 23 | git config --global user.name "Travis CI" 24 | git config --global user.email "deploy@travis-ci.com" 25 | git checkout "$TRAVIS_BRANCH" 26 | git remote set-url origin "git@github.com:$TRAVIS_REPO_SLUG.git" 27 | 28 | # Fetch and create gh-pages branch 29 | # Travis does a shallow and single branch git clone 30 | git remote set-branches --add origin gh-pages 31 | git fetch origin gh-pages:gh-pages 32 | 33 | commit_message="\ 34 | Generate catalog output on $(date --iso --utc) 35 | 36 | built by $TRAVIS_JOB_WEB_URL 37 | based on https://github.com/$TRAVIS_REPO_SLUG/commit/$TRAVIS_COMMIT 38 | [skip ci] 39 | " 40 | # echo >&2 "$commit_message" 41 | 42 | ghp-import \ 43 | --push --no-jekyll \ 44 | --message="$commit_message" \ 45 | docs/output/xswap 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import re 4 | 5 | import setuptools 6 | 7 | os.environ["CC"] = "g++" 8 | 9 | directory = pathlib.Path(__file__).parent.resolve() 10 | 11 | # version 12 | init_path = directory.joinpath('xswap', '__init__.py') 13 | text = init_path.read_text() 14 | pattern = re.compile(r"^__version__ = ['\"]([^'\"]*)['\"]", re.MULTILINE) 15 | version = pattern.search(text).group(1) 16 | 17 | # long_description 18 | readme_path = directory.joinpath('README.md') 19 | long_description = readme_path.read_text() 20 | 21 | xswap_cpp_extension = setuptools.Extension( 22 | 'xswap._xswap_backend', 23 | sources=['xswap/src/xswap_wrapper.cpp', 'xswap/src/bitset.cpp', 'xswap/src/xswap.cpp', 'xswap/lib/roaring.c'], 24 | extra_compile_args=["-std=c++11"], 25 | ) 26 | 27 | setuptools.setup( 28 | # Package details 29 | name='xswap', 30 | version=version, 31 | url='https://github.com/greenelab/xswap', 32 | project_urls={ 33 | 'Documentation': 'https://hetio.github.io/xswap/', 34 | 'Source': 'https://github.com/hetio/xswap', 35 | 'Tracker': 'https://github.com/hetio/xswap/issues', 36 | 'Publication': 'https://greenelab.github.io/xswap-manuscript/', 37 | }, 38 | description='Python-wrapped C/C++ library for degree-preserving network randomization', 39 | long_description_content_type='text/markdown', 40 | long_description=long_description, 41 | license='BSD 2-Clause', 42 | 43 | # Author details 44 | author='Michael Zietz', 45 | author_email='michael.zietz@gmail.com', 46 | 47 | # Specify python version 48 | python_requires='>=3.5', 49 | 50 | ext_modules=[xswap_cpp_extension], 51 | packages=setuptools.find_packages(), 52 | ) 53 | -------------------------------------------------------------------------------- /docs/img/xswap.svg: -------------------------------------------------------------------------------- 1 | 2 | 8 | 16 | 24 | 32 | 40 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /tests/test_permute.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import pytest 4 | import requests 5 | 6 | import xswap 7 | 8 | 9 | @pytest.mark.parametrize('edges,permutable', [ 10 | ([(0, 0), (1, 1), (1, 2), (2, 3)], True), 11 | ([(0, 0)], False), 12 | ]) 13 | def test_xswap_changes_edges(edges, permutable): 14 | """ 15 | Check that XSwap returns a different set of edges than the ones given if the edges 16 | are permutable. Check that XSwap does not modify edges in place. 17 | """ 18 | edges_copy = edges.copy() 19 | new_edges, stats = xswap.permute_edge_list( 20 | edges, allow_self_loops=True, allow_antiparallel=True) 21 | assert edges == edges_copy 22 | if permutable: 23 | assert new_edges != edges 24 | else: 25 | assert new_edges == edges 26 | 27 | 28 | def test_roaring_warning(): 29 | """ 30 | Check that a warning is given when using the much slower but far more general 31 | Roaring bitset rather than the faster fully uncompressed bitset. 32 | """ 33 | edges_url = "https://github.com/greenelab/xswap/raw/{}/{}".format( 34 | "8c31b4cbdbbf2cfa5018b1277bbd0e9f6263e573", "graphs/GiG_edges_reduced.txt") 35 | response = requests.get(edges_url) 36 | with tempfile.NamedTemporaryFile() as tf: 37 | tf.write(response.content) 38 | edges = xswap.preprocessing.load_processed_edges(tf.name) 39 | 40 | with pytest.warns(None): 41 | permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True, 42 | allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=4000000000) 43 | 44 | with pytest.warns(RuntimeWarning, match="Using Roaring bitset because of the large number of edges."): 45 | permuted_edges, stats = xswap.permute_edge_list(edges, allow_self_loops=True, 46 | allow_antiparallel=False, multiplier=0.1, seed=0, max_malloc=10) 47 | -------------------------------------------------------------------------------- /tests/test_formats.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pytest 3 | import scipy.sparse 4 | 5 | import xswap.network_formats 6 | 7 | 8 | @pytest.mark.parametrize('matrix,correct_edges,include_reverse_edges', [ 9 | (numpy.array([[1,0,0,0],[0,0,1,0],[0,0,0,1]]), [(0, 0), (1, 2), (2, 3)], False), 10 | (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 2)], False), 11 | (numpy.array([[1,0,0],[0,0,1],[0,1,1]]), [(0, 0), (1, 2), (2, 1), (2, 2)], True), 12 | ]) 13 | def test_matrix_to_edges(matrix, correct_edges, include_reverse_edges): 14 | edges = xswap.network_formats.matrix_to_edges(matrix, include_reverse_edges) 15 | assert sorted(edges) == sorted(correct_edges) 16 | 17 | 18 | @pytest.mark.parametrize('edges,correct_matrix,add_reverse_edges,shape,dtype,sparse', [ 19 | ( 20 | [(0, 1), (0, 3), (2, 2)], 21 | numpy.array([[0,1,0,1], [1,0,0,0], [0,0,1,0], [1,0,0,0]], dtype=int), 22 | True, (4, 4), int, False), 23 | ( 24 | [(0, 1), (0, 3), (2, 2)], 25 | numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0], [0,0,0,0]], dtype=int), 26 | False, (4, 4), int, False), 27 | ( 28 | [(0, 1), (0, 3), (2, 2)], 29 | numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=int), 30 | False, (3, 4), int, False), 31 | ( 32 | [(0, 1), (0, 3), (2, 2)], 33 | numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=float), 34 | False, (3, 4), float, False), 35 | ( 36 | [(0, 1), (0, 3), (2, 2)], 37 | numpy.array([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32), 38 | False, (3, 4), numpy.float32, False), 39 | ( 40 | [(0, 1), (0, 3), (2, 2)], 41 | scipy.sparse.csc_matrix([[0,1,0,1], [0,0,0,0], [0,0,1,0]], dtype=numpy.float32), 42 | False, (3, 4), numpy.float32, True), 43 | ]) 44 | def test_edges_to_matrix(edges, correct_matrix, add_reverse_edges, shape, dtype, sparse): 45 | matrix = xswap.network_formats.edges_to_matrix( 46 | edge_list=edges, add_reverse_edges=add_reverse_edges, shape=shape, 47 | dtype=dtype, sparse=sparse) 48 | 49 | assert matrix.dtype == dtype 50 | assert scipy.sparse.issparse(matrix) == sparse 51 | if sparse: 52 | assert (matrix != correct_matrix).nnz == 0 53 | else: 54 | assert numpy.array_equal(matrix, correct_matrix) 55 | -------------------------------------------------------------------------------- /tests/test_roaring.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../xswap/src/xswap.h" 3 | 4 | 5 | main(int argc, char const *argv[]) 6 | { 7 | int counter, incorrect_contains, incorrect_doesnt_contain; 8 | 9 | // Create real edges to be added to the Roaring set 10 | int** real_edges = (int**)malloc(sizeof(int*) * 16); 11 | counter = 0; 12 | for (int i = 4; i < 8; i++) { 13 | for (int j = 4; j < 8; j++) { 14 | real_edges[counter] = (int*)malloc(sizeof(int) * 2); 15 | real_edges[counter][0] = i; 16 | real_edges[counter][1] = j; 17 | counter += 1; 18 | } 19 | } 20 | 21 | Edges edges; 22 | edges.edge_array = real_edges; 23 | edges.num_edges = 16; 24 | RoaringBitSet edges_set = RoaringBitSet(edges); 25 | 26 | // Check that edges added at the creation of the set are contained 27 | incorrect_doesnt_contain = 0; 28 | for (int i = 4; i < 8; i++) { 29 | for (int j = 4; j < 8; j++) { 30 | int edge[2] = {i, j}; 31 | if (!edges_set.contains(edge)) { 32 | incorrect_doesnt_contain += 1; 33 | } 34 | } 35 | } 36 | 37 | // Create fake edges and check that they are not in the set 38 | counter = 0; 39 | incorrect_contains = 0; 40 | for (int i = 0; i < 4; i++) { 41 | for (int j = 0; j < 4; j++) { 42 | int fake_edge[2] = {i, j}; 43 | // Check that this edge is not in the set 44 | if (edges_set.contains(fake_edge)) { 45 | incorrect_contains += 1; 46 | } 47 | // Add the edge and check that it was added 48 | edges_set.add(fake_edge); 49 | if (!edges_set.contains(fake_edge)) { 50 | incorrect_doesnt_contain += 1; 51 | } 52 | // Remove the edge and check that it is removed 53 | edges_set.remove(fake_edge); 54 | if (edges_set.contains(fake_edge)) { 55 | incorrect_contains += 1; 56 | } 57 | counter += 1; 58 | } 59 | } 60 | 61 | free(real_edges); 62 | if (incorrect_contains == 0 && incorrect_doesnt_contain == 0) { 63 | std::cout << "All tests passed" << "\n"; 64 | return 0; 65 | } else { 66 | std::cout << "Tests failed " << incorrect_contains << " " << incorrect_doesnt_contain << "\n"; 67 | return 1; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /xswap/src/xswap.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../lib/roaring.hh" 3 | 4 | extern int CHAR_BITS; 5 | 6 | struct Edges { 7 | int** edge_array; 8 | int num_edges; 9 | int max_id; 10 | }; 11 | 12 | // Slower bitset 13 | class RoaringBitSet 14 | { 15 | public: 16 | RoaringBitSet() = default; 17 | RoaringBitSet(Edges edges); 18 | bool contains(int *edge); 19 | void add(int *edge); 20 | void remove(int *edge); 21 | 22 | private: 23 | Roaring bitmap; 24 | }; 25 | 26 | // Faster edge bitset for smaller numbers of edges 27 | class UncompressedBitSet 28 | { 29 | public: 30 | UncompressedBitSet() = default; 31 | UncompressedBitSet(int max_id, unsigned long long int max_malloc); 32 | UncompressedBitSet(Edges edges, unsigned long long int max_malloc); 33 | bool contains(int *edge); 34 | void add(int *edge); 35 | void remove(int *edge); 36 | void free_array(); 37 | 38 | private: 39 | char* bitset; 40 | size_t max_cantor; 41 | void create_bitset(size_t num_elements, unsigned long long int max_malloc); 42 | char get_bit(char word, char bit_position); 43 | void set_bit_true(char* word, char bit_position); 44 | void set_bit_false(char* word, char bit_position); 45 | }; 46 | 47 | // Wrapper class for the two bitset implementations 48 | class BitSet 49 | { 50 | public: 51 | BitSet(Edges edges, unsigned long long int max_malloc); 52 | bool contains(int *edge); 53 | void add(int *edge); 54 | void remove(int *edge); 55 | void free_array(); 56 | PyObject* runtime_warning_roaring(void); 57 | UncompressedBitSet uncompressed_set; 58 | 59 | private: 60 | bool use_compressed; 61 | RoaringBitSet compressed_set; 62 | }; 63 | 64 | struct statsCounter { 65 | int num_swaps; 66 | int same_edge = 0; 67 | int self_loop = 0; 68 | int duplicate = 0; 69 | int undir_duplicate = 0; 70 | int excluded = 0; 71 | }; 72 | 73 | struct Conditions { 74 | int seed; 75 | bool allow_antiparallel; 76 | bool allow_self_loop; 77 | Edges excluded_edges; 78 | }; 79 | 80 | size_t cantor_pair(int* edge); 81 | 82 | void swap_edges(Edges edges, int num_swaps, Conditions cond, statsCounter *stats, 83 | unsigned long long int max_malloc); 84 | 85 | bool is_valid_edge(int *edge, BitSet edges_set, Conditions cond, 86 | statsCounter *stats); 87 | 88 | bool is_valid_swap(int **new_edges, BitSet edges_set, Conditions cond, 89 | statsCounter *stats); 90 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | setup_and_test: &setup_and_test 2 | stage: test 3 | language: python 4 | addons: 5 | apt: 6 | packages: 7 | - pkg-config 8 | - python3-dev 9 | before_install: 10 | - pip install -r tests-require.txt 11 | install: 12 | - pkg-config --cflags --libs python3 13 | - python setup.py build 14 | - pip install . 15 | script: 16 | - pytest tests/ 17 | - > 18 | g++ tests/test_bitset.cpp xswap/src/xswap.h xswap/src/bitset.cpp 19 | xswap/lib/roaring.c -o tests/test_bitset.o -std=c++11 20 | `pkg-config --cflags --libs python3` 21 | - ./tests/test_bitset.o 22 | - > 23 | g++ tests/test_roaring.cpp xswap/src/xswap.h xswap/src/bitset.cpp 24 | xswap/lib/roaring.c -o tests/test_roaring.o -std=c++11 25 | `pkg-config --cflags --libs python3` 26 | - ./tests/test_roaring.o 27 | 28 | build_and_upload: &build_and_upload 29 | stage: deploy 30 | sudo: required 31 | if: tag IS present 32 | services: 33 | - docker 34 | install: 35 | - docker pull $DOCKER_IMAGE 36 | script: 37 | - docker run --rm -e PLAT=$PLAT -v `pwd`:/io $DOCKER_IMAGE /io/ci/build-wheels.sh 38 | - /opt/python/3.6/bin/pip install twine 39 | - /opt/python/3.6/bin/python -m twine upload -u zietzm -p $PYPI_PASSWORD --repository-url https://upload.pypi.org/legacy/ --skip-existing wheelhouse/* 40 | 41 | compiler: 42 | - g++ 43 | matrix: 44 | include: 45 | - <<: *setup_and_test 46 | name: "Test 3.5 on Ubuntu" 47 | dist: xenial 48 | python: 3.5 49 | - <<: *setup_and_test 50 | name: "Test 3.6 on Ubuntu" 51 | dist: xenial 52 | python: 3.6 53 | - <<: *setup_and_test 54 | name: "Test 3.7 on Ubuntu" 55 | dist: xenial 56 | python: 3.7 57 | - <<: *build_and_upload 58 | name: "Build manylinux1_x86_64" 59 | env: 60 | - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 61 | - PLAT=manylinux1_x86_64 62 | - <<: *build_and_upload 63 | name: "Build manylinux1_i686" 64 | env: 65 | - DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 66 | - PLAT=manylinux1_i686 67 | - <<: *build_and_upload 68 | name: "Build manylinux2010_x86_64" 69 | env: 70 | - DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 71 | - PLAT=manylinux2010_x86_64 72 | - name: "Build documentation" 73 | dist: xenial 74 | language: python 75 | python: 3.7 76 | install: 77 | - pip install --requirement tests-require.txt 78 | - pip install pdoc3~=0.7.0 ghp-import~=0.5.5 79 | script: 80 | - pdoc --force --html 81 | --config="git_link_template=\"https://github.com/$TRAVIS_REPO_SLUG/blob/{commit}/{path}#L{start_line}-L{end_line}\"" 82 | --output-dir=docs/output 83 | xswap 84 | deploy: 85 | provider: script 86 | script: bash ci/deploy.sh 87 | skip_cleanup: true 88 | on: 89 | branch: master 90 | condition: $TRAVIS_EVENT_TYPE = "push" 91 | -------------------------------------------------------------------------------- /xswap/src/xswap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "xswap.h" 3 | 4 | void swap_edges(Edges edges, int num_swaps, Conditions cond, statsCounter *stats, 5 | unsigned long long int max_malloc) { 6 | // Initialize bitset for possible edges 7 | BitSet edges_set = BitSet(edges, max_malloc); 8 | 9 | // Initialize unbiased random number generator 10 | std::mt19937 rng(cond.seed); 11 | std::uniform_int_distribution uni(0, edges.num_edges - 1); 12 | 13 | // Do XSwap 14 | for (int i = 0; i < num_swaps; i++) { 15 | // Draw edges randomly 16 | int edge_index_a = uni(rng); 17 | int edge_index_b = uni(rng); 18 | 19 | if (edge_index_a == edge_index_b) { 20 | stats->same_edge += 1; 21 | continue; 22 | } 23 | 24 | // Old edges 25 | int* edge_a = edges.edge_array[edge_index_a]; 26 | int* edge_b = edges.edge_array[edge_index_b]; 27 | 28 | // Form potential new edges 29 | int new_edge_a[2] = { edge_a[0], edge_b[1] }; 30 | int new_edge_b[2] = { edge_b[0], edge_a[1] }; 31 | int* new_edges[2] = { new_edge_a, new_edge_b }; 32 | 33 | bool valid = is_valid_swap(new_edges, edges_set, cond, stats); 34 | if (valid) { 35 | edges_set.remove(edge_a); 36 | edges_set.remove(edge_b); 37 | 38 | int temp_target = edge_a[1]; 39 | edge_a[1] = edge_b[1]; 40 | edge_b[1] = temp_target; 41 | 42 | edges_set.add(new_edge_a); 43 | edges_set.add(new_edge_b); 44 | } 45 | } 46 | edges_set.free_array(); 47 | } 48 | 49 | bool is_valid_edge(int *new_edge, BitSet edges_set, Conditions valid_conditions, 50 | statsCounter *stats) { 51 | // New edge would be a self-loop 52 | if (!valid_conditions.allow_self_loop && new_edge[0] == new_edge[1]) { 53 | stats->self_loop += 1; 54 | return false; 55 | } 56 | // New edge already exists 57 | if (edges_set.contains(new_edge)) { 58 | stats->duplicate += 1; 59 | return false; 60 | } 61 | // Undirected and reverse of new edge already exists 62 | int reversed[2] = { new_edge[1], new_edge[0] }; 63 | if (!valid_conditions.allow_antiparallel && edges_set.contains(reversed)) { 64 | stats->undir_duplicate += 1; 65 | return false; 66 | } 67 | for (int i = 0; i < valid_conditions.excluded_edges.num_edges; i++) { 68 | if (valid_conditions.excluded_edges.edge_array[i][0] == new_edge[0] && 69 | valid_conditions.excluded_edges.edge_array[i][1] == new_edge[1]) { 70 | stats->excluded += 1; 71 | return false; 72 | } 73 | } 74 | return true; 75 | } 76 | 77 | bool is_valid_swap(int **new_edges, BitSet edges_set, Conditions valid_conditions, 78 | statsCounter *stats) { 79 | for (int i = 0; i < 2; i++) { 80 | bool is_valid = is_valid_edge(new_edges[i], edges_set, valid_conditions, stats); 81 | if (!is_valid) { 82 | return false; 83 | } 84 | } 85 | return true; 86 | } 87 | -------------------------------------------------------------------------------- /xswap/network_formats.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, TypeVar 2 | 3 | import numpy 4 | import scipy.sparse 5 | 6 | 7 | def matrix_to_edges(matrix: numpy.ndarray, include_reverse_edges: bool=True): 8 | """ 9 | Convert (bi)adjacency matrix to an edge list. Inverse of `edges_to_matrix`. 10 | 11 | Parameters 12 | ---------- 13 | matrix : numpy.ndarray 14 | Adjacency matrix or biadjacency matrix of a network 15 | include_reverse_edges : bool 16 | Whether to return edges that are the inverse of existing edges. For 17 | example, if returning [(0, 1), (1, 0)] is desired or not. If False, 18 | then only edges where source <= target are returned. This parameter 19 | should be `True` when passing a biadjacency matrix, as matrix positions 20 | indicate separate nodes. 21 | 22 | Returns 23 | ------- 24 | edge_list : List[Tuple[int, int]] 25 | Edge list with node ids as the corresponding matrix indices. For example, 26 | if `matrix` has `matrix[0, 2] == 1`, then `(0, 2)` will be among the 27 | returned edges. 28 | """ 29 | sparse = scipy.sparse.coo_matrix(matrix) 30 | edges = zip(sparse.row, sparse.col) 31 | 32 | if not include_reverse_edges: 33 | edges = filter(lambda edge: edge[0] <= edge[1], edges) 34 | return list(edges) 35 | 36 | 37 | def edges_to_matrix(edge_list: List[Tuple[int, int]], add_reverse_edges: bool, 38 | shape: Tuple[int, int], dtype: TypeVar=bool, sparse: bool=True): 39 | """ 40 | Convert edge list to (bi)adjacency matrix. Inverse of `matrix_to_edges`. 41 | 42 | Parameters 43 | ---------- 44 | edge_list : List[Tuple[int, int]] 45 | An edge list mapped such that node ids correspond to desired matrix 46 | positions. For example, (0, 0) will mean that the resulting matrix has 47 | a positive value of type `dtype` in that position. 48 | add_reverse_edges : bool 49 | Whether to include the reverse of edges in the matrix. For example, 50 | if `edge_list = [(1, 0)]` and `add_reverse_edge = True`, then the 51 | returned matrix has `matrix[1, 0]` = `matrix[0, 1]` = 1. Else, the matrix 52 | only has `matrix[1, 0]` = 1. If a biadjacency matrix is desired, then 53 | set `add_reverse_edges = False`. 54 | shape : Tuple[int, int] 55 | Shape of the matrix to be returned. Allows edges to be converted to 56 | a matrix even when there are nodes without edges. 57 | dtype : data-type 58 | Dtype of the returned matrix. For example, `int`, `bool`, `float`, etc. 59 | sparse : bool 60 | Whether a sparse matrix should be returned. If `False`, returns a dense 61 | numpy.ndarray 62 | 63 | Returns 64 | ------- 65 | matrix : scipy.sparse.csc_matrix or numpy.ndarray 66 | """ 67 | matrix = scipy.sparse.csc_matrix( 68 | (numpy.ones(len(edge_list)), zip(*edge_list)), dtype=dtype, shape=shape, 69 | ) 70 | 71 | if add_reverse_edges: 72 | matrix = (matrix + matrix.T) > 0 73 | matrix = matrix.astype(dtype) 74 | 75 | if not sparse: 76 | matrix = matrix.toarray() 77 | 78 | return matrix 79 | -------------------------------------------------------------------------------- /tests/test_prior.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pandas 3 | import pytest 4 | 5 | import xswap 6 | 7 | 8 | @pytest.mark.parametrize('edges,true_prior,num_swaps,shape', [ 9 | ([(0, 0), (1, 1)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)), 10 | ([(0, 1), (1, 0)], 0.5 * numpy.ones((2, 2)), 10000, (2, 2)), 11 | ([(0, 0)], numpy.ones((1, 1)), 10, (1, 1)), 12 | ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((5, 5)), 0, (5, 5)), 13 | ([(0, 1), (1, 2), (3, 4), (1, 0)], numpy.zeros((4, 5)), 0, (4, 5)), 14 | ]) 15 | def test_prior_matrix(edges, true_prior, num_swaps, shape): 16 | """ 17 | Check that `xswap.prior.compute_xswap_occurrence_matrix` is returning 18 | reasonable results for very small networks where the correct prior is obvious. 19 | """ 20 | occurrence_matrix = xswap.prior.compute_xswap_occurrence_matrix( 21 | edges, n_permutations=num_swaps, shape=shape, allow_self_loops=True, 22 | allow_antiparallel=True) 23 | if num_swaps: 24 | edge_prior = (occurrence_matrix / num_swaps).toarray() 25 | else: 26 | edge_prior = occurrence_matrix.toarray() 27 | assert numpy.abs(edge_prior - true_prior).max() == pytest.approx(0, abs=0.01) 28 | 29 | 30 | @pytest.mark.parametrize('edges,dtypes,source_degrees,target_degrees,shape,allow_antiparallel', [ 31 | ( 32 | [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)], 33 | {'id': numpy.uint16, 'edge': bool, 'degree': numpy.uint32, 'xswap_prior': float}, 34 | {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False 35 | ), 36 | ( 37 | [(0, 2), (0, 3), (1, 2), (2, 3), (3, 4)], 38 | {'id': numpy.int8, 'edge': int, 'degree': numpy.float, 'xswap_prior': numpy.float64}, 39 | {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, {0: 2, 1: 1, 2: 3, 3: 3, 4: 1}, (5, 5), False 40 | ), 41 | ( 42 | [(0, 2), (0, 3), (1, 2), (1, 3)], 43 | {'id': numpy.float16, 'edge': float, 'degree': float, 'xswap_prior': numpy.float32}, 44 | {0: 2, 1: 2, 2: 0, 3: 0}, {0: 0, 1: 0, 2: 2, 3: 2}, (4, 4), True 45 | ), 46 | ]) 47 | def test_prior_dataframe(edges, dtypes, source_degrees, target_degrees, shape, allow_antiparallel): 48 | """ 49 | Check that the `xswap.prior.compute_xswap_priors` performs correctly 50 | """ 51 | prior_df = xswap.prior.compute_xswap_priors(edges, n_permutations=1000, 52 | shape=shape, allow_self_loops=False, allow_antiparallel=allow_antiparallel, dtypes=dtypes) 53 | 54 | assert isinstance(prior_df, pandas.DataFrame) 55 | assert list(prior_df.columns) == ['source_id', 'target_id', 'edge', 'source_degree', 56 | 'target_degree', 'xswap_prior'] 57 | assert dict(prior_df.dtypes) == { 58 | 'source_id': dtypes['id'], 'target_id': dtypes['id'], 'edge': dtypes['edge'], 59 | 'source_degree': dtypes['degree'], 'target_degree': dtypes['degree'], 60 | 'xswap_prior': dtypes['xswap_prior'] 61 | } 62 | 63 | assert prior_df.set_index('source_id')['source_degree'].to_dict() == source_degrees 64 | assert prior_df.set_index('target_id')['target_degree'].to_dict() == target_degrees 65 | 66 | # Ensure that all the edges are accounted for in the dataframe 67 | for edge in edges: 68 | assert prior_df.query('source_id == {} & target_id == {}'.format(*edge))['edge'].values[0] 69 | 70 | # Whether directed-ness is correctly propagated through the pipeline 71 | if allow_antiparallel: 72 | assert prior_df['edge'].sum() == len(edges) 73 | else: 74 | assert prior_df['edge'].sum() == len(edges) * 2 75 | -------------------------------------------------------------------------------- /xswap/permute.py: -------------------------------------------------------------------------------- 1 | from typing import List, Set, Tuple 2 | 3 | 4 | def permute_edge_list(edge_list: List[Tuple[int, int]], allow_self_loops: bool = False, 5 | allow_antiparallel: bool = False, multiplier: float = 10, 6 | excluded_edges: Set[Tuple[int, int]] = set(), seed: int = 0, 7 | max_malloc: int = 4000000000): 8 | """ 9 | Permute the edges of a graph using the XSwap method given by Hanhijärvi, 10 | et al. (doi.org/f3mn58). XSwap is a degree-preserving network randomization 11 | technique that selects edges, checks the validity of the swap, and exchanges 12 | the target nodes between the edges. For information on what values to select 13 | for directed, please see README.md. 14 | 15 | Parameters 16 | ---------- 17 | edge_list : List[Tuple[int, int]] 18 | Edge list representing the graph to be randomized. Tuples can contain 19 | integer values representing nodes. No value should be greater than C++'s 20 | `INT_MAX`, in this case 2_147_483_647. 21 | allow_self_loops : bool 22 | Whether to allow edges like (0, 0). In the case of bipartite graphs, 23 | such an edge represents a connection between two distinct nodes, while 24 | in other graphs it may represent an edge from a node to itself, in which 25 | case an edge may or may not be meaningful depending on context. 26 | allow_antiparallel : bool 27 | Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case 28 | of bipartite graphs, these edges represent two connections between four 29 | distinct nodes, while for other graphs, these may be connections between 30 | the same two nodes. 31 | multiplier : float 32 | The number of edge swap attempts is determined by the product of the 33 | number of existing edges and multiplier. For example, if five edges are 34 | passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer 35 | products will be rounded down to the nearest integer. 36 | excluded_edges : Set[Tuple[int, int]] 37 | Specific edges which should never be created by the network randomization 38 | seed : int 39 | Random seed that will be passed to the C++ Mersenne Twister 19937 random 40 | number generator. 41 | max_malloc : int (`unsigned long long int` in C) 42 | The maximum amount of memory to be allocated using `malloc` when making 43 | a bitset to hold edges. An uncompressed bitset is implemented for 44 | holding edges that is significantly faster than alternatives. However, 45 | it is memory-inefficient and will not be used if more memory is required 46 | than `max_malloc`. Above the threshold, a Roaring bitset will be used. 47 | 48 | Returns 49 | ------- 50 | new_edges : List[Tuple[int, int]] 51 | Edge list of a permutation of the network given as `edge_list` 52 | stats : Dict[str, int] 53 | Information about the permutation performed. Gives the following information: 54 | `swap_attempts` - number of attempted swaps 55 | `same_edge` - number of swaps rejected because one edge was chosen twice 56 | `self_loop` - number of swaps rejected because new edge is a self-loop 57 | 'duplicate` - number of swaps rejected because new edge already exists 58 | `undir_duplicate` - number of swaps rejected because the network is 59 | undirected and the reverse of the new edge already exists 60 | `excluded` - number of swaps rejected because new edge was among excluded 61 | """ 62 | import xswap._xswap_backend 63 | if len(edge_list) != len(set(edge_list)): 64 | raise ValueError("Edge list contained duplicate edges.") 65 | 66 | # Number of attempted XSwap swaps 67 | num_swaps = int(multiplier * len(edge_list)) 68 | 69 | # Compute the maximum node ID (for creating the bitset) 70 | max_id = max(map(max, edge_list)) 71 | 72 | new_edges, stats = xswap._xswap_backend._xswap( 73 | edge_list, list(excluded_edges), max_id, allow_self_loops, 74 | allow_antiparallel, num_swaps, seed, max_malloc) 75 | 76 | return new_edges, stats 77 | -------------------------------------------------------------------------------- /xswap/preprocessing.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | def load_str_edges(filename, node_delim=',', edge_delim='\n'): 5 | """ 6 | Load edges from file into memory. Store edges as a list and store each edge 7 | as Tuple[str, str]. Used to load edges for preprocessing. 8 | """ 9 | with open(filename, 'r', newline='') as f: 10 | reader = csv.reader(f, delimiter=node_delim, lineterminator=edge_delim) 11 | str_edges = [tuple(row) for row in reader if len(row) > 1] 12 | return str_edges 13 | 14 | 15 | def load_processed_edges(filename): 16 | """ 17 | Load processed edges from a file. Processed means that edges are guaranteed 18 | to be integers ranging from zero to the number of unique nodes. 19 | """ 20 | str_edges = load_str_edges(filename) 21 | edges = [ 22 | (int(edge[0]), int(edge[1])) for edge in str_edges 23 | ] 24 | return edges 25 | 26 | 27 | def write_edges(filename, edges, node_delim=',', edge_delim='\n'): 28 | with open(filename, 'w', newline='') as f: 29 | writer = csv.writer(f, delimiter=node_delim, lineterminator=edge_delim) 30 | writer.writerows(edges) 31 | 32 | 33 | def write_mapping(filename, mapping, delimiter=','): 34 | with open(filename, 'w', newline='') as f: 35 | writer = csv.writer(f, delimiter=delimiter) 36 | writer.writerow(['original', 'mapped']) 37 | for original, mapped in mapping.items(): 38 | writer.writerow([original, mapped]) 39 | 40 | 41 | def _map_nodes_to_int(nodes): 42 | """ 43 | Return a dict mapping a list of nodes to their sorted indices. Nodes should 44 | be a list of strings. 45 | 46 | Returns: 47 | -------- 48 | Dict[str, int] 49 | """ 50 | sorted_node_set = sorted(set(nodes)) 51 | name_to_id = {name: i for i, name in enumerate(sorted_node_set)} 52 | return name_to_id 53 | 54 | 55 | def _apply_map(edges, source_mapping, target_mapping): 56 | """ 57 | Maps edges according to new node names specified by source and target maps. 58 | 59 | edges : List[Tuple[str, str]] 60 | source_mapping : Dict[str, int] 61 | target_mapping : Dict[str, int] 62 | """ 63 | source_nodes = [edge[0] for edge in edges] 64 | target_nodes = [edge[1] for edge in edges] 65 | mapped_nodes = [ 66 | map(source_mapping.get, source_nodes), 67 | map(target_mapping.get, target_nodes), 68 | ] 69 | return list(zip(*mapped_nodes)) 70 | 71 | 72 | def map_str_edges(edges, bipartite): 73 | """ 74 | Maps a list of edge tuples containing strings to a minimal set of 75 | integer edges. 76 | 77 | edges : List[Tuple[str, str]] 78 | bipartite : bool 79 | Whether to map source and target nodes using the same mapping. 80 | For example, an edge like ('1', '1') may refer to a connection between 81 | separate nodes, or it may be a self-loop. If `bipartite=True`, the 82 | edge would be mapped like (0, 1), where the new node ids reflect the fact 83 | that the same names do not indicate the same nodes. To ensure that names 84 | are consistently mapped between source and target, put `bipartite=False`. 85 | 86 | Returns: 87 | -------- 88 | Tuple[List[Tuple[int, int]], Dict[int, str]] 89 | 90 | Example: 91 | -------- 92 | >>> map_str_edges([('a', 'b'), ('b', 'c')], bipartite=False) 93 | 94 | ([(0, 1), (1, 2)], {0: 'a', 1: 'b', 2: 'c'}) 95 | """ 96 | source_nodes = [edge[0] for edge in edges] 97 | target_nodes = [edge[1] for edge in edges] 98 | 99 | # Two separate mappings to be used for source and target nodes 100 | if bipartite: 101 | source_map = _map_nodes_to_int(source_nodes) 102 | target_map = _map_nodes_to_int(target_nodes) 103 | 104 | # One single mapping to be used for both source and target nodes 105 | if not bipartite: 106 | combined_nodes = list(set(source_nodes + target_nodes)) 107 | source_map = target_map = _map_nodes_to_int(combined_nodes) 108 | 109 | mapped_edges = _apply_map(edges, source_map, target_map) 110 | return (mapped_edges, source_map, target_map) 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XSwap: Fast degree-preserving network permutation 2 | 3 | [![Linux Build Status](https://img.shields.io/travis/com/hetio/xswap/master.svg?logo=travis)](https://travis-ci.com/hetio/xswap) 4 | [![PyPI](https://img.shields.io/pypi/v/xswap.svg?logo=pypi&logoColor=white)](https://pypi.org/project/xswap/) 5 | [![GitHub issues](https://img.shields.io/github/issues/hetio/xswap.svg?logo=github)](https://github.com/hetio/xswap/issues) 6 | 7 | **Full documentation:** 8 | 9 | 10 | 11 | XSwap is an algorithm for degree-preserving network randomization (permutation) [1]. 12 | Permuted networks can be used for a number of purposes in network analysis, including for generating counterfactual distributions of features when only the network's degree sequence is maintained or for computing a prior probability of an edge given only the network's degree sequence. 13 | Overall, permuted networks allow one to quantify the effects of degree on analysis and prediction methods. 14 | Understanding this effect is useful when a network's degree sequence is subject to biases. 15 | This implementation is a modified version of the algorithm due to Hanhijärvi et al. with two additional parameters (`allow_self_loops` and `allow_antiparallel`), which enable greater generalizability to bipartite, directed, and undirected networks. 16 | 17 | 1. **Randomization Techniques for Graphs** 18 | Sami Hanhijärvi, Gemma C. Garriga, Kai Puolamäki 19 | *Proceedings of the 2009 SIAM International Conference on Data Mining* (2009-04-30) 20 | DOI: [10.1137/1.9781611972795.67](https://doi.org/10.1137/1.9781611972795.67) 21 | 22 | ## Usage examples 23 | 24 | #### Permuting an edge list 25 | 26 | ```python 27 | >>> edges = [(0, 1), (1, 0)] 28 | >>> permuted_edges, permutation_statistics = xswap.permute_edge_list( 29 | edges, allow_self_loops=True, allow_antiparallel=True, 30 | multiplier=10) 31 | >>> permuted_edges 32 | [(0, 0), (1, 1)] 33 | >>> permutation_statistics 34 | {'swap_attempts': 20, 'same_edge': 10, 'self_loop': 0, 'duplicate': 1, 35 | 'undir_duplicate': 0, 'excluded': 0} 36 | ``` 37 | 38 | #### Computing degree-sequence based prior probabilities of edges existing 39 | 40 | ```python 41 | >>> edges = [(0, 1), (1, 0)] 42 | >>> prior_prob_df = xswap.prior.compute_xswap_priors( 43 | edges, n_permutations=10000, shape=(2, 2), allow_self_loops=True, 44 | allow_antiparallel=True) 45 | >>> prior_prob_df 46 | source_id target_id edge source_degree target_degree xswap_prior 47 | 0 0 0 False 1 1 0.5 48 | 1 0 1 True 1 1 0.5 49 | 2 1 0 True 1 1 0.5 50 | 3 1 1 False 1 1 0.5 51 | ``` 52 | 53 | ## Choice of parameters 54 | 55 | #### Bipartite networks 56 | 57 | Bipartite networks should be indexed using the bi-adjacency matrix, meaning that the edge `(0, 0)` is from source node 0 to target node 0, and is not a self-loop. 58 | Moreover, bipartite networks should be permuted using `allow_self_loops=False` and `allow_antiparallel=True`. 59 | 60 | #### Directed and undirected networks 61 | 62 | For non-bipartite networks, the decisions of `allow_self_loops` and `allow_antiparallel` are not always the same. 63 | For undirected networks, set `allow_antiparallel=False`, as otherwise the edges (1, 0) and (0, 1), which represent the same edge, will be treated as separate. 64 | Antiparallel edges may or may not be allowed for directed networks, depending on context. 65 | Similarly, self-loops may or may not be allowed for directed or undirected networks, depending on the specific network being permuted. 66 | 67 | ## Libraries 68 | 69 | The XSwap library includes [Roaring Bitmaps](https://github.com/RoaringBitmap/CRoaring), available under the [Apache 2.0 license](https://github.com/RoaringBitmap/CRoaring/blob/LICENSE). 70 | 71 | ## Acknowledgments 72 | 73 | Development of this project has largely taken place in the [Greene Lab](http://www.greenelab.com/) at the University of Pennsylvania. As an open source project under the `hetio` organization, this repository is grateful for its community of maintainers, contributors, and users. 74 | 75 | This work is funded in part by the Gordon and Betty Moore Foundation’s Data-Driven Discovery Initiative through Grants [GBMF4552](https://www.moore.org/grant-detail?grantId=GBMF4552) to Casey Greene, [GBMF4560](https://www.moore.org/grant-detail?grantId=GBMF4560) to Blair Sullivan, and the National Institutes of Health’s National Human Genome Research Institute [R01 HG010067](http://grantome.com/grant/NIH/R01-HG010067-02). 76 | -------------------------------------------------------------------------------- /xswap/src/xswap_wrapper.cpp: -------------------------------------------------------------------------------- 1 | #include "xswap.h" 2 | 3 | #define XSWAP_MODULE 4 | 5 | static Edges py_list_to_edges(PyObject *py_list) { 6 | int num_edges = (int)PyList_Size(py_list); 7 | int** edges_array = (int**)malloc(sizeof(int*) * num_edges); 8 | 9 | for (int i = 0; i < num_edges; i++) { 10 | edges_array[i] = (int*)malloc(sizeof(int) * 2); 11 | PyObject* py_tuple = PyList_GetItem(py_list, i); 12 | for (int j = 0; j < 2; j++) { 13 | PyObject* temp = PyTuple_GetItem(py_tuple, j); 14 | int value = (int)PyLong_AsLong(temp); 15 | edges_array[i][j] = value; 16 | } 17 | } 18 | Edges return_object; 19 | return_object.edge_array = edges_array; 20 | return_object.num_edges = num_edges; 21 | return return_object; 22 | } 23 | 24 | static PyObject* edge_to_py_tuple(int *edge) { 25 | PyObject* edge_tuple = PyTuple_New(2); 26 | for (int j = 0; j < 2; j++) { 27 | PyObject* node_id = PyLong_FromLong(edge[j]); 28 | PyTuple_SET_ITEM(edge_tuple, j, node_id); 29 | } 30 | return edge_tuple; 31 | } 32 | 33 | static PyObject* edges_to_py_list(Edges edges) { 34 | int num_edges = edges.num_edges; 35 | PyObject* py_list = PyList_New(num_edges); 36 | 37 | for (int i = 0; i < num_edges; i++) { 38 | PyObject* edge_tuple = edge_to_py_tuple(edges.edge_array[i]); 39 | PyList_SET_ITEM(py_list, i, edge_tuple); 40 | } 41 | return py_list; 42 | } 43 | 44 | static PyObject* stats_to_py_dict(statsCounter& stats) { 45 | PyObject* py_num_swaps = PyLong_FromLong(stats.num_swaps); 46 | PyObject* py_same_edge = PyLong_FromLong(stats.same_edge); 47 | PyObject* py_self_loop = PyLong_FromLong(stats.self_loop); 48 | PyObject* py_duplicate = PyLong_FromLong(stats.duplicate); 49 | PyObject* py_undir_duplicate = PyLong_FromLong(stats.undir_duplicate); 50 | PyObject* py_excluded = PyLong_FromLong(stats.excluded); 51 | 52 | PyObject* dict = PyDict_New(); 53 | PyDict_SetItemString(dict, "swap_attempts", py_num_swaps); 54 | PyDict_SetItemString(dict, "same_edge", py_same_edge); 55 | PyDict_SetItemString(dict, "self_loop", py_self_loop); 56 | PyDict_SetItemString(dict, "duplicate", py_duplicate); 57 | PyDict_SetItemString(dict, "undir_duplicate", py_undir_duplicate); 58 | PyDict_SetItemString(dict, "excluded", py_excluded); 59 | return dict; 60 | } 61 | 62 | static PyObject* wrap_xswap(PyObject *self, PyObject *args) { 63 | // Get arguments from python and compute quantities where needed 64 | PyObject *py_edges, *py_excluded_edges; 65 | int max_id, num_swaps, seed, allow_self_loop, allow_antiparallel; 66 | unsigned long long int max_malloc; 67 | int parsed_successfully = PyArg_ParseTuple(args, "OOippiiK", &py_edges, 68 | &py_excluded_edges, &max_id, &allow_self_loop, 69 | &allow_antiparallel, &num_swaps, &seed, &max_malloc); 70 | if (!parsed_successfully) 71 | return NULL; 72 | 73 | // Load edges from python list 74 | Edges edges = py_list_to_edges(py_edges); 75 | edges.max_id = max_id; 76 | Edges excluded_edges = py_list_to_edges(py_excluded_edges); 77 | 78 | // Set the conditions under which new edges are accepted 79 | Conditions valid_cond; 80 | valid_cond.seed = seed; 81 | valid_cond.allow_self_loop = allow_self_loop; 82 | valid_cond.allow_antiparallel = allow_antiparallel; 83 | valid_cond.excluded_edges = excluded_edges; 84 | 85 | // Initialize stats counters for failure reasons 86 | statsCounter stats; 87 | stats.num_swaps = num_swaps; 88 | 89 | // Perform XSwap 90 | swap_edges(edges, num_swaps, valid_cond, &stats, max_malloc); 91 | 92 | // Get new edges as python list 93 | PyObject* py_list = edges_to_py_list(edges); 94 | 95 | // Get stats as python dict 96 | PyObject* stats_py_dict = stats_to_py_dict(stats); 97 | 98 | // Create and return a python tuple of new_edges, stats 99 | PyObject* return_tuple = PyTuple_New(2); 100 | PyTuple_SET_ITEM(return_tuple, 0, py_list); 101 | PyTuple_SET_ITEM(return_tuple, 1, stats_py_dict); 102 | for (int i = 0; i < edges.num_edges; i++) { 103 | free(edges.edge_array[i]); 104 | } 105 | free(edges.edge_array); 106 | for (int i = 0; i < valid_cond.excluded_edges.num_edges; i++) { 107 | free(valid_cond.excluded_edges.edge_array[i]); 108 | } 109 | free(valid_cond.excluded_edges.edge_array); 110 | return return_tuple; 111 | } 112 | 113 | static PyMethodDef XSwapMethods[] = { 114 | {"_xswap", wrap_xswap, METH_VARARGS, "Backend for edge permutation"}, 115 | {NULL, NULL, 0, NULL} 116 | }; 117 | 118 | static struct PyModuleDef xswapmodule = { 119 | PyModuleDef_HEAD_INIT, 120 | "_xswap_backend", /* name of module */ 121 | NULL, /* module documentation, NULL */ 122 | -1, /* -1 since the module keeps state in global variables. */ 123 | XSwapMethods 124 | }; 125 | 126 | PyMODINIT_FUNC PyInit__xswap_backend(void) { 127 | return PyModule_Create(&xswapmodule); 128 | } 129 | -------------------------------------------------------------------------------- /tests/test_bitset.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../xswap/src/xswap.h" 6 | 7 | void handle_eptr(std::exception_ptr eptr) { 8 | try { 9 | if (eptr) { 10 | std::rethrow_exception(eptr); 11 | } 12 | } catch(const std::exception& e) { 13 | std::cout << "Unexpected exception while attempting bad element access " << e.what() << "\n"; 14 | } 15 | } 16 | 17 | bool test_add(UncompressedBitSet edges_set) { 18 | int edge_to_add[2] = {1, 1}; 19 | edges_set.add(edge_to_add); 20 | int** fake_edges = (int**)malloc(sizeof(int*) * 16); 21 | int counter = 0; 22 | for (int i = 0; i < 4; i++) { 23 | for (int j = 0; j < 4; j++) { 24 | fake_edges[counter] = (int*)malloc(sizeof(int) * 2); 25 | fake_edges[counter][0] = i; 26 | fake_edges[counter][1] = j; 27 | counter += 1; 28 | } 29 | } 30 | bool correctly_contains = edges_set.contains(edge_to_add); 31 | int num_incorrect = 0; 32 | for (int i = 0; i < 16; i++) { 33 | bool incorrectly_contains = edges_set.contains(fake_edges[i]); 34 | bool was_added = (fake_edges[i][0] == edge_to_add[0] && fake_edges[i][1] == edge_to_add[1]); 35 | if (incorrectly_contains and !was_added) { 36 | num_incorrect += 1; 37 | std::printf("Incorrectly contained: (%d, %d)\n", fake_edges[i][0], fake_edges[i][1]); 38 | } 39 | } 40 | free(fake_edges); 41 | if (num_incorrect == 0 && correctly_contains == true) { 42 | return true; 43 | } else { 44 | return false; 45 | } 46 | } 47 | 48 | bool test_remove(UncompressedBitSet edges_set) { 49 | int edge_to_add[2] = {1, 1}; 50 | edges_set.add(edge_to_add); 51 | bool was_added = edges_set.contains(edge_to_add); 52 | edges_set.remove(edge_to_add); 53 | bool was_removed = !edges_set.contains(edge_to_add); 54 | bool passed = was_added && was_removed; 55 | if (!was_added) 56 | std::printf("Did not add edge properly"); 57 | if (!was_removed) 58 | std::printf("Did not remove edge properly"); 59 | return passed; 60 | } 61 | 62 | bool test_oob_insert(UncompressedBitSet edges_set) { 63 | 64 | int edge_to_add[2] = {4, 4}; 65 | std::exception_ptr eptr; 66 | try { 67 | edges_set.add(edge_to_add); 68 | } catch(std::out_of_range) { 69 | return true; 70 | } catch(...) { 71 | eptr = std::current_exception(); 72 | handle_eptr(eptr); 73 | return true; 74 | } 75 | std::printf("No exception on OOB insert\n"); 76 | return false; 77 | } 78 | 79 | bool test_oob_access(UncompressedBitSet edges_set) { 80 | int edge_to_access[2] = {4, 4}; 81 | std::exception_ptr eptr; 82 | try { 83 | edges_set.add(edge_to_access); 84 | } catch(std::out_of_range) { 85 | return true; 86 | } catch(...) { 87 | eptr = std::current_exception(); 88 | handle_eptr(eptr); 89 | return true; 90 | } 91 | std::printf("No exception on OOB access\n"); 92 | return false; 93 | } 94 | 95 | bool test_oob_remove(UncompressedBitSet edges_set) { 96 | int edge_to_access[2] = {4, 4}; 97 | std::exception_ptr eptr; 98 | try { 99 | edges_set.add(edge_to_access); 100 | } catch(std::out_of_range) { 101 | return true; 102 | } catch(...) { 103 | eptr = std::current_exception(); 104 | handle_eptr(eptr); 105 | return true; 106 | } 107 | std::printf("No exception on OOB removal\n"); 108 | return false; 109 | } 110 | 111 | bool test_remove_nonexistent(UncompressedBitSet edges_set) { 112 | int edge_to_access[2] = {2, 2}; 113 | std::exception_ptr eptr; 114 | try { 115 | edges_set.remove(edge_to_access); 116 | } catch(std::logic_error) { 117 | return true; 118 | } catch(...) { 119 | eptr = std::current_exception(); 120 | handle_eptr(eptr); 121 | return true; 122 | } 123 | std::printf("No exception on removal of nonexisting element\n"); 124 | return false; 125 | } 126 | 127 | bool test_insert_existing(UncompressedBitSet edges_set) { 128 | int edge_to_access[2] = {2, 2}; 129 | edges_set.add(edge_to_access); 130 | std::exception_ptr eptr; 131 | try { 132 | edges_set.add(edge_to_access); 133 | } catch(std::logic_error) { 134 | return true; 135 | } catch(...) { 136 | eptr = std::current_exception(); 137 | handle_eptr(eptr); 138 | return true; 139 | } 140 | std::printf("No exception on addition of existing element\n"); 141 | return false; 142 | } 143 | 144 | main(int argc, char const *argv[]) { 145 | unsigned long long int max_malloc = 4000000; 146 | int num_tests = 7; 147 | bool test_passed[num_tests]; 148 | 149 | UncompressedBitSet edges_set = UncompressedBitSet(3, max_malloc); 150 | test_passed[0] = test_add(edges_set); 151 | edges_set = UncompressedBitSet(3, max_malloc); // Reset so functions don't interfere 152 | test_passed[1] = test_remove(edges_set); 153 | test_passed[2] = test_oob_insert(edges_set); 154 | test_passed[3] = test_oob_access(edges_set); 155 | test_passed[4] = test_oob_remove(edges_set); 156 | edges_set = UncompressedBitSet(3, max_malloc); 157 | test_passed[5] = test_remove_nonexistent(edges_set); 158 | edges_set = UncompressedBitSet(3, max_malloc); 159 | test_passed[6] = test_insert_existing(edges_set); 160 | 161 | bool all_tests_passed = true; 162 | for (int i = 0; i < num_tests; i++) { 163 | all_tests_passed &= test_passed[i]; 164 | } 165 | 166 | if (all_tests_passed) { 167 | std::printf("All tests passed\n"); 168 | return 0; 169 | } else { 170 | std::printf("Test failure\n"); 171 | return 1; 172 | } 173 | edges_set.free_array(); 174 | } 175 | -------------------------------------------------------------------------------- /xswap/src/bitset.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "xswap.h" 4 | 5 | int CHAR_BITS = 8*sizeof(char); 6 | 7 | size_t cantor_pair(int* edge) { 8 | size_t source = edge[0]; 9 | size_t target = edge[1]; 10 | return ((source + target) * (source + target + 1) / 2) + target; 11 | } 12 | 13 | UncompressedBitSet::UncompressedBitSet(int max_id, unsigned long long int max_malloc) { 14 | int max_pair[2] = {max_id, max_id}; 15 | max_cantor = cantor_pair(max_pair); 16 | create_bitset(max_cantor, max_malloc); 17 | } 18 | 19 | UncompressedBitSet::UncompressedBitSet(Edges edges, unsigned long long int max_malloc) { 20 | int max_pair[2] = {edges.max_id, edges.max_id}; 21 | max_cantor = cantor_pair(max_pair); 22 | create_bitset(max_cantor, max_malloc); 23 | for (int i = 0; i < edges.num_edges; i++) { 24 | add(edges.edge_array[i]); 25 | } 26 | } 27 | 28 | bool UncompressedBitSet::contains(int *edge) { 29 | size_t edge_cantor = cantor_pair(edge); 30 | if (edge_cantor > max_cantor) 31 | throw std::out_of_range("Attempting to check membership for out-of-bounds element."); 32 | return (bool)get_bit(bitset[edge_cantor / CHAR_BITS], edge_cantor % CHAR_BITS); 33 | } 34 | 35 | void UncompressedBitSet::add(int *edge) { 36 | size_t edge_cantor = cantor_pair(edge); 37 | if (edge_cantor > max_cantor) { 38 | throw std::out_of_range("Attempting to add an out-of-bounds element to the bitset."); 39 | } 40 | if (get_bit(bitset[edge_cantor / CHAR_BITS], edge_cantor % CHAR_BITS)) { 41 | throw std::logic_error("Attempting to add an existing element."); 42 | } 43 | set_bit_true(&bitset[edge_cantor / CHAR_BITS], edge_cantor % CHAR_BITS); 44 | } 45 | 46 | void UncompressedBitSet::remove(int *edge) { 47 | size_t edge_cantor = cantor_pair(edge); 48 | if (edge_cantor > max_cantor) 49 | throw std::out_of_range("Attempting to remove an out-of-bounds element."); 50 | if (!get_bit(bitset[edge_cantor / CHAR_BITS], edge_cantor % CHAR_BITS)) 51 | throw std::logic_error("Attempting to remove a nonexisting element."); 52 | set_bit_false(&bitset[edge_cantor / CHAR_BITS], edge_cantor % CHAR_BITS); 53 | } 54 | 55 | void UncompressedBitSet::free_array() { 56 | free(bitset); 57 | } 58 | 59 | // num_elements corresponds to the minimum number of bits that are needed 60 | void UncompressedBitSet::create_bitset(size_t num_elements, 61 | unsigned long long int max_malloc) { 62 | // Minimum sufficient number of bytes for the array "ceil(num_elements / CHAR_BITS)" 63 | size_t bytes_needed = (num_elements + CHAR_BITS - (num_elements % CHAR_BITS)) / CHAR_BITS; 64 | if (bytes_needed > max_malloc) { 65 | throw std::runtime_error("Bitset requires too much memory."); 66 | } 67 | bitset = (char*)calloc(bytes_needed, 1); 68 | } 69 | 70 | /* Gets the bit from byte `word` at position `bit_position`. In the array, bits 71 | correspond to cantor pair values 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, etc. To access 72 | the bit corresponding to cantor pair value 9, call `get_bit` with `word` equal 73 | to the second bit and `bit_position` equal to 1 (ie. the second bit). 74 | `word >> (7 - bit_position)` puts the selected bit in the least significant position */ 75 | char UncompressedBitSet::get_bit(char word, char bit_position) { 76 | return (word >> (7 - bit_position)) & 0x1; 77 | } 78 | 79 | void UncompressedBitSet::set_bit_true(char* word, char bit_position) { 80 | *word |= (0x1 << (7 - bit_position)); 81 | } 82 | 83 | void UncompressedBitSet::set_bit_false(char* word, char bit_position) { 84 | *word &= ~(0x1 << (7 - bit_position)); 85 | } 86 | 87 | RoaringBitSet::RoaringBitSet(Edges edges) { 88 | for (int i = 0; i < edges.num_edges; i++) { 89 | add(edges.edge_array[i]); 90 | } 91 | } 92 | 93 | bool RoaringBitSet::contains(int *edge) { 94 | int edge_cantor = cantor_pair(edge); 95 | return bitmap.contains(edge_cantor); 96 | } 97 | 98 | void RoaringBitSet::add(int *edge) { 99 | int edge_cantor = cantor_pair(edge); 100 | bool success = bitmap.addChecked(edge_cantor); 101 | if (!success) { 102 | throw std::logic_error("Attempting to add an existing element."); 103 | } 104 | } 105 | 106 | void RoaringBitSet::remove(int *edge) { 107 | int edge_cantor = cantor_pair(edge); 108 | bool success = bitmap.removeChecked(edge_cantor); 109 | if (!success) { 110 | throw std::logic_error("Attempting to remove a nonexisting element."); 111 | } 112 | } 113 | 114 | BitSet::BitSet(Edges edges, unsigned long long int max_malloc) { 115 | int max_pair[2] = {edges.max_id, edges.max_id}; 116 | size_t max_cantor = cantor_pair(max_pair); 117 | 118 | if (max_cantor < max_malloc) { 119 | use_compressed = false; 120 | uncompressed_set = UncompressedBitSet(edges, max_malloc); 121 | } else { 122 | runtime_warning_roaring(); 123 | use_compressed = true; 124 | compressed_set = RoaringBitSet(edges); 125 | } 126 | } 127 | 128 | PyObject *BitSet::runtime_warning_roaring(void) { 129 | // Roaring bitset is significantly slower, but used because of large network sizes 130 | PyErr_WarnEx(PyExc_RuntimeWarning, "Using Roaring bitset because of the large number of edges.", 2); 131 | return NULL; 132 | } 133 | 134 | bool BitSet::contains(int *edge) { 135 | if (use_compressed) { 136 | return compressed_set.contains(edge); 137 | } else { 138 | return uncompressed_set.contains(edge); 139 | } 140 | } 141 | 142 | void BitSet::add(int *edge) { 143 | if (use_compressed) { 144 | return compressed_set.add(edge); 145 | } else { 146 | return uncompressed_set.add(edge); 147 | } 148 | } 149 | 150 | void BitSet::remove(int *edge) { 151 | if (use_compressed) { 152 | return compressed_set.remove(edge); 153 | } else { 154 | return uncompressed_set.remove(edge); 155 | } 156 | } 157 | 158 | void BitSet::free_array() { 159 | if (use_compressed) { 160 | return; 161 | } else { 162 | uncompressed_set.free_array(); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /xswap/prior.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy 4 | import pandas 5 | import scipy.sparse 6 | 7 | import xswap.network_formats 8 | 9 | 10 | def compute_xswap_occurrence_matrix(edge_list: List[Tuple[int, int]], 11 | n_permutations: int, 12 | shape: Tuple[int, int], 13 | allow_self_loops: bool = False, 14 | allow_antiparallel: bool = False, 15 | sparse: bool = True, 16 | swap_multiplier: float = 10, 17 | initial_seed: int = 0, 18 | max_malloc: int = 4000000000): 19 | """ 20 | Compute the XSwap prior probability for every node pair in a network. The 21 | XSwap prior is the probability of a node pair having an edge between them in 22 | degree-preserving permutations of a network. The prior value for a node 23 | pair can be considered as the probability of an edge existing between two 24 | nodes given only the network's degree sequence. 25 | 26 | Parameters 27 | ---------- 28 | edge_list : List[Tuple[int, int]] 29 | Edge list representing the graph whose XSwap edge priors are to be 30 | computed. Tuples contain integer values representing nodes. No value 31 | should be greater than C++'s `INT_MAX`, in this case 2_147_483_647. 32 | An adjacency matrix will be created assuming that a node's value is its 33 | index in the matrix. If not, map edges (identifiers can be string or 34 | otherwise) using `xswap.preprocessing.map_str_edges`. 35 | n_permutations : int 36 | The number of permuted networks used to compute the empirical XSwap prior 37 | shape : Tuple[int, int] 38 | The shape of the matrix to be returned. In other words, a tuple of the 39 | number of source and target nodes. 40 | allow_self_loops : bool 41 | Whether to allow edges like (0, 0). In the case of bipartite graphs, 42 | such an edge represents a connection between two distinct nodes, while 43 | in other graphs it may represent an edge from a node to itself, in which 44 | case an edge may or may not be meaningful depending on context. 45 | allow_antiparallel : bool 46 | Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case 47 | of bipartite graphs, these edges represent two connections between four 48 | distinct nodes, while for other graphs, these may be connections between 49 | the same two nodes. 50 | sparse : bool 51 | Whether to use a sparse matrix when adding up edge occurrences across 52 | permutations. If large changes in sparsity are expected, a dense 53 | array may be preferable. 54 | swap_multiplier : float 55 | The number of edge swap attempts is determined by the product of the 56 | number of existing edges and multiplier. For example, if five edges are 57 | passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer 58 | products will be rounded down to the nearest integer. 59 | initial_seed : int 60 | Random seed that will be passed to the C++ Mersenne Twister 19937 random 61 | number generator. `initial_seed` will be used for the first permutation, 62 | and the seed used for each subsequent permutation will be incremented by 63 | one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then 64 | the two permutations will pass seeds 0 and 1, respectively. 65 | max_malloc : int (`unsigned long long int` in C) 66 | The maximum amount of memory to be allocated using `malloc` when making 67 | a bitset to hold edges. An uncompressed bitset is implemented for 68 | holding edges that is significantly faster than alternatives. However, 69 | it is memory-inefficient and will not be used if more memory is required 70 | than `max_malloc`. Above the threshold, a Roaring bitset will be used. 71 | 72 | Returns 73 | ------- 74 | edge_counter : scipy.sparse.csc_matrix 75 | Adjacency matrix with entries equal to the number of permutations in 76 | which a given edge appeared 77 | """ 78 | import xswap._xswap_backend 79 | if len(edge_list) != len(set(edge_list)): 80 | raise ValueError("Edge list contained duplicate edges. " 81 | "XSwap does not support multigraphs.") 82 | 83 | num_swaps = int(swap_multiplier * len(edge_list)) 84 | 85 | max_id = max(map(max, edge_list)) 86 | 87 | if sparse: 88 | edge_counter = scipy.sparse.csc_matrix(shape, dtype=int) 89 | else: 90 | edge_counter = numpy.zeros(shape, dtype=int) 91 | 92 | for i in range(n_permutations): 93 | permuted_edges, stats = xswap._xswap_backend._xswap( 94 | edge_list, [], max_id, allow_self_loops, allow_antiparallel, 95 | num_swaps, initial_seed + i, max_malloc) 96 | permuted_matrix = xswap.network_formats.edges_to_matrix( 97 | permuted_edges, add_reverse_edges=(not allow_antiparallel), 98 | shape=shape, dtype=int, sparse=sparse) 99 | edge_counter += permuted_matrix 100 | 101 | return edge_counter 102 | 103 | 104 | def compute_xswap_priors(edge_list: List[Tuple[int, int]], n_permutations: int, 105 | shape: Tuple[int, int], allow_self_loops: bool = False, 106 | allow_antiparallel: bool = False, sparse: bool = True, 107 | swap_multiplier: int = 10, initial_seed: int = 0, 108 | max_malloc: int = 4000000000, 109 | dtypes = {'id': numpy.uint16, 'degree': numpy.uint16, 110 | 'edge': bool, 'xswap_prior': float}, 111 | ): 112 | """ 113 | Compute the XSwap prior for every potential edge in the network. Uses 114 | degree-grouping to maximize the effective number of permutations for each 115 | node pair. That is, node pairs with the same source and target degrees can 116 | be grouped when computing the XSwap prior, allowing there to be more 117 | permutations for some node pairs than `n_permutations`. 118 | 119 | Note that the mechanics of this function are separated to minimize memory use. 120 | 121 | Parameters 122 | ---------- 123 | edge_list : List[Tuple[int, int]] 124 | Edge list representing the graph whose XSwap edge priors are to be 125 | computed. Tuples contain integer values representing nodes. No value 126 | should be greater than C++'s `INT_MAX`, in this case 2_147_483_647. 127 | An adjacency matrix will be created assuming that a node's value is its 128 | index in the matrix. If not, map edges (identifiers can be string or 129 | otherwise) using `xswap.preprocessing.map_str_edges`. 130 | n_permutations : int 131 | The number of permuted networks used to compute the empirical XSwap prior 132 | shape : Tuple[int, int] 133 | The shape of the matrix to be returned. In other words, a tuple of the 134 | number of source and target nodes. 135 | allow_self_loops : bool 136 | Whether to allow edges like (0, 0). In the case of bipartite graphs, 137 | such an edge represents a connection between two distinct nodes, while 138 | in other graphs it may represent an edge from a node to itself, in which 139 | case an edge may or may not be meaningful depending on context. 140 | allow_antiparallel : bool 141 | Whether to allow simultaneous edges like (0, 1) and (1, 0). In the case 142 | of bipartite graphs, these edges represent two connections between four 143 | distinct nodes, while for other graphs, these may be connections between 144 | the same two nodes. 145 | sparse : bool 146 | Whether to use a sparse matrix when adding up edge occurrences across 147 | permutations. If large changes in sparsity are expected, a dense 148 | array may be preferable. 149 | swap_multiplier : float 150 | The number of edge swap attempts is determined by the product of the 151 | number of existing edges and multiplier. For example, if five edges are 152 | passed and multiplier is set to 10, 50 swaps will be attempted. Non-integer 153 | products will be rounded down to the nearest integer. 154 | initial_seed : int 155 | Random seed that will be passed to the C++ Mersenne Twister 19937 random 156 | number generator. `initial_seed` will be used for the first permutation, 157 | and the seed used for each subsequent permutation will be incremented by 158 | one. For example, if `initial_seed` is 0 and `n_permutations` is 2, then 159 | the two permutations will pass seeds 0 and 1, respectively. 160 | max_malloc : int (`unsigned long long int` in C) 161 | The maximum amount of memory to be allocated using `malloc` when making 162 | a bitset to hold edges. An uncompressed bitset is implemented for 163 | holding edges that is significantly faster than alternatives. However, 164 | it is memory-inefficient and will not be used if more memory is required 165 | than `max_malloc`. Above the threshold, a Roaring bitset will be used. 166 | dtypes : dict 167 | Dictionary mapping returned column types to dtypes. Keys should be 168 | `'id'`, `'degree'`, `'edge'`, and `'xswap_prior'`. `dtype` need only 169 | be changed from its defaults if the values of `id` or `degree` are 170 | greater than the maxima in the default dtypes, or in cases where greater 171 | precision is desired. (`numpy.uint16` has a maximum value of 65535.) 172 | 173 | Returns 174 | ------- 175 | prior_df : pandas.DataFrame 176 | Columns are the following: 177 | [source_id, target_id, edge, source_degree, target_degree, xswap_prior] 178 | """ 179 | # Compute the adjacency matrix of the original (unpermuted) network 180 | original_edges = xswap.network_formats.edges_to_matrix( 181 | edge_list, add_reverse_edges=(not allow_antiparallel), shape=shape, 182 | dtype=dtypes['edge'], sparse=True) 183 | 184 | # Setup DataFrame for recording prior data 185 | prior_df = pandas.DataFrame({ 186 | 'source_id': numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]), 187 | 'target_id': numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]), 188 | 'edge': original_edges.toarray().flatten(), 189 | }) 190 | del original_edges 191 | 192 | prior_df['source_degree'] = (prior_df 193 | .groupby('source_id') 194 | .transform(sum)['edge'] 195 | .astype(dtypes['degree'])) 196 | del prior_df['source_id'] 197 | 198 | prior_df['target_degree'] = (prior_df 199 | .groupby('target_id') 200 | .transform(sum)['edge'] 201 | .astype(dtypes['degree'])) 202 | del prior_df['target_id'] 203 | 204 | # Compute the number of occurrences of each edge across permutations 205 | edge_counter = compute_xswap_occurrence_matrix( 206 | edge_list=edge_list, n_permutations=n_permutations, shape=shape, 207 | allow_self_loops=allow_self_loops, allow_antiparallel=allow_antiparallel, 208 | sparse=sparse, swap_multiplier=swap_multiplier, initial_seed=initial_seed, 209 | max_malloc=max_malloc) 210 | 211 | prior_df['num_permuted_edges'] = edge_counter.toarray().flatten() 212 | del edge_counter 213 | 214 | # The number of edges that occurred across all node pairs with the same 215 | # `source_degree` and `target_degree` 216 | dgp_edge_count = ( 217 | prior_df 218 | .groupby(['source_degree', 'target_degree']) 219 | .transform(sum)['num_permuted_edges'] 220 | .values 221 | .astype(dtypes['degree']) 222 | ) 223 | del prior_df['num_permuted_edges'] 224 | 225 | # The effective number of permutations for every node pair, incorporating 226 | # degree-grouping 227 | num_dgp = ( 228 | n_permutations * prior_df.groupby(['source_degree', 'target_degree']) 229 | .transform(len)['edge'] 230 | .values 231 | ) 232 | xswap_prior = (dgp_edge_count / num_dgp).astype(dtypes['xswap_prior']) 233 | del dgp_edge_count, num_dgp 234 | 235 | prior_df['xswap_prior'] = xswap_prior 236 | del xswap_prior 237 | 238 | prior_df = ( 239 | prior_df 240 | .assign( 241 | source_id=numpy.repeat(numpy.arange(shape[0], dtype=dtypes['id']), shape[1]), 242 | target_id=numpy.tile(numpy.arange(shape[1], dtype=dtypes['id']), shape[0]), 243 | ) 244 | .filter(items=['source_id', 'target_id', 'edge', 'source_degree', 245 | 'target_degree', 'xswap_prior']) 246 | ) 247 | return prior_df 248 | 249 | 250 | def approximate_xswap_prior(source_degree, target_degree, num_edges): 251 | """ 252 | Approximate the XSwap prior by assuming that the XSwap Markov Chain is stationary. 253 | While this is not the case in reality, some networks' priors can be estimated 254 | very well using this equation. 255 | 256 | Parameters 257 | ---------- 258 | source_degree : int, float, numpy.array, or pandas.Series 259 | The source degree for a single node pair or a number of source degrees. 260 | The type of object passed should match `target_degree`. 261 | target_degree : int, float, numpy.array, or pandas.Series 262 | The target degree for a single node pair or a number of target degrees. 263 | The type of object passed should match `source_degree`. 264 | num_edges : int or float 265 | The total number of edges in the network 266 | 267 | Returns 268 | ------- 269 | approximate_prior : float, numpy.array, or pandas.Series 270 | Output type matches the types of `source_degree` and `target_degree`. 271 | """ 272 | return source_degree * target_degree / ( 273 | (source_degree * target_degree) ** 2 274 | + (num_edges - source_degree - target_degree + 1) ** 2 275 | ) ** 0.5 276 | -------------------------------------------------------------------------------- /xswap/lib/roaring.hh: -------------------------------------------------------------------------------- 1 | /* auto-generated on Lun 14 jan 2019 11:35:33 EST. Do not edit! */ 2 | #include "roaring.h" 3 | /* begin file /Users/dlemire/CVS/github/CRoaring/cpp/roaring.hh */ 4 | /* 5 | A C++ header for Roaring Bitmaps. 6 | */ 7 | #ifndef INCLUDE_ROARING_HH_ 8 | #define INCLUDE_ROARING_HH_ 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | class RoaringSetBitForwardIterator; 18 | 19 | class Roaring { 20 | public: 21 | /** 22 | * Create an empty bitmap 23 | */ 24 | Roaring() { 25 | ra_init(&roaring.high_low_container); 26 | roaring.copy_on_write = false; 27 | } 28 | 29 | /** 30 | * Construct a bitmap from a list of integer values. 31 | */ 32 | Roaring(size_t n, const uint32_t *data) : Roaring() { 33 | roaring_bitmap_add_many(&roaring, n, data); 34 | } 35 | 36 | /** 37 | * Copy constructor 38 | */ 39 | Roaring(const Roaring &r) { 40 | bool is_ok = 41 | ra_copy(&r.roaring.high_low_container, &roaring.high_low_container, 42 | r.roaring.copy_on_write); 43 | if (!is_ok) { 44 | throw std::runtime_error("failed memory alloc in constructor"); 45 | } 46 | roaring.copy_on_write = r.roaring.copy_on_write; 47 | } 48 | 49 | /** 50 | * Move constructor. The moved object remains valid, i.e. 51 | * all methods can still be called on it. 52 | */ 53 | Roaring(Roaring &&r) noexcept { 54 | roaring = std::move(r.roaring); 55 | r.roaring.copy_on_write = false; 56 | ra_init(&r.roaring.high_low_container); 57 | } 58 | 59 | /** 60 | * Construct a roaring object from the C struct. 61 | * 62 | * Passing a NULL point is unsafe. 63 | * the pointer to the C struct will be invalid after the call. 64 | */ 65 | Roaring(roaring_bitmap_t *s) noexcept { 66 | // steal the interior struct 67 | roaring.high_low_container = s->high_low_container; 68 | roaring.copy_on_write = s->copy_on_write; 69 | // deallocate the old container 70 | free(s); 71 | } 72 | 73 | /** 74 | * Construct a bitmap from a list of integer values. 75 | */ 76 | static Roaring bitmapOf(size_t n, ...) { 77 | Roaring ans; 78 | va_list vl; 79 | va_start(vl, n); 80 | for (size_t i = 0; i < n; i++) { 81 | ans.add(va_arg(vl, uint32_t)); 82 | } 83 | va_end(vl); 84 | return ans; 85 | } 86 | 87 | /** 88 | * Add value x 89 | * 90 | */ 91 | void add(uint32_t x) { roaring_bitmap_add(&roaring, x); } 92 | 93 | /** 94 | * Add value x 95 | * Returns true if a new value was added, false if the value was already existing. 96 | */ 97 | bool addChecked(uint32_t x) { 98 | return roaring_bitmap_add_checked(&roaring, x); 99 | } 100 | 101 | /** 102 | * add if all values from x (included) to y (excluded) 103 | */ 104 | void addRange(const uint64_t x, const uint64_t y) { 105 | return roaring_bitmap_add_range(&roaring, x, y); 106 | } 107 | 108 | /** 109 | * Add value n_args from pointer vals 110 | * 111 | */ 112 | void addMany(size_t n_args, const uint32_t *vals) { 113 | roaring_bitmap_add_many(&roaring, n_args, vals); 114 | } 115 | 116 | /** 117 | * Remove value x 118 | * 119 | */ 120 | void remove(uint32_t x) { roaring_bitmap_remove(&roaring, x); } 121 | 122 | /** 123 | * Remove value x 124 | * Returns true if a new value was removed, false if the value was not existing. 125 | */ 126 | bool removeChecked(uint32_t x) { 127 | return roaring_bitmap_remove_checked(&roaring, x); 128 | } 129 | 130 | /** 131 | * Return the largest value (if not empty) 132 | * 133 | */ 134 | uint32_t maximum() const { return roaring_bitmap_maximum(&roaring); } 135 | 136 | /** 137 | * Return the smallest value (if not empty) 138 | * 139 | */ 140 | uint32_t minimum() const { return roaring_bitmap_minimum(&roaring); } 141 | 142 | /** 143 | * Check if value x is present 144 | */ 145 | bool contains(uint32_t x) const { 146 | return roaring_bitmap_contains(&roaring, x); 147 | } 148 | 149 | /** 150 | * Check if all values from x (included) to y (excluded) are present 151 | */ 152 | bool containsRange(const uint64_t x, const uint64_t y) const { 153 | return roaring_bitmap_contains_range(&roaring, x, y); 154 | } 155 | 156 | /** 157 | * Destructor 158 | */ 159 | ~Roaring() { ra_clear(&roaring.high_low_container); } 160 | 161 | /** 162 | * Copies the content of the provided bitmap, and 163 | * discard the current content. 164 | */ 165 | Roaring &operator=(const Roaring &r) { 166 | ra_clear(&roaring.high_low_container); 167 | bool is_ok = 168 | ra_copy(&r.roaring.high_low_container, &roaring.high_low_container, 169 | r.roaring.copy_on_write); 170 | if (!is_ok) { 171 | throw std::runtime_error("failed memory alloc in assignment"); 172 | } 173 | roaring.copy_on_write = r.roaring.copy_on_write; 174 | return *this; 175 | } 176 | 177 | /** 178 | * Moves the content of the provided bitmap, and 179 | * discard the current content. 180 | */ 181 | Roaring &operator=(Roaring &&r) noexcept { 182 | ra_clear(&roaring.high_low_container); 183 | roaring = std::move(r.roaring); 184 | r.roaring.copy_on_write = false; 185 | ra_init(&r.roaring.high_low_container); 186 | return *this; 187 | } 188 | 189 | /** 190 | * Compute the intersection between the current bitmap and the provided 191 | * bitmap, 192 | * writing the result in the current bitmap. The provided bitmap is not 193 | * modified. 194 | */ 195 | Roaring &operator&=(const Roaring &r) { 196 | roaring_bitmap_and_inplace(&roaring, &r.roaring); 197 | return *this; 198 | } 199 | 200 | /** 201 | * Compute the difference between the current bitmap and the provided 202 | * bitmap, 203 | * writing the result in the current bitmap. The provided bitmap is not 204 | * modified. 205 | */ 206 | Roaring &operator-=(const Roaring &r) { 207 | roaring_bitmap_andnot_inplace(&roaring, &r.roaring); 208 | return *this; 209 | } 210 | 211 | /** 212 | * Compute the union between the current bitmap and the provided bitmap, 213 | * writing the result in the current bitmap. The provided bitmap is not 214 | * modified. 215 | * 216 | * See also the fastunion function to aggregate many bitmaps more quickly. 217 | */ 218 | Roaring &operator|=(const Roaring &r) { 219 | roaring_bitmap_or_inplace(&roaring, &r.roaring); 220 | return *this; 221 | } 222 | 223 | /** 224 | * Compute the symmetric union between the current bitmap and the provided 225 | * bitmap, 226 | * writing the result in the current bitmap. The provided bitmap is not 227 | * modified. 228 | */ 229 | Roaring &operator^=(const Roaring &r) { 230 | roaring_bitmap_xor_inplace(&roaring, &r.roaring); 231 | return *this; 232 | } 233 | 234 | /** 235 | * Exchange the content of this bitmap with another. 236 | */ 237 | void swap(Roaring &r) { std::swap(r.roaring, roaring); } 238 | 239 | /** 240 | * Get the cardinality of the bitmap (number of elements). 241 | */ 242 | uint64_t cardinality() const { 243 | return roaring_bitmap_get_cardinality(&roaring); 244 | } 245 | 246 | /** 247 | * Returns true if the bitmap is empty (cardinality is zero). 248 | */ 249 | bool isEmpty() const { return roaring_bitmap_is_empty(&roaring); } 250 | 251 | /** 252 | * Returns true if the bitmap is subset of the other. 253 | */ 254 | bool isSubset(const Roaring &r) const { 255 | return roaring_bitmap_is_subset(&roaring, &r.roaring); 256 | } 257 | 258 | /** 259 | * Returns true if the bitmap is strict subset of the other. 260 | */ 261 | bool isStrictSubset(const Roaring &r) const { 262 | return roaring_bitmap_is_strict_subset(&roaring, &r.roaring); 263 | } 264 | 265 | /** 266 | * Convert the bitmap to an array. Write the output to "ans", 267 | * caller is responsible to ensure that there is enough memory 268 | * allocated 269 | * (e.g., ans = new uint32[mybitmap.cardinality()];) 270 | */ 271 | void toUint32Array(uint32_t *ans) const { 272 | roaring_bitmap_to_uint32_array(&roaring, ans); 273 | } 274 | /** 275 | * to int array with pagination 276 | * 277 | */ 278 | void rangeUint32Array(uint32_t *ans, size_t offset, size_t limit) const { 279 | roaring_bitmap_range_uint32_array(&roaring, offset, limit, ans); 280 | } 281 | 282 | /** 283 | * Return true if the two bitmaps contain the same elements. 284 | */ 285 | bool operator==(const Roaring &r) const { 286 | return roaring_bitmap_equals(&roaring, &r.roaring); 287 | } 288 | 289 | /** 290 | * compute the negation of the roaring bitmap within a specified interval. 291 | * areas outside the range are passed through unchanged. 292 | */ 293 | void flip(uint64_t range_start, uint64_t range_end) { 294 | roaring_bitmap_flip_inplace(&roaring, range_start, range_end); 295 | } 296 | 297 | /** 298 | * Remove run-length encoding even when it is more space efficient 299 | * return whether a change was applied 300 | */ 301 | bool removeRunCompression() { 302 | return roaring_bitmap_remove_run_compression(&roaring); 303 | } 304 | 305 | /** convert array and bitmap containers to run containers when it is more 306 | * efficient; 307 | * also convert from run containers when more space efficient. Returns 308 | * true if the result has at least one run container. 309 | * Additional savings might be possible by calling shrinkToFit(). 310 | */ 311 | bool runOptimize() { return roaring_bitmap_run_optimize(&roaring); } 312 | 313 | /** 314 | * If needed, reallocate memory to shrink the memory usage. Returns 315 | * the number of bytes saved. 316 | */ 317 | size_t shrinkToFit() { return roaring_bitmap_shrink_to_fit(&roaring); } 318 | 319 | /** 320 | * Iterate over the bitmap elements. The function iterator is called once for 321 | * all the values with ptr (can be NULL) as the second parameter of each call. 322 | * 323 | * roaring_iterator is simply a pointer to a function that returns bool 324 | * (true means that the iteration should continue while false means that it 325 | * should stop), and takes (uint32_t,void*) as inputs. 326 | */ 327 | void iterate(roaring_iterator iterator, void *ptr) const { 328 | roaring_iterate(&roaring, iterator, ptr); 329 | } 330 | 331 | /** 332 | * If the size of the roaring bitmap is strictly greater than rank, then 333 | * this function returns true and set element to the element of given rank. 334 | * Otherwise, it returns false. 335 | */ 336 | bool select(uint32_t rnk, uint32_t *element) const { 337 | return roaring_bitmap_select(&roaring, rnk, element); 338 | } 339 | 340 | /** 341 | * Computes the size of the intersection between two bitmaps. 342 | * 343 | */ 344 | uint64_t and_cardinality(const Roaring &r) const { 345 | return roaring_bitmap_and_cardinality(&roaring, &r.roaring); 346 | } 347 | 348 | /** 349 | * Check whether the two bitmaps intersect. 350 | * 351 | */ 352 | bool intersect(const Roaring &r) const { 353 | return roaring_bitmap_intersect(&roaring, &r.roaring); 354 | } 355 | 356 | /** 357 | * Computes the Jaccard index between two bitmaps. (Also known as the 358 | * Tanimoto distance, 359 | * or the Jaccard similarity coefficient) 360 | * 361 | * The Jaccard index is undefined if both bitmaps are empty. 362 | * 363 | */ 364 | double jaccard_index(const Roaring &r) const { 365 | return roaring_bitmap_jaccard_index(&roaring, &r.roaring); 366 | } 367 | 368 | /** 369 | * Computes the size of the union between two bitmaps. 370 | * 371 | */ 372 | uint64_t or_cardinality(const Roaring &r) const { 373 | return roaring_bitmap_or_cardinality(&roaring, &r.roaring); 374 | } 375 | 376 | /** 377 | * Computes the size of the difference (andnot) between two bitmaps. 378 | * 379 | */ 380 | uint64_t andnot_cardinality(const Roaring &r) const { 381 | return roaring_bitmap_andnot_cardinality(&roaring, &r.roaring); 382 | } 383 | 384 | /** 385 | * Computes the size of the symmetric difference (andnot) between two 386 | * bitmaps. 387 | * 388 | */ 389 | uint64_t xor_cardinality(const Roaring &r) const { 390 | return roaring_bitmap_xor_cardinality(&roaring, &r.roaring); 391 | } 392 | 393 | /** 394 | * Returns the number of integers that are smaller or equal to x. 395 | */ 396 | uint64_t rank(uint32_t x) const { return roaring_bitmap_rank(&roaring, x); } 397 | 398 | /** 399 | * write a bitmap to a char buffer. This is meant to be compatible with 400 | * the 401 | * Java and Go versions. Returns how many bytes were written which should be 402 | * getSizeInBytes(). 403 | * 404 | * Setting the portable flag to false enable a custom format that 405 | * can save space compared to the portable format (e.g., for very 406 | * sparse bitmaps). 407 | * 408 | * Boost users can serialize bitmaps in this manner: 409 | * 410 | * BOOST_SERIALIZATION_SPLIT_FREE(Roaring) 411 | * namespace boost { 412 | * namespace serialization { 413 | * 414 | * template 415 | * void save(Archive& ar, const Roaring& bitmask, 416 | * const unsigned int version) { 417 | * std::size_t expected_size_in_bytes = bitmask.getSizeInBytes(); 418 | * std::vector buffer(expected_size_in_bytes); 419 | * std::size_t size_in_bytes = bitmask.write(buffer.data()); 420 | * 421 | * ar& size_in_bytes; 422 | * ar& boost::serialization::make_binary_object(buffer.data(), 423 | * size_in_bytes); 424 | * } 425 | * template 426 | * void load(Archive& ar, Roaring& bitmask, 427 | * const unsigned int version) { 428 | * std::size_t size_in_bytes = 0; 429 | * ar& size_in_bytes; 430 | * std::vector buffer(size_in_bytes); 431 | * ar& boost::serialization::make_binary_object(buffer.data(), 432 | * size_in_bytes); 433 | * bitmask = Roaring::readSafe(buffer.data(), size_in_bytes); 434 | *} 435 | *} // namespace serialization 436 | *} // namespace boost 437 | */ 438 | size_t write(char *buf, bool portable = true) const { 439 | if (portable) 440 | return roaring_bitmap_portable_serialize(&roaring, buf); 441 | else 442 | return roaring_bitmap_serialize(&roaring, buf); 443 | } 444 | 445 | /** 446 | * read a bitmap from a serialized version. This is meant to be compatible 447 | * with the Java and Go versions. 448 | * 449 | * Setting the portable flag to false enable a custom format that 450 | * can save space compared to the portable format (e.g., for very 451 | * sparse bitmaps). 452 | * 453 | * This function is unsafe in the sense that if you provide bad data, 454 | * many, many bytes could be read. See also readSafe. 455 | */ 456 | static Roaring read(const char *buf, bool portable = true) { 457 | roaring_bitmap_t * r = portable ? roaring_bitmap_portable_deserialize(buf) : roaring_bitmap_deserialize(buf); 458 | if (r == NULL) { 459 | throw std::runtime_error("failed alloc while reading"); 460 | } 461 | return Roaring(r); 462 | } 463 | /** 464 | * read a bitmap from a serialized version, reading no more than maxbytes bytes. 465 | * This is meant to be compatible with the Java and Go versions. 466 | * 467 | */ 468 | static Roaring readSafe(const char *buf, size_t maxbytes) { 469 | roaring_bitmap_t * r = roaring_bitmap_portable_deserialize_safe(buf,maxbytes); 470 | if (r == NULL) { 471 | throw std::runtime_error("failed alloc while reading"); 472 | } 473 | return Roaring(r); 474 | } 475 | /** 476 | * How many bytes are required to serialize this bitmap (meant to be 477 | * compatible 478 | * with Java and Go versions) 479 | * 480 | * Setting the portable flag to false enable a custom format that 481 | * can save space compared to the portable format (e.g., for very 482 | * sparse bitmaps). 483 | */ 484 | size_t getSizeInBytes(bool portable = true) const { 485 | if (portable) 486 | return roaring_bitmap_portable_size_in_bytes(&roaring); 487 | else 488 | return roaring_bitmap_size_in_bytes(&roaring); 489 | } 490 | 491 | /** 492 | * Computes the intersection between two bitmaps and returns new bitmap. 493 | * The current bitmap and the provided bitmap are unchanged. 494 | */ 495 | Roaring operator&(const Roaring &o) const { 496 | roaring_bitmap_t *r = roaring_bitmap_and(&roaring, &o.roaring); 497 | if (r == NULL) { 498 | throw std::runtime_error("failed materalization in and"); 499 | } 500 | return Roaring(r); 501 | } 502 | 503 | /** 504 | * Computes the difference between two bitmaps and returns new bitmap. 505 | * The current bitmap and the provided bitmap are unchanged. 506 | */ 507 | Roaring operator-(const Roaring &o) const { 508 | roaring_bitmap_t *r = roaring_bitmap_andnot(&roaring, &o.roaring); 509 | if (r == NULL) { 510 | throw std::runtime_error("failed materalization in andnot"); 511 | } 512 | return Roaring(r); 513 | } 514 | 515 | /** 516 | * Computes the union between two bitmaps and returns new bitmap. 517 | * The current bitmap and the provided bitmap are unchanged. 518 | */ 519 | Roaring operator|(const Roaring &o) const { 520 | roaring_bitmap_t *r = roaring_bitmap_or(&roaring, &o.roaring); 521 | if (r == NULL) { 522 | throw std::runtime_error("failed materalization in or"); 523 | } 524 | return Roaring(r); 525 | } 526 | 527 | /** 528 | * Computes the symmetric union between two bitmaps and returns new bitmap. 529 | * The current bitmap and the provided bitmap are unchanged. 530 | */ 531 | Roaring operator^(const Roaring &o) const { 532 | roaring_bitmap_t *r = roaring_bitmap_xor(&roaring, &o.roaring); 533 | if (r == NULL) { 534 | throw std::runtime_error("failed materalization in xor"); 535 | } 536 | return Roaring(r); 537 | } 538 | 539 | /** 540 | * Whether or not we apply copy and write. 541 | */ 542 | void setCopyOnWrite(bool val) { roaring.copy_on_write = val; } 543 | 544 | /** 545 | * Print the content of the bitmap 546 | */ 547 | void printf() const { roaring_bitmap_printf(&roaring); } 548 | 549 | /** 550 | * Print the content of the bitmap into a string 551 | */ 552 | std::string toString() const { 553 | struct iter_data { 554 | std::string str; 555 | char first_char = '{'; 556 | } outer_iter_data; 557 | if (!isEmpty()) { 558 | iterate( 559 | [](uint32_t value, void *inner_iter_data) -> bool { 560 | ((iter_data *)inner_iter_data)->str += 561 | ((iter_data *)inner_iter_data)->first_char; 562 | ((iter_data *)inner_iter_data)->str += 563 | std::to_string(value); 564 | ((iter_data *)inner_iter_data)->first_char = ','; 565 | return true; 566 | }, 567 | (void *)&outer_iter_data); 568 | } else 569 | outer_iter_data.str = '{'; 570 | outer_iter_data.str += '}'; 571 | return outer_iter_data.str; 572 | } 573 | 574 | /** 575 | * Whether or not copy and write is active. 576 | */ 577 | bool getCopyOnWrite() const { return roaring.copy_on_write; } 578 | 579 | /** 580 | * computes the logical or (union) between "n" bitmaps (referenced by a 581 | * pointer). 582 | */ 583 | static Roaring fastunion(size_t n, const Roaring **inputs) { 584 | const roaring_bitmap_t **x = 585 | (const roaring_bitmap_t **)malloc(n * sizeof(roaring_bitmap_t *)); 586 | if (x == NULL) { 587 | throw std::runtime_error("failed memory alloc in fastunion"); 588 | } 589 | for (size_t k = 0; k < n; ++k) x[k] = &inputs[k]->roaring; 590 | 591 | roaring_bitmap_t *c_ans = roaring_bitmap_or_many(n, x); 592 | if (c_ans == NULL) { 593 | free(x); 594 | throw std::runtime_error("failed memory alloc in fastunion"); 595 | } 596 | Roaring ans(c_ans); 597 | free(x); 598 | return ans; 599 | } 600 | 601 | typedef RoaringSetBitForwardIterator const_iterator; 602 | 603 | /** 604 | * Returns an iterator that can be used to access the position of the 605 | * set bits. The running time complexity of a full scan is proportional to 606 | * the 607 | * number 608 | * of set bits: be aware that if you have long strings of 1s, this can be 609 | * very inefficient. 610 | * 611 | * It can be much faster to use the toArray method if you want to 612 | * retrieve the set bits. 613 | */ 614 | const_iterator begin() const; 615 | 616 | /** 617 | * A bogus iterator that can be used together with begin() 618 | * for constructions such as for(auto i = b.begin(); 619 | * i!=b.end(); ++i) {} 620 | */ 621 | const_iterator &end() const; 622 | 623 | roaring_bitmap_t roaring; 624 | }; 625 | 626 | /** 627 | * Used to go through the set bits. Not optimally fast, but convenient. 628 | */ 629 | class RoaringSetBitForwardIterator final { 630 | public: 631 | typedef std::forward_iterator_tag iterator_category; 632 | typedef uint32_t *pointer; 633 | typedef uint32_t &reference_type; 634 | typedef uint32_t value_type; 635 | typedef int32_t difference_type; 636 | typedef RoaringSetBitForwardIterator type_of_iterator; 637 | 638 | /** 639 | * Provides the location of the set bit. 640 | */ 641 | value_type operator*() const { return i.current_value; } 642 | 643 | bool operator<(const type_of_iterator &o) { 644 | if (!i.has_value) return false; 645 | if (!o.i.has_value) return true; 646 | return i.current_value < *o; 647 | } 648 | 649 | bool operator<=(const type_of_iterator &o) { 650 | if (!o.i.has_value) return true; 651 | if (!i.has_value) return false; 652 | return i.current_value <= *o; 653 | } 654 | 655 | bool operator>(const type_of_iterator &o) { 656 | if (!o.i.has_value) return false; 657 | if (!i.has_value) return true; 658 | return i.current_value > *o; 659 | } 660 | 661 | bool operator>=(const type_of_iterator &o) { 662 | if (!i.has_value) return true; 663 | if (!o.i.has_value) return false; 664 | return i.current_value >= *o; 665 | } 666 | 667 | /** 668 | * Move the iterator to the first value >= val. 669 | */ 670 | void equalorlarger(uint32_t val) { 671 | roaring_move_uint32_iterator_equalorlarger(&i,val); 672 | } 673 | 674 | type_of_iterator &operator++() { // ++i, must returned inc. value 675 | roaring_advance_uint32_iterator(&i); 676 | return *this; 677 | } 678 | 679 | type_of_iterator operator++(int) { // i++, must return orig. value 680 | RoaringSetBitForwardIterator orig(*this); 681 | roaring_advance_uint32_iterator(&i); 682 | return orig; 683 | } 684 | 685 | type_of_iterator& operator--() { // prefix -- 686 | roaring_previous_uint32_iterator(&i); 687 | return *this; 688 | } 689 | 690 | type_of_iterator operator--(int) { // postfix -- 691 | RoaringSetBitForwardIterator orig(*this); 692 | roaring_previous_uint32_iterator(&i); 693 | return orig; 694 | } 695 | 696 | bool operator==(const RoaringSetBitForwardIterator &o) const { 697 | return i.current_value == *o && i.has_value == o.i.has_value; 698 | } 699 | 700 | bool operator!=(const RoaringSetBitForwardIterator &o) const { 701 | return i.current_value != *o || i.has_value != o.i.has_value; 702 | } 703 | 704 | RoaringSetBitForwardIterator(const Roaring &parent, 705 | bool exhausted = false) { 706 | if (exhausted) { 707 | i.parent = &parent.roaring; 708 | i.container_index = INT32_MAX; 709 | i.has_value = false; 710 | i.current_value = UINT32_MAX; 711 | } else { 712 | roaring_init_iterator(&parent.roaring, &i); 713 | } 714 | } 715 | 716 | roaring_uint32_iterator_t i; 717 | }; 718 | 719 | inline RoaringSetBitForwardIterator Roaring::begin() const { 720 | return RoaringSetBitForwardIterator(*this); 721 | } 722 | 723 | inline RoaringSetBitForwardIterator &Roaring::end() const { 724 | static RoaringSetBitForwardIterator e(*this, true); 725 | return e; 726 | } 727 | 728 | #endif /* INCLUDE_ROARING_HH_ */ 729 | /* end file /Users/dlemire/CVS/github/CRoaring/cpp/roaring.hh */ 730 | /* begin file /Users/dlemire/CVS/github/CRoaring/cpp/roaring64map.hh */ 731 | /* 732 | A C++ header for 64-bit Roaring Bitmaps, implemented by way of a map of many 733 | 32-bit Roaring Bitmaps. 734 | */ 735 | #ifndef INCLUDE_ROARING_64_MAP_HH_ 736 | #define INCLUDE_ROARING_64_MAP_HH_ 737 | 738 | #include 739 | #include 740 | #include 741 | #include 742 | #include 743 | #include 744 | #include 745 | #include 746 | #include 747 | #include 748 | 749 | 750 | class Roaring64MapSetBitForwardIterator; 751 | 752 | class Roaring64Map { 753 | public: 754 | /** 755 | * Create an empty bitmap 756 | */ 757 | Roaring64Map() = default; 758 | 759 | /** 760 | * Construct a bitmap from a list of 32-bit integer values. 761 | */ 762 | Roaring64Map(size_t n, const uint32_t *data) { addMany(n, data); } 763 | 764 | /** 765 | * Construct a bitmap from a list of 64-bit integer values. 766 | */ 767 | Roaring64Map(size_t n, const uint64_t *data) { addMany(n, data); } 768 | 769 | /** 770 | * Construct a 64-bit map from a 32-bit one 771 | */ 772 | Roaring64Map(const Roaring &r) { emplaceOrInsert(0, r); } 773 | 774 | /** 775 | * Construct a roaring object from the C struct. 776 | * 777 | * Passing a NULL point is unsafe. 778 | */ 779 | Roaring64Map(roaring_bitmap_t *s) { emplaceOrInsert(0, s); } 780 | 781 | /** 782 | * Construct a bitmap from a list of integer values. 783 | */ 784 | static Roaring64Map bitmapOf(size_t n...) { 785 | Roaring64Map ans; 786 | va_list vl; 787 | va_start(vl, n); 788 | for (size_t i = 0; i < n; i++) { 789 | ans.add(va_arg(vl, uint64_t)); 790 | } 791 | va_end(vl); 792 | return ans; 793 | } 794 | 795 | /** 796 | * Add value x 797 | * 798 | */ 799 | void add(uint32_t x) { 800 | roarings[0].add(x); 801 | roarings[0].setCopyOnWrite(copyOnWrite); 802 | } 803 | void add(uint64_t x) { 804 | roarings[highBytes(x)].add(lowBytes(x)); 805 | roarings[highBytes(x)].setCopyOnWrite(copyOnWrite); 806 | } 807 | 808 | /** 809 | * Add value x 810 | * Returns true if a new value was added, false if the value was already existing. 811 | */ 812 | bool addChecked(uint32_t x) { 813 | bool result = roarings[0].addChecked(x); 814 | roarings[0].setCopyOnWrite(copyOnWrite); 815 | return result; 816 | } 817 | bool addChecked(uint64_t x) { 818 | bool result = roarings[highBytes(x)].addChecked(lowBytes(x)); 819 | roarings[highBytes(x)].setCopyOnWrite(copyOnWrite); 820 | return result; 821 | } 822 | 823 | /** 824 | * Add value n_args from pointer vals 825 | * 826 | */ 827 | void addMany(size_t n_args, const uint32_t *vals) { 828 | for (size_t lcv = 0; lcv < n_args; lcv++) { 829 | roarings[0].add(vals[lcv]); 830 | roarings[0].setCopyOnWrite(copyOnWrite); 831 | } 832 | } 833 | void addMany(size_t n_args, const uint64_t *vals) { 834 | for (size_t lcv = 0; lcv < n_args; lcv++) { 835 | roarings[highBytes(vals[lcv])].add(lowBytes(vals[lcv])); 836 | roarings[highBytes(vals[lcv])].setCopyOnWrite(copyOnWrite); 837 | } 838 | } 839 | 840 | /** 841 | * Remove value x 842 | * 843 | */ 844 | void remove(uint32_t x) { roarings[0].remove(x); } 845 | void remove(uint64_t x) { 846 | auto roaring_iter = roarings.find(highBytes(x)); 847 | if (roaring_iter != roarings.cend()) 848 | roaring_iter->second.remove(lowBytes(x)); 849 | } 850 | 851 | /** 852 | * Remove value x 853 | * Returns true if a new value was removed, false if the value was not existing. 854 | */ 855 | bool removeChecked(uint32_t x) { 856 | return roarings[0].removeChecked(x); 857 | } 858 | bool removeChecked(uint64_t x) { 859 | auto roaring_iter = roarings.find(highBytes(x)); 860 | if (roaring_iter != roarings.cend()) 861 | return roaring_iter->second.removeChecked(lowBytes(x)); 862 | return false; 863 | } 864 | 865 | /** 866 | * Return the largest value (if not empty) 867 | * 868 | */ 869 | uint64_t maximum() const { 870 | for (auto roaring_iter = roarings.crbegin(); 871 | roaring_iter != roarings.crend(); ++roaring_iter) { 872 | if (!roaring_iter->second.isEmpty()) { 873 | return uniteBytes(roaring_iter->first, 874 | roaring_iter->second.maximum()); 875 | } 876 | } 877 | // we put std::numeric_limits<>::max/min in parenthesis 878 | // to avoid a clash with the Windows.h header under Windows 879 | return (std::numeric_limits::min)(); 880 | } 881 | 882 | /** 883 | * Return the smallest value (if not empty) 884 | * 885 | */ 886 | uint64_t minimum() const { 887 | for (auto roaring_iter = roarings.cbegin(); 888 | roaring_iter != roarings.cend(); ++roaring_iter) { 889 | if (!roaring_iter->second.isEmpty()) { 890 | return uniteBytes(roaring_iter->first, 891 | roaring_iter->second.minimum()); 892 | } 893 | } 894 | // we put std::numeric_limits<>::max/min in parenthesis 895 | // to avoid a clash with the Windows.h header under Windows 896 | return (std::numeric_limits::max)(); 897 | } 898 | 899 | /** 900 | * Check if value x is present 901 | */ 902 | bool contains(uint32_t x) const { 903 | return roarings.count(0) == 0 ? false : roarings.at(0).contains(x); 904 | } 905 | bool contains(uint64_t x) const { 906 | return roarings.count(highBytes(x)) == 0 907 | ? false 908 | : roarings.at(highBytes(x)).contains(lowBytes(x)); 909 | } 910 | 911 | /** 912 | * Compute the intersection between the current bitmap and the provided 913 | * bitmap, 914 | * writing the result in the current bitmap. The provided bitmap is not 915 | * modified. 916 | */ 917 | Roaring64Map &operator&=(const Roaring64Map &r) { 918 | for (auto &map_entry : roarings) { 919 | if (r.roarings.count(map_entry.first) == 1) 920 | map_entry.second &= r.roarings.at(map_entry.first); 921 | else 922 | map_entry.second = Roaring(); 923 | } 924 | return *this; 925 | } 926 | 927 | /** 928 | * Compute the difference between the current bitmap and the provided 929 | * bitmap, 930 | * writing the result in the current bitmap. The provided bitmap is not 931 | * modified. 932 | */ 933 | Roaring64Map &operator-=(const Roaring64Map &r) { 934 | for (auto &map_entry : roarings) { 935 | if (r.roarings.count(map_entry.first) == 1) 936 | map_entry.second -= r.roarings.at(map_entry.first); 937 | } 938 | return *this; 939 | } 940 | 941 | /** 942 | * Compute the union between the current bitmap and the provided bitmap, 943 | * writing the result in the current bitmap. The provided bitmap is not 944 | * modified. 945 | * 946 | * See also the fastunion function to aggregate many bitmaps more quickly. 947 | */ 948 | Roaring64Map &operator|=(const Roaring64Map &r) { 949 | for (const auto &map_entry : r.roarings) { 950 | if (roarings.count(map_entry.first) == 0) { 951 | roarings[map_entry.first] = map_entry.second; 952 | roarings[map_entry.first].setCopyOnWrite(copyOnWrite); 953 | } else 954 | roarings[map_entry.first] |= map_entry.second; 955 | } 956 | return *this; 957 | } 958 | 959 | /** 960 | * Compute the symmetric union between the current bitmap and the provided 961 | * bitmap, 962 | * writing the result in the current bitmap. The provided bitmap is not 963 | * modified. 964 | */ 965 | Roaring64Map &operator^=(const Roaring64Map &r) { 966 | for (const auto &map_entry : r.roarings) { 967 | if (roarings.count(map_entry.first) == 0) { 968 | roarings[map_entry.first] = map_entry.second; 969 | roarings[map_entry.first].setCopyOnWrite(copyOnWrite); 970 | } else 971 | roarings[map_entry.first] ^= map_entry.second; 972 | } 973 | return *this; 974 | } 975 | 976 | /** 977 | * Exchange the content of this bitmap with another. 978 | */ 979 | void swap(Roaring64Map &r) { roarings.swap(r.roarings); } 980 | 981 | /** 982 | * Get the cardinality of the bitmap (number of elements). 983 | * Throws std::length_error in the special case where the bitmap is full 984 | * (cardinality() == 2^64). Check isFull() before calling to avoid 985 | * exception. 986 | */ 987 | uint64_t cardinality() const { 988 | if (isFull()) { 989 | throw std::length_error( 990 | "bitmap is full, cardinality is 2^64, " 991 | "unable to represent in a 64-bit integer"); 992 | } 993 | return std::accumulate( 994 | roarings.cbegin(), roarings.cend(), (uint64_t)0, 995 | [](uint64_t previous, 996 | const std::pair &map_entry) { 997 | return previous + map_entry.second.cardinality(); 998 | }); 999 | } 1000 | 1001 | /** 1002 | * Returns true if the bitmap is empty (cardinality is zero). 1003 | */ 1004 | bool isEmpty() const { 1005 | return std::all_of(roarings.cbegin(), roarings.cend(), 1006 | [](const std::pair &map_entry) { 1007 | return map_entry.second.isEmpty(); 1008 | }); 1009 | } 1010 | 1011 | /** 1012 | * Returns true if the bitmap is full (cardinality is max uint64_t + 1). 1013 | */ 1014 | bool isFull() const { 1015 | // only bother to check if map is fully saturated 1016 | // 1017 | // we put std::numeric_limits<>::max/min in parenthesis 1018 | // to avoid a clash with the Windows.h header under Windows 1019 | return roarings.size() == 1020 | ((size_t)(std::numeric_limits::max)()) + 1 1021 | ? std::all_of( 1022 | roarings.cbegin(), roarings.cend(), 1023 | [](const std::pair &roaring_map_entry) { 1024 | // roarings within map are saturated if cardinality 1025 | // is uint32_t max + 1 1026 | return roaring_map_entry.second.cardinality() == 1027 | ((uint64_t) 1028 | (std::numeric_limits::max)()) + 1029 | 1; 1030 | }) 1031 | : false; 1032 | } 1033 | 1034 | /** 1035 | * Returns true if the bitmap is subset of the other. 1036 | */ 1037 | bool isSubset(const Roaring64Map &r) const { 1038 | for (const auto &map_entry : roarings) { 1039 | auto roaring_iter = r.roarings.find(map_entry.first); 1040 | if (roaring_iter == roarings.cend()) 1041 | return false; 1042 | else if (!map_entry.second.isSubset(roaring_iter->second)) 1043 | return false; 1044 | } 1045 | return true; 1046 | } 1047 | 1048 | /** 1049 | * Returns true if the bitmap is strict subset of the other. 1050 | * Throws std::length_error in the special case where the bitmap is full 1051 | * (cardinality() == 2^64). Check isFull() before calling to avoid exception. 1052 | */ 1053 | bool isStrictSubset(const Roaring64Map &r) const { 1054 | return isSubset(r) && cardinality() != r.cardinality(); 1055 | } 1056 | 1057 | /** 1058 | * Convert the bitmap to an array. Write the output to "ans", 1059 | * caller is responsible to ensure that there is enough memory 1060 | * allocated 1061 | * (e.g., ans = new uint32[mybitmap.cardinality()];) 1062 | */ 1063 | void toUint64Array(uint64_t *ans) const { 1064 | // Annoyingly, VS 2017 marks std::accumulate() as [[nodiscard]] 1065 | (void)std::accumulate(roarings.cbegin(), roarings.cend(), ans, 1066 | [](uint64_t *previous, 1067 | const std::pair &map_entry) { 1068 | for (uint32_t low_bits : map_entry.second) 1069 | *previous++ = 1070 | uniteBytes(map_entry.first, low_bits); 1071 | return previous; 1072 | }); 1073 | } 1074 | 1075 | /** 1076 | * Return true if the two bitmaps contain the same elements. 1077 | */ 1078 | bool operator==(const Roaring64Map &r) const { 1079 | // we cannot use operator == on the map because either side may contain 1080 | // empty Roaring Bitmaps 1081 | auto lhs_iter = roarings.cbegin(); 1082 | auto rhs_iter = r.roarings.cbegin(); 1083 | do { 1084 | // if the left map has reached its end, ensure that the right map 1085 | // contains only empty Bitmaps 1086 | if (lhs_iter == roarings.cend()) { 1087 | while (rhs_iter != r.roarings.cend()) { 1088 | if (rhs_iter->second.isEmpty()) { 1089 | ++rhs_iter; 1090 | continue; 1091 | } 1092 | return false; 1093 | } 1094 | return true; 1095 | } 1096 | // if the left map has an empty bitmap, skip it 1097 | if (lhs_iter->second.isEmpty()) { 1098 | ++lhs_iter; 1099 | continue; 1100 | } 1101 | 1102 | do { 1103 | // if the right map has reached its end, ensure that the right 1104 | // map contains only empty Bitmaps 1105 | if (rhs_iter == r.roarings.cend()) { 1106 | while (lhs_iter != roarings.cend()) { 1107 | if (lhs_iter->second.isEmpty()) { 1108 | ++lhs_iter; 1109 | continue; 1110 | } 1111 | return false; 1112 | } 1113 | return true; 1114 | } 1115 | // if the right map has an empty bitmap, skip it 1116 | if (rhs_iter->second.isEmpty()) { 1117 | ++rhs_iter; 1118 | continue; 1119 | } 1120 | } while (false); 1121 | // if neither map has reached its end ensure elements are equal and 1122 | // move to the next element in both 1123 | } while (lhs_iter++->second == rhs_iter++->second); 1124 | return false; 1125 | } 1126 | 1127 | /** 1128 | * compute the negation of the roaring bitmap within a specified interval. 1129 | * areas outside the range are passed through unchanged. 1130 | */ 1131 | void flip(uint64_t range_start, uint64_t range_end) { 1132 | uint32_t start_high = highBytes(range_start); 1133 | uint32_t start_low = lowBytes(range_start); 1134 | uint32_t end_high = highBytes(range_end); 1135 | uint32_t end_low = lowBytes(range_end); 1136 | 1137 | if (start_high == end_high) { 1138 | roarings[start_high].flip(start_low, end_low); 1139 | return; 1140 | } 1141 | // we put std::numeric_limits<>::max/min in parenthesis 1142 | // to avoid a clash with the Windows.h header under Windows 1143 | roarings[start_high].flip(start_low, 1144 | (std::numeric_limits::max)()); 1145 | roarings[start_high++].setCopyOnWrite(copyOnWrite); 1146 | 1147 | for (; start_high <= highBytes(range_end) - 1; ++start_high) { 1148 | roarings[start_high].flip((std::numeric_limits::min)(), 1149 | (std::numeric_limits::max)()); 1150 | roarings[start_high].setCopyOnWrite(copyOnWrite); 1151 | } 1152 | 1153 | roarings[start_high].flip((std::numeric_limits::min)(), 1154 | end_low); 1155 | roarings[start_high].setCopyOnWrite(copyOnWrite); 1156 | } 1157 | 1158 | /** 1159 | * Remove run-length encoding even when it is more space efficient 1160 | * return whether a change was applied 1161 | */ 1162 | bool removeRunCompression() { 1163 | return std::accumulate( 1164 | roarings.begin(), roarings.end(), false, 1165 | [](bool previous, std::pair &map_entry) { 1166 | return map_entry.second.removeRunCompression() && previous; 1167 | }); 1168 | } 1169 | 1170 | /** convert array and bitmap containers to run containers when it is more 1171 | * efficient; 1172 | * also convert from run containers when more space efficient. Returns 1173 | * true if the result has at least one run container. 1174 | * Additional savings might be possible by calling shrinkToFit(). 1175 | */ 1176 | bool runOptimize() { 1177 | return std::accumulate( 1178 | roarings.begin(), roarings.end(), false, 1179 | [](bool previous, std::pair &map_entry) { 1180 | return map_entry.second.runOptimize() && previous; 1181 | }); 1182 | } 1183 | 1184 | /** 1185 | * If needed, reallocate memory to shrink the memory usage. Returns 1186 | * the number of bytes saved. 1187 | */ 1188 | size_t shrinkToFit() { 1189 | size_t savedBytes = 0; 1190 | auto iter = roarings.begin(); 1191 | while (iter != roarings.cend()) { 1192 | if (iter->second.isEmpty()) { 1193 | // empty Roarings are 84 bytes 1194 | savedBytes += 88; 1195 | roarings.erase(iter++); 1196 | } else { 1197 | savedBytes += iter->second.shrinkToFit(); 1198 | iter++; 1199 | } 1200 | } 1201 | return savedBytes; 1202 | } 1203 | 1204 | /** 1205 | * Iterate over the bitmap elements. The function iterator is called once 1206 | * for all the values with ptr (can be NULL) as the second parameter of each 1207 | * call. 1208 | * 1209 | * roaring_iterator is simply a pointer to a function that returns bool 1210 | * (true means that the iteration should continue while false means that it 1211 | * should stop), and takes (uint32_t,void*) as inputs. 1212 | */ 1213 | void iterate(roaring_iterator64 iterator, void *ptr) const { 1214 | std::for_each(roarings.begin(), roarings.cend(), 1215 | [=](const std::pair &map_entry) { 1216 | roaring_iterate64(&map_entry.second.roaring, iterator, 1217 | uint64_t(map_entry.first) << 32, 1218 | ptr); 1219 | }); 1220 | } 1221 | 1222 | /** 1223 | * If the size of the roaring bitmap is strictly greater than rank, then 1224 | this 1225 | function returns true and set element to the element of given rank. 1226 | Otherwise, it returns false. 1227 | */ 1228 | bool select(uint64_t rnk, uint64_t *element) const { 1229 | for (const auto &map_entry : roarings) { 1230 | uint64_t sub_cardinality = (uint64_t)map_entry.second.cardinality(); 1231 | if (rnk < sub_cardinality) { 1232 | *element = ((uint64_t)map_entry.first) << 32; 1233 | // assuming little endian 1234 | return map_entry.second.select((uint32_t)rnk, 1235 | ((uint32_t *)element)); 1236 | } 1237 | rnk -= sub_cardinality; 1238 | } 1239 | return false; 1240 | } 1241 | 1242 | /** 1243 | * Returns the number of integers that are smaller or equal to x. 1244 | */ 1245 | uint64_t rank(uint64_t x) const { 1246 | uint64_t result = 0; 1247 | auto roaring_destination = roarings.find(highBytes(x)); 1248 | if (roaring_destination != roarings.cend()) { 1249 | for (auto roaring_iter = roarings.cbegin(); 1250 | roaring_iter != roaring_destination; ++roaring_iter) { 1251 | result += roaring_iter->second.cardinality(); 1252 | } 1253 | result += roaring_destination->second.rank(lowBytes(x)); 1254 | return result; 1255 | } 1256 | roaring_destination = roarings.lower_bound(highBytes(x)); 1257 | for (auto roaring_iter = roarings.cbegin(); 1258 | roaring_iter != roaring_destination; ++roaring_iter) { 1259 | result += roaring_iter->second.cardinality(); 1260 | } 1261 | return result; 1262 | } 1263 | 1264 | /** 1265 | * write a bitmap to a char buffer. This is meant to be compatible with 1266 | * the 1267 | * Java and Go versions. Returns how many bytes were written which should be 1268 | * getSizeInBytes(). 1269 | * 1270 | * Setting the portable flag to false enable a custom format that 1271 | * can save space compared to the portable format (e.g., for very 1272 | * sparse bitmaps). 1273 | */ 1274 | size_t write(char *buf, bool portable = true) const { 1275 | const char *orig = buf; 1276 | // push map size 1277 | *((uint64_t *)buf) = roarings.size(); 1278 | buf += sizeof(uint64_t); 1279 | std::for_each( 1280 | roarings.cbegin(), roarings.cend(), 1281 | [&buf, portable](const std::pair &map_entry) { 1282 | // push map key 1283 | memcpy(buf, &map_entry.first, 1284 | sizeof(uint32_t)); // this is undefined: 1285 | // *((uint32_t*)buf) = 1286 | // map_entry.first; 1287 | buf += sizeof(uint32_t); 1288 | // push map value Roaring 1289 | buf += map_entry.second.write(buf, portable); 1290 | }); 1291 | return buf - orig; 1292 | } 1293 | 1294 | /** 1295 | * read a bitmap from a serialized version. This is meant to be compatible 1296 | * with 1297 | * the 1298 | * Java and Go versions. 1299 | * 1300 | * Setting the portable flag to false enable a custom format that 1301 | * can save space compared to the portable format (e.g., for very 1302 | * sparse bitmaps). 1303 | * 1304 | * This function is unsafe in the sense that if you provide bad data, 1305 | * many bytes could be read, possibly causing a buffer overflow. See also readSafe. 1306 | */ 1307 | static Roaring64Map read(const char *buf, bool portable = true) { 1308 | Roaring64Map result; 1309 | // get map size 1310 | uint64_t map_size = *((uint64_t *)buf); 1311 | buf += sizeof(uint64_t); 1312 | for (uint64_t lcv = 0; lcv < map_size; lcv++) { 1313 | // get map key 1314 | uint32_t key; 1315 | memcpy(&key, buf, sizeof(uint32_t)); // this is undefined: uint32_t 1316 | // key = *((uint32_t*)buf); 1317 | buf += sizeof(uint32_t); 1318 | // read map value Roaring 1319 | Roaring read = Roaring::read(buf, portable); 1320 | result.emplaceOrInsert(key, read); 1321 | // forward buffer past the last Roaring Bitmap 1322 | buf += read.getSizeInBytes(portable); 1323 | } 1324 | return result; 1325 | } 1326 | 1327 | /** 1328 | * read a bitmap from a serialized version, reading no more than maxbytes bytes. 1329 | * This is meant to be compatible with the Java and Go versions. 1330 | * 1331 | * Setting the portable flag to false enable a custom format that 1332 | * can save space compared to the portable format (e.g., for very 1333 | * sparse bitmaps). 1334 | */ 1335 | static Roaring64Map readSafe(const char *buf, size_t maxbytes) { 1336 | Roaring64Map result; 1337 | // get map size 1338 | uint64_t map_size = *((uint64_t *)buf); 1339 | buf += sizeof(uint64_t); 1340 | for (uint64_t lcv = 0; lcv < map_size; lcv++) { 1341 | // get map key 1342 | if(maxbytes < sizeof(uint32_t)) { 1343 | throw std::runtime_error("ran out of bytes"); 1344 | } 1345 | uint32_t key; 1346 | memcpy(&key, buf, sizeof(uint32_t)); // this is undefined: uint32_t 1347 | // key = *((uint32_t*)buf); 1348 | buf += sizeof(uint32_t); 1349 | maxbytes -= sizeof(uint32_t); 1350 | // read map value Roaring 1351 | Roaring read = Roaring::readSafe(buf, maxbytes); 1352 | result.emplaceOrInsert(key, read); 1353 | // forward buffer past the last Roaring Bitmap 1354 | size_t tz = read.getSizeInBytes(true); 1355 | buf += tz; 1356 | maxbytes -= tz; 1357 | } 1358 | return result; 1359 | } 1360 | 1361 | /** 1362 | * How many bytes are required to serialize this bitmap (meant to be 1363 | * compatible 1364 | * with Java and Go versions) 1365 | * 1366 | * Setting the portable flag to false enable a custom format that 1367 | * can save space compared to the portable format (e.g., for very 1368 | * sparse bitmaps). 1369 | */ 1370 | size_t getSizeInBytes(bool portable = true) const { 1371 | // start with, respectively, map size and size of keys for each map 1372 | // entry 1373 | return std::accumulate( 1374 | roarings.cbegin(), roarings.cend(), 1375 | sizeof(uint64_t) + roarings.size() * sizeof(uint32_t), 1376 | [=](size_t previous, 1377 | const std::pair &map_entry) { 1378 | // add in bytes used by each Roaring 1379 | return previous + map_entry.second.getSizeInBytes(portable); 1380 | }); 1381 | } 1382 | 1383 | /** 1384 | * Computes the intersection between two bitmaps and returns new bitmap. 1385 | * The current bitmap and the provided bitmap are unchanged. 1386 | */ 1387 | Roaring64Map operator&(const Roaring64Map &o) const { 1388 | return Roaring64Map(*this) &= o; 1389 | } 1390 | 1391 | /** 1392 | * Computes the difference between two bitmaps and returns new bitmap. 1393 | * The current bitmap and the provided bitmap are unchanged. 1394 | */ 1395 | Roaring64Map operator-(const Roaring64Map &o) const { 1396 | return Roaring64Map(*this) -= o; 1397 | } 1398 | 1399 | /** 1400 | * Computes the union between two bitmaps and returns new bitmap. 1401 | * The current bitmap and the provided bitmap are unchanged. 1402 | */ 1403 | Roaring64Map operator|(const Roaring64Map &o) const { 1404 | return Roaring64Map(*this) |= o; 1405 | } 1406 | 1407 | /** 1408 | * Computes the symmetric union between two bitmaps and returns new bitmap. 1409 | * The current bitmap and the provided bitmap are unchanged. 1410 | */ 1411 | Roaring64Map operator^(const Roaring64Map &o) const { 1412 | return Roaring64Map(*this) ^= o; 1413 | } 1414 | 1415 | /** 1416 | * Whether or not we apply copy and write. 1417 | */ 1418 | void setCopyOnWrite(bool val) { 1419 | if (copyOnWrite == val) return; 1420 | copyOnWrite = val; 1421 | std::for_each(roarings.begin(), roarings.end(), 1422 | [=](std::pair &map_entry) { 1423 | map_entry.second.setCopyOnWrite(val); 1424 | }); 1425 | } 1426 | 1427 | /** 1428 | * Print the content of the bitmap 1429 | */ 1430 | void printf() const { 1431 | if (!isEmpty()) { 1432 | auto map_iter = roarings.cbegin(); 1433 | while (map_iter->second.isEmpty()) ++map_iter; 1434 | struct iter_data { 1435 | uint32_t high_bits; 1436 | char first_char = '{'; 1437 | } outer_iter_data; 1438 | outer_iter_data.high_bits = roarings.begin()->first; 1439 | map_iter->second.iterate( 1440 | [](uint32_t low_bits, void *inner_iter_data) -> bool { 1441 | std::printf("%c%llu", 1442 | ((iter_data *)inner_iter_data)->first_char, 1443 | (long long unsigned)uniteBytes( 1444 | ((iter_data *)inner_iter_data)->high_bits, 1445 | low_bits)); 1446 | ((iter_data *)inner_iter_data)->first_char = ','; 1447 | return true; 1448 | }, 1449 | (void *)&outer_iter_data); 1450 | std::for_each( 1451 | ++map_iter, roarings.cend(), 1452 | [](const std::pair &map_entry) { 1453 | map_entry.second.iterate( 1454 | [](uint32_t low_bits, void *high_bits) -> bool { 1455 | std::printf(",%llu", 1456 | (long long unsigned)uniteBytes( 1457 | *(uint32_t *)high_bits, low_bits)); 1458 | return true; 1459 | }, 1460 | (void *)&map_entry.first); 1461 | }); 1462 | } else 1463 | std::printf("{"); 1464 | std::printf("}\n"); 1465 | } 1466 | 1467 | /** 1468 | * Print the content of the bitmap into a string 1469 | */ 1470 | std::string toString() const { 1471 | struct iter_data { 1472 | std::string str; 1473 | uint32_t high_bits; 1474 | char first_char = '{'; 1475 | } outer_iter_data; 1476 | if (!isEmpty()) { 1477 | auto map_iter = roarings.cbegin(); 1478 | while (map_iter->second.isEmpty()) ++map_iter; 1479 | outer_iter_data.high_bits = roarings.begin()->first; 1480 | map_iter->second.iterate( 1481 | [](uint32_t low_bits, void *inner_iter_data) -> bool { 1482 | ((iter_data *)inner_iter_data)->str += 1483 | ((iter_data *)inner_iter_data)->first_char; 1484 | ((iter_data *)inner_iter_data)->str += std::to_string( 1485 | uniteBytes(((iter_data *)inner_iter_data)->high_bits, 1486 | low_bits)); 1487 | ((iter_data *)inner_iter_data)->first_char = ','; 1488 | return true; 1489 | }, 1490 | (void *)&outer_iter_data); 1491 | std::for_each( 1492 | ++map_iter, roarings.cend(), 1493 | [&outer_iter_data]( 1494 | const std::pair &map_entry) { 1495 | outer_iter_data.high_bits = map_entry.first; 1496 | map_entry.second.iterate( 1497 | [](uint32_t low_bits, void *inner_iter_data) -> bool { 1498 | ((iter_data *)inner_iter_data)->str += 1499 | ((iter_data *)inner_iter_data)->first_char; 1500 | ((iter_data *)inner_iter_data)->str += 1501 | std::to_string(uniteBytes( 1502 | ((iter_data *)inner_iter_data)->high_bits, 1503 | low_bits)); 1504 | return true; 1505 | }, 1506 | (void *)&outer_iter_data); 1507 | }); 1508 | } else 1509 | outer_iter_data.str = '{'; 1510 | outer_iter_data.str += '}'; 1511 | return outer_iter_data.str; 1512 | } 1513 | 1514 | /** 1515 | * Whether or not copy and write is active. 1516 | */ 1517 | bool getCopyOnWrite() const { return copyOnWrite; } 1518 | 1519 | /** 1520 | * computes the logical or (union) between "n" bitmaps (referenced by a 1521 | * pointer). 1522 | */ 1523 | static Roaring64Map fastunion(size_t n, const Roaring64Map **inputs) { 1524 | Roaring64Map ans; 1525 | // not particularly fast 1526 | for (size_t lcv = 0; lcv < n; ++lcv) { 1527 | ans |= *(inputs[lcv]); 1528 | } 1529 | return ans; 1530 | } 1531 | 1532 | friend class Roaring64MapSetBitForwardIterator; 1533 | typedef Roaring64MapSetBitForwardIterator const_iterator; 1534 | 1535 | /** 1536 | * Returns an iterator that can be used to access the position of the 1537 | * set bits. The running time complexity of a full scan is proportional to 1538 | * the 1539 | * number 1540 | * of set bits: be aware that if you have long strings of 1s, this can be 1541 | * very inefficient. 1542 | * 1543 | * It can be much faster to use the toArray method if you want to 1544 | * retrieve the set bits. 1545 | */ 1546 | const_iterator begin() const; 1547 | 1548 | /** 1549 | * A bogus iterator that can be used together with begin() 1550 | * for constructions such as for(auto i = b.begin(); 1551 | * i!=b.end(); ++i) {} 1552 | */ 1553 | const_iterator end() const; 1554 | 1555 | private: 1556 | std::map roarings; 1557 | bool copyOnWrite = false; 1558 | static uint32_t highBytes(const uint64_t in) { return uint32_t(in >> 32); } 1559 | static uint32_t lowBytes(const uint64_t in) { return uint32_t(in); } 1560 | static uint64_t uniteBytes(const uint32_t highBytes, 1561 | const uint32_t lowBytes) { 1562 | return (uint64_t(highBytes) << 32) | uint64_t(lowBytes); 1563 | } 1564 | // this is needed to tolerate gcc's C++11 libstdc++ lacking emplace 1565 | // prior to version 4.8 1566 | void emplaceOrInsert(const uint32_t key, const Roaring &value) { 1567 | #if defined(__GLIBCXX__) && __GLIBCXX__ < 20130322 1568 | roarings.insert(std::make_pair(key, value)); 1569 | #else 1570 | roarings.emplace(std::make_pair(key, value)); 1571 | #endif 1572 | } 1573 | }; 1574 | 1575 | /** 1576 | * Used to go through the set bits. Not optimally fast, but convenient. 1577 | */ 1578 | class Roaring64MapSetBitForwardIterator final { 1579 | public: 1580 | typedef std::forward_iterator_tag iterator_category; 1581 | typedef uint64_t *pointer; 1582 | typedef uint64_t &reference_type; 1583 | typedef uint64_t value_type; 1584 | typedef int64_t difference_type; 1585 | typedef Roaring64MapSetBitForwardIterator type_of_iterator; 1586 | 1587 | /** 1588 | * Provides the location of the set bit. 1589 | */ 1590 | value_type operator*() const { 1591 | return Roaring64Map::uniteBytes(map_iter->first, i.current_value); 1592 | } 1593 | 1594 | bool operator<(const type_of_iterator &o) { 1595 | if (map_iter == map_end) return false; 1596 | if (o.map_iter == o.map_end) return true; 1597 | return **this < *o; 1598 | } 1599 | 1600 | bool operator<=(const type_of_iterator &o) { 1601 | if (o.map_iter == o.map_end) return true; 1602 | if (map_iter == map_end) return false; 1603 | return **this <= *o; 1604 | } 1605 | 1606 | bool operator>(const type_of_iterator &o) { 1607 | if (o.map_iter == o.map_end) return false; 1608 | if (map_iter == map_end) return true; 1609 | return **this > *o; 1610 | } 1611 | 1612 | bool operator>=(const type_of_iterator &o) { 1613 | if (map_iter == map_end) return true; 1614 | if (o.map_iter == o.map_end) return false; 1615 | return **this >= *o; 1616 | } 1617 | 1618 | type_of_iterator &operator++() { // ++i, must returned inc. value 1619 | if (i.has_value == true) roaring_advance_uint32_iterator(&i); 1620 | while (!i.has_value) { 1621 | map_iter++; 1622 | if (map_iter == map_end) return *this; 1623 | roaring_init_iterator(&map_iter->second.roaring, &i); 1624 | } 1625 | return *this; 1626 | } 1627 | 1628 | type_of_iterator operator++(int) { // i++, must return orig. value 1629 | Roaring64MapSetBitForwardIterator orig(*this); 1630 | roaring_advance_uint32_iterator(&i); 1631 | while (!i.has_value) { 1632 | map_iter++; 1633 | if (map_iter == map_end) return orig; 1634 | roaring_init_iterator(&map_iter->second.roaring, &i); 1635 | } 1636 | return orig; 1637 | } 1638 | 1639 | bool operator==(const Roaring64MapSetBitForwardIterator &o) { 1640 | if (map_iter == map_end && o.map_iter == o.map_end) return true; 1641 | if (o.map_iter == o.map_end) return false; 1642 | return **this == *o; 1643 | } 1644 | 1645 | bool operator!=(const Roaring64MapSetBitForwardIterator &o) { 1646 | if (map_iter == map_end && o.map_iter == o.map_end) return false; 1647 | if (o.map_iter == o.map_end) return true; 1648 | return **this != *o; 1649 | } 1650 | 1651 | Roaring64MapSetBitForwardIterator(const Roaring64Map &parent, 1652 | bool exhausted = false) 1653 | : map_end(parent.roarings.cend()) { 1654 | if (exhausted || parent.roarings.empty()) { 1655 | map_iter = parent.roarings.cend(); 1656 | } else { 1657 | map_iter = parent.roarings.cbegin(); 1658 | roaring_init_iterator(&map_iter->second.roaring, &i); 1659 | while (!i.has_value) { 1660 | map_iter++; 1661 | if (map_iter == map_end) return; 1662 | roaring_init_iterator(&map_iter->second.roaring, &i); 1663 | } 1664 | } 1665 | } 1666 | 1667 | private: 1668 | std::map::const_iterator map_iter; 1669 | std::map::const_iterator map_end; 1670 | roaring_uint32_iterator_t i; 1671 | }; 1672 | 1673 | inline Roaring64MapSetBitForwardIterator Roaring64Map::begin() const { 1674 | return Roaring64MapSetBitForwardIterator(*this); 1675 | } 1676 | 1677 | inline Roaring64MapSetBitForwardIterator Roaring64Map::end() const { 1678 | return Roaring64MapSetBitForwardIterator(*this, true); 1679 | } 1680 | 1681 | #endif /* INCLUDE_ROARING_64_MAP_HH_ */ 1682 | /* end file /Users/dlemire/CVS/github/CRoaring/cpp/roaring64map.hh */ --------------------------------------------------------------------------------