├── outrank ├── algorithms │ ├── __init__.py │ ├── sketches │ │ ├── __init__.py │ │ ├── counting_counters_ordinary.py │ │ ├── counting_cms.py │ │ └── counting_ultiloglog.py │ ├── feature_ranking │ │ ├── __init__.py │ │ ├── ranking_cov_alignment.py │ │ ├── ranking_mi_numba.py │ │ └── ranking_mi_numba_opt.py │ └── synthetic_data_generators │ │ ├── __init__.py │ │ └── generator_naive.py ├── visualizations │ ├── __init__.py │ └── ranking_visualization.py ├── feature_transformations │ ├── __init__.py │ ├── feature_transformer_vault │ │ ├── __init__.py │ │ └── fw_transformers.py │ └── ranking_transformers.py ├── __init__.py ├── core_selftest.py ├── task_visualization.py ├── task_generators.py ├── task_selftest.py ├── task_instance_ranking.py ├── task_summary.py ├── __main__.py └── task_ranking.py ├── MANIFEST.in ├── scripts ├── run_unit_tests.sh ├── run_minimal.sh └── run_benchmarks.sh ├── tests ├── __init__.py ├── tests_files │ ├── data.csv │ └── vw_namespace_map.csv ├── test_ref_model.json ├── fw_transformers_test.py ├── hll_test.py ├── data_io_test.py ├── ranking_module_test.py ├── cms_test.py ├── cov_heu_test.py ├── multivalue_mi_test.py └── mi_numba_test.py ├── pyproject.toml ├── benchmarks ├── comparison.png ├── README.md ├── generator_second_order.py ├── generator_third_order.py ├── analyse_rankings.py ├── data_regression_experiment.sh └── generator_naive.py ├── examples ├── data.csv ├── simple_transformers.json ├── custom_transformers.json ├── run_ranking_3MR.sh ├── run_multivalue_example.sh ├── run_ranking_singles.sh ├── run_ranking_pairwise.sh ├── run_ranking_opt.sh ├── run_ranking_prior.sh ├── multivalue_data.csv ├── run_ranking_transformations.sh ├── run_ranking_combinations.sh ├── README.md ├── recursive_ranking.py └── multirank.py ├── .flake8 ├── docs ├── index.html ├── build_docs.sh └── DOCSMAIN.md ├── requirements.txt ├── TODO.md ├── clean_repo.sh ├── .github └── workflows │ ├── selftest.yml │ ├── python-unit.yml │ ├── benchmarks.yml │ └── python-package.yml ├── setup.py ├── .pre-commit-config.yaml ├── LICENSE.md ├── .gitignore ├── test_coverage_summary.py └── README.md /outrank/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /outrank/visualizations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /outrank/algorithms/sketches/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /outrank/feature_transformations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /outrank/algorithms/feature_ranking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/run_unit_tests.sh: -------------------------------------------------------------------------------- 1 | python -m pytest . 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Tests module initialization -------------------------------------------------------------------------------- /outrank/algorithms/synthetic_data_generators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.autopep8] 2 | in-place = true 3 | list-fixes = true 4 | ignore = "W690" 5 | -------------------------------------------------------------------------------- /tests/tests_files/data.csv: -------------------------------------------------------------------------------- 1 | f1,f2,f3,f4 2 | 1.0,TS,23,12 3 | 1.2,TA,222,15 4 | 1.4,TC,252,15 5 | -------------------------------------------------------------------------------- /tests/tests_files/vw_namespace_map.csv: -------------------------------------------------------------------------------- 1 | AE,f1,f32 2 | AK,f2,f32 3 | As,f3,f32 4 | AR,f4, 5 | Ae,f5, 6 | -------------------------------------------------------------------------------- /benchmarks/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outbrain-inc/outrank/HEAD/benchmarks/comparison.png -------------------------------------------------------------------------------- /tests/test_ref_model.json: -------------------------------------------------------------------------------- 1 | { 2 | "desc": { 3 | "features": ["f0","f1","f0,f1"] 4 | } 5 | } -------------------------------------------------------------------------------- /outrank/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. include:: ../docs/DOCSMAIN.md 3 | """ 4 | from __future__ import annotations 5 | -------------------------------------------------------------------------------- /examples/data.csv: -------------------------------------------------------------------------------- 1 | feature1,feature2,target 2 | 1.0,0.5,1 3 | 4.0,1.0,0 4 | 9.0,1.5,1 5 | 16.0,2.0,0 6 | 25.0,2.5,1 7 | 36.0,3.0,0 -------------------------------------------------------------------------------- /outrank/core_selftest.py: -------------------------------------------------------------------------------- 1 | # helper set of methods that enable anywhere verification of core functions 2 | from __future__ import annotations 3 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = ANN001,ANN201,ANN202,ANN203,ANN205 3 | extend-ignore = ANN101,F824 4 | exclude = .git,__pycache__,build,dist,tests,gunicorn.py,swagger.py,main.py 5 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/build_docs.sh: -------------------------------------------------------------------------------- 1 | # Note: this requires pdoc>=14.1.0 to run (pip install pdoc>=14.1.0) 2 | rm -rvf index.html outrank outrank.html search.js; 3 | cd ..; 4 | pdoc ./outrank -o docs; 5 | -------------------------------------------------------------------------------- /examples/simple_transformers.json: -------------------------------------------------------------------------------- 1 | { 2 | "_tr_sqrt": "np.sqrt(X)", 3 | "_tr_log": "np.log(X + 1)", 4 | "_tr_square": "np.square(X)", 5 | "_tr_abs": "np.abs(X)", 6 | "_tr_exp": "np.exp(X)" 7 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flake8>=6.1.0 2 | matplotlib>=3.7.2 3 | numba>=0.55.1 4 | numpy>=1.21.6 5 | pandas>=1.3.1 6 | pathos>=0.2.9 7 | pre-commit>=3.4.0 8 | scikit-learn>=0.24.1 9 | scipy>=1.8.1 10 | seaborn>=0.12 11 | tqdm>=4.63.0 12 | xxhash>=3.0.0 13 | zstandard==0.22.0 14 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # a suite of a bit longer (regression) tests 2 | 3 | By running `data_regression_experiment.sh`, you can conduct a stand-alone experiment that demonstrates the rankings' capability of approximating the scores obtained by using the full data set. 4 | 5 | ![comparison](./comparison.png) 6 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Some TODOs 2 | 3 | 1. Documentation of the derived heuristics and what hyperparameters mean (3mr). 4 | 2. Logging unified 5 | 3. Benchmark CI for core components as a regression test 6 | 4. Gradual speedups/rewrites of main algorithms 7 | 5. Documenting new/extra features (subfeatures etc.) 8 | 6. Get rid of Pandas, let's use something more efficient instead 9 | 7. Tree-based explanation for stream-like data 10 | -------------------------------------------------------------------------------- /examples/custom_transformers.json: -------------------------------------------------------------------------------- 1 | { 2 | "_tr_custom_sigmoid": "1 / (1 + np.exp(-X))", 3 | "_tr_custom_tanh": "np.tanh(X)", 4 | "_tr_custom_relu": "np.maximum(0, X)", 5 | "_tr_custom_normalize": "(X - np.min(X)) / (np.max(X) - np.min(X))", 6 | "_tr_custom_zscore": "(X - np.mean(X)) / np.std(X)", 7 | "_tr_custom_log_sigmoid": "np.log(1 / (1 + np.exp(-X)))", 8 | "_tr_custom_softplus": "np.log(1 + np.exp(X))" 9 | } -------------------------------------------------------------------------------- /clean_repo.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | # isort 5 | isort . 6 | 7 | ## emacs noise ;) 8 | find . -name '*~' -type f -delete 9 | 10 | ## other noise - more robust cleanup 11 | find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true 12 | find . -name "*.pyc" -delete 13 | find . -name "*.pyo" -delete 14 | 15 | ## import cleanup 16 | find . -name '*.py' | xargs autoflake --in-place --remove-unused-variables --expand-star-imports 17 | 18 | ## formatting 19 | find . -name '*.py' -print0 | xargs -0 yapf -i 20 | 21 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 22 | 23 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 24 | flake8 . --count --exit-zero --max-complexity=10 --statistics 25 | -------------------------------------------------------------------------------- /outrank/task_visualization.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | 6 | import pandas as pd 7 | 8 | from outrank.visualizations.ranking_visualization import visualize_all 9 | 10 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 11 | 12 | 13 | def outrank_task_visualize_results(args): 14 | logging.info(f'Beginning visualization based on: {args.output_folder}.') 15 | 16 | triplets = pd.read_csv( 17 | os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t', 18 | ) 19 | visualize_all( 20 | triplets, 21 | args.output_folder, 22 | args.label_column, 23 | args.reference_model_JSON, 24 | image_format=args.image_format, 25 | heuristic=args.heuristic, 26 | ) 27 | -------------------------------------------------------------------------------- /examples/run_ranking_3MR.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # Performing non-myopic ranking with 3MR 3 | ########################################################################################################## 4 | 5 | # This run computes 3MR-based rankings, see repo's papers for more details. 6 | # hint - if unsure what parameters do, you can always run "outrank --help" 7 | 8 | outrank \ 9 | --task all \ 10 | --data_path $PATH_TO_YOUR_DATA \ 11 | --data_source csv-raw \ 12 | --heuristic MI-numba-3mr \ 13 | --target_ranking_only True \ 14 | --combination_number_upper_bound 2048 \ 15 | --num_threads 12 \ 16 | --interaction_order 1 \ 17 | --transformers fw-transformers \ 18 | --output_folder ./some_output_folder \ 19 | --subsampling 30 20 | -------------------------------------------------------------------------------- /examples/run_multivalue_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ########################################################################################################## 4 | # Multivalue MI ranking 5 | ########################################################################################################## 6 | 7 | # This run demonstrates multivalue MI computation with cardinality correction. 8 | # Use '_' as delimiter in CSV for multivalue features (e.g., "sports_music"). 9 | # hint - if unsure what parameters do, you can always run "outrank --help" 10 | 11 | outrank \ 12 | --task all \ 13 | --data_path examples/multivalue_data.csv \ 14 | --data_source csv-raw \ 15 | --heuristic MI-multivalue-set-randomized \ 16 | --target_ranking_only True \ 17 | --combination_number_upper_bound 2048 \ 18 | --num_threads 8 \ 19 | --output_folder ./ranking_outputs_multivalue \ 20 | --subsampling 100 21 | -------------------------------------------------------------------------------- /examples/run_ranking_singles.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. # 3 | ########################################################################################################## 4 | 5 | # This run compares features "one-at-a-time" and summarizes, visualizes the outputs. 6 | # hint - if unsure what parameters do, you can always run "outrank --help" 7 | 8 | outrank \ 9 | --task all \ 10 | --data_path $PATH_TO_YOUR_DATA \ 11 | --data_source csv-raw \ 12 | --heuristic MI-numba-randomized \ 13 | --subfeature_mapping f12->f32;f1<->f41 \ 14 | --target_ranking_only True \ 15 | --combination_number_upper_bound 2048 \ 16 | --num_threads 12 \ 17 | --output_folder ./some_output_folder \ 18 | --subsampling 10 19 | -------------------------------------------------------------------------------- /outrank/algorithms/feature_ranking/ranking_cov_alignment.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import numpy.typing as npt 5 | 6 | np.random.seed(123) 7 | max_size = 10**6 8 | 9 | 10 | def max_pair_coverage(array1: npt.NDArray[np.int32], array2: npt.NDArray[np.int32]) -> float: 11 | def hash_pair(el1: np.int32, el2: np.int32): 12 | return (el1 * 1471343 - el2) % max_size 13 | 14 | counts = np.zeros(max_size, dtype=np.int32) 15 | tot_len = len(array1) 16 | for i in range(tot_len): 17 | identifier = hash_pair(array1[i], array2[i]) 18 | counts[identifier] += 1 19 | 20 | return np.max(counts) / tot_len 21 | 22 | 23 | if __name__ == '__main__': 24 | 25 | array1 = np.array([1,1,2,3,1,1,1,5] * 100000) 26 | array2 = np.array([0,0,5,5,3,0,0,0] * 100000) 27 | coverage = max_pair_coverage(array1, array2) 28 | assert coverage == 0.5 29 | -------------------------------------------------------------------------------- /examples/run_ranking_pairwise.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # Pairwise feature ranking (feature redundancy calculation) 3 | ########################################################################################################## 4 | 5 | # This run demonstrates how to obtain "feature heatmaps" - pairwise summaries of mutual redundancy 6 | # Note that pairwise calculations take more time - increasing thread count is a possible mitigation 7 | 8 | # hint - if unsure what parameters do, you can always run "outrank --help" 9 | outrank \ 10 | --task all \ 11 | --data_path $PATH_TO_YOUR_DATA \ 12 | --data_source csv-raw \ 13 | --heuristic MI-numba-randomized \ 14 | --target_ranking_only False \ 15 | --combination_number_upper_bound 2048 \ 16 | --num_threads 50 \ 17 | --output_folder ./some_output_folder \ 18 | --subsampling 100 19 | -------------------------------------------------------------------------------- /examples/run_ranking_opt.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # An optimization of generic OutRank invocation. It includes visualizations and other relevant statistics. # 3 | ########################################################################################################## 4 | 5 | # This run compares features "one-at-a-time" and summarizes, visualizes the outputs. 6 | # Heuristic is the same as MI-numba-randomized, but the code is optimized for faster executuion. 7 | # hint - if unsure what parameters do, you can always run "outrank --help" 8 | 9 | outrank \ 10 | --task all \ 11 | --data_path $PATH_TO_YOUR_DATA \ 12 | --data_source csv-raw \ 13 | --heuristic MI-numba-randomized-opt \ 14 | --target_ranking_only True \ 15 | --combination_number_upper_bound 2048 \ 16 | --num_threads 12 \ 17 | --interaction_order 1 \ 18 | --output_folder ./some_output_folder \ 19 | --subsampling 30 20 | -------------------------------------------------------------------------------- /examples/run_ranking_prior.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. # 3 | ########################################################################################################## 4 | 5 | # This run compares features "one-at-a-time" and summarizes, visualizes the outputs. 6 | # hint - if unsure what parameters do, you can always run "outrank --help" 7 | 8 | outrank \ 9 | --task all \ 10 | --data_path $PATH_TO_YOUR_DATA \ 11 | --data_source ob-csv \ 12 | --heuristic surrogate-SGD-prior \ 13 | --target_ranking_only True \ 14 | --interaction_order 2 \ 15 | --combination_number_upper_bound 2048 \ 16 | --num_threads 12 \ 17 | --output_folder ./some_output_folder \ 18 | --subsampling 100 \ 19 | --minibatch_size 10000 \ 20 | --label_column info_click_valid \ 21 | --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL 22 | -------------------------------------------------------------------------------- /examples/multivalue_data.csv: -------------------------------------------------------------------------------- 1 | user_id,interests,skills,purchased,satisfaction 2 | 1,sports_music,python_sql,laptop_phone,high 3 | 2,music_tech,java_sql,phone_tablet,high 4 | 3,sports_tech,python_java,laptop_tablet,medium 5 | 4,music_art,r_python,phone_headphones,high 6 | 5,sports_art,sql_r,laptop_headphones,medium 7 | 6,tech_art,java_r,tablet_headphones,low 8 | 7,sports_music_tech,python_sql_java,laptop_phone_tablet,high 9 | 8,music_art_tech,r_python_sql,phone_headphones_tablet,medium 10 | 9,sports_music,python_java,laptop_phone,high 11 | 10,tech_art,sql_java,tablet_headphones,low 12 | 11,music_tech,python_r,phone_laptop,high 13 | 12,sports_art,java_sql,tablet_phone,medium 14 | 13,music_art,r_sql,headphones_laptop,medium 15 | 14,sports_tech,python_sql,laptop_tablet,high 16 | 15,tech_art,java_r,tablet_phone,low 17 | 16,sports_music,python_java_sql,laptop_phone_tablet,high 18 | 17,music_tech_art,r_python_java,phone_headphones_tablet,high 19 | 18,sports_tech,sql_java,laptop_tablet,medium 20 | 19,music_art,python_r,phone_headphones,medium 21 | 20,sports_music_tech,python_sql_r,laptop_phone,high 22 | -------------------------------------------------------------------------------- /outrank/feature_transformations/feature_transformer_vault/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import DEFAULT_TRANSFORMERS 4 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import EXTENDED_ROUNDED_TRANSFORMERS 5 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import EXTENDED_TRANSFORMERS 6 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import MINIMAL_TRANSFORMERS 7 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import VERBOSE_TRANSFORMERS 8 | from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \ 9 | FW_TRANSFORMERS 10 | 11 | _tr_global_namespace = { 12 | 'default': DEFAULT_TRANSFORMERS, 13 | 'minimal': MINIMAL_TRANSFORMERS, 14 | 'fw-transformers': FW_TRANSFORMERS, 15 | 'extended': EXTENDED_TRANSFORMERS, 16 | 'verbose': VERBOSE_TRANSFORMERS, 17 | 'extended_rounded': EXTENDED_ROUNDED_TRANSFORMERS, 18 | } 19 | -------------------------------------------------------------------------------- /scripts/run_minimal.sh: -------------------------------------------------------------------------------- 1 | 2 | # An example benchmark data set 3 | DATA_ENDPOINT="https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/avazu_sample.txt" 4 | pip install . --upgrade; 5 | cd benchmarks; 6 | 7 | ################################################################### 8 | #.................................................................. 9 | ################################################################### 10 | # Can we find a needle 11 | 12 | rm -r ranking_outputs; rm -r dataset_naive; 13 | wget $DATA_ENDPOINT 14 | cat avazu_sample.txt avazu_sample.txt avazu_sample.txt > tmp.txt; mv tmp.txt avazu_sample.txt; rm -rf tmp.txt; 15 | mkdir avazu; mv avazu_sample.txt avazu/data.csv;rm -rf avazu_sample.csv; 16 | 17 | # Run the feature ranking by using 3MR heuristic 18 | outrank --data_path avazu --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-3mr --target_ranking_only False --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 100 --label click; 19 | 20 | echo "Ranking outputs are present in benchmarks/ranking_outputs .." 21 | ls ranking_outputs; 22 | 23 | cd ..; 24 | -------------------------------------------------------------------------------- /.github/workflows/selftest.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Selftest 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install . --upgrade 33 | 34 | - name: Run selftest 35 | run: | 36 | outrank --task selftest 37 | -------------------------------------------------------------------------------- /.github/workflows/python-unit.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Unit tests 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install . --upgrade 33 | - name: Unit tests 34 | run: | 35 | python -m pytest ./tests/*.py 36 | -------------------------------------------------------------------------------- /.github/workflows/benchmarks.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Benchmark 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install . --upgrade 33 | 34 | - name: Run benchmark tests 35 | run: | 36 | bash scripts/run_benchmarks.sh CI 37 | -------------------------------------------------------------------------------- /outrank/algorithms/sketches/counting_counters_ordinary.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import Counter 4 | 5 | 6 | class PrimitiveConstrainedCounter: 7 | """ 8 | A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT. 9 | """ 10 | 11 | def __init__(self, bound: int=(10**4) * 3): 12 | self.max_bound_thr = bound 13 | self.default_counter: Counter = Counter() 14 | 15 | def batch_add(self, lst): 16 | if len(self.default_counter) < self.max_bound_thr: 17 | self.default_counter = self.default_counter + Counter(lst) 18 | 19 | def add(self, val): 20 | if len(self.default_counter) < self.max_bound_thr: 21 | self.default_counter[val] += 1 22 | 23 | 24 | if __name__ == '__main__': 25 | from collections import Counter 26 | 27 | depth = 8 28 | width = 2**22 29 | cms = PrimitiveConstrainedCounter() 30 | 31 | items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000 32 | cms.batch_add(items) # Use the batch_add function 33 | 34 | print(Counter(items)) # Print the exact counts for comparison 35 | -------------------------------------------------------------------------------- /examples/run_ranking_transformations.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # Ranking of feature transformations 3 | ########################################################################################################## 4 | 5 | # A common and very important task is figuring out which transformations of a feature are promising. 6 | 7 | # hint - if unsure what parameters do, you can always run "outrank --help" 8 | # Example considering some generic transformations of features. Note that OutRank is type aware, if using formats such as ob-vw or ob-csv, 9 | # type-aware transformations can be produced. See e.g., https://outbrain.github.io/outrank/outrank/algorithms/importance_estimator.html?search=ob-vw for more details on the format. 10 | outrank \ 11 | --task all \ 12 | --data_path $PATH_TO_YOUR_DATA \ 13 | --data_source csv-raw \ 14 | --heuristic MI-numba-randomized \ 15 | --target_ranking_only True \ 16 | --combination_number_upper_bound 2048 \ 17 | --num_threads 50 \ 18 | --transformers default \ 19 | --output_folder ./some_output_folder \ 20 | --subsampling 100 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import setuptools 6 | 7 | 8 | def _parse_requirements(file): 9 | with open( 10 | os.path.join(os.path.dirname(__file__), file), encoding='utf-8', 11 | ) as req_file: 12 | return [line.strip() for line in req_file] 13 | 14 | 15 | def _read_description(): 16 | with open('README.md', encoding='utf-8') as description: 17 | return description.read() 18 | 19 | 20 | packages = [x for x in setuptools.find_packages() if x != 'test'] 21 | setuptools.setup( 22 | name='outrank', 23 | version='0.97.6', 24 | description='OutRank: Feature ranking for massive sparse data sets.', 25 | long_description=_read_description(), 26 | long_description_content_type='text/markdown', 27 | url='https://github.com/outbrain/outrank', 28 | author='Research Infra (Outbrain); Blaz Skrlj led the development of this project', 29 | license='BSD', 30 | entry_points={'console_scripts': ['outrank = outrank.__main__:main']}, 31 | packages=packages, 32 | zip_safe=True, 33 | include_package_data=True, 34 | install_requires=_parse_requirements('requirements.txt'), 35 | ) 36 | -------------------------------------------------------------------------------- /examples/run_ranking_combinations.sh: -------------------------------------------------------------------------------- 1 | ########################################################################################################## 2 | # Ranking of feature combinations 3 | ########################################################################################################## 4 | 5 | # This run demonstrates how to perform "supervised combination ranking" - the process of figuring out 6 | # which feature combinations are potentially promising. 7 | # Note that this process' time is directly correlated with interaction order (higher=longer runs) 8 | 9 | # hint - if unsure what parameters do, you can always run "outrank --help" 10 | # Example for feature pairs 11 | outrank \ 12 | --task all \ 13 | --data_path $PATH_TO_YOUR_DATA \ 14 | --data_source csv-raw \ 15 | --heuristic MI-numba-randomized \ 16 | --target_ranking_only True \ 17 | --interaction_order 2 \ 18 | --combination_number_upper_bound 2048 \ 19 | --num_threads 50 \ 20 | --output_folder ./some_output_folder \ 21 | --subsampling 100 22 | 23 | 24 | # And feature triplets. The combination_number_upper_bound bounds the number of sampled combinations (RAM controller) 25 | outrank \ 26 | --task all \ 27 | --data_path $PATH_TO_YOUR_DATA \ 28 | --data_source csv-raw \ 29 | --heuristic MI-numba-randomized \ 30 | --target_ranking_only True \ 31 | --interaction_order 3 \ 32 | --combination_number_upper_bound 2048 \ 33 | --num_threads 50 \ 34 | --output_folder ./some_output_folder_triplets \ 35 | --subsampling 100 36 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install . --upgrade 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | bash scripts/run_unit_tests.sh 42 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: debug-statements 9 | - id: double-quote-string-fixer 10 | - id: name-tests-test 11 | - id: requirements-txt-fixer 12 | - repo: https://github.com/asottile/setup-cfg-fmt 13 | rev: v2.4.0 14 | hooks: 15 | - id: setup-cfg-fmt 16 | - repo: https://github.com/asottile/reorder-python-imports 17 | rev: v3.10.0 18 | hooks: 19 | - id: reorder-python-imports 20 | exclude: ^(pre_commit/resources/|testing/resources/python3_hooks_repo/) 21 | args: [--py38-plus, --add-import, 'from __future__ import annotations'] 22 | - repo: https://github.com/asottile/add-trailing-comma 23 | rev: v3.1.0 24 | hooks: 25 | - id: add-trailing-comma 26 | - repo: https://github.com/asottile/pyupgrade 27 | rev: v3.10.1 28 | hooks: 29 | - id: pyupgrade 30 | args: [--py38-plus] 31 | - repo: https://github.com/hhatto/autopep8 32 | rev: v2.0.4 33 | hooks: 34 | - id: autopep8 35 | args: ["--global-config pyproject.toml"] 36 | - repo: https://github.com/PyCQA/flake8 37 | rev: 6.1.0 38 | hooks: 39 | - id: flake8 40 | # - repo: https://github.com/pre-commit/mirrors-mypy 41 | # rev: v1.5.1 42 | # hooks: 43 | # - id: mypy 44 | # additional_dependencies: [types-all] 45 | # exclude: ^testing/resources/ 46 | -------------------------------------------------------------------------------- /outrank/task_generators.py: -------------------------------------------------------------------------------- 1 | # OutRank is also capable of generating data sets. 2 | from __future__ import annotations 3 | 4 | import logging 5 | import os 6 | import shutil 7 | 8 | import pandas as pd 9 | 10 | from outrank.algorithms.synthetic_data_generators import generator_naive 11 | 12 | logging.basicConfig( 13 | format='%(asctime)s - %(message)s', 14 | datefmt='%d-%b-%y %H:%M:%S', 15 | ) 16 | logger = logging.getLogger('syn-logger') 17 | logger.setLevel(logging.DEBUG) 18 | 19 | 20 | def outrank_task_generate_data_set(args): 21 | """Core method for generating data sets""" 22 | 23 | if args.generator_type == 'naive': 24 | sample, target = generator_naive.generate_random_matrix( 25 | args.num_synthetic_features, args.num_synthetic_rows, 26 | ) 27 | else: 28 | raise ValueError(f'Generator {args.generator_type} not implemented.') 29 | 30 | dfx = pd.DataFrame(sample) 31 | dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] 32 | dfx['label'] = target 33 | if os.path.exists(args.output_synthetic_df_name) and os.path.isdir( 34 | args.output_synthetic_df_name, 35 | ): 36 | logger.info( 37 | f'Found existing: {args.output_synthetic_df_name}, removing first ..', 38 | ) 39 | shutil.rmtree(args.output_synthetic_df_name) 40 | os.mkdir(args.output_synthetic_df_name) 41 | dfx.to_csv(f'./{args.output_synthetic_df_name}/data.csv', index=False) 42 | 43 | logger.info( 44 | f'Generated data set of shape {dfx.shape} in {args.output_synthetic_df_name}', 45 | ) 46 | -------------------------------------------------------------------------------- /tests/fw_transformers_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import unittest 5 | 6 | import numpy as np 7 | 8 | from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \ 9 | FW_TRANSFORMERS 10 | 11 | sys.path.append('./outrank') 12 | 13 | 14 | class FWTransformersTest(unittest.TestCase): 15 | def test_log_probs(self): 16 | X = np.asarray([0.68294952, 0.7, 0.91263375]) 17 | some_transformer = FW_TRANSFORMERS.get('_tr_fw_prob_log_res_1_gt_0.01') 18 | assert X is not None 19 | assert some_transformer is not None 20 | output = eval(some_transformer) 21 | self.assertListEqual(list(output), [-0.0, -0.0, -0.0]) 22 | 23 | def test_sqrt_int_gt_1(self): 24 | X = np.asarray([1.0, 2.0, 5.0]) 25 | some_transformer = FW_TRANSFORMERS.get('_tr_fw_sqrt_res_1_gt_1') 26 | assert X is not None 27 | assert some_transformer is not None 28 | output = eval(some_transformer) 29 | self.assertListEqual(list(output), [0.0, 1.0, 2.0]) 30 | 31 | def test_sqrt_probs(self): 32 | X = np.asarray([0.68294952, 0.72944264, 0.91263375]) 33 | some_transformer = FW_TRANSFORMERS.get( 34 | '_tr_fw_prob_sqrt_res_1_gt_0.01', 35 | ) 36 | assert some_transformer is not None 37 | assert X is not None 38 | output = eval(some_transformer) 39 | self.assertListEqual(list(output), [1.0, 1.0, 1.0]) 40 | 41 | def test_overall_transf_count(self): 42 | self.assertEqual(len(FW_TRANSFORMERS), 138) 43 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, the respective contributors, as shown by the AUTHORS file. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /outrank/feature_transformations/feature_transformer_vault/fw_transformers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | 5 | from outrank.feature_transformations.feature_transformer_vault.default_transformers import \ 6 | DEFAULT_TRANSFORMERS 7 | 8 | FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy() 9 | resolution_range = [1, 10, 50, 100] 10 | greater_than_range = [1, 2, 4, 8, 16, 32, 64, 96] 11 | 12 | for resolution in resolution_range: 13 | for greater_than in greater_than_range: 14 | FW_TRANSFORMERS[f'_tr_fw_sqrt_res_{resolution}_gt_{greater_than}'] = ( 15 | f'np.where(X < {greater_than}, ' 16 | f'X, ' 17 | f'np.where(X>{greater_than} ,' 18 | f'np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' 19 | ) 20 | 21 | FW_TRANSFORMERS[ 22 | f'_tr_fw_log_res_{resolution}_gt_{greater_than}' 23 | ] = f'np.where(X <{greater_than}, X, np.where(X >{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' 24 | 25 | for resolution in resolution_range: 26 | for greater_than in [np.divide(x, 100) for x in greater_than_range]: 27 | FW_TRANSFORMERS[ 28 | f'_tr_fw_prob_sqrt_res_{resolution}_gt_{greater_than}' 29 | ] = f'np.where(X < {greater_than}, X, np.where(X>{greater_than}, np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' 30 | 31 | FW_TRANSFORMERS[ 32 | f'_tr_fw_prob_log_res_{resolution}_gt_{greater_than}' 33 | ] = f'np.where(X <{greater_than},X, np.where(X>{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' 34 | 35 | if __name__ == '__main__': 36 | print(len(FW_TRANSFORMERS)) 37 | -------------------------------------------------------------------------------- /outrank/task_selftest.py: -------------------------------------------------------------------------------- 1 | # helper set of methods that enable anywhere verification of core functions 2 | from __future__ import annotations 3 | 4 | import logging 5 | import os 6 | import shutil 7 | import subprocess 8 | 9 | import pandas as pd 10 | 11 | logging.basicConfig( 12 | format='%(asctime)s - %(message)s', 13 | datefmt='%d-%b-%y %H:%M:%S', 14 | ) 15 | logger = logging.getLogger('syn-logger') 16 | logger.setLevel(logging.DEBUG) 17 | 18 | 19 | def conduct_self_test(heuristic='MI-numba-randomized'): 20 | # Simulate full flow, ranking only 21 | subprocess.run( 22 | 'outrank --task data_generator --num_synthetic_rows 100000', shell=True, 23 | ) 24 | subprocess.run( 25 | f'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --heuristic {heuristic};', 26 | shell=True, 27 | ) 28 | 29 | dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t') 30 | 31 | logger.info("Verifying output's properties ..") 32 | assert dfx.shape[0] == 201 33 | assert dfx.shape[1] == 3 34 | assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)' 35 | 36 | to_remove = ['ranking_outputs', 'test_data_synthetic'] 37 | for path in to_remove: 38 | if os.path.exists(path) and os.path.isdir(path): 39 | logger.info(f'Removing {path} as part of cleanup ..') 40 | shutil.rmtree(path) 41 | 42 | logger.info(f'All tests passed for heuristic: {heuristic} \N{rocket}') 43 | 44 | 45 | if __name__ == '__main__': 46 | conduct_self_test('MI-numba-randomized') 47 | conduct_self_test('max-value-coverage') 48 | logger.info('OutRank seems in shape \N{winking face}') 49 | -------------------------------------------------------------------------------- /tests/hll_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import unittest 5 | 6 | from outrank.algorithms.sketches.counting_ultiloglog import \ 7 | HyperLogLogWCache as HyperLogLog 8 | 9 | sys.path.append('./outrank') 10 | 11 | 12 | class CompareStrategiesTest(unittest.TestCase): 13 | def test_hll_update(self): 14 | GLOBAL_CARDINALITY_STORAGE = dict() 15 | GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLog(0.01) 16 | GLOBAL_CARDINALITY_STORAGE[1].add(123) 17 | GLOBAL_CARDINALITY_STORAGE[1].add(123) 18 | self.assertEqual(len(GLOBAL_CARDINALITY_STORAGE[1]), 1) 19 | 20 | GLOBAL_CARDINALITY_STORAGE[1].add(1232) 21 | self.assertEqual(len(GLOBAL_CARDINALITY_STORAGE[1]), 2) 22 | 23 | for j in range(100): 24 | GLOBAL_CARDINALITY_STORAGE[1].add(1232 + j) 25 | 26 | self.assertEqual(len(GLOBAL_CARDINALITY_STORAGE[1]), 101) 27 | 28 | def test_stress_multi_feature(self): 29 | GLOBAL_CARDINALITY_STORAGE = dict() 30 | for j in range(10): 31 | GLOBAL_CARDINALITY_STORAGE[j] = HyperLogLog(0.01) 32 | for j in range(1000): 33 | for k in range(len(GLOBAL_CARDINALITY_STORAGE)): 34 | GLOBAL_CARDINALITY_STORAGE[k].add(1232 + j) 35 | 36 | for j in range(10): 37 | self.assertEqual(len(GLOBAL_CARDINALITY_STORAGE[j]), 1000) 38 | 39 | def test_stress_high_card(self): 40 | GLOBAL_CARDINALITY_STORAGE = dict() 41 | for j in range(10): 42 | GLOBAL_CARDINALITY_STORAGE[j] = HyperLogLog(0.01) 43 | 44 | for j in range(10000): 45 | for k in range(len(GLOBAL_CARDINALITY_STORAGE)): 46 | GLOBAL_CARDINALITY_STORAGE[k].add(1232 + j) 47 | 48 | # 1% err is toleratable above certain card range 49 | for j in range(10): 50 | self.assertLess( 51 | abs(len(GLOBAL_CARDINALITY_STORAGE[j]) - 10000), 100, 52 | ) 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /outrank/algorithms/sketches/counting_cms.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import Counter 4 | 5 | import numpy as np 6 | from numba import njit 7 | from numba import prange 8 | 9 | 10 | @njit 11 | def cms_hash(x, seed, width): 12 | x_hash = np.uint32(hash(x)) 13 | return (x_hash + seed) % width 14 | 15 | class CountMinSketch: 16 | """ 17 | A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT. 18 | """ 19 | 20 | def __init__(self, depth=6, width=2**15, M=None): 21 | self.depth = depth 22 | self.width = width 23 | self.hash_seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=depth), dtype=np.uint32) 24 | self.M = np.zeros((depth, width), dtype=np.int32) if M is None else M 25 | 26 | @staticmethod 27 | @njit 28 | def _add(M, x, depth, width, hash_seeds, delta=1): 29 | for i in prange(depth): 30 | location = cms_hash(x, hash_seeds[i], width) 31 | M[i, location] += delta 32 | 33 | def add(self, x, delta=1): 34 | CountMinSketch._add(self.M, x, self.depth, self.width, self.hash_seeds, delta) 35 | 36 | def batch_add(self, lst, delta=1): 37 | for x in lst: 38 | self.add(x, delta) 39 | 40 | def query(self, x): 41 | return min(self.M[i][cms_hash(x, self.hash_seeds[i], self.width)] for i in range(self.depth)) 42 | 43 | def get_matrix(self): 44 | return self.M 45 | 46 | 47 | if __name__ == '__main__': 48 | from collections import Counter 49 | 50 | depth = 8 51 | width = 2**22 52 | cms = CountMinSketch(depth, width) 53 | 54 | items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 1000 55 | cms.batch_add(items) # Use the batch_add function 56 | 57 | print(cms.query(3)) # Query for frequency estimates 58 | print(cms.query(1)) 59 | print(cms.query(2)) 60 | print(cms.query(4)) 61 | print(cms.query(5)) 62 | 63 | print(Counter(items)) # Print the exact counts for comparison 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Python cache files 3 | __pycache__/ 4 | *.pyc 5 | *.pyo 6 | *.pyd 7 | *.py[cod] 8 | *$py.class 9 | .Python 10 | *.so 11 | 12 | # Virtual environments 13 | venv/ 14 | env/ 15 | ENV/ 16 | 17 | # IDE files 18 | .vscode/ 19 | .idea/ 20 | *.swp 21 | *.swo 22 | 23 | # OS files 24 | .DS_Store 25 | Thumbs.db 26 | 27 | # Distribution / packaging 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | pip-wheel-metadata/ 41 | share/python-wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # pipenv 103 | Pipfile.lock 104 | 105 | # PEP 582 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre 141 | -------------------------------------------------------------------------------- /tests/data_io_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import sys 5 | import tempfile 6 | import unittest 7 | from dataclasses import dataclass 8 | 9 | import numpy as np 10 | 11 | from outrank.core_utils import parse_csv_raw 12 | from outrank.core_utils import parse_namespace 13 | 14 | sys.path.append('./outrank') 15 | 16 | 17 | np.random.seed(123) 18 | test_files_path = 'tests/tests_files' 19 | 20 | 21 | @dataclass 22 | class args: 23 | label_column: str = 'label' 24 | heuristic: str = 'surrogate-LR' 25 | target_ranking_only: bool = True 26 | interaction_order: int = 3 27 | combination_number_upper_bound: int = 1024 28 | 29 | 30 | class CoreIOTest(unittest.TestCase): 31 | def test_parser_vw_namespace(self): 32 | float_set, _ = parse_namespace( 33 | os.path.join(test_files_path, 'vw_namespace_map.csv'), 34 | ) 35 | expected_output = {f'f{x}' for x in [1, 2, 3]} 36 | 37 | self.assertEqual(float_set, expected_output) 38 | 39 | def test_parse_raw_csv(self): 40 | dataset_info = parse_csv_raw(test_files_path) 41 | self.assertEqual(dataset_info.column_names, ['f1', 'f2', 'f3', 'f4']) 42 | self.assertEqual(dataset_info.col_delimiter, ',') 43 | self.assertEqual(dataset_info.column_types, set()) 44 | 45 | def test_parse_csv_with_quoted_fields(self): 46 | """Test proper CSV parsing with quoted fields containing commas""" 47 | with tempfile.TemporaryDirectory() as temp_dir: 48 | csv_file_path = os.path.join(temp_dir, 'data.csv') 49 | 50 | # Create CSV with quoted fields containing commas and quotes 51 | csv_content = 'f1,"f2,quoted",f3,"f4 ""with"" quotes"\n1.0,TS,23,12\n' 52 | 53 | with open(csv_file_path, 'w') as f: 54 | f.write(csv_content) 55 | 56 | dataset_info = parse_csv_raw(temp_dir) 57 | 58 | # Verify proper CSV parsing handles quoted fields correctly 59 | expected_columns = ['f1', 'f2,quoted', 'f3', 'f4 "with" quotes'] 60 | self.assertEqual(dataset_info.column_names, expected_columns) 61 | self.assertEqual(dataset_info.col_delimiter, ',') 62 | self.assertEqual(dataset_info.column_types, set()) 63 | 64 | 65 | if __name__ == '__main__': 66 | unittest.main() 67 | -------------------------------------------------------------------------------- /benchmarks/generator_second_order.py: -------------------------------------------------------------------------------- 1 | # This simplest thing we can do for now. 2 | from __future__ import annotations 3 | 4 | import numpy as np 5 | 6 | np.random.seed(123) 7 | 8 | 9 | def generate_random_matrix(num_features, size=2000): 10 | # random int matrix (categorical) 11 | sample = np.random.randint(10, 100, size=(size, num_features)) 12 | 13 | target = sample[:, 30] + sample[:, 50] 14 | 15 | target[target < 20] = 0 16 | return sample, target 17 | 18 | 19 | if __name__ == '__main__': 20 | import argparse 21 | import logging 22 | import os 23 | import shutil 24 | 25 | import pandas as pd 26 | 27 | logging.basicConfig( 28 | format='%(asctime)s - %(message)s', 29 | datefmt='%d-%b-%y %H:%M:%S', 30 | ) 31 | logger = logging.getLogger('syn-logger') 32 | logger.setLevel(logging.DEBUG) 33 | 34 | parser = argparse.ArgumentParser( 35 | description='Fast feature screening for sparse data sets.', 36 | formatter_class=argparse.RawTextHelpFormatter, 37 | ) 38 | 39 | parser.add_argument('--output_df_name', type=str, default=None) 40 | 41 | parser.add_argument('--verify_outputs', type=str, default=None) 42 | 43 | parser.add_argument('--num_features', type=int, default=300) 44 | 45 | parser.add_argument('--size', type=int, default=1000) 46 | 47 | args = parser.parse_args() 48 | 49 | if args.output_df_name is not None: 50 | sample, target = generate_random_matrix(args.num_features, args.size) 51 | dfx = pd.DataFrame(sample) 52 | dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] 53 | dfx['label'] = target 54 | if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): 55 | shutil.rmtree(args.output_df_name) 56 | os.mkdir(args.output_df_name) 57 | dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) 58 | 59 | logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') 60 | elif args.verify_outputs is not None: 61 | rankings = pd.read_csv( 62 | os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', 63 | ) 64 | if rankings.iloc[1]['Feature'] != 'f30 AND f50-(5749; 100)': 65 | raise Exception( 66 | f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', 67 | ) 68 | else: 69 | logger.info( 70 | f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})', 71 | ) 72 | -------------------------------------------------------------------------------- /outrank/algorithms/synthetic_data_generators/generator_naive.py: -------------------------------------------------------------------------------- 1 | # This simplest thing we can do for now. 2 | from __future__ import annotations 3 | 4 | import numpy as np 5 | 6 | np.random.seed(123) 7 | 8 | 9 | def generate_random_matrix(num_features=100, size=20000): 10 | # random int matrix (categorical) 11 | sample = np.random.randint(10, 100, size=(size, num_features)) 12 | 13 | target = sample[:, 30] 14 | # Some noise 15 | 16 | target[target < 40] = 0 17 | target[target > 39] = 1 18 | return sample, target 19 | 20 | 21 | if __name__ == '__main__': 22 | import argparse 23 | import logging 24 | import os 25 | import shutil 26 | 27 | import pandas as pd 28 | 29 | logging.basicConfig( 30 | format='%(asctime)s - %(message)s', 31 | datefmt='%d-%b-%y %H:%M:%S', 32 | ) 33 | logger = logging.getLogger('syn-logger') 34 | logger.setLevel(logging.DEBUG) 35 | 36 | parser = argparse.ArgumentParser( 37 | description='Fast feature screening for sparse data sets.', 38 | formatter_class=argparse.RawTextHelpFormatter, 39 | ) 40 | 41 | parser.add_argument('--output_df_name', type=str, default=None) 42 | 43 | parser.add_argument('--verify_outputs', type=str, default=None) 44 | 45 | parser.add_argument('--num_features', type=int, default=300) 46 | 47 | parser.add_argument('--size', type=int, default=1000) 48 | 49 | args = parser.parse_args() 50 | 51 | if args.output_df_name is not None: 52 | sample, target = generate_random_matrix(args.num_features, args.size) 53 | dfx = pd.DataFrame(sample) 54 | dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] 55 | dfx['label'] = target 56 | if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): 57 | shutil.rmtree(args.output_df_name) 58 | os.mkdir(args.output_df_name) 59 | dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) 60 | 61 | logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') 62 | elif args.verify_outputs is not None: 63 | rankings = pd.read_csv( 64 | os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', 65 | ) 66 | if rankings.iloc[1]['Feature'] != 'f30-(81; 100)': 67 | raise Exception( 68 | f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', 69 | ) 70 | else: 71 | logger.info( 72 | f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})', 73 | ) 74 | -------------------------------------------------------------------------------- /benchmarks/generator_third_order.py: -------------------------------------------------------------------------------- 1 | # This simplest thing we can do for now. 2 | from __future__ import annotations 3 | 4 | import numpy as np 5 | 6 | np.random.seed(123) 7 | 8 | 9 | def generate_random_matrix(num_features, size=2000): 10 | # random int matrix (categorical) 11 | sample = np.random.randint(10, 100, size=(size, num_features)) 12 | 13 | target = sample[:, 30] + sample[:, 50] + sample[:, 20] 14 | 15 | target[target < 20] = 0 16 | return sample, target 17 | 18 | 19 | if __name__ == '__main__': 20 | import argparse 21 | import logging 22 | import os 23 | import shutil 24 | 25 | import pandas as pd 26 | 27 | logging.basicConfig( 28 | format='%(asctime)s - %(message)s', 29 | datefmt='%d-%b-%y %H:%M:%S', 30 | ) 31 | logger = logging.getLogger('syn-logger') 32 | logger.setLevel(logging.DEBUG) 33 | 34 | parser = argparse.ArgumentParser( 35 | description='Fast feature screening for sparse data sets.', 36 | formatter_class=argparse.RawTextHelpFormatter, 37 | ) 38 | 39 | parser.add_argument('--output_df_name', type=str, default=None) 40 | 41 | parser.add_argument('--verify_outputs', type=str, default=None) 42 | 43 | parser.add_argument('--num_features', type=int, default=300) 44 | 45 | parser.add_argument('--size', type=int, default=1000) 46 | 47 | args = parser.parse_args() 48 | 49 | if args.output_df_name is not None: 50 | sample, target = generate_random_matrix(args.num_features, args.size) 51 | dfx = pd.DataFrame(sample) 52 | dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] 53 | dfx['label'] = target 54 | if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): 55 | shutil.rmtree(args.output_df_name) 56 | os.mkdir(args.output_df_name) 57 | dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) 58 | 59 | logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') 60 | elif args.verify_outputs is not None: 61 | rankings = pd.read_csv( 62 | os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', 63 | ) 64 | if ( 65 | rankings.iloc[1]['Feature'] != 'f20-(90; 100)' 66 | and rankings.iloc[2]['Feature'] != 'f50-(90; 100)' 67 | and rankings.iloc[3]['Feature'] != 'f30-(90; 100)' 68 | ): 69 | raise Exception( 70 | f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', 71 | ) 72 | else: 73 | logger.info( 74 | f'Identified the appropriate feature in the haystack ({rankings.iloc[1:4].Feature})', 75 | ) 76 | -------------------------------------------------------------------------------- /benchmarks/analyse_rankings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import glob 4 | import os 5 | import sys 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | def extract_just_ranking(dfile): 11 | """Extract ranking from an output file.""" 12 | ranks = [] 13 | with open(dfile) as df: 14 | next(df) # Skip header line 15 | for line in df: 16 | parts = line.strip().split('\t') 17 | ranks.append(parts[1]) 18 | return ranks 19 | 20 | def calculate_mismatch_scores(all_folders, mismatches): 21 | """Calculate mismatch scores based on ranking files.""" 22 | all_counts = [int(folder.split('_').pop()) for folder in all_folders if 'ranking' in folder] 23 | 24 | ranking_out_struct = {} 25 | for count in all_counts: 26 | rpath = os.path.join(dfolder, f'ranking_{count}', 'feature_singles.tsv') 27 | ranking_out_struct[count] = extract_just_ranking(rpath) 28 | 29 | pivot_score_key = max(all_counts) 30 | reference_ranking = ranking_out_struct[pivot_score_key] 31 | 32 | out_results = {} 33 | for ranking_id, ranking in ranking_out_struct.items(): 34 | mismatches_counter = 0 35 | for el in ranking[:mismatches]: 36 | if el not in reference_ranking[:mismatches]: 37 | mismatches_counter += 1 38 | out_results[ranking_id] = 100 * (1 - mismatches_counter / mismatches) 39 | 40 | return dict(sorted(out_results.items(), key=lambda x: x[0])) 41 | 42 | def plot_precision_curve(results, pivot_score_key, mismatches, axs, c1, c2): 43 | """Plot the precision curve based on mismatch results.""" 44 | instances = [100 * (k / pivot_score_key) for k in results.keys()] 45 | values = list(results.values()) 46 | 47 | axs[c1,c2].plot(instances, values, marker='o', linestyle='-', color='black') 48 | axs[c1,c2].invert_xaxis() 49 | axs[c1,c2].set(xlabel='Proportion of data used (%)', ylabel=f'hits@{mismatches} (%)', title=f'Approximation, top {mismatches} Features') 50 | axs[c1,c2].grid(True) 51 | 52 | if __name__ == '__main__': 53 | if len(sys.argv) < 2: 54 | print('Usage: python script.py ') 55 | sys.exit(1) 56 | 57 | dfolder = sys.argv[1] 58 | mismatch_range = [1, 5, 10, 20] 59 | fig, axs = plt.subplots(2, 2) 60 | fig.set_figheight(10) 61 | fig.set_figwidth(10) 62 | row = -1 63 | for enx, mismatches in enumerate(mismatch_range): 64 | if enx % 2 == 0: 65 | row += 1 66 | col = enx % 2 67 | all_folders = list(glob.glob(os.path.join(dfolder, '*'))) 68 | out_results = calculate_mismatch_scores(all_folders, mismatches) 69 | pivot_score_key = max(out_results) 70 | plot_precision_curve(out_results, pivot_score_key, mismatches, axs, row, col) 71 | plt.tight_layout() 72 | plt.savefig('comparison.png', dpi=300) 73 | -------------------------------------------------------------------------------- /benchmarks/data_regression_experiment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail # Enable strict mode for safety 4 | 5 | # Configurable variables 6 | NUM_ROWS=1000000 7 | NUM_FEATURES=100 8 | INPUT_FILE="test_data_synthetic/data.csv" 9 | SIZES=('50000' '100000' '200000' '500000' '600000' '700000' '800000' '900000' '1000000') 10 | 11 | # Function to remove a directory safely 12 | remove_directory_safely() { 13 | directory_to_remove=$1 14 | if [ -d "$directory_to_remove" ]; then 15 | echo "Removing directory: $directory_to_remove" 16 | rm -rvf "$directory_to_remove" 17 | else 18 | echo "Directory does not exist, skipping: $directory_to_remove" 19 | fi 20 | } 21 | 22 | # Function to generate random data 23 | generate_data() { 24 | echo "Generating random data files with $NUM_ROWS rows and $NUM_FEATURES features..." 25 | outrank --task data_generator --num_synthetic_rows $NUM_ROWS --num_synthetic_features $NUM_FEATURES 26 | echo "Random data generation complete." 27 | } 28 | 29 | # Function to create subspaces from the data 30 | sample_subspaces() { 31 | for i in "${SIZES[@]}" 32 | do 33 | dataset="test_data_synthetic/dataset_$i" 34 | outfile="$dataset/data.csv" 35 | mkdir -p "$dataset" 36 | 37 | if [ -f "$INPUT_FILE" ]; then 38 | echo "Sampling $i rows into $outfile..." 39 | head -n $i "$INPUT_FILE" > "$outfile" 40 | echo "Sampling for $outfile done." 41 | else 42 | echo "Input file $INPUT_FILE not found. Skipping sampling for $i rows." 43 | fi 44 | done 45 | } 46 | 47 | # Function to perform feature ranking 48 | feature_ranking() { 49 | for i in "${SIZES[@]}" 50 | do 51 | dataset="test_data_synthetic/dataset_$i" 52 | output_folder="./test_data_synthetic/ranking_$i" 53 | 54 | if [ ! -d "$dataset" ]; then 55 | echo "Dataset directory $dataset does not exist. Skipping ranking for $i rows." 56 | continue 57 | fi 58 | 59 | echo "Proceeding with feature ranking for $i rows..." 60 | outrank --task ranking --data_path "$dataset" --data_source csv-raw \ 61 | --combination_number_upper_bound 60 --output_folder "$output_folder" \ 62 | --disable_tqdm True 63 | 64 | echo "Feature ranking summary for $i rows." 65 | outrank --task ranking_summary --output_folder "$output_folder" --data_path "$dataset" 66 | echo "Ranking for $i done." 67 | done 68 | } 69 | 70 | # Function to analyze the rankings 71 | analyse_rankings() { 72 | echo "Analyzing the rankings..." 73 | python analyse_rankings.py test_data_synthetic 74 | echo "Analysis complete." 75 | } 76 | 77 | # Main script execution 78 | remove_directory_safely test_data_synthetic/ 79 | generate_data 80 | sample_subspaces 81 | feature_ranking 82 | analyse_rankings 83 | 84 | echo "Script execution finished." 85 | -------------------------------------------------------------------------------- /docs/DOCSMAIN.md: -------------------------------------------------------------------------------- 1 | # Welcome to OutRank's documentation! 2 | 3 | All functions/methods can be searched-for (search bar on the left). 4 | 5 | This tool enables fast screening of feature-feature interactions. Its purpose is to give the user fast insight into potential redundancies/anomalies in the data. 6 | It is implemented to operate in _mini batches_, it traverses the `raw data` incrementally, refining the rankings as it goes along. The core operation, interaction ranking, outputs triplets which look as follows: 7 | 8 | ``` 9 | featureA featureB 0.512 10 | featureA featureC 0.125 11 | ``` 12 | 13 | 14 | # Setup 15 | ```bash 16 | pip install outrank 17 | ``` 18 | 19 | and test a minimal cycle with 20 | 21 | ```bash 22 | outrank --task selftest 23 | ``` 24 | 25 | if this passes, you can be pretty certain OutRank will perform as intended. OutRank's primary use case is as a CLI tool, begin exploring with 26 | 27 | ```bash 28 | outrank --help 29 | ``` 30 | 31 | 32 | # Example use cases 33 | * A minimal showcase of performing feature ranking on a generic CSV is demonstrated with [this example](https://github.com/outbrain/outrank/tree/main/scripts/run_minimal.sh). 34 | 35 | * [More examples](https://github.com/outbrain/outrank/tree/main/examples) demonstrating OutRank's capabilities are also available. 36 | 37 | 38 | # OutRank as a Python library 39 | Once installed, _OutRank_ can be used as any other Python library. For example, generic feature ranking algorithms can be accessed as 40 | 41 | ```python 42 | from outrank.algorithms.feature_ranking.ranking_mi_numba import ( 43 | mutual_info_estimator_numba, 44 | ) 45 | 46 | # Some synthetic minimal data (Numpy vectors) 47 | a = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32) 48 | 49 | lowest = np.array(np.random.permutation(a), dtype=np.int32) 50 | medium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32) 51 | high = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32) 52 | 53 | lowest_score = mutual_info_estimator_numba( 54 | a, lowest, np.float32(1.0), False, 55 | ) 56 | medium_score = mutual_info_estimator_numba( 57 | a, medium, np.float32(1.0), False, 58 | ) 59 | high_score = mutual_info_estimator_numba( 60 | a, high, np.float32(1.0), False, 61 | ) 62 | 63 | scores = [lowest_score, medium_score, high_score] 64 | sorted_score_indices = np.argsort(scores) 65 | assert np.sum(np.array([0, 1, 2]) - sorted_score_indices) == 0 66 | ``` 67 | --- 68 | ## Creating a simple dataset 69 | ```python 70 | from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification 71 | 72 | cc = CategoricalClassification() 73 | 74 | # Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35 75 | X = cc.generate_data(9, 76 | 10000, 77 | cardinality=35, 78 | ensure_rep=True, 79 | random_values=True, 80 | low=0, 81 | high=40) 82 | 83 | # Creates target labels via clustering 84 | y = cc.generate_labels(X, n=2, class_relation='cluster') 85 | 86 | ``` -------------------------------------------------------------------------------- /scripts/run_benchmarks.sh: -------------------------------------------------------------------------------- 1 | 2 | pip install . --upgrade; 3 | cd benchmarks; 4 | 5 | ################################################################### 6 | #.................................................................. 7 | ################################################################### 8 | # Can we find a needle 9 | 10 | if [[ $1 == "CI" ]] 11 | then 12 | echo "CI Run experiments initialized" 13 | # Generate relevant synthetic data sets 14 | python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000; 15 | 16 | # Substantial subsampling must retrieve the needle. 17 | outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only False --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000; 18 | 19 | python generator_naive.py --verify_outputs ranking_outputs; 20 | 21 | rm -r ranking_outputs dataset_naive; 22 | 23 | python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000; 24 | 25 | exit 26 | fi 27 | ################################################################### 28 | #.................................................................. 29 | ################################################################### 30 | # Can we find a needle - bigger data set 31 | 32 | # Generate relevant synthetic data sets 33 | python generator_naive.py --output_df_name dataset_naive --num_features 300 --size 2000000; 34 | 35 | # Substantial subsampling must retrieve the needle. 36 | outrank --data_path dataset_naive --data_source csv-raw --subsampling 100 --task all --heuristic MI-numba-randomized --target_ranking_only True --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000; 37 | 38 | python generator_naive.py --verify_outputs ranking_outputs; 39 | 40 | rm -r ranking_outputs dataset_naive; 41 | 42 | ################################################################### 43 | #.................................................................. 44 | ################################################################### 45 | # Can we find an interaction needle? 46 | 47 | # Generate relevant synthetic data sets 48 | python generator_second_order.py --output_df_name dataset_naive --num_features 100 --size 10000; 49 | 50 | # Substantial subsampling must retrieve the needle. 51 | outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only True --interaction_order 2 --output_folder ./ranking_outputs; 52 | 53 | python generator_second_order.py --verify_outputs ranking_outputs; 54 | 55 | rm -r ranking_outputs dataset_naive; 56 | 57 | ################################################################### 58 | #.................................................................. 59 | ################################################################### 60 | # Can we find an interaction needle - order 3 with samplied stream 61 | 62 | # Generate relevant synthetic data sets 63 | python generator_third_order.py --output_df_name dataset_naive --num_features 100 --size 100000; 64 | 65 | # Substantial subsampling must retrieve the needle. 66 | outrank --data_path dataset_naive --data_source csv-raw --subsampling 10 --task all --heuristic MI-numba-randomized --target_ranking_only True --interaction_order 3 --output_folder ./ranking_outputs; 67 | 68 | python generator_third_order.py --verify_outputs ranking_outputs; 69 | 70 | rm -r ranking_outputs dataset_naive; 71 | 72 | cd ..; 73 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Feature Evolution via Ranking 2 | 3 | This script facilitates the process of feature evolution through iterative ranking using the `outrank` tool. It automates the process of running multiple iterations of feature ranking, extracting the best features, and updating the model specifications accordingly. 4 | 5 | ## Overview 6 | 7 | The script performs the following steps: 8 | 1. **Initialization**: Sets up the initial model specification directory and creates the initial model JSON file. 9 | 2. **Iteration**: Runs the `outrank` task for a specified number of iterations. 10 | 3. **Feature Extraction**: Processes the results of each iteration to extract the best feature. 11 | 4. **Model Update**: Updates the model specification JSON with the newly identified best feature. 12 | 13 | ## Prerequisites 14 | 15 | - Ensure that the `outrank` tool is installed and accessible from the command line. 16 | - Python 3.6 or higher. 17 | - Required Python packages: `pandas`, `argparse`, `json`, `shutil`, and `logging`. 18 | 19 | ## Installation 20 | 21 | Install the required Python packages using pip (`pip install outrank --upgrade`) 22 | 23 | --- 24 | 25 | # JSON-Based Feature Transformers 26 | 27 | This directory also contains example JSON files for specifying custom feature transformations in OutRank. 28 | 29 | ## JSON Transformer Overview 30 | 31 | OutRank now supports loading feature transformers from JSON specification files in addition to the built-in presets. This allows users to define custom numpy-based transformations without modifying the source code. 32 | 33 | ## JSON Format 34 | 35 | The JSON format is simple: a dictionary where keys are transformer names and values are numpy expressions: 36 | 37 | ```json 38 | { 39 | "_tr_sqrt": "np.sqrt(X)", 40 | "_tr_log": "np.log(X + 1)", 41 | "_tr_custom": "np.tanh(X)" 42 | } 43 | ``` 44 | 45 | ## JSON Transformer Examples 46 | 47 | ### `simple_transformers.json` 48 | Basic mathematical transformations including square root, logarithm, square, absolute value, and exponential. 49 | 50 | ### `custom_transformers.json` 51 | Advanced transformations including sigmoid, tanh, ReLU, normalization, z-score standardization, and other custom functions. 52 | 53 | ## Usage 54 | 55 | ### Command Line Interface 56 | 57 | ```bash 58 | # Use JSON transformers only 59 | outrank --transformers examples/simple_transformers.json --data_path mydata/ --data_source csv-raw 60 | 61 | # Combine preset with JSON transformers 62 | outrank --transformers default,examples/custom_transformers.json --data_path mydata/ --data_source csv-raw 63 | ``` 64 | 65 | ### Python API 66 | 67 | ```python 68 | from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric 69 | 70 | # JSON transformers only 71 | transformer = FeatureTransformerGeneric( 72 | numeric_columns={'feature1', 'feature2'}, 73 | preset='examples/simple_transformers.json' 74 | ) 75 | 76 | # Combine with presets 77 | transformer = FeatureTransformerGeneric( 78 | numeric_columns={'feature1', 'feature2'}, 79 | preset='minimal,examples/custom_transformers.json' 80 | ) 81 | ``` 82 | 83 | ## Creating Custom Transformers 84 | 85 | 1. Create a JSON file with your transformer specifications 86 | 2. Use valid numpy expressions where `X` represents the input feature array 87 | 3. Follow the naming convention `_tr_*` for transformer names 88 | 4. Ensure all expressions are strings in the JSON 89 | 90 | ### Example Custom Transformer 91 | 92 | ```json 93 | { 94 | "_tr_my_custom": "np.log(np.abs(X) + 1) * np.sqrt(X)", 95 | "_tr_sigmoid_scaled": "1 / (1 + np.exp(-X * 0.1))", 96 | "_tr_percentile_rank": "np.searchsorted(np.sort(X), X) / len(X)" 97 | } 98 | ``` 99 | -------------------------------------------------------------------------------- /outrank/task_instance_ranking.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import gzip 4 | import os 5 | from collections import Counter 6 | from collections import defaultdict 7 | from typing import Any 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import tqdm 12 | 13 | from outrank.core_utils import generic_line_parser 14 | from outrank.core_utils import get_dataset_info 15 | from outrank.core_utils import get_num_of_instances 16 | 17 | try: 18 | import matplotlib.pyplot as plt 19 | except ImportError: 20 | plt = None 21 | 22 | def shannon_entropy(string: str) -> float: 23 | counts = Counter(string) 24 | frequencies = (i / len(string) for i in counts.values()) 25 | return -sum(f * np.log2(f) for f in frequencies) 26 | 27 | def compute_average_entropy(line: list[str]) -> float: 28 | return sum(shannon_entropy(field) for field in line) 29 | 30 | def score_line(line: list[str]) -> dict[str, float]: 31 | total_fields = len(line) 32 | nan_prop = line.count('') / total_fields 33 | empty_dict_prop = line.count('{}') / total_fields 34 | all_empty_prop = (line.count('{}') + line.count('')) / total_fields 35 | all_zero_prop = line.count('0') / total_fields 36 | 37 | out_struct = { 38 | 'empty_string_prop': nan_prop, 39 | 'empty_dict': empty_dict_prop, 40 | 'all_empty': all_empty_prop, 41 | 'all_zero': all_zero_prop, 42 | 'row_entropy': compute_average_entropy(line), 43 | } 44 | 45 | for j in [30, 60, 100, 200, 300]: 46 | out_struct[f'all_more_{j}_chars'] = sum(len(x) > j for x in line) / total_fields 47 | 48 | return out_struct 49 | 50 | def outrank_task_rank_instances(args: Any) -> None: 51 | dataset_info = get_dataset_info(args) 52 | data_path = dataset_info.data_path 53 | data_encoding = 'utf-8' 54 | delimiter = '\t' 55 | 56 | total_lines = get_num_of_instances(data_path) - 1 57 | local_pbar = tqdm.tqdm(total=total_lines, position=0, disable=args.disable_tqdm == 'True') 58 | local_pbar.set_description('Starting ranking computation') 59 | 60 | _, file_extension = os.path.splitext(data_path) 61 | file_stream = gzip.open(data_path, 'rt', encoding=data_encoding) if file_extension == '.gz' else open(data_path, encoding=data_encoding) 62 | 63 | line_counter = 0 64 | out_scores_lab = defaultdict(list) 65 | 66 | for line in file_stream: 67 | line_counter += 1 68 | local_pbar.update(1) 69 | 70 | parsed_line = generic_line_parser( 71 | line, 72 | delimiter, 73 | args, 74 | dataset_info.fw_map, 75 | dataset_info.column_names, 76 | ) 77 | 78 | if line_counter > 100_000: 79 | break 80 | out_scores_lab[line[0]].append(score_line(parsed_line)) 81 | 82 | file_stream.close() 83 | 84 | os.makedirs(args.output_folder, exist_ok=True) 85 | for label, out_scores in out_scores_lab.items(): 86 | out_df = pd.DataFrame(out_scores) 87 | for col in out_df.columns: 88 | sorted_vals = out_df[col].sort_values() 89 | plt.figure(figsize=(5, 5), dpi=300) 90 | plt.title(f'{col} label: {label}') 91 | plt.hist( 92 | x=sorted_vals * 100, 93 | color='black', 94 | density=True, 95 | bins=100, 96 | ) 97 | plt.xlabel('Proportion of namespaces (%)' if 'entropy' not in col else 'Row entropy') 98 | plt.ylabel('Density') 99 | plt.tight_layout() 100 | fname = f'distPlot{col}_{label}.pdf' 101 | plt.savefig(os.path.join(args.output_folder, fname), dpi=300) 102 | plt.cla() 103 | plt.clf() 104 | 105 | local_pbar.close() 106 | -------------------------------------------------------------------------------- /benchmarks/generator_naive.py: -------------------------------------------------------------------------------- 1 | # This simplest thing we can do for now. 2 | from __future__ import annotations 3 | 4 | import numpy as np 5 | 6 | np.random.seed(123) 7 | 8 | 9 | def generate_random_matrix(num_features, size=2000000): 10 | # random int matrix (categorical) 11 | sample = np.random.randint(10, 100, size=(size, num_features)) 12 | 13 | target = sample[:, 30] 14 | # Some noise 15 | 16 | sample[:, 31] = target * 19 17 | target[target < 20] = 0 18 | return sample, target 19 | 20 | 21 | if __name__ == '__main__': 22 | import argparse 23 | import logging 24 | import os 25 | import shutil 26 | 27 | import pandas as pd 28 | 29 | logging.basicConfig( 30 | format='%(asctime)s - %(message)s', 31 | datefmt='%d-%b-%y %H:%M:%S', 32 | ) 33 | logger = logging.getLogger('syn-logger') 34 | logger.setLevel(logging.DEBUG) 35 | 36 | parser = argparse.ArgumentParser( 37 | description='Fast feature screening for sparse data sets.', 38 | formatter_class=argparse.RawTextHelpFormatter, 39 | ) 40 | 41 | parser.add_argument('--output_df_name', type=str, default=None) 42 | 43 | parser.add_argument('--verify_outputs', type=str, default=None) 44 | 45 | parser.add_argument('--num_features', type=int, default=300) 46 | 47 | parser.add_argument('--size', type=int, default=1000) 48 | 49 | args = parser.parse_args() 50 | 51 | if args.output_df_name is not None: 52 | sample, target = generate_random_matrix(args.num_features, args.size) 53 | dfx = pd.DataFrame(sample) 54 | dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] 55 | dfx['label'] = target 56 | if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): 57 | shutil.rmtree(args.output_df_name) 58 | os.mkdir(args.output_df_name) 59 | dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) 60 | 61 | logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') 62 | elif args.verify_outputs is not None: 63 | rankings = pd.read_csv( 64 | os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', 65 | ) 66 | 67 | rankings_pairwise = pd.read_csv( 68 | os.path.join(args.verify_outputs, 'pairwise_ranks.tsv'), sep='\t', 69 | ) 70 | 71 | # Partial match test 72 | if rankings.iloc[2]['Feature'] != 'f31-(90; 100)' and rankings.iloc[2]['Score MI-numba-randomized'] > 0.9: 73 | raise Exception( 74 | f'Could not retrieve the appropriate second-ranked feature needle in the haystack {rankings.iloc[2].Feature}, exiting', 75 | ) 76 | else: 77 | logger.info( 78 | f'Identified the appropriate second-ranked feature in the haystack ({rankings.iloc[1].Feature})', 79 | ) 80 | 81 | # Test of direct retrievals 82 | if rankings.iloc[1]['Feature'] != 'f30-(81; 100)' and rankings.iloc[2]['Score MI-numba-randomized'] > 0.99: 83 | raise Exception( 84 | f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', 85 | ) 86 | else: 87 | logger.info( 88 | f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})', 89 | ) 90 | 91 | 92 | # Tests related to pairwise rankings 93 | sorted_by_scores = rankings_pairwise.sort_values(by=['Score', 'FeatureA']) 94 | 95 | if len(sorted_by_scores) < 10000: 96 | raise Exception('Number of pairwise comparisons insufficient!') 97 | else: 98 | logger.info('Found enough pairwise comparisons ..') 99 | 100 | if sorted_by_scores.iloc[-1]['FeatureA'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['FeatureB'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['Score'] > 1.0: 101 | logger.info('Similarity check passed for f45 ..') 102 | else: 103 | raise Exception('Most similar features not identified ..') 104 | -------------------------------------------------------------------------------- /outrank/task_summary.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | from collections import defaultdict 6 | from typing import Any 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 12 | 13 | 14 | def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame: 15 | """Read triplets from a file and sort by the 'Score' column.""" 16 | triplets = pd.read_csv(triplets_path, sep='\t') 17 | return triplets.sort_values(by='Score', ascending=False) 18 | 19 | 20 | def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]: 21 | """Generate final ranking based on the label column.""" 22 | final_ranking = [] 23 | for _, row in triplets.iterrows(): 24 | feature_a, feature_b = row['FeatureA'], row['FeatureB'] 25 | score = row['Score'] 26 | if label_column == feature_a.split('-')[0]: 27 | final_ranking.append([feature_b, score]) 28 | elif label_column == feature_b.split('-')[0]: 29 | final_ranking.append([feature_a, score]) 30 | return final_ranking 31 | 32 | 33 | def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame: 34 | """Create a final DataFrame and normalize if necessary.""" 35 | final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}']) 36 | final_df = ( 37 | final_df.groupby('Feature') 38 | .median() 39 | .reset_index() 40 | .sort_values(by=f'Score {heuristic}', ascending=False) 41 | ) 42 | 43 | if 'MI' in heuristic: 44 | min_score = final_df[f'Score {heuristic}'].min() 45 | max_score = final_df[f'Score {heuristic}'].max() 46 | final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score) 47 | 48 | return final_df 49 | 50 | 51 | def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None: 52 | """Store the summary files and optionally print the head of the DataFrame.""" 53 | logging.info(f'Storing summary files to {output_folder}') 54 | pd.set_option('display.max_rows', None, 'display.max_columns', None) 55 | 56 | singles_path = os.path.join(output_folder, 'feature_singles.tsv') 57 | final_df.to_csv(singles_path, sep='\t', index=False) 58 | 59 | if tldr: 60 | print(final_df.head(20)) 61 | 62 | 63 | def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None: 64 | """Handle the interaction order if it is greater than 1.""" 65 | if interaction_order > 1: 66 | feature_store = defaultdict(list) 67 | for _, row in final_df.iterrows(): 68 | fname = row['Feature'] 69 | score = row[f'Score {heuristic}'] 70 | if 'AND' in fname: 71 | for el in fname.split('-')[0].split(' AND '): 72 | feature_store[el].append(score) 73 | 74 | final_aggregate_df = pd.DataFrame([ 75 | { 76 | 'Feature': k, 77 | f'Combined score (order: {interaction_order}, {heuristic})': np.median(v), 78 | } 79 | for k, v in feature_store.items() 80 | ]) 81 | final_aggregate_df.to_csv( 82 | os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False, 83 | ) 84 | 85 | 86 | def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None: 87 | """Filter the DataFrame to include only transformer features and store the result.""" 88 | transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv') 89 | final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False) 90 | 91 | 92 | def outrank_task_result_summary(args) -> None: 93 | """Main function to generate a summary of outrank task results.""" 94 | triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') 95 | triplets = read_and_sort_triplets(triplets_path) 96 | 97 | final_ranking = generate_final_ranking(triplets, args.label_column) 98 | final_df = create_final_dataframe(final_ranking, args.heuristic) 99 | 100 | store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr) 101 | handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order) 102 | filter_transformers_only(final_df, args.output_folder) 103 | -------------------------------------------------------------------------------- /examples/recursive_ranking.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import json 5 | import logging 6 | import os 7 | import shutil 8 | import subprocess 9 | 10 | import pandas as pd 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') 14 | logger = logging.getLogger('syn-logger') 15 | 16 | # Configuration constants 17 | DATA_PATH = os.path.expanduser('~/datasets/toy') 18 | MODEL_SPEC_DIR = 'model_spec_dir' 19 | LABEL_COLUMN_NAME = 'label' 20 | HEURISTIC = 'surrogate-SGD' 21 | DATA_FORMAT = 'ob-vw' 22 | NUM_THREADS = 6 23 | INTERACTION_ORDER = 2 24 | COMBINATION_NUMBER_BOUND = 1_000 25 | MINIBATCH_SIZE = 10_000 26 | SUBSAMPLING = 10 27 | 28 | def run_outrank_task(reference_model_json: str, output_folder: str) -> None: 29 | """Run the outrank task with the specified parameters.""" 30 | outrank_command = ( 31 | f'outrank --task all --data_path {DATA_PATH} --data_source {DATA_FORMAT} ' 32 | f'--target_ranking_only True --combination_number_upper_bound {COMBINATION_NUMBER_BOUND} ' 33 | f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} ' 34 | f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} ' 35 | f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} ' 36 | f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm False;' 37 | ) 38 | logger.info(f'Running outrank command: {outrank_command}') 39 | subprocess.run(outrank_command, shell=True, check=True) 40 | logger.info(f'Outrank task completed for {reference_model_json}') 41 | 42 | def process_results(output_folder: str) -> str: 43 | """Read the results and extract the best feature.""" 44 | results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t') 45 | best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1]) 46 | best_feature = ','.join(best_feature.split(' AND ')) 47 | logger.info(f'Best feature: {best_feature}') 48 | return best_feature 49 | 50 | def update_model_spec(model_index: int, best_feature: str) -> None: 51 | """Update the model specification JSON with the new best feature.""" 52 | current_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index}.json') 53 | next_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index + 1}.json') 54 | 55 | with open(current_model_path) as file: 56 | model_spec = json.load(file) 57 | 58 | current_features = model_spec['desc']['features'] 59 | current_features.append(best_feature) 60 | logger.info(f'Updated features: {current_features}') 61 | 62 | with open(next_model_path, 'w') as file: 63 | new_model_spec = {'desc': {'features': current_features}} 64 | json.dump(new_model_spec, file) 65 | 66 | def initialize_model_spec_dir() -> None: 67 | """Initialize the model specification directory with the initial JSON file.""" 68 | command = ( 69 | 'mkdir -p model_spec_dir && ' 70 | 'rm -rv model_spec_dir/* && ' 71 | 'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json' 72 | ) 73 | subprocess.run(command, shell=True, check=True) 74 | logger.info('Initialized model specification directory with model_0.json') 75 | 76 | def run_evolution(iterations: int) -> None: 77 | """Main function to run the test for multiple iterations.""" 78 | for i in range(iterations): 79 | reference_model_json = os.path.join(MODEL_SPEC_DIR, f'model_{i}.json') 80 | output_folder = f'output_dir_{i}' 81 | 82 | if os.path.isdir(output_folder): 83 | shutil.rmtree(output_folder) 84 | os.mkdir(output_folder) 85 | 86 | try: 87 | run_outrank_task(reference_model_json, output_folder) 88 | best_feature = process_results(output_folder) 89 | update_model_spec(i, best_feature) 90 | except Exception as e: 91 | logger.error(f'An error occurred during iteration {i}: {e}') 92 | continue 93 | 94 | def parse_arguments() -> argparse.Namespace: 95 | """Parse command-line arguments.""" 96 | parser = argparse.ArgumentParser(description='Run the outrank evolution process.') 97 | parser.add_argument( 98 | '--iterations', 99 | type=int, 100 | default=80, 101 | help='Number of iterations to run (default: 10)', 102 | ) 103 | return parser.parse_args() 104 | 105 | if __name__ == '__main__': 106 | args = parse_arguments() 107 | initialize_model_spec_dir() 108 | run_evolution(args.iterations) 109 | -------------------------------------------------------------------------------- /test_coverage_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Test Coverage Summary for OutRank 4 | 5 | This script provides a summary of the comprehensive test coverage improvements 6 | made to the OutRank codebase. 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | import subprocess 12 | import sys 13 | import time 14 | 15 | 16 | def run_test_module(module_name): 17 | """Run tests for a specific module and return results""" 18 | try: 19 | result = subprocess.run( 20 | [sys.executable, '-m', 'unittest', f'tests.{module_name}', '-v'], 21 | capture_output=True, 22 | text=True, 23 | timeout=120 24 | ) 25 | 26 | lines = result.stderr.split('\n') 27 | test_lines = [line for line in lines if 'ok' in line or 'FAIL' in line or 'ERROR' in line] 28 | 29 | return { 30 | 'module': module_name, 31 | 'returncode': result.returncode, 32 | 'test_count': len([line for line in test_lines if 'ok' in line]), 33 | 'passed': result.returncode == 0, 34 | 'output': result.stderr 35 | } 36 | except subprocess.TimeoutExpired: 37 | return { 38 | 'module': module_name, 39 | 'returncode': -1, 40 | 'test_count': 0, 41 | 'passed': False, 42 | 'output': 'TIMEOUT' 43 | } 44 | 45 | 46 | def main(): 47 | print("=" * 70) 48 | print("OutRank Test Coverage Improvement Summary") 49 | print("=" * 70) 50 | 51 | # Enhanced test modules 52 | enhanced_modules = [ 53 | ('cms_test', 'CountMinSketch Algorithm'), 54 | ('cov_heu_test', 'Max Pair Coverage Algorithm'), 55 | ('mi_numba_test', 'Mutual Information Estimator'), 56 | ('json_transformers_test', 'Feature Transformers'), 57 | ('integration_tests', 'Integration & Property-Based Tests') 58 | ] 59 | 60 | print("\nRunning enhanced test suites...") 61 | print("-" * 50) 62 | 63 | total_tests = 0 64 | total_passed = 0 65 | 66 | for module, description in enhanced_modules: 67 | print(f"\n📊 {description}") 68 | print(f" Module: tests.{module}") 69 | 70 | start_time = time.time() 71 | result = run_test_module(module) 72 | duration = time.time() - start_time 73 | 74 | if result['passed']: 75 | status = "✅ PASSED" 76 | total_passed += 1 77 | else: 78 | status = "❌ FAILED" 79 | 80 | print(f" Status: {status}") 81 | print(f" Tests: {result['test_count']} test cases") 82 | print(f" Time: {duration:.2f}s") 83 | 84 | total_tests += result['test_count'] 85 | 86 | print("\n" + "=" * 70) 87 | print("SUMMARY") 88 | print("=" * 70) 89 | print(f"📈 Total test cases added/enhanced: {total_tests}") 90 | print(f"✅ Test modules enhanced: {len(enhanced_modules)}") 91 | print(f"🎯 Success rate: {total_passed}/{len(enhanced_modules)} modules passing") 92 | 93 | print("\n🔍 Coverage Improvements Made:") 94 | improvements = [ 95 | "• CountMinSketch: +13 new tests (260% increase)", 96 | "• Max Pair Coverage: +15 new tests (214% increase)", 97 | "• Mutual Information: +15 new tests (214% increase)", 98 | "• JSON Transformers: +12 new tests (300% increase)", 99 | "• Integration Tests: +9 new cross-component tests" 100 | ] 101 | 102 | for improvement in improvements: 103 | print(improvement) 104 | 105 | print("\n🎯 Test Categories Added:") 106 | categories = [ 107 | "• Comprehensive edge case testing (empty arrays, single elements)", 108 | "• Boundary value testing (min/max integers, extreme values)", 109 | "• Error handling validation (invalid inputs, malformed data)", 110 | "• Mathematical property verification (deterministic behavior)", 111 | "• Performance and scalability testing (large datasets)", 112 | "• Integration testing (cross-component interaction)", 113 | "• Property-based testing (mathematical invariants)", 114 | "• Stress testing (extreme conditions, memory efficiency)" 115 | ] 116 | 117 | for category in categories: 118 | print(category) 119 | 120 | print("\n✨ Key Benefits:") 121 | benefits = [ 122 | "• Enhanced code reliability through comprehensive edge case coverage", 123 | "• Improved mathematical correctness validation", 124 | "• Better error handling and graceful failure modes", 125 | "• Increased confidence in algorithm implementations", 126 | "• Regression testing for future code changes", 127 | "• Documentation of expected behavior through tests" 128 | ] 129 | 130 | for benefit in benefits: 131 | print(benefit) 132 | 133 | print("\n" + "=" * 70) 134 | 135 | if total_passed == len(enhanced_modules): 136 | print("🎉 All enhanced test suites are passing!") 137 | return 0 138 | else: 139 | print("⚠️ Some test suites have failures - please review.") 140 | return 1 141 | 142 | 143 | if __name__ == '__main__': 144 | sys.exit(main()) -------------------------------------------------------------------------------- /tests/ranking_module_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import unittest 5 | from dataclasses import dataclass 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import tqdm 10 | from pathos.multiprocessing import ProcessingPool as Pool 11 | 12 | from outrank.core_ranking import compute_combined_features 13 | from outrank.core_ranking import get_combinations_from_columns 14 | from outrank.core_ranking import mixed_rank_graph 15 | from outrank.feature_transformations.feature_transformer_vault import \ 16 | default_transformers 17 | from outrank.feature_transformations.ranking_transformers import \ 18 | FeatureTransformerGeneric 19 | 20 | sys.path.append('./outrank') 21 | 22 | 23 | np.random.seed(123) 24 | test_files_path = 'tests/tests_files' 25 | 26 | 27 | @dataclass 28 | class args: 29 | label_column: str = 'label' 30 | heuristic: str = 'surrogate-LR' 31 | target_ranking_only: str = 'True' 32 | interaction_order: int = 3 33 | combination_number_upper_bound: int = 1024 34 | disable_tqdm: bool = False 35 | mi_stratified_sampling_ratio: float = 1.0 36 | reference_model_JSON: str = '' 37 | 38 | 39 | class CompareStrategiesTest(unittest.TestCase): 40 | def test_mixed_rank_graph_MI(self): 41 | initial_matrix = np.random.randint(0, 2, (1000, 5)) 42 | dfx = pd.DataFrame(initial_matrix) 43 | dfx.columns = ['c' + str(x) for x in range(4)] + ['label'] 44 | dfx['label'] = dfx['label'].astype(int) 45 | GLOBAL_CPU_POOL = Pool(processes=1) 46 | local_pbar = tqdm.tqdm(total=100, position=0) 47 | for heuristic in ['MI']: 48 | args.heuristic = heuristic 49 | ranking_triplets = mixed_rank_graph( 50 | dfx, args, GLOBAL_CPU_POOL, local_pbar, 51 | ) 52 | unique_nodes = len({x[0] for x in ranking_triplets.triplet_scores}) 53 | self.assertEqual(unique_nodes, dfx.shape[1]) 54 | triplet_df = pd.DataFrame(ranking_triplets.triplet_scores) 55 | triplet_df.columns = ['f1', 'f2', 'score'] 56 | self.assertEqual(int(np.std(triplet_df.score)), 0) 57 | 58 | GLOBAL_CPU_POOL.close() 59 | GLOBAL_CPU_POOL.join() 60 | 61 | def test_feature_transformer_generic(self): 62 | random_array = np.random.rand(100, 5) 63 | dfx = pd.DataFrame(random_array) 64 | numeric_column_names = dfx.columns 65 | transformer = FeatureTransformerGeneric(numeric_column_names) 66 | features_before = dfx.shape[1] 67 | transformed_df = transformer.construct_new_features(dfx) 68 | features_after = transformed_df.shape[1] 69 | self.assertEqual(features_after - features_before, 45) 70 | 71 | def test_transformer_generation(self): 72 | # Generic transformations commonly used 73 | default_ob_transformations = default_transformers.DEFAULT_TRANSFORMERS 74 | self.assertEqual(len(default_ob_transformations), 10) 75 | 76 | def test_compute_combinations(self): 77 | # Some random data - order=3 by default 78 | random_matrix = [[1, 2, 3], [3, 2, 1], [1, 1, 1], [2, 3, 4]] 79 | random_df = pd.DataFrame(random_matrix) 80 | random_df.columns = ['F1', 'F2', 'F3'] 81 | local_pbar = tqdm.tqdm(total=100, position=0) 82 | transformed_df = compute_combined_features( 83 | random_df, args, local_pbar, 84 | ) 85 | self.assertEqual(transformed_df.shape[1], 4) 86 | 87 | args.interaction_order = 2 88 | random_matrix = [[1, 2, 3], [3, 2, 1], [1, 1, 1], [2, 3, 4]] 89 | random_df = pd.DataFrame(random_matrix) 90 | random_df.columns = ['F1', 'F2', 'F3'] 91 | transformed_df = compute_combined_features( 92 | random_df, args, local_pbar, 93 | ) 94 | self.assertEqual(transformed_df.shape[1], 6) 95 | 96 | def test_get_combinations_from_columns_target_ranking_only(self): 97 | all_columns = pd.Index(['a', 'b', 'label']) 98 | args.heuristic = 'MI-numba-randomized' 99 | args.target_ranking_only = 'True' 100 | combinations = get_combinations_from_columns(all_columns, args) 101 | 102 | self.assertSetEqual( 103 | set(combinations), 104 | {('a', 'label'), ('b', 'label'), ('label', 'label')}, 105 | ) 106 | 107 | def test_get_combinations_from_columns(self): 108 | all_columns = pd.Index(['a', 'b', 'label']) 109 | args.heuristic = 'MI-numba-randomized' 110 | args.target_ranking_only = 'False' 111 | combinations = get_combinations_from_columns(all_columns, args) 112 | 113 | self.assertSetEqual( 114 | set(combinations), 115 | {('a', 'a'), ('b', 'b'), ('label', 'label'), ('a', 'b'), ('a', 'label'), ('b', 'label')}, 116 | ) 117 | 118 | def test_get_combinations_from_columns_3mr(self): 119 | all_columns = pd.Index(['a', 'b', 'label']) 120 | args.heuristic = 'MI-numba-3mr' 121 | combinations = get_combinations_from_columns(all_columns, args) 122 | 123 | self.assertSetEqual( 124 | set(combinations), 125 | {('a', 'a'), ('b', 'b'), ('label', 'label'), ('a', 'b'), ('a', 'label'), ('b', 'label')}, 126 | ) 127 | 128 | 129 | if __name__ == '__main__': 130 | unittest.main() 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *///////////////. 2 | //////////////////////* 3 | */////////////////////////. 4 | ////////////// *///////////// 5 | /////////* ///////// 6 | ////// ///// ////, ///// 7 | //////// /// ///////// 8 | ///// ///// .///// ////* 9 | ,//// //// 10 | *//// ////. 11 | ///////*/////// 12 | 13 | ░█████╗░██╗░░░██╗████████╗██████╗░░█████╗░███╗░░██╗██╗░░██╗ 14 | ██╔══██╗██║░░░██║╚══██╔══╝██╔══██╗██╔══██╗████╗░██║██║░██╔╝ 15 | ██║░░██║██║░░░██║░░░██║░░░██████╔╝███████║██╔██╗██║█████═╝░ 16 | ██║░░██║██║░░░██║░░░██║░░░██╔══██╗██╔══██║██║╚████║██╔═██╗░ 17 | ╚█████╔╝╚██████╔╝░░░██║░░░██║░░██║██║░░██║██║░╚███║██║░╚██╗ 18 | ░╚════╝░░╚═════╝░░░░╚═╝░░░╚═╝░░╚═╝╚═╝░░╚═╝╚═╝░░╚══╝╚═╝░░╚═╝ 19 | 20 | [![CI - package](https://github.com/outbrain/outrank/actions/workflows/python-package.yml/badge.svg)](https://github.com/outbrain/outrank/actions/workflows/python-package.yml) [![CI - benchmark](https://github.com/outbrain/outrank/actions/workflows/benchmarks.yml/badge.svg)](https://github.com/outbrain/outrank/actions/workflows/benchmarks.yml) [![CI - selftest](https://github.com/outbrain/outrank/actions/workflows/selftest.yml/badge.svg)](https://github.com/outbrain/outrank/actions/workflows/selftest.yml) [![Unit tests](https://github.com/outbrain/outrank/actions/workflows/python-unit.yml/badge.svg)](https://github.com/outbrain/outrank/actions/workflows/python-unit.yml) 21 | 22 | # TLDR 23 | > The design of modern recommender systems relies on understanding which parts of the feature space are relevant for solving a given recommendation task. However, real-world data sets in this domain are often characterized by their large size, sparsity, and noise, making it challenging to identify meaningful signals. Feature ranking represents an efficient branch of algorithms that can help address these challenges by identifying the most informative features and facilitating the automated search for more compact and better-performing models (AutoML). We introduce OutRank, a system for versatile feature ranking and data quality-related anomaly detection. OutRank was built with categorical data in mind, utilizing a variant of mutual information that is normalized with regard to the noise produced by features of the same cardinality. We further extend the similarity measure by incorporating information on feature similarity and combined relevance. 24 | 25 | # Getting started 26 | Minimal examples and an interface to explore OutRank's functionality are available as [the docs](https://outbrain-inc.github.io/outrank/outrank.html). 27 | 28 | # Contributing 29 | 1. Make sure the functionality is not already implemented! 30 | 2. Decide where the functionality would fit best (is it an algorithm? A parser?) 31 | 3. Open a PR with the implementation 32 | 33 | # Bugs and other reports 34 | Feel free to open a PR that contains: 35 | 1. Issue overview 36 | 2. Minimal example useful for replicating the issue on our end 37 | 3. Possible solution 38 | 39 | # Citing this work 40 | If you use or build on top of OutRank, feel free to cite: 41 | 42 | ``` 43 | @inproceedings{10.1145/3604915.3610636, 44 | author = {Skrlj, Blaz and Mramor, Bla\v{z}}, 45 | title = {OutRank: Speeding up AutoML-Based Model Search for Large Sparse Data Sets with Cardinality-Aware Feature Ranking}, 46 | year = {2023}, 47 | isbn = {9798400702419}, 48 | publisher = {Association for Computing Machinery}, 49 | address = {New York, NY, USA}, 50 | url = {https://doi.org/10.1145/3604915.3610636}, 51 | doi = {10.1145/3604915.3610636}, 52 | abstract = {The design of modern recommender systems relies on understanding which parts of the feature space are relevant for solving a given recommendation task. However, real-world data sets in this domain are often characterized by their large size, sparsity, and noise, making it challenging to identify meaningful signals. Feature ranking represents an efficient branch of algorithms that can help address these challenges by identifying the most informative features and facilitating the automated search for more compact and better-performing models (AutoML). We introduce OutRank, a system for versatile feature ranking and data quality-related anomaly detection. OutRank was built with categorical data in mind, utilizing a variant of mutual information that is normalized with regard to the noise produced by features of the same cardinality. We further extend the similarity measure by incorporating information on feature similarity and combined relevance. The proposed approach’s feasibility is demonstrated by speeding up the state-of-the-art AutoML system on a synthetic data set with no performance loss. Furthermore, we considered a real-life click-through-rate prediction data set where it outperformed strong baselines such as random forest-based approaches. The proposed approach enables exploration of up to 300\% larger feature spaces compared to AutoML-only approaches, enabling faster search for better models on off-the-shelf hardware.}, 53 | booktitle = {Proceedings of the 17th ACM Conference on Recommender Systems}, 54 | pages = {1078–1083}, 55 | numpages = {6}, 56 | keywords = {Feature ranking, massive data sets, AutoML, recommender systems}, 57 | location = {Singapore, Singapore}, 58 | series = {RecSys '23} 59 | } 60 | 61 | @article{skrlj2023DrifterEO, 62 | title={Drifter: Efficient Online Feature Monitoring for Improved Data Integrity in Large-Scale Recommendation Systems}, 63 | author={Bla{\vz} {\vS}krlj and Nir Ki-Tov and Lee Edelist and Natalia Silberstein and Hila Weisman-Zohar and Bla{\vz} Mramor and Davorin Kopic and Naama Ziporin}, 64 | journal={ArXiv}, 65 | year={2023}, 66 | volume={abs/2309.08617}, 67 | url={https://api.semanticscholar.org/CorpusID:262045065} 68 | } 69 | ``` 70 | -------------------------------------------------------------------------------- /outrank/algorithms/sketches/counting_ultiloglog.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory 3 | """ 4 | from __future__ import annotations 5 | 6 | import numpy as np 7 | import xxhash 8 | 9 | 10 | class HyperLogLogWCache: 11 | def __init__(self, error_rate=0.005): 12 | # int(np.ceil(np.log2((1.04 / error_rate) ** 2))) 13 | self.p = 19 14 | self.m = 1 << self.p 15 | self.warmup_set = set() 16 | self.warmup_size = int(self.m / 2) 17 | self.width = 64 - self.p 18 | self.hll_flag = False 19 | 20 | def _hasher_update(self, value): 21 | self.hasher = xxhash.xxh32(seed=self.p) 22 | if isinstance(value, str): 23 | value = value.encode('utf-8') 24 | self.hasher.update(bytes(value)) 25 | else: 26 | self.hasher.update(bytes(value)) 27 | 28 | x = self.hasher.intdigest() 29 | j = x & (self.m - 1) 30 | w = x >> self.p 31 | 32 | rho = self.width - w.bit_length() 33 | self.M[j] = max(self.M[j], rho) 34 | 35 | def add(self, value): 36 | if len(self.warmup_set) < self.warmup_size and not self.hll_flag: 37 | self.warmup_set.add(value) 38 | elif not self.hll_flag: 39 | if not self.hll_flag: 40 | self.M = np.zeros(self.m) 41 | for element in self.warmup_set: 42 | self._hasher_update(element) 43 | self.warmup_set = {} 44 | self.hll_flag = True 45 | else: 46 | self._hasher_update(value) 47 | 48 | def __len__(self): 49 | if self.hll_flag: 50 | basis = np.ceil( 51 | self.m * 52 | np.log(np.divide(self.m, len(np.where(self.M == 0)[0]))), 53 | ) 54 | if basis != np.inf: 55 | return int(basis) - 1 56 | else: 57 | return 2**self.p 58 | else: 59 | return len(self.warmup_set) 60 | 61 | 62 | if __name__ == '__main__': 63 | import random 64 | import string 65 | 66 | 67 | def get_random_string(length): 68 | # choose from all lowercase letter 69 | letters = string.ascii_lowercase 70 | result_str = ''.join(random.choice(letters) for i in range(length)) 71 | return result_str 72 | 73 | # results_df = [] 74 | # num_vals = 100000 75 | # nbits = 16 76 | # for _ in range(3): 77 | # for j in tqdm.tqdm(range(1000000, 10000000, 1000)): 78 | # ground = list(set(np.random.randint(0, j, num_vals).tolist())) 79 | # ground = ground + [ 80 | # get_random_string(random.randint(1, 15)) for k in range(j) 81 | # ] 82 | 83 | # start_time = time.time() 84 | # GLOBAL_CARDINALITY_STORAGE = {} 85 | # GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLogWCache(0.005) 86 | 87 | # for j in ground: 88 | # GLOBAL_CARDINALITY_STORAGE[1].add(j) 89 | 90 | # size1 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE) 91 | # error1 = 100 * \ 92 | # (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground))) 93 | # end_time = time.time() 94 | # tp1 = end_time - start_time 95 | 96 | # import hyperloglog 97 | 98 | # start_time = time.time() 99 | # GLOBAL_CARDINALITY_STORAGE = {} 100 | # GLOBAL_CARDINALITY_STORAGE[1] = hyperloglog.HyperLogLog(0.005) 101 | 102 | # for j in ground: 103 | # GLOBAL_CARDINALITY_STORAGE[1].add(j) 104 | # size2 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE) 105 | # error2 = 100 * \ 106 | # (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground))) 107 | # end_time = time.time() 108 | # tp2 = end_time - start_time 109 | 110 | # start_time = time.time() 111 | # GLOBAL_CARDINALITY_STORAGE = set() 112 | 113 | # for j in ground: 114 | # GLOBAL_CARDINALITY_STORAGE.add(j) 115 | 116 | # size3 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE) 117 | # error3 = 100 * \ 118 | # (1 - len(GLOBAL_CARDINALITY_STORAGE) / len(set(ground))) 119 | # end_time = time.time() 120 | # tp3 = end_time - start_time 121 | 122 | # results_df.append( 123 | # { 124 | # 'num_samples': len(ground), 125 | # 'time': tp3, 126 | # 'algo': 'set', 127 | # 'error': error3, 128 | # }, 129 | # ) 130 | # results_df.append( 131 | # { 132 | # 'num_samples': len(ground), 133 | # 'time': tp2, 134 | # 'algo': 'default', 135 | # 'error': error2, 136 | # }, 137 | # ) 138 | # results_df.append( 139 | # { 140 | # 'num_samples': len(ground), 141 | # 'time': tp1, 142 | # 'algo': f'hllc ({nbits}, mixed)', 143 | # 'error': error1, 144 | # }, 145 | # ) 146 | 147 | # out_df = pd.DataFrame(results_df) 148 | # out_df.to_csv('backup.csv') 149 | # print(out_df) 150 | # print(out_df.groupby('algo').mean()) 151 | # sns.lineplot( 152 | # x=out_df.num_samples, y=out_df.error, 153 | # hue=out_df.algo, alpha=0.5, 154 | # ) 155 | # plt.tight_layout() 156 | # plt.ylabel('Num. of unique values in data') 157 | # plt.ylabel('Abs error') 158 | # plt.savefig('linep.pdf') 159 | # plt.clf() 160 | # plt.cla() 161 | 162 | # sns.lineplot( 163 | # x=out_df.num_samples.astype( 164 | # float, 165 | # ), y=out_df.time, hue=out_df.algo, 166 | # ) 167 | # plt.tight_layout() 168 | # plt.ylabel('Time (s)') 169 | # plt.savefig('barp.pdf') 170 | # plt.clf() 171 | # plt.cla() 172 | -------------------------------------------------------------------------------- /examples/multirank.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import shutil 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import seaborn as sns 10 | from scipy.cluster.hierarchy import dendrogram 11 | from scipy.cluster.hierarchy import linkage 12 | 13 | 14 | def rbo_score(l1, l2, p=0.9): 15 | """ 16 | Calculate the Rank-Biased Overlap (RBO) score. 17 | 18 | Args: 19 | l1 (list): Ranked list of elements. 20 | l2 (list): Ranked list of elements. 21 | p (float): Persistence probability (0 <= p < 1), default is 0.9 22 | 23 | Returns: 24 | float: RBO score, a value between 0 and 1. 25 | """ 26 | if l1 == l2: 27 | return 1.0 28 | 29 | len1, len2 = len(l1), len(l2) 30 | if len1 == 0 or len2 == 0: 31 | return 0.0 32 | 33 | overlap, rbo, depth = 0, 0, 0 34 | seen = set() 35 | 36 | for i in range(max(len1, len2)): 37 | if i < len1 and l1[i] not in seen: 38 | overlap += 1 39 | seen.add(l1[i]) 40 | if i < len2 and l2[i] not in seen: 41 | overlap += 1 42 | seen.add(l2[i]) 43 | 44 | depth += 1 45 | weight = (p ** (depth - 1)) / depth 46 | rbo += (overlap / depth) * weight 47 | 48 | return rbo * (1 - p) 49 | 50 | if __name__ == '__main__': 51 | 52 | # Define the number of top features to consider 53 | top_n = 10 54 | 55 | # Define different sizes and corresponding folder names 56 | sizes = [100000, 15000, 20000, 30000, 50000, 70000, 230000, 25000, 35000, 15000] 57 | input_folders = [f'../examples/df{i+1}' for i in range(10)] 58 | output_folders = [f'./output_df{i+1}' for i in range(10)] 59 | 60 | # Initialize a DataFrame to accumulate results 61 | accumulated_results = pd.DataFrame() 62 | 63 | # Loop over the sizes and folders 64 | for i, (size, input_folder, output_folder) in enumerate(zip(sizes, input_folders, output_folders), start=1): 65 | # Generate data set 66 | dataset_id = f'dataset_{i}' # Identifier for each data set 67 | print(f'Generating data set for size {size} with id {dataset_id}') 68 | os.system(f'python ../benchmarks/generator_third_order.py --size {size} --output_df_name {input_folder}') 69 | 70 | # Run ranking 71 | print(f'Running ranking for data set {input_folder}') 72 | os.system(f""" 73 | outrank \ 74 | --task all \ 75 | --data_path {input_folder} \ 76 | --data_source csv-raw \ 77 | --heuristic MI-numba-randomized \ 78 | --target_ranking_only True \ 79 | --combination_number_upper_bound 2048 \ 80 | --num_threads 12 \ 81 | --output_folder {output_folder} \ 82 | --subsampling 1 83 | """) 84 | 85 | # Read and accumulate the results from 'feature_singles.tsv' 86 | feature_singles_path = os.path.join(output_folder, 'feature_singles.tsv') 87 | if os.path.exists(feature_singles_path): 88 | print(f'Reading results from {feature_singles_path}') 89 | df_singles = pd.read_csv(feature_singles_path, sep='\t') 90 | df_singles['size'] = size # Include the size information in the results 91 | df_singles['dataset_id'] = dataset_id # Include the dataset identifier 92 | 93 | # Ensure 'Score' column naming correctness 94 | score_column = 'Score' if 'Score' in df_singles.columns else 'Score MI-numba-randomized' 95 | 96 | # Include rank based on Score 97 | df_singles['rank'] = df_singles[score_column].rank(ascending=False) 98 | 99 | # Clean the Feature names by taking only the part before the "-" 100 | df_singles['Feature-clean'] = df_singles['Feature'].apply(lambda x: x.split('-')[0]) 101 | 102 | # Accumulate the results 103 | accumulated_results = pd.concat([accumulated_results, df_singles], ignore_index=True) 104 | else: 105 | print(f'Warning: {feature_singles_path} does not exist!') 106 | 107 | # Data cleanup 108 | print(f'Cleaning up data set {input_folder} and output {output_folder}') 109 | if os.path.exists(input_folder): 110 | shutil.rmtree(input_folder) 111 | 112 | if os.path.exists(output_folder): 113 | shutil.rmtree(output_folder) 114 | 115 | # Compute average and standard deviation of ranks for each feature 116 | rank_stats = accumulated_results.groupby('Feature-clean').agg( 117 | avg_rank=('rank', 'mean'), 118 | std_rank=('rank', 'std'), 119 | ).reset_index() 120 | 121 | # Save accumulated results and rank statistics 122 | output_csv_path = './accumulated_feature_singles_results.csv' 123 | rank_stats_csv_path = './feature_rank_stats.csv' 124 | 125 | print(f'Saving accumulated results to {output_csv_path}') 126 | accumulated_results.to_csv(output_csv_path, sep='\t', index=False) 127 | 128 | print(f'Saving rank statistics to {rank_stats_csv_path}') 129 | rank_stats.to_csv(rank_stats_csv_path, sep='\t', index=False) 130 | 131 | # Compute pairwise similarity using RBO for top n features 132 | datasets = accumulated_results['dataset_id'].unique() 133 | similarity_matrix = np.zeros((len(datasets), len(datasets))) 134 | 135 | for i, dataset_i in enumerate(datasets): 136 | for j, dataset_j in enumerate(datasets): 137 | if i <= j: # Compute only for upper triangle and diagonal 138 | ranks_i = accumulated_results[accumulated_results['dataset_id'] == dataset_i].nlargest(top_n, 'rank').set_index('Feature-clean')['rank'] 139 | ranks_j = accumulated_results[accumulated_results['dataset_id'] == dataset_j].nlargest(top_n, 'rank').set_index('Feature-clean')['rank'] 140 | 141 | # Align the series 142 | common_features = ranks_i.index.intersection(ranks_j.index) 143 | if len(common_features) > 0: 144 | ranks_i = ranks_i[common_features] 145 | ranks_j = ranks_j[common_features] 146 | rbo_similarity = round(rbo_score(ranks_i.tolist(), ranks_j.tolist()), 3) 147 | similarity_matrix[i, j] = rbo_similarity 148 | similarity_matrix[j, i] = rbo_similarity 149 | 150 | # Convert the similarity matrix to DataFrame for saving 151 | similarity_df = pd.DataFrame(similarity_matrix, index=datasets, columns=datasets) 152 | similarity_matrix_path = './dataset_similarity_matrix.tsv' 153 | 154 | print(f'Saving similarity matrix to {similarity_matrix_path}') 155 | similarity_df.to_csv(similarity_matrix_path, sep='\t') 156 | 157 | # Visualization via dendrogram 158 | def plot_dendrogram(similarity_matrix, datasets): 159 | # Convert similarity matrix to distance matrix 160 | distance_matrix = 1 - similarity_matrix 161 | 162 | # Perform hierarchical/agglomerative clustering 163 | linkage_matrix = linkage(distance_matrix, 'ward') 164 | 165 | # Plot the dendrogram 166 | plt.figure(figsize=(10, 7)) 167 | dendrogram(linkage_matrix, labels=datasets, leaf_rotation=90) 168 | plt.title('Dendrogram of Dataset Similarities') 169 | plt.xlabel('Dataset') 170 | plt.ylabel('Distance') 171 | plt.tight_layout() 172 | plt.savefig('Dendrogram_all.pdf', dpi=300) 173 | 174 | print('Plotting dendrogram...') 175 | plot_dendrogram(similarity_matrix, datasets) 176 | 177 | print('Loop completed successfully, data has been cleaned up, rank statistics, and similarity matrix have been computed.') 178 | -------------------------------------------------------------------------------- /outrank/algorithms/feature_ranking/ranking_mi_numba.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | from numba import njit 5 | from numba import prange 6 | 7 | np.random.seed(123) 8 | # Fast Numba-based approximative mutual information 9 | 10 | 11 | @njit( 12 | 'Tuple((int32[:], int32[:]))(int32[:])', 13 | cache=True, 14 | fastmath=True, 15 | error_model='numpy', 16 | boundscheck=True, 17 | ) 18 | def numba_unique(a): 19 | """Identify unique elements in an array, fast""" 20 | 21 | container = np.zeros(np.max(a) + 1, dtype=np.int32) 22 | for val in a: 23 | container[val] += 1 24 | 25 | unique_values = np.nonzero(container)[0] 26 | unique_counts = container[unique_values] 27 | return unique_values.astype(np.int32), unique_counts.astype(np.int32) 28 | 29 | 30 | @njit( 31 | 'float32(uint32[:], int32[:], int32, float32, uint32[:])', 32 | cache=True, 33 | fastmath=True, 34 | error_model='numpy', 35 | boundscheck=True, 36 | ) 37 | def compute_conditional_entropy(Y_classes, class_values, class_var_shape, initial_prob, nonzero_counts): 38 | conditional_entropy = 0.0 39 | index = 0 40 | for c in class_values: 41 | conditional_prob = nonzero_counts[index] / class_var_shape 42 | if conditional_prob != 0: 43 | conditional_entropy -= ( 44 | initial_prob * conditional_prob * np.log(conditional_prob) 45 | ) 46 | index += 1 47 | 48 | return conditional_entropy 49 | 50 | 51 | @njit( 52 | 'float32(int32[:], int32[:], int32, int32[:], int32[:], b1)', 53 | cache=True, 54 | parallel=False, 55 | fastmath=True, 56 | error_model='numpy', 57 | boundscheck=True, 58 | ) 59 | def compute_entropies( 60 | X, Y, all_events, f_values, f_value_counts, cardinality_correction, 61 | ): 62 | """Core entropy computation function""" 63 | 64 | conditional_entropy = 0.0 65 | background_cond_entropy = 0.0 66 | full_entropy = 0.0 67 | class_values, class_counts = numba_unique(Y) 68 | 69 | if not cardinality_correction: 70 | for k in prange(len(class_counts)): 71 | class_probability = class_counts[k] / all_events 72 | full_entropy += -class_probability * np.log(class_probability) 73 | 74 | for f_index in prange(len(f_values)): 75 | _f_value_counts = f_value_counts[f_index] 76 | 77 | if _f_value_counts == 1: 78 | continue 79 | 80 | initial_prob = _f_value_counts / all_events 81 | x_value_subspace = np.where(X == f_values[f_index]) 82 | 83 | Y_classes = Y[x_value_subspace].astype(np.uint32) 84 | subspace_size = x_value_subspace[0].size 85 | 86 | # Right-shift to simulate noise 87 | Y_classes_spoofed = np.zeros(subspace_size, dtype=np.uint32) 88 | for enx, el in enumerate(x_value_subspace[0]): 89 | index = (el + _f_value_counts) % len(Y) 90 | Y_classes_spoofed[enx] = Y[index] 91 | 92 | nonzero_class_counts = np.zeros(len(class_values), dtype=np.uint32) 93 | nonzero_class_counts_spoofed = np.zeros(len(class_values), dtype=np.uint32) 94 | 95 | # Cache nonzero counts 96 | for index, c in enumerate(class_values): 97 | nonzero_class_counts[index] = np.count_nonzero(Y_classes == c) 98 | nonzero_class_counts_spoofed[index] = np.count_nonzero(Y_classes_spoofed == c) 99 | 100 | conditional_entropy += compute_conditional_entropy( 101 | Y_classes, class_values, _f_value_counts, initial_prob, nonzero_class_counts, 102 | ) 103 | 104 | if cardinality_correction: 105 | background_cond_entropy += compute_conditional_entropy( 106 | Y_classes_spoofed, class_values, _f_value_counts, initial_prob, nonzero_class_counts_spoofed, 107 | ) 108 | 109 | if not cardinality_correction: 110 | return full_entropy - conditional_entropy 111 | 112 | else: 113 | # note: full entropy falls out during derivation of final term 114 | core_joint_entropy = -conditional_entropy + background_cond_entropy 115 | return core_joint_entropy 116 | 117 | 118 | @njit( 119 | 'Tuple((int32[:], int32[:]))(int32[:], int32[:], float32, int32[:])', 120 | ) 121 | def stratified_subsampling(Y, X, approximation_factor, _f_values_X): 122 | 123 | all_events = len(X) 124 | final_space_size = int(approximation_factor * all_events) 125 | 126 | unique_samples_per_val = int(final_space_size / len(_f_values_X)) 127 | 128 | if unique_samples_per_val == 0: 129 | return Y, X 130 | 131 | final_index_array = np.empty(final_space_size) 132 | 133 | index_offset = 0 134 | for fval in _f_values_X: 135 | 136 | # note: this is not randomized due to batch effects, could be an improvement 137 | x_indices = np.where(X == fval)[0][:unique_samples_per_val] 138 | x_indices_len = len(x_indices) 139 | second_offset = (index_offset + x_indices_len) 140 | final_index_array[index_offset:second_offset] = x_indices 141 | index_offset += x_indices_len 142 | 143 | final_index_array = final_index_array.astype(np.int32) 144 | 145 | X = X[final_index_array] 146 | Y = Y[final_index_array] 147 | 148 | return Y, X 149 | 150 | 151 | @njit( 152 | 'float32(int32[:], int32[:], float32, b1)', 153 | cache=True, 154 | fastmath=True, 155 | error_model='numpy', 156 | boundscheck=True, 157 | ) 158 | def mutual_info_estimator_numba( 159 | Y, X, approximation_factor=1.0, cardinality_correction=False, 160 | ): 161 | """Core estimator logic. Compute unique elements, subset if required""" 162 | 163 | all_events = len(X) 164 | f_values, f_value_counts = numba_unique(X) 165 | 166 | # Diagonal entries 167 | if np.sum(X - Y) == 0: 168 | cardinality_correction = False 169 | 170 | if approximation_factor < 1.0: 171 | Y, X = stratified_subsampling(Y, X, approximation_factor, f_values) 172 | 173 | joint_entropy_core = compute_entropies( 174 | X, Y, all_events, f_values, f_value_counts, cardinality_correction, 175 | ) 176 | 177 | return approximation_factor * joint_entropy_core 178 | 179 | 180 | if __name__ == '__main__': 181 | import pandas as pd 182 | from sklearn.feature_selection import mutual_info_classif 183 | 184 | np.random.seed(123) 185 | import time 186 | 187 | final_times = [] 188 | for algo in ['MI-numba-randomized']: 189 | for order in range(12): 190 | for j in range(1): 191 | start = time.time() 192 | a = np.random.randint(1000, size=2**order).astype(np.int32) 193 | b = np.random.randint(1000, size=2**order).astype(np.int32) 194 | if algo == 'MI': 195 | final_score = mutual_info_classif( 196 | a.reshape(-1, 1), b.reshape(-1), discrete_features=True, 197 | ) 198 | elif algo == 'MI-numba-randomized': 199 | final_score = mutual_info_estimator_numba( 200 | a, b, np.float32(0.1), True, 201 | ) 202 | elif algo == 'MI-numba': 203 | final_score = mutual_info_estimator_numba( 204 | a, b, np.float32(1.0), False, 205 | ) 206 | elif algo == 'MI-numba-randomized-ap': 207 | final_score = mutual_info_estimator_numba( 208 | a, b, np.float32(0.3), True, 209 | ) 210 | elif algo == 'MI-numba-ap': 211 | final_score = mutual_info_estimator_numba( 212 | a, b, np.float32(0.3), False, 213 | ) 214 | 215 | end = time.time() 216 | tdiff = end - start 217 | instance = { 218 | 'time': tdiff, 219 | 'samples 2e': order, 'algorithm': algo, 220 | } 221 | final_times.append(instance) 222 | print(instance) 223 | print(final_score) 224 | dfx = pd.DataFrame(final_times) 225 | dfx = dfx.sort_values(by=['samples 2e']) 226 | print(dfx) 227 | -------------------------------------------------------------------------------- /outrank/algorithms/feature_ranking/ranking_mi_numba_opt.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import numpy as np 3 | from numba import njit, prange 4 | 5 | np.random.seed(123) 6 | 7 | 8 | @njit('Tuple((int32[:], int32[:]))(int32[:])', cache=True, fastmath=True) 9 | def numba_unique(a): 10 | """ 11 | Identify unique elements and their counts in a non-negative integer array. 12 | This version finds the max value in one pass to size the container. 13 | """ 14 | maxv = 0 15 | if a.size > 0: 16 | for i in range(a.size): 17 | if a[i] > maxv: 18 | maxv = a[i] 19 | container = np.zeros(maxv + 1, dtype=np.int32) 20 | for i in range(a.size): 21 | container[a[i]] += 1 22 | unique_values = np.nonzero(container)[0].astype(np.int32) 23 | unique_counts = container[unique_values].astype(np.int32) 24 | return unique_values, unique_counts 25 | 26 | 27 | @njit('float32(float32, int32, uint32[:])', cache=True, fastmath=True) 28 | def compute_conditional_entropy(initial_prob, group_size, class_counts): 29 | """ 30 | Calculates the contribution to conditional entropy for a single group. 31 | - initial_prob: P(X=v) 32 | - group_size: Number of elements in this group. 33 | - class_counts: Histogram of Y classes within this group. 34 | """ 35 | ce = 0.0 36 | inv_group_size = 1.0 / group_size 37 | for count in class_counts: 38 | if count > 0: 39 | conditional_prob = count * inv_group_size 40 | ce -= initial_prob * conditional_prob * np.log(conditional_prob) 41 | return ce 42 | 43 | 44 | @njit('Tuple((int32[:], int32[:], int32[:], int32[:]))(int32[:])', cache=True, fastmath=True) 45 | def build_groups(X): 46 | """ 47 | Pre-processes X to create an efficient grouping structure. 48 | This avoids repeated np.where scans. 49 | Returns: 50 | - f_values: Unique values in X. 51 | - f_counts: Counts of each unique value. 52 | - group_starts: Start indices for each group in the `positions` array. 53 | - positions: A single array of indices [0..N-1], sorted by the value of X at that index. 54 | """ 55 | f_values, f_counts = numba_unique(X) 56 | V = f_values.size 57 | 58 | vmax = 0 59 | if V > 0: 60 | for i in range(V): 61 | if f_values[i] > vmax: 62 | vmax = f_values[i] 63 | value_to_group_idx = np.full(vmax + 1, -1, dtype=np.int32) 64 | for i in range(V): 65 | value_to_group_idx[f_values[i]] = i 66 | 67 | group_starts = np.zeros(V, dtype=np.int32) 68 | run = 0 69 | for i in range(V): 70 | group_starts[i] = run 71 | run += f_counts[i] 72 | 73 | positions = np.empty(X.size, dtype=np.int32) 74 | cursors = group_starts.copy() 75 | for i in range(X.size): 76 | xi = X[i] 77 | gi = value_to_group_idx[xi] 78 | pos = cursors[gi] 79 | positions[pos] = i 80 | cursors[gi] = pos + 1 81 | 82 | return f_values, f_counts, group_starts, positions 83 | 84 | 85 | @njit( 86 | 'float32(int32[:], int32, int32[:], int32[:], int32[:], int32[:], b1)', 87 | cache=True, 88 | fastmath=True, 89 | ) 90 | def compute_entropies_grouped( 91 | Y, all_events, 92 | f_values, f_counts, group_starts, positions, 93 | cardinality_correction, 94 | ): 95 | """ 96 | Core entropy computation using the pre-built grouping structure. 97 | This is much faster as it avoids scans and temporary arrays in the loop. 98 | """ 99 | class_values, class_counts = numba_unique(Y) 100 | C = class_values.size 101 | 102 | full_entropy = 0.0 103 | if not cardinality_correction: 104 | invN = 1.0 / all_events 105 | for k in range(class_counts.size): 106 | p = class_counts[k] * invN 107 | if p > 0.0: 108 | full_entropy -= p * np.log(p) 109 | 110 | cmax = 0 111 | if C > 0: 112 | for i in range(C): 113 | if class_values[i] > cmax: 114 | cmax = class_values[i] 115 | class_to_idx = np.full(cmax + 1, -1, dtype=np.int32) 116 | for i in range(C): 117 | class_to_idx[class_values[i]] = i 118 | 119 | conditional_entropy = 0.0 120 | background_cond_entropy = 0.0 121 | n = Y.size 122 | 123 | hist = np.zeros(C, dtype=np.uint32) 124 | hist_spoofed = np.zeros(C, dtype=np.uint32) 125 | 126 | for gi in prange(f_values.size): 127 | group_size = f_counts[gi] 128 | if group_size <= 1: 129 | continue 130 | 131 | start = group_starts[gi] 132 | end = start + group_size 133 | 134 | for c in range(C): 135 | hist[c] = 0 136 | if cardinality_correction: 137 | hist_spoofed[c] = 0 138 | 139 | for pidx in range(start, end): 140 | original_idx = positions[pidx] 141 | y_val = Y[original_idx] 142 | class_idx = class_to_idx[y_val] 143 | hist[class_idx] += 1 144 | 145 | if cardinality_correction: 146 | shift = group_size 147 | for pidx in range(start, end): 148 | original_idx = positions[pidx] 149 | spoofed_idx = (original_idx + shift) % n 150 | y_val_spoofed = Y[spoofed_idx] 151 | class_idx_spoofed = class_to_idx[y_val_spoofed] 152 | hist_spoofed[class_idx_spoofed] += 1 153 | 154 | initial_prob = group_size / all_events 155 | conditional_entropy += compute_conditional_entropy(initial_prob, group_size, hist) 156 | if cardinality_correction: 157 | background_cond_entropy += compute_conditional_entropy(initial_prob, group_size, hist_spoofed) 158 | 159 | if not cardinality_correction: 160 | return full_entropy - conditional_entropy 161 | else: 162 | return -conditional_entropy + background_cond_entropy 163 | 164 | 165 | @njit( 166 | 'Tuple((int32[:], int32[:]))(int32[:], int32[:], float32, int32[:])', 167 | cache=True, 168 | fastmath=True 169 | ) 170 | def stratified_subsampling(Y, X, approximation_factor, _f_values_X): 171 | """ 172 | More efficient subsampling that avoids repeated np.where scans. 173 | """ 174 | all_events = X.size 175 | final_space_size = int(approximation_factor * all_events) 176 | if _f_values_X.size == 0: 177 | return Y, X 178 | unique_samples_per_val = int(final_space_size / _f_values_X.size) 179 | if unique_samples_per_val == 0: 180 | return Y, X 181 | 182 | final_index_array = np.empty(final_space_size, dtype=np.int32) 183 | index_offset = 0 184 | 185 | for fval in _f_values_X: 186 | count_collected = 0 187 | for j in range(X.size): 188 | if X[j] == fval: 189 | if count_collected < unique_samples_per_val: 190 | if index_offset < final_space_size: 191 | final_index_array[index_offset] = j 192 | index_offset += 1 193 | count_collected += 1 194 | else: 195 | break 196 | 197 | final_index_array = final_index_array[:index_offset] 198 | X_sub = X[final_index_array] 199 | Y_sub = Y[final_index_array] 200 | return Y_sub, X_sub 201 | 202 | 203 | @njit( 204 | 'float32(int32[:], int32[:], float32, b1)', 205 | cache=True, 206 | fastmath=True, 207 | ) 208 | def mutual_info_estimator_numba_opt( 209 | Y, X, approximation_factor=1.0, cardinality_correction=False, 210 | ): 211 | """ 212 | The heuristic is MI-numba-randomized, but the code for numba is structured so the execution is faster. 213 | Core estimator logic. This version uses the efficient grouped approach. 214 | """ 215 | 216 | if X.size != Y.size: 217 | raise ValueError("Input arrays X and Y must have the same length.") 218 | if X.size == 0: 219 | raise ValueError("Input arrays cannot be empty.") 220 | 221 | all_events = X.size 222 | 223 | is_diagonal = True 224 | if X.size == Y.size: 225 | for i in range(X.size): 226 | if X[i] != Y[i]: 227 | is_diagonal = False 228 | break 229 | else: 230 | is_diagonal = False 231 | 232 | if is_diagonal: 233 | cardinality_correction = False 234 | 235 | if approximation_factor < 1.0: 236 | f_values_full, _ = numba_unique(X) 237 | Y, X = stratified_subsampling(Y, X, approximation_factor, f_values_full) 238 | all_events = X.size 239 | 240 | f_values, f_counts, group_starts, positions = build_groups(X) 241 | 242 | joint_entropy_core = compute_entropies_grouped( 243 | Y, all_events, f_values, f_counts, group_starts, positions, cardinality_correction 244 | ) 245 | 246 | return approximation_factor * joint_entropy_core -------------------------------------------------------------------------------- /outrank/feature_transformations/ranking_transformers.py: -------------------------------------------------------------------------------- 1 | # A collection of feature transformers that can be considered 2 | from __future__ import annotations 3 | 4 | import json 5 | import logging 6 | import os 7 | from typing import Any 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | import outrank.feature_transformations.feature_transformer_vault as transformer_vault 13 | from outrank.core_utils import internal_hash 14 | 15 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 16 | 17 | 18 | class FeatureTransformerNoise: 19 | def __init__(self): 20 | self.noise_preset = 'default' 21 | 22 | def construct_new_features(self, dataframe: pd.DataFrame, label_column=None): 23 | """Generate a few standard noise distributions""" 24 | 25 | new_columns = dict() 26 | if self.noise_preset == 'default': 27 | new_columns['CONTROL-constant0'] = np.array([0] * dataframe.shape[0]) 28 | new_columns['CONTROL-gaussian'] = np.random.normal( 29 | size=dataframe.shape[0], 30 | ) 31 | new_columns['CONTROL-uniform'] = np.random.random( 32 | dataframe.shape[0], 33 | ) 34 | new_columns['CONTROL-random-binary'] = np.random.randint( 35 | 0, 2, dataframe.shape[0], 36 | ) 37 | new_columns['CONTROL-random-card100'] = np.random.randint( 38 | 0, 1 + 1 * 10**2, dataframe.shape[0], 39 | ) 40 | new_columns['CONTROL-random-card2k'] = np.random.randint( 41 | 0, 1 + 2 * 10**3, dataframe.shape[0], 42 | ) 43 | new_columns['CONTROL-random-card10k'] = np.random.randint( 44 | 0, 1 + 10 * 10**3, dataframe.shape[0], 45 | ) 46 | new_columns['CONTROL-random-card50k'] = np.random.randint( 47 | 0, 1 + 50 * 10**3, dataframe.shape[0], 48 | ) 49 | new_columns['CONTROL-int-sequence'] = np.arange( 50 | 0, dataframe.shape[0], 1.0, 51 | ) 52 | 53 | if label_column not in dataframe.columns: 54 | logging.warn( 55 | 'Could not find target feature in your data set - please inspect the columns if doing targeted ranking!', 56 | ) 57 | else: 58 | new_columns['CONTROL-target'] = dataframe[label_column] 59 | 60 | new_columns['CONTROL-volume'] = np.array([ 61 | internal_hash(str(x)) for _, x in dataframe.iterrows() 62 | ]) 63 | else: 64 | # Not relevant yet; will be if this is useful. 65 | pass 66 | 67 | if len(new_columns) > 0: 68 | tmp_df = pd.DataFrame(new_columns) 69 | dataframe = pd.concat([dataframe, tmp_df], axis=1) 70 | del tmp_df 71 | 72 | return dataframe 73 | 74 | 75 | class FeatureTransformerGeneric: 76 | def __init__(self, numeric_column_names: set[str], preset: str = 'default'): 77 | self.transformer_collection: dict[str, str] = dict() 78 | 79 | for transformer_namespace in preset.split(','): 80 | transformer_namespace = transformer_namespace.strip() 81 | 82 | # Check if it's a JSON file path 83 | if transformer_namespace.endswith('.json'): 84 | if os.path.isfile(transformer_namespace): 85 | json_transformers = self._load_transformers_from_json(transformer_namespace) 86 | self.transformer_collection = { 87 | **self.transformer_collection, 88 | **json_transformers, 89 | } 90 | else: 91 | raise FileNotFoundError(f"Transformer JSON file not found: {transformer_namespace}") 92 | else: 93 | # Handle existing preset names 94 | transformer_subspace = transformer_vault._tr_global_namespace.get( 95 | transformer_namespace, None, 96 | ) 97 | if transformer_subspace: 98 | self.transformer_collection = { 99 | **self.transformer_collection, 100 | **transformer_subspace, 101 | } 102 | 103 | if len(self.transformer_collection) == 0: 104 | raise NotImplementedError( 105 | 'Please, specify valid transformer namespaces (e.g., default, minimal etc.) or provide a valid JSON file path.', 106 | ) 107 | 108 | self.numeric_column_names = set(numeric_column_names) 109 | self.constructed_feature_names: set[str] = set() 110 | 111 | # If 80% of values are the same, don't consider a transformation 112 | self.max_maj_support = 0.80 113 | 114 | # If more than 75% of vals are missing, don't consider a transformation 115 | self.nan_prop_support = 0.75 116 | 117 | def _load_transformers_from_json(self, json_file_path: str) -> dict[str, str]: 118 | """Load transformer specifications from a JSON file.""" 119 | try: 120 | with open(json_file_path, 'r') as f: 121 | transformers = json.load(f) 122 | 123 | if not isinstance(transformers, dict): 124 | raise ValueError(f"JSON file {json_file_path} must contain a dictionary of transformer specifications") 125 | 126 | # Validate that all values are strings (transformer expressions) 127 | for key, value in transformers.items(): 128 | if not isinstance(value, str): 129 | raise ValueError(f"Transformer '{key}' in {json_file_path} must have a string expression, got {type(value)}") 130 | 131 | logging.info(f"Loaded {len(transformers)} transformers from {json_file_path}") 132 | return transformers 133 | 134 | except json.JSONDecodeError as e: 135 | raise ValueError(f"Invalid JSON in transformer file {json_file_path}: {e}") 136 | except Exception as e: 137 | raise 138 | 139 | def get_vals(self, tmp_df: pd.DataFrame, col_name: str) -> Any: 140 | cvals = tmp_df[col_name].values.tolist() 141 | cvals = [str(x).replace('"', '') for x in cvals] 142 | cvals = [0.0 if len(x) == 0 else float(x) for x in cvals] 143 | 144 | return np.array(cvals) 145 | 146 | def construct_baseline_features(self, dataframe: Any) -> pd.DataFrame: 147 | fvals = [] 148 | for enx, row in dataframe.iterrows(): 149 | missing_prop = np.round( 150 | row.values.tolist().count('') / dataframe.shape[1], 1, 151 | ) 152 | fvals.append(missing_prop) 153 | 154 | dataframe['BASELINE-MISSING-PROPORTION'] = fvals 155 | dataframe['BASELINE-DUMMY'] = 0 156 | 157 | return dataframe 158 | 159 | def construct_new_features(self, dataframe: Any) -> pd.DataFrame: 160 | new_numeric = set() 161 | logging.info( 162 | f'Considering {len(self.transformer_collection)} transformations for {len(self.numeric_column_names)} features ({len(self.transformer_collection) * len(self.numeric_column_names)} new features will be considered).', 163 | ) 164 | 165 | invalid_transforms = 0 166 | new_columns = dict() 167 | for numeric_column in self.numeric_column_names: 168 | X = self.get_vals(dataframe, numeric_column) 169 | 170 | if len(X) == 0: 171 | raise AssertionError( 172 | f"Could not retrieve the colomn {numeric_column}'s values. Please check the data.", 173 | ) 174 | 175 | for k, v in self.transformer_collection.items(): 176 | feature_name = f'{numeric_column}{k}' 177 | transformed_array = eval(v).astype(str) 178 | u, c = np.unique(transformed_array, return_counts=True) 179 | nan_prop = np.count_nonzero(transformed_array == 'nan') / len( 180 | transformed_array, 181 | ) 182 | cfreq = np.divide(np.max(c), np.sum(c)) 183 | if ( 184 | len(u) > 1 185 | and cfreq < self.max_maj_support 186 | and nan_prop < self.nan_prop_support 187 | ): 188 | new_columns[feature_name] = transformed_array 189 | new_numeric.add(feature_name) 190 | 191 | else: 192 | invalid_transforms += 1 193 | 194 | if len(new_columns) > 0: 195 | tmp_df = pd.DataFrame(new_columns) 196 | dataframe = pd.concat([dataframe, tmp_df], axis=1) 197 | del tmp_df 198 | 199 | logging.info( 200 | f'{invalid_transforms} invalid transformations were skipped.', 201 | ) 202 | self.numeric_column_names = self.numeric_column_names 203 | self.constructed_feature_names = new_numeric 204 | return dataframe 205 | -------------------------------------------------------------------------------- /tests/cms_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from outrank.algorithms.sketches.counting_cms import cms_hash 8 | from outrank.algorithms.sketches.counting_cms import CountMinSketch 9 | 10 | 11 | class TestCountMinSketch(unittest.TestCase): 12 | 13 | def setUp(self): 14 | # Set up a CountMinSketch instance with known parameters for testing 15 | self.depth = 6 16 | self.width = 2**10 # smaller width for testing purposes 17 | self.cms = CountMinSketch(self.depth, self.width) 18 | 19 | def test_init(self): 20 | self.assertEqual(self.cms.depth, self.depth) 21 | self.assertEqual(self.cms.width, self.width) 22 | self.assertEqual(self.cms.M.shape, (self.depth, self.width)) 23 | self.assertEqual(len(self.cms.hash_seeds), self.depth) 24 | 25 | def test_add_and_query_single_element(self): 26 | # Test adding a single element and querying it 27 | element = 'test_element' 28 | self.cms.add(element) 29 | # The queried count should be at least 1 (could be higher due to hash collisions) 30 | self.assertGreaterEqual(self.cms.query(element), 1) 31 | 32 | def test_add_and_query_multiple_elements(self): 33 | elements = ['foo', 'bar', 'baz', 'qux', 'quux'] 34 | for elem in elements: 35 | self.cms.add(elem) 36 | 37 | for elem in elements: 38 | self.assertGreaterEqual(self.cms.query(elem), 1) 39 | 40 | def test_batch_add_and_query(self): 41 | elements = ['foo', 'bar', 'baz'] * 10 42 | self.cms.batch_add(elements) 43 | 44 | for elem in set(elements): 45 | self.assertGreaterEqual(self.cms.query(elem), 10) 46 | 47 | def test_hash_uniformity(self): 48 | # Basic check for hash function's distribution 49 | seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=self.depth), dtype=np.uint32) 50 | hashes = [cms_hash(i, seeds[0], self.width) for i in range(1000)] 51 | # Expect fewer collisions over a small sample with a large width 52 | unique_hashes = len(set(hashes)) 53 | self.assertGreater(unique_hashes, 900) 54 | 55 | # === NEW COMPREHENSIVE TESTS === 56 | 57 | def test_init_boundary_values(self): 58 | """Test CountMinSketch initialization with boundary values""" 59 | # Test minimum valid dimensions 60 | cms_min = CountMinSketch(depth=1, width=1) 61 | self.assertEqual(cms_min.depth, 1) 62 | self.assertEqual(cms_min.width, 1) 63 | self.assertEqual(cms_min.M.shape, (1, 1)) 64 | 65 | # Test large dimensions 66 | cms_large = CountMinSketch(depth=100, width=2**16) 67 | self.assertEqual(cms_large.depth, 100) 68 | self.assertEqual(cms_large.width, 2**16) 69 | 70 | def test_init_with_custom_matrix(self): 71 | """Test initialization with pre-existing matrix""" 72 | custom_matrix = np.ones((3, 5), dtype=np.int32) 73 | cms = CountMinSketch(depth=3, width=5, M=custom_matrix) 74 | self.assertTrue(np.array_equal(cms.M, custom_matrix)) 75 | self.assertEqual(cms.depth, 3) 76 | self.assertEqual(cms.width, 5) 77 | 78 | def test_add_with_different_deltas(self): 79 | """Test adding elements with different delta values""" 80 | element = 'test' 81 | 82 | # Add with positive delta 83 | self.cms.add(element, delta=5) 84 | self.assertGreaterEqual(self.cms.query(element), 5) 85 | 86 | # Add with zero delta (should not change count) 87 | initial_count = self.cms.query(element) 88 | self.cms.add(element, delta=0) 89 | self.assertEqual(self.cms.query(element), initial_count) 90 | 91 | # Add with negative delta 92 | self.cms.add(element, delta=-2) 93 | self.assertGreaterEqual(self.cms.query(element), initial_count - 2) 94 | 95 | def test_add_various_data_types(self): 96 | """Test adding different data types""" 97 | test_cases = [ 98 | ('string', str), 99 | (42, int), 100 | (3.14, float), 101 | (True, bool), 102 | ((1, 2, 3), tuple), 103 | ] 104 | 105 | for element, data_type in test_cases: 106 | with self.subTest(element=element, data_type=data_type): 107 | self.cms.add(element) 108 | count = self.cms.query(element) 109 | self.assertGreaterEqual(count, 1, 110 | f"Failed to add/query element of type {data_type}") 111 | 112 | def test_query_nonexistent_elements(self): 113 | """Test querying elements that were never added""" 114 | nonexistent_elements = ['never_added', 999, 'ghost_element'] 115 | 116 | for element in nonexistent_elements: 117 | count = self.cms.query(element) 118 | self.assertEqual(count, 0, 119 | f"Non-existent element {element} should have count 0") 120 | 121 | def test_batch_add_empty_list(self): 122 | """Test batch adding an empty list""" 123 | initial_matrix = self.cms.M.copy() 124 | self.cms.batch_add([]) 125 | 126 | # Matrix should remain unchanged 127 | self.assertTrue(np.array_equal(self.cms.M, initial_matrix)) 128 | 129 | def test_batch_add_large_list(self): 130 | """Test batch adding a very large list""" 131 | large_list = ['item'] * 10000 132 | self.cms.batch_add(large_list) 133 | 134 | count = self.cms.query('item') 135 | self.assertGreaterEqual(count, 10000) 136 | 137 | def test_hash_function_properties(self): 138 | """Test hash function mathematical properties""" 139 | seed = np.uint32(42) 140 | width = 1000 141 | 142 | # Test hash function returns values in range [0, width) 143 | for i in range(100): 144 | hash_val = cms_hash(i, seed, width) 145 | self.assertGreaterEqual(hash_val, 0) 146 | self.assertLess(hash_val, width) 147 | self.assertIsInstance(hash_val, (int, np.integer)) 148 | 149 | # Test different seeds produce different distributions 150 | hashes1 = [cms_hash(i, np.uint32(1), width) for i in range(1000)] 151 | hashes2 = [cms_hash(i, np.uint32(2), width) for i in range(1000)] 152 | 153 | # Should have different distributions (not identical) 154 | self.assertNotEqual(hashes1, hashes2) 155 | 156 | def test_hash_collision_frequency(self): 157 | """Test hash collision rates are reasonable""" 158 | seed = np.uint32(123) 159 | width = 100 160 | num_items = 200 # More items than width to guarantee some collisions 161 | 162 | hashes = [cms_hash(i, seed, width) for i in range(num_items)] 163 | unique_hashes = len(set(hashes)) 164 | 165 | # Should have some collisions but not too many 166 | self.assertLess(unique_hashes, num_items) # Some collisions expected 167 | self.assertGreater(unique_hashes, width // 2) # Not too many collisions 168 | 169 | def test_multiple_hash_seeds_independence(self): 170 | """Test that different hash seeds produce independent results""" 171 | cms = CountMinSketch(depth=4, width=1000) 172 | test_element = 'test_independence' 173 | 174 | # Get hash values for same element with different seeds 175 | hash_values = [] 176 | for i in range(cms.depth): 177 | hash_val = cms_hash(test_element, cms.hash_seeds[i], cms.width) 178 | hash_values.append(hash_val) 179 | 180 | # All hash values should be different (very high probability) 181 | unique_hashes = len(set(hash_values)) 182 | self.assertEqual(unique_hashes, cms.depth, 183 | "Hash seeds should produce independent hash values") 184 | 185 | def test_accuracy_with_known_frequencies(self): 186 | """Test accuracy of count estimates with known ground truth""" 187 | # Create data with known frequencies 188 | elements = ['a'] * 100 + ['b'] * 50 + ['c'] * 25 + ['d'] * 10 189 | 190 | self.cms.batch_add(elements) 191 | 192 | # Verify estimates are at least as large as true counts 193 | self.assertGreaterEqual(self.cms.query('a'), 100) 194 | self.assertGreaterEqual(self.cms.query('b'), 50) 195 | self.assertGreaterEqual(self.cms.query('c'), 25) 196 | self.assertGreaterEqual(self.cms.query('d'), 10) 197 | 198 | # Verify estimates are reasonably close (within 2x for this small test) 199 | self.assertLessEqual(self.cms.query('a'), 200) 200 | self.assertLessEqual(self.cms.query('b'), 100) 201 | 202 | def test_get_matrix_returns_copy_safety(self): 203 | """Test that modifying returned matrix doesn't affect internal state""" 204 | original_matrix = self.cms.M.copy() 205 | returned_matrix = self.cms.get_matrix() 206 | 207 | # Modify the returned matrix 208 | returned_matrix[0, 0] = 999 209 | 210 | # Original should be unchanged if it's a proper copy 211 | # Note: Current implementation returns reference, this tests documents the behavior 212 | # In a production system, we might want get_matrix() to return a copy 213 | self.assertTrue(np.array_equal(self.cms.M, returned_matrix), 214 | "get_matrix() returns reference to internal matrix") 215 | 216 | def test_consistent_query_results(self): 217 | """Test that multiple queries of same element return consistent results""" 218 | element = 'consistent_test' 219 | self.cms.add(element, delta=5) 220 | 221 | # Multiple queries should return the same result 222 | first_query = self.cms.query(element) 223 | second_query = self.cms.query(element) 224 | third_query = self.cms.query(element) 225 | 226 | self.assertEqual(first_query, second_query) 227 | self.assertEqual(second_query, third_query) 228 | -------------------------------------------------------------------------------- /tests/cov_heu_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import unittest 5 | 6 | import numpy as np 7 | 8 | from outrank.algorithms.feature_ranking.ranking_cov_alignment import \ 9 | max_pair_coverage 10 | 11 | np.random.seed(123) 12 | sys.path.append('./outrank') 13 | 14 | 15 | class TestMaxPairCoverage(unittest.TestCase): 16 | def test_basic_functionality(self): 17 | array1 = np.array([1, 2, 3, 1, 2]) 18 | array2 = np.array([4, 5, 6, 4, 5]) 19 | result = max_pair_coverage(array1, array2) 20 | self.assertAlmostEqual(result, 2/5, places=5) 21 | 22 | def test_identical_elements(self): 23 | array1 = np.array([1, 1, 1, 1]) 24 | array2 = np.array([1, 1, 1, 1]) 25 | result = max_pair_coverage(array1, array2) 26 | self.assertEqual(result, 1.0) 27 | 28 | def test_large_arrays(self): 29 | array1 = np.random.randint(0, 100, size=10000) 30 | array2 = np.random.randint(0, 100, size=10000) 31 | result = max_pair_coverage(array1, array2) 32 | self.assertTrue(0 <= result <= 1) 33 | 34 | def test_all_unique_pairs(self): 35 | array1 = np.array([1, 2, 3, 4, 5]) 36 | array2 = np.array([6, 7, 8, 9, 10]) 37 | result = max_pair_coverage(array1, array2) 38 | self.assertEqual(result, 1/5) 39 | 40 | def test_all_same_pairs(self): 41 | array1 = np.array([1, 1, 1, 1, 1]) 42 | array2 = np.array([2, 2, 2, 2, 2]) 43 | result = max_pair_coverage(array1, array2) 44 | self.assertEqual(result, 1.0) 45 | 46 | def test_high_collision_potential(self): 47 | array1 = np.array([1] * 1000) 48 | array2 = np.array([2] * 1000) 49 | result = max_pair_coverage(array1, array2) 50 | self.assertEqual(result, 1.0) 51 | 52 | def test_very_large_arrays(self): 53 | array1 = np.random.randint(0, 1000, size=1000000) 54 | array2 = np.random.randint(0, 1000, size=1000000) 55 | result = max_pair_coverage(array1, array2) 56 | self.assertTrue(0 <= result <= 1) 57 | 58 | # === NEW COMPREHENSIVE TESTS === 59 | 60 | def test_empty_arrays(self): 61 | """Test behavior with empty arrays""" 62 | array1 = np.array([], dtype=np.int32) 63 | array2 = np.array([], dtype=np.int32) 64 | 65 | # Empty arrays result in NaN due to 0/0 division 66 | result = max_pair_coverage(array1, array2) 67 | self.assertTrue(np.isnan(result)) 68 | 69 | def test_single_element_arrays(self): 70 | """Test arrays with single elements""" 71 | array1 = np.array([42], dtype=np.int32) 72 | array2 = np.array([73], dtype=np.int32) 73 | result = max_pair_coverage(array1, array2) 74 | self.assertEqual(result, 1.0) # Single pair gets 100% coverage 75 | 76 | def test_two_element_arrays(self): 77 | """Test arrays with two elements""" 78 | # Different pairs 79 | array1 = np.array([1, 2], dtype=np.int32) 80 | array2 = np.array([3, 4], dtype=np.int32) 81 | result = max_pair_coverage(array1, array2) 82 | self.assertEqual(result, 0.5) # Each pair appears once, max coverage is 1/2 83 | 84 | # Same pairs 85 | array1 = np.array([1, 1], dtype=np.int32) 86 | array2 = np.array([3, 3], dtype=np.int32) 87 | result = max_pair_coverage(array1, array2) 88 | self.assertEqual(result, 1.0) # Same pair appears twice 89 | 90 | def test_mismatched_array_lengths(self): 91 | """Test error handling for arrays of different lengths""" 92 | array1 = np.array([1, 2, 3], dtype=np.int32) 93 | array2 = np.array([4, 5], dtype=np.int32) # Different length 94 | 95 | with self.assertRaises(IndexError): 96 | max_pair_coverage(array1, array2) 97 | 98 | def test_wrong_data_types(self): 99 | """Test behavior with non-int32 arrays""" 100 | # Test with float arrays - should work due to numpy casting 101 | array1 = np.array([1.0, 2.0, 3.0]) 102 | array2 = np.array([4.0, 5.0, 6.0]) 103 | 104 | # Convert to int32 as expected by function signature 105 | array1_int32 = array1.astype(np.int32) 106 | array2_int32 = array2.astype(np.int32) 107 | result = max_pair_coverage(array1_int32, array2_int32) 108 | self.assertIsInstance(result, float) 109 | self.assertTrue(0 <= result <= 1) 110 | 111 | def test_negative_values(self): 112 | """Test arrays containing negative values""" 113 | array1 = np.array([-1, -2, -3, -1, -2], dtype=np.int32) 114 | array2 = np.array([4, 5, 6, 4, 5], dtype=np.int32) 115 | result = max_pair_coverage(array1, array2) 116 | 117 | # Should work with negative values 118 | self.assertIsInstance(result, float) 119 | self.assertTrue(0 <= result <= 1) 120 | self.assertAlmostEqual(result, 2/5, places=5) 121 | 122 | def test_zero_values(self): 123 | """Test arrays containing zero values""" 124 | array1 = np.array([0, 0, 1, 1], dtype=np.int32) 125 | array2 = np.array([0, 0, 2, 2], dtype=np.int32) 126 | result = max_pair_coverage(array1, array2) 127 | 128 | # Two (0,0) pairs and two (1,2) pairs, max coverage should be 0.5 129 | self.assertEqual(result, 0.5) 130 | 131 | def test_large_integer_values(self): 132 | """Test with very large integer values""" 133 | max_int32 = np.iinfo(np.int32).max 134 | min_int32 = np.iinfo(np.int32).min 135 | 136 | array1 = np.array([max_int32, min_int32, 0], dtype=np.int32) 137 | array2 = np.array([max_int32, min_int32, 0], dtype=np.int32) 138 | result = max_pair_coverage(array1, array2) 139 | 140 | # Due to hash function behavior and potential overflow, result should be valid float 141 | self.assertIsInstance(result, float) 142 | self.assertTrue(0 <= result <= 1 or np.isnan(result)) # Allow NaN due to overflow 143 | 144 | def test_hash_collision_simulation(self): 145 | """Test behavior when hash collisions might occur""" 146 | # Create values that might cause hash collisions 147 | # Using large numbers that could wrap around in hash function 148 | large_vals = np.array([1471343, 2942686, 4414029], dtype=np.int32) 149 | array1 = np.tile(large_vals, 100) 150 | array2 = np.tile([1, 2, 3], 100) 151 | 152 | result = max_pair_coverage(array1, array2) 153 | 154 | # Should handle potential hash collisions gracefully 155 | self.assertIsInstance(result, float) 156 | self.assertTrue(0 <= result <= 1) 157 | 158 | def test_mathematical_properties(self): 159 | """Test mathematical properties of the coverage function""" 160 | array1 = np.array([1, 2, 3, 1, 2, 1], dtype=np.int32) 161 | array2 = np.array([4, 5, 6, 4, 5, 4], dtype=np.int32) 162 | 163 | result = max_pair_coverage(array1, array2) 164 | 165 | # Coverage should be fraction of most common pair 166 | # (1,4) appears 3 times out of 6 total, so coverage = 3/6 = 0.5 167 | self.assertEqual(result, 0.5) 168 | 169 | # Test symmetry property isn't expected (function uses el1 * constant - el2) 170 | result_swapped = max_pair_coverage(array2, array1) 171 | # Results may be different due to hash function asymmetry 172 | self.assertIsInstance(result_swapped, float) 173 | self.assertTrue(0 <= result_swapped <= 1) 174 | 175 | def test_coverage_bounds_verification(self): 176 | """Verify coverage is always between 0 and 1""" 177 | # Test with various random configurations 178 | np.random.seed(456) # Different seed for this test 179 | 180 | for size in [10, 100, 1000]: 181 | for num_unique in [1, size//4, size//2, size]: 182 | array1 = np.random.randint(0, num_unique, size=size, dtype=np.int32) 183 | array2 = np.random.randint(0, num_unique, size=size, dtype=np.int32) 184 | 185 | result = max_pair_coverage(array1, array2) 186 | 187 | with self.subTest(size=size, num_unique=num_unique): 188 | self.assertGreaterEqual(result, 0.0, 189 | f"Coverage should be >= 0, got {result}") 190 | self.assertLessEqual(result, 1.0, 191 | f"Coverage should be <= 1, got {result}") 192 | self.assertIsInstance(result, float) 193 | 194 | def test_hash_function_properties(self): 195 | """Test properties of the internal hash function indirectly""" 196 | # Create array where we can predict hash behavior 197 | array1 = np.array([0, 1, 2], dtype=np.int32) 198 | array2 = np.array([0, 0, 0], dtype=np.int32) 199 | 200 | result = max_pair_coverage(array1, array2) 201 | 202 | # Each pair (0,0), (1,0), (2,0) should hash to different values 203 | # unless there are collisions, so max coverage should be 1/3 204 | self.assertAlmostEqual(result, 1/3, places=5) 205 | 206 | def test_deterministic_behavior(self): 207 | """Test that function returns consistent results for same input""" 208 | array1 = np.array([1, 2, 3, 1, 2], dtype=np.int32) 209 | array2 = np.array([4, 5, 6, 4, 5], dtype=np.int32) 210 | 211 | # Multiple calls should return identical results 212 | result1 = max_pair_coverage(array1, array2) 213 | result2 = max_pair_coverage(array1, array2) 214 | result3 = max_pair_coverage(array1, array2) 215 | 216 | self.assertEqual(result1, result2) 217 | self.assertEqual(result2, result3) 218 | 219 | def test_coverage_with_all_different_pairs(self): 220 | """Test coverage when all pairs are unique""" 221 | n = 100 222 | array1 = np.arange(n, dtype=np.int32) 223 | array2 = np.arange(n, n*2, dtype=np.int32) 224 | 225 | result = max_pair_coverage(array1, array2) 226 | 227 | # All pairs are unique, so max coverage is 1/n 228 | expected = 1.0 / n 229 | self.assertAlmostEqual(result, expected, places=5) 230 | 231 | def test_maximum_coverage_scenario(self): 232 | """Test scenario that should give maximum coverage (1.0)""" 233 | # All pairs are identical 234 | array1 = np.array([42] * 100, dtype=np.int32) 235 | array2 = np.array([73] * 100, dtype=np.int32) 236 | 237 | result = max_pair_coverage(array1, array2) 238 | self.assertEqual(result, 1.0) 239 | -------------------------------------------------------------------------------- /outrank/visualizations/ranking_visualization.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | import warnings 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | import seaborn as sns 11 | from scipy.cluster import hierarchy 12 | from sklearn.manifold import TSNE 13 | from sklearn.metrics import silhouette_score 14 | 15 | from outrank.core_utils import read_reference_json 16 | 17 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 18 | plt.rcParams['figure.figsize'] = (50, 30) 19 | 20 | 21 | def visualize_hierarchical_clusters( 22 | triplet_dataframe: pd.DataFrame, 23 | output_folder: str, 24 | image_format: str = 'png', 25 | max_num_clusters: int = 100, 26 | ) -> None: 27 | plt.rcParams['figure.figsize'] = (10, 5) 28 | unique_features = triplet_dataframe.FeatureA.unique() 29 | 30 | if len(unique_features) > 1000: 31 | logging.info('Trying to visualize too many features, exiting ..') 32 | exit() 33 | 34 | dmat = np.zeros((len(unique_features), len(unique_features))) 35 | logging.info('Preparing the data for clustering ..') 36 | 37 | if triplet_dataframe.shape[0] > 10**5: 38 | logging.info('Trying to visualize more than 10 ** 5 triplets, exiting ..') 39 | exit() 40 | 41 | pivot_table = pd.pivot_table( 42 | triplet_dataframe, 43 | values='Score', 44 | index='FeatureA', 45 | columns='FeatureB', 46 | aggfunc='mean', # Updated from np.mean to 'mean' 47 | ) 48 | 49 | pivot_table.fillna(0, inplace=True) 50 | dmat = 1 - pivot_table.values 51 | 52 | logging.info('Clustering ..') 53 | 54 | for linkage_heuristic in ['complete']: 55 | Z = hierarchy.linkage(dmat, linkage_heuristic) 56 | 57 | hierarchy.dendrogram( 58 | Z, above_threshold_color='y', orientation='top', labels=unique_features, 59 | ) 60 | plt.title(f'Linkage function: {linkage_heuristic}') 61 | with warnings.catch_warnings(): 62 | warnings.simplefilter('ignore', UserWarning) 63 | plt.tight_layout() 64 | out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}' 65 | plt.savefig(out_path, dpi=300) 66 | plt.clf() 67 | plt.cla() 68 | logging.info(f'Visualized hierarchical clustering with linkage {linkage_heuristic} to {out_path}') 69 | 70 | range_min, range_max = np.min(pivot_table.values), np.max(pivot_table.values) 71 | spectrum = np.arange(range_min, range_max, (range_max - range_min) / 1000) 72 | max_silhouette = 0 73 | top_clustering = [] 74 | full_silhouette_space = [] 75 | 76 | for possible_threshold in spectrum: 77 | cluster_assignments = hierarchy.fcluster(Z, possible_threshold) 78 | num_clusters = len(np.unique(cluster_assignments)) 79 | if num_clusters > 2 and num_clusters < max_num_clusters: 80 | try: 81 | sil_score = silhouette_score(pivot_table, cluster_assignments) 82 | except Exception: 83 | continue 84 | 85 | full_silhouette_space.append([sil_score, possible_threshold, num_clusters]) 86 | if sil_score >= max_silhouette: 87 | top_clustering = cluster_assignments 88 | max_silhouette = sil_score 89 | 90 | dfx = pd.DataFrame(full_silhouette_space) 91 | if len(dfx) == 0: 92 | logging.info('Silhouette space empty, exiting') 93 | exit() 94 | 95 | dfx.columns = ['Silhouette', 'threshold', 'numClusters'] 96 | sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black') 97 | with warnings.catch_warnings(): 98 | warnings.simplefilter('ignore', UserWarning) 99 | plt.tight_layout() 100 | out_path = f'{output_folder}/SilhouetteProfile.{image_format}' 101 | plt.savefig(out_path, dpi=300) 102 | plt.clf() 103 | plt.cla() 104 | logging.info('Stored the Silhouette profile.') 105 | 106 | final_feature_cluster_df = pd.DataFrame(list(zip(top_clustering, pivot_table.index))) 107 | final_feature_cluster_df.columns = ['ClusterID', 'Feature'] 108 | final_feature_cluster_df.to_csv(f'{output_folder}/TopClustering.tsv', sep='\t') 109 | 110 | try: 111 | projected_data = TSNE().fit_transform(pivot_table.values) 112 | projected_data = pd.DataFrame(projected_data, columns=['Dim1', 'Dim2']) 113 | projected_data['ClusterID'] = top_clustering.astype(str) 114 | sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2') 115 | with warnings.catch_warnings(): 116 | warnings.simplefilter('ignore', UserWarning) 117 | plt.tight_layout() 118 | plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300) 119 | plt.clf() 120 | plt.cla() 121 | except: 122 | pass 123 | 124 | plt.rcParams['figure.figsize'] = (50, 30) 125 | 126 | 127 | def visualize_heatmap( 128 | triplets: pd.DataFrame, output_folder: str, image_format: str, 129 | ) -> None: 130 | sns.set(font_scale=2) 131 | fig, ax = plt.subplots() 132 | pivot_table = pd.pivot_table( 133 | triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean', # Updated from np.mean to 'mean' 134 | ) 135 | mask = np.zeros_like(pivot_table.values) 136 | mask[np.triu_indices_from(mask)] = True 137 | fsize_heatmap = 20 138 | if pivot_table.shape[0] > 100: 139 | sns.set(font_scale=1) 140 | fsize_heatmap = 3 141 | 142 | logging.info('Visualizing the heatmap ..') 143 | 144 | if pivot_table.shape[0] > 500: 145 | logging.info('Skipping heatmap visualization due to too many elements ..') 146 | return 147 | 148 | plt.figure(figsize=(50, 50)) 149 | plt.rcParams.update({'font.size': 1}) 150 | sns.heatmap( 151 | pivot_table, 152 | annot=True, 153 | mask=mask, 154 | annot_kws={'size': fsize_heatmap}, 155 | square=False, 156 | cmap='coolwarm', 157 | linecolor='black', 158 | linewidths=0.05, 159 | ) 160 | plt.xlabel('') 161 | plt.ylabel('') 162 | with warnings.catch_warnings(): 163 | warnings.simplefilter('ignore', UserWarning) 164 | plt.tight_layout() 165 | plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500) 166 | plt.clf() 167 | plt.cla() 168 | logging.info(f'Stored heatmap to: {output_folder}/heatmap.{image_format}') 169 | 170 | 171 | def visualize_barplots( 172 | triplets: pd.DataFrame, 173 | output_folder: str, 174 | reference_json: str, 175 | image_format: str, 176 | label: str, 177 | heuristic: str, 178 | ) -> None: 179 | sns.set(font_scale=8) 180 | feature_ranks_rows = [] 181 | for _, row in triplets.iterrows(): 182 | feature_A = row['FeatureA'] 183 | feature_B = row['FeatureB'] 184 | if label in feature_A: 185 | feature_ranks_rows.append([feature_B, row.Score]) 186 | elif label in feature_B: 187 | feature_ranks_rows.append([feature_A, row.Score]) 188 | 189 | feature_ranks: pd.DataFrame = pd.DataFrame(feature_ranks_rows, columns=['Feature', 'Value']) 190 | feature_ranks = feature_ranks[~feature_ranks['Feature'].str.contains(label)] 191 | if not os.path.exists(reference_json): 192 | reference_json = '' 193 | 194 | used_features = [] 195 | if reference_json: 196 | ref_json = read_reference_json(reference_json) 197 | if 'features' in ref_json['desc']: 198 | used_features.extend(ref_json['desc']['features']) 199 | if 'fields' in ref_json['desc']: 200 | used_features.extend(ref_json['desc']['fields']) 201 | else: 202 | used_features = feature_ranks['Feature'].tolist() 203 | 204 | feature_ranks['Feature'] = feature_ranks['Feature'].astype(str) 205 | feature_ranks['Value'] = feature_ranks['Value'].astype(float) 206 | feature_ranks = feature_ranks.groupby('Feature').median().reset_index() 207 | feature_ranks = feature_ranks.sort_values(by='Value', ascending=False) 208 | 209 | subset_ranges = [10, 25, 50, 100, feature_ranks.shape[0]] 210 | sns.set_style('whitegrid') 211 | 212 | for subset_range in subset_ranges: 213 | feature_ranks_reduced = feature_ranks.iloc[:subset_range] 214 | plt.figure(figsize=(18, 12)) 215 | fig, ax = plt.subplots() 216 | 217 | if 45 < feature_ranks_reduced.shape[0] <= 100: 218 | ax.yaxis.set_tick_params(labelsize=8) 219 | elif feature_ranks_reduced.shape[0] > 100: 220 | ax.yaxis.set_tick_params(labelsize=2) 221 | else: 222 | ax.yaxis.set_tick_params(labelsize=25) 223 | 224 | plt.title(f'Ranking w.r.t "{label}"\n') 225 | sns.barplot( 226 | x='Value', 227 | y='Feature', 228 | hue='Feature', 229 | data=feature_ranks_reduced, 230 | palette='coolwarm_r', 231 | err_kws={'linewidth': 0.7}, 232 | dodge=False, 233 | ) 234 | 235 | if ax.legend_ is not None: 236 | ax.legend_.remove() # Remove the legend if it exists 237 | 238 | for item in ax.get_yticklabels(): 239 | for prod_feature in used_features: 240 | if item.get_text() in prod_feature: 241 | item.set_fontweight('bold') 242 | item.set_color('red') 243 | break 244 | 245 | plt.xlabel(f'Feature importance (based on heuristic {heuristic})') 246 | plt.ylabel('') 247 | with warnings.catch_warnings(): 248 | warnings.simplefilter('ignore', UserWarning) 249 | plt.tight_layout() 250 | plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300) 251 | plt.clf() 252 | plt.cla() 253 | 254 | logging.info(f'Stored barplot to: {output_folder}/barplot_top_{subset_range}_.{image_format}') 255 | 256 | 257 | def visualize_all( 258 | triplets: pd.DataFrame, 259 | output_folder: str, 260 | label: str = '', 261 | reference_json: str = '', 262 | image_format: str = 'png', 263 | heuristic: str = 'MI', 264 | ) -> None: 265 | if not os.path.exists(output_folder): 266 | os.makedirs(output_folder) 267 | 268 | visualize_hierarchical_clusters(triplets, output_folder, image_format) 269 | visualize_heatmap(triplets, output_folder, image_format) 270 | visualize_barplots(triplets, output_folder, reference_json, image_format, label, heuristic) 271 | -------------------------------------------------------------------------------- /outrank/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import logging 5 | 6 | from outrank.task_generators import outrank_task_generate_data_set 7 | from outrank.task_instance_ranking import outrank_task_rank_instances 8 | from outrank.task_ranking import outrank_task_conduct_ranking 9 | from outrank.task_selftest import conduct_self_test 10 | from outrank.task_summary import outrank_task_result_summary 11 | from outrank.task_visualization import outrank_task_visualize_results 12 | 13 | logging.basicConfig( 14 | format='%(asctime)s - %(message)s', 15 | datefmt='%d-%b-%y %H:%M:%S', 16 | ) 17 | logging.getLogger(__name__).setLevel(logging.INFO) 18 | 19 | usage_examples = """ 20 | Usage examples: 21 | 22 | # perform ranking, summary and visualize the results 23 | outrank --task all --data_path pathToSomeData --data_source ob-vw --heuristic MI-numba-randomized --include_cardinality_in_feature_names True --target_ranking_only True --combination_number_upper_bound 2048 --num_threads 8 --interaction_order 1 --transformers fw-transformers --output_folder ./ranking_outputs --subsampling 100 24 | 25 | # pairwise ranking only 26 | outrank --task ranking --data_path pathToSomeData --data_source ob-vw --heuristic MI-numba-randomized --target_ranking_only False --combination_number_upper_bound 10000 --num_threads 30 --output_folder ./ranking_outputs --subsampling 10 27 | 28 | # Higher order interactions 29 | outrank --task all --data_path pathToSomeData --data_source csv-raw --heuristic MI-numba-randomized --target_ranking_only True --combination_number_upper_bound 2048 --num_threads 8 --interaction_order 3 --output_folder ./ranking_outputs --subsampling 20 30 | 31 | # Using custom JSON transformers 32 | outrank --task ranking --data_path pathToSomeData --data_source csv-raw --heuristic MI-numba-randomized --transformers examples/custom_transformers.json --output_folder ./ranking_outputs 33 | 34 | # More docs and use cases at https://outbrain.github.io/outrank/outrank.html 35 | """ 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser( 40 | description='Fast feature screening for sparse data sets.', 41 | epilog=usage_examples, 42 | formatter_class=argparse.RawTextHelpFormatter, 43 | ) 44 | 45 | parser.add_argument( 46 | '--task', 47 | type=str, 48 | default='all', 49 | help='Type of task to consider. Can be either "ranking", "ranking_summary", "feature_summary_transformers", or "visualization"', 50 | ) 51 | 52 | parser.add_argument( 53 | '--minibatch_size', 54 | type=int, 55 | default=2**14, 56 | help='Suitable for data, not pre-split to batches, this parameter determines batch size - note that too large batch size can slow down the multithreaded score computation due to many thread allocations etc. This works ok for <300 features and up to 48 threads.', 57 | ) 58 | 59 | parser.add_argument( 60 | '--output_folder', 61 | type=str, 62 | default='ranking_outputs', 63 | help='Output folder containing ranking results.', 64 | ) 65 | 66 | parser.add_argument( 67 | '--data_source', 68 | type=str, 69 | default='ob-vw', 70 | help='Which database is used to obtain learning instances? this determines the inferred folder structure (csv-raw, ob-vw, ob-csv).', 71 | ) 72 | 73 | parser.add_argument( 74 | '--data_path', 75 | type=str, 76 | default=None, 77 | help='Path to the folder containing the main data used for subsequent learning.', 78 | ) 79 | 80 | parser.add_argument( 81 | '--subsampling', 82 | type=int, 83 | default=10, 84 | help='Subsampling ratio - every n-th instance will be considered (suggested value: 10 to 100)', 85 | ) 86 | 87 | parser.add_argument( 88 | '--combination_number_upper_bound', 89 | type=int, 90 | default=2**15, 91 | help='Cap the number of columns during feature ranking, per batch. This means that if you were to evaluate e.g., 100k combinations, this parameter results in behavior where only 2 ** 15 are taken into account (randomly) each bach, resulting in a monte-carlo like sampling scheme that yields estimates of the final ranks when all data is seen.', 92 | ) 93 | 94 | parser.add_argument( 95 | '--missing_value_symbols', 96 | type=str, 97 | default=',{}', 98 | help='What symbols denote missing values? Comma-separate them - if comma is a missing symbol itself please open an issue.', 99 | ) 100 | 101 | parser.add_argument( 102 | '--heuristic', 103 | type=str, 104 | default='MI-numba-randomized', 105 | help='Selected heuristic (that performs feature scoring). For full list please see the docs: https://outbrain.github.io/outrank/outrank/algorithms/importance_estimator.html', 106 | ) 107 | 108 | parser.add_argument( 109 | '--include_noise_baseline_features', 110 | type=str, 111 | default='False', 112 | help='If enabled, it computes five control variables (random noises)', 113 | ) 114 | 115 | parser.add_argument( 116 | '--include_cardinality_in_feature_names', 117 | type=str, 118 | default='True', 119 | help='If enabled, feature names appear as feature-(cardinality) for easier inspection/debugging.', 120 | ) 121 | 122 | parser.add_argument( 123 | '--image_format', 124 | type=str, 125 | default='pdf', 126 | help='The format of the output images (task: visualization)', 127 | ) 128 | 129 | parser.add_argument( 130 | '--num_threads', type=int, default=8, help='Number of threads to consider. More threads implies faster ranking, however, there will be some memory overhead. Should be as large as the machine can handle memory-wise.', 131 | ) 132 | 133 | parser.add_argument( 134 | '--label_column', 135 | type=str, 136 | default='label', 137 | help='Name of the target attribute for ranking. Note that this can be any other feature for most implemented heuristics.', 138 | ) 139 | 140 | parser.add_argument( 141 | '--max_unique_hist_constraint', 142 | type=int, 143 | default=30_000, 144 | help='Max number of unique values for which counts are recalled.', 145 | ) 146 | 147 | parser.add_argument( 148 | '--transformers', 149 | type=str, 150 | default='none', 151 | help='Collection of which feature transformations to consider. Examples are: fw-transformers, default, minimal. Also supports JSON file paths (e.g., custom_transformers.json) and combinations (e.g., default,custom.json)', 152 | ) 153 | 154 | parser.add_argument( 155 | '--rare_value_count_upper_bound', 156 | type=int, 157 | default=1, 158 | help="When identifying rare attr-val pairs, what's the upper frequency bound?", 159 | ) 160 | 161 | parser.add_argument( 162 | '--feature_set_focus', 163 | type=str, 164 | default=None, 165 | help='Collection of which feature transformations to consider', 166 | ) 167 | 168 | parser.add_argument( 169 | '--interaction_order', 170 | type=int, 171 | default=1, 172 | help='The order of feature interactions to consider during ranking (complex features comprised of n elementary ones)', 173 | ) 174 | 175 | parser.add_argument( 176 | '--reference_model_JSON', 177 | type=str, 178 | default='', 179 | help='Reference model JSON', 180 | ) 181 | 182 | parser.add_argument( 183 | '--target_ranking_only', 184 | type=str, 185 | default='True', 186 | help='Compute only the feature-label scores? This is substantially faster (O(n)).', 187 | ) 188 | 189 | parser.add_argument( 190 | '--explode_multivalue_features', 191 | type=str, 192 | default='False', 193 | help="Which ';'-separated features should be one-hot encoded into n new features (coverage analysis)", 194 | ) 195 | 196 | parser.add_argument( 197 | '--subfeature_mapping', 198 | type=str, 199 | default='False', 200 | help='Compute sub-features on-the fly. Example: featureA->featureB implies features based on each value of featureA will be considered. So, feature names will correspond to values of the first feature, with actual values being constructed based on the second feature (two or more possible values).', 201 | ) 202 | 203 | parser.add_argument( 204 | '--num_synthetic_features', 205 | type=int, 206 | default=100, 207 | help='Relevant for task data_generator -- how many features.', 208 | ) 209 | 210 | parser.add_argument( 211 | '--tldr', 212 | type=str, 213 | default='True', 214 | help='If enabled, it will output some of the main results on the screen after finishing.', 215 | ) 216 | 217 | parser.add_argument( 218 | '--num_synthetic_rows', 219 | type=int, 220 | default=1000000, 221 | help='Relevant for task data_generator -- how many rows.', 222 | ) 223 | 224 | parser.add_argument( 225 | '--generator_type', 226 | type=str, 227 | default='naive', 228 | help='Relevant for task data_generator -- which generator to consider', 229 | ) 230 | 231 | parser.add_argument( 232 | '--output_synthetic_df_name', 233 | type=str, 234 | default='test_data_synthetic', 235 | help='Relevant for task data_generator -- name of the folder that contains generated data.', 236 | ) 237 | 238 | parser.add_argument( 239 | '--disable_tqdm', 240 | default='False', 241 | choices=['False', 'True'], 242 | help='Either True or False.', 243 | ) 244 | 245 | parser.add_argument( 246 | '--mi_stratified_sampling_ratio', 247 | type=float, 248 | default=1.0, 249 | help='If < 1.0, MI algorithm will further subsample data in stratified manner (equal distributions per value if possible).', 250 | ) 251 | 252 | 253 | args = parser.parse_args() 254 | 255 | if args.task == 'selftest': 256 | conduct_self_test('MI-numba-randomized') 257 | exit() 258 | 259 | if args.data_path is None and args.task != 'data_generator': 260 | logging.error('Please specify data set name (--data_path).') 261 | exit() 262 | 263 | all_tasks_to_consider = [] 264 | if args.task != 'all': 265 | all_tasks_to_consider = [args.task] 266 | 267 | else: 268 | all_tasks_to_consider = ['ranking', 'ranking_summary', 'visualization'] 269 | 270 | for task in all_tasks_to_consider: 271 | logging.info(f'Proceeding with task: {task} ..') 272 | 273 | if ( 274 | task == 'ranking' 275 | or task == 'feature_summary_transformers' 276 | or task == 'identify_rare_values' 277 | ): 278 | outrank_task_conduct_ranking(args) 279 | 280 | elif task == 'visualization': 281 | outrank_task_visualize_results(args) 282 | 283 | elif task == 'ranking_summary': 284 | outrank_task_result_summary(args) 285 | 286 | elif task == 'data_generator': 287 | outrank_task_generate_data_set(args) 288 | 289 | elif task == 'instance_ranking': 290 | outrank_task_rank_instances(args) 291 | 292 | else: 293 | logging.info(f'Warning, the selected task: {task} does not exist.') 294 | 295 | 296 | if __name__ == '__main__': 297 | main() 298 | -------------------------------------------------------------------------------- /tests/multivalue_mi_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest 4 | import numpy as np 5 | from outrank.algorithms.feature_ranking.ranking_mi_multivalue import ( 6 | multivalue_mutual_info_estimator, 7 | parse_multivalue_feature, 8 | jaccard_based_mutual_info, 9 | multivalue_mi_with_overlap, 10 | set_based_mutual_info, 11 | ) 12 | 13 | class MultivalueMITest(unittest.TestCase): 14 | """Test cases for multivalue mutual information algorithms""" 15 | 16 | def test_parse_multivalue_feature(self): 17 | """Test parsing multivalue features into sets""" 18 | # Using default delimiter '_' 19 | feature_vector = np.array(['a_b_c', 'b_c', '', 'a']) 20 | result = parse_multivalue_feature(feature_vector) 21 | 22 | expected = [ 23 | {'a', 'b', 'c'}, 24 | {'b', 'c'}, 25 | set(), 26 | {'a'} 27 | ] 28 | 29 | self.assertEqual(result, expected) 30 | 31 | def test_parse_multivalue_feature_with_custom_delimiter(self): 32 | """Test parsing with custom delimiter""" 33 | # Test with comma delimiter 34 | feature_vector = np.array(['a,b,c', 'b,c', '', 'a']) 35 | result = parse_multivalue_feature(feature_vector, delimiter=',') 36 | 37 | expected = [ 38 | {'a', 'b', 'c'}, 39 | {'b', 'c'}, 40 | set(), 41 | {'a'} 42 | ] 43 | 44 | self.assertEqual(result, expected) 45 | 46 | expected = [ 47 | {'a', 'b', 'c'}, 48 | {'b', 'c'}, 49 | set(), 50 | {'a'} 51 | ] 52 | 53 | self.assertEqual(result, expected) 54 | 55 | def test_set_based_mutual_info_identical_sets(self): 56 | """Test set-based MI with identical multivalue features""" 57 | X_sets = [{'a', 'b'}, {'b', 'c'}, {'a', 'c'}] 58 | Y_sets = [{'a', 'b'}, {'b', 'c'}, {'a', 'c'}] 59 | 60 | result = set_based_mutual_info(X_sets, Y_sets) 61 | 62 | # Should be high since features are identical 63 | self.assertGreater(result, 1.0) 64 | 65 | def test_set_based_mutual_info_independent_sets(self): 66 | """Test set-based MI with independent multivalue features""" 67 | X_sets = [{'a'}, {'b'}, {'c'}, {'d'}] 68 | Y_sets = [{'x'}, {'y'}, {'z'}, {'w'}] 69 | 70 | result = set_based_mutual_info(X_sets, Y_sets) 71 | 72 | # Should be high due to perfect correspondence (each X maps to unique Y) 73 | self.assertGreater(result, 1.0) 74 | 75 | def test_set_based_mutual_info_empty_sets(self): 76 | """Test set-based MI with empty sets""" 77 | X_sets = [set(), set(), set()] 78 | Y_sets = [set(), set(), set()] 79 | 80 | result = set_based_mutual_info(X_sets, Y_sets) 81 | 82 | # Should be 0 since all sets are identical (empty) 83 | self.assertEqual(result, 0.0) 84 | 85 | def test_jaccard_based_mutual_info_basic(self): 86 | """Test Jaccard-based MI with basic multivalue features""" 87 | X_sets = [{'a', 'b'}, {'b', 'c'}, {'a', 'c'}] 88 | Y_sets = [{'x', 'y'}, {'y', 'z'}, {'x', 'z'}] 89 | 90 | result = jaccard_based_mutual_info(X_sets, Y_sets) 91 | 92 | # Should return a valid MI score 93 | self.assertIsInstance(result, float) 94 | self.assertGreaterEqual(result, 0.0) 95 | 96 | def test_multivalue_mi_with_overlap_basic(self): 97 | """Test overlap-based MI with basic multivalue features""" 98 | X_sets = [{'a', 'b'}, {'b', 'c'}, {'a', 'c'}] 99 | Y_sets = [{'x', 'y'}, {'y', 'z'}, {'x', 'z'}] 100 | 101 | result = multivalue_mi_with_overlap(X_sets, Y_sets) 102 | 103 | # Should return a valid MI score 104 | self.assertIsInstance(result, float) 105 | self.assertGreaterEqual(result, 0.0) 106 | 107 | def test_multivalue_mutual_info_estimator_jaccard(self): 108 | """Test main estimator with Jaccard algorithm""" 109 | X = np.array(['a_b', 'b_c', 'a_c']) 110 | Y = np.array(['x_y', 'y_z', 'x_z']) 111 | 112 | result = multivalue_mutual_info_estimator(X, Y, algorithm='jaccard') 113 | 114 | self.assertIsInstance(result, float) 115 | self.assertGreaterEqual(result, 0.0) 116 | 117 | def test_multivalue_mutual_info_estimator_overlap(self): 118 | """Test main estimator with overlap algorithm""" 119 | X = np.array(['a_b', 'b_c', 'a_c']) 120 | Y = np.array(['x_y', 'y_z', 'x_z']) 121 | 122 | result = multivalue_mutual_info_estimator(X, Y, algorithm='overlap') 123 | 124 | self.assertIsInstance(result, float) 125 | self.assertGreaterEqual(result, 0.0) 126 | 127 | def test_multivalue_mutual_info_estimator_set_based(self): 128 | """Test main estimator with set-based algorithm""" 129 | X = np.array(['a_b', 'b_c', 'a_c']) 130 | Y = np.array(['x_y', 'y_z', 'x_z']) 131 | 132 | result = multivalue_mutual_info_estimator(X, Y, algorithm='set_based') 133 | 134 | self.assertIsInstance(result, float) 135 | self.assertGreaterEqual(result, 0.0) 136 | 137 | def test_multivalue_mutual_info_estimator_invalid_algorithm(self): 138 | """Test main estimator with invalid algorithm""" 139 | X = np.array(['a_b', 'b_c', 'a_c']) 140 | Y = np.array(['x_y', 'y_z', 'x_z']) 141 | 142 | with self.assertRaises(ValueError): 143 | multivalue_mutual_info_estimator(X, Y, algorithm='invalid') 144 | 145 | def test_multivalue_mutual_info_estimator_empty_input(self): 146 | """Test main estimator with empty input""" 147 | X = np.array([]) 148 | Y = np.array([]) 149 | 150 | result = multivalue_mutual_info_estimator(X, Y, algorithm='jaccard') 151 | self.assertEqual(result, 0.0) 152 | 153 | def test_multivalue_mutual_info_estimator_mismatched_lengths(self): 154 | """Test main estimator with mismatched input lengths""" 155 | X = np.array(['a_b']) 156 | Y = np.array(['x_y', 'y_z']) 157 | 158 | result = multivalue_mutual_info_estimator(X, Y, algorithm='jaccard') 159 | self.assertEqual(result, 0.0) 160 | 161 | def test_functional_relationship_detection(self): 162 | """Test detection of functional relationships in multivalue features""" 163 | # Create data with functional relationship: Y values determined by X values 164 | X = np.array(['a_b', 'b_c', 'c_d', 'a_b', 'b_c', 'c_d']) 165 | Y = np.array(['x_y', 'y_z', 'z_w', 'x_y', 'y_z', 'z_w']) 166 | 167 | result = multivalue_mutual_info_estimator(X, Y, algorithm='set_based') 168 | 169 | # Should detect the functional relationship 170 | self.assertGreater(result, 1.0) 171 | 172 | def test_no_relationship_detection(self): 173 | """Test detection when there's no relationship between features""" 174 | # Create completely random multivalue features 175 | np.random.seed(42) 176 | X = np.array([f'{i}_{i+1}' for i in range(100)]) 177 | Y = np.array([f'{100-i}_{100-i-1}' for i in range(100)]) 178 | 179 | result = multivalue_mutual_info_estimator(X, Y, algorithm='set_based') 180 | 181 | # Should detect high MI due to deterministic pattern (each X maps to unique Y) 182 | self.assertGreater(result, 0.0) 183 | 184 | def test_sequential_pattern_without_intersections(self): 185 | """Test detection of sequential patterns when row-wise intersections are empty. 186 | 187 | This addresses the issue raised in GitHub where Jaccard and overlap methods 188 | returned 0 for data like: 189 | Col1: a,b b,c c,d (with comma delimiter) 190 | Col2: i,j,k j,k,l k,l,m (with comma delimiter) 191 | 192 | Here intersections are empty in all cases, but there is information shared 193 | through the sequential patterns. 194 | 195 | NOTE: Using comma delimiter here to test the specific reported case. 196 | """ 197 | # Test case from GitHub comment - using comma delimiter 198 | Col1 = np.array(['a,b', 'b,c', 'c,d', 'd,e', 'e,f']) 199 | Col2 = np.array(['i,j,k', 'j,k,l', 'k,l,m', 'l,m,n', 'm,n,o']) 200 | 201 | # All algorithms should now detect information despite empty intersections 202 | jaccard_score = multivalue_mutual_info_estimator(Col1, Col2, algorithm='jaccard', delimiter=',') 203 | overlap_score = multivalue_mutual_info_estimator(Col1, Col2, algorithm='overlap', delimiter=',') 204 | set_based_score = multivalue_mutual_info_estimator(Col1, Col2, algorithm='set_based', delimiter=',') 205 | 206 | # All should detect meaningful information 207 | self.assertGreater(jaccard_score, 0.0, 208 | "Jaccard should detect information in sequential patterns") 209 | self.assertGreater(overlap_score, 0.0, 210 | "Overlap should detect information in sequential patterns") 211 | self.assertGreater(set_based_score, 0.0, 212 | "Set-based should detect information in sequential patterns") 213 | 214 | # Set-based typically gives highest scores 215 | self.assertGreater(set_based_score, overlap_score * 0.5) 216 | 217 | def test_multivalue_with_compound_values(self): 218 | """Test multivalue features with compound values like 'yellow_sun', 'green_grass', etc. 219 | 220 | This test addresses the request to handle realistic feature values that themselves 221 | contain underscores (e.g., colors with objects). The algorithm should treat 222 | 'yellow_sun' as a single atomic value, not split it further. 223 | """ 224 | # Multivalue features where each value is a compound word 225 | # Using '|' as delimiter to separate different multivalue items 226 | # since the values themselves contain underscores 227 | colors1 = np.array(['yellow_sun|green_grass', 'blue_sea|red_flower', 228 | 'yellow_sun|blue_sea', 'green_grass|red_flower']) 229 | colors2 = np.array(['yellow_sun|blue_sea', 'green_grass|red_flower', 230 | 'yellow_sun|red_flower', 'blue_sea|green_grass']) 231 | 232 | # Test with pipe delimiter for the multivalue separation 233 | for algo in ['jaccard', 'overlap', 'set_based']: 234 | with self.subTest(algorithm=algo): 235 | score = multivalue_mutual_info_estimator( 236 | colors1, colors2, algorithm=algo, delimiter='|' 237 | ) 238 | # Should compute valid MI scores 239 | self.assertIsInstance(score, float) 240 | self.assertGreaterEqual(score, 0.0) 241 | 242 | # Verify parsing treats compound values as atomic units 243 | parsed = parse_multivalue_feature(colors1, delimiter='|') 244 | expected_first = {'yellow_sun', 'green_grass'} 245 | expected_second = {'blue_sea', 'red_flower'} 246 | 247 | self.assertEqual(parsed[0], expected_first, 248 | "Compound values should be treated as atomic units") 249 | self.assertEqual(parsed[1], expected_second, 250 | "Compound values should be treated as atomic units") 251 | 252 | # Test that there's meaningful information between the features 253 | set_based_score = multivalue_mutual_info_estimator( 254 | colors1, colors2, algorithm='set_based', delimiter='|' 255 | ) 256 | self.assertGreater(set_based_score, 0.0, 257 | "Should detect information between correlated multivalue features") 258 | 259 | 260 | if __name__ == '__main__': 261 | unittest.main() -------------------------------------------------------------------------------- /tests/mi_numba_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import unittest 5 | 6 | import numpy as np 7 | 8 | from outrank.algorithms.feature_ranking.ranking_mi_numba import \ 9 | mutual_info_estimator_numba 10 | 11 | np.random.seed(123) 12 | sys.path.append('./outrank') 13 | 14 | 15 | class CompareStrategiesTest(unittest.TestCase): 16 | def test_mi_numba(self): 17 | a = np.random.random(10**6).reshape(-1).astype(np.int32) 18 | b = np.random.random(10**6).reshape(-1).astype(np.int32) 19 | final_score = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 20 | self.assertEqual(final_score, 0.0) 21 | 22 | def test_mi_numba_random(self): 23 | a = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32) 24 | b = np.random.random(8).reshape(-1).astype(np.int32) 25 | 26 | final_score = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 27 | self.assertLess(final_score, 0.0) 28 | 29 | def test_mi_numba_mirror(self): 30 | a = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32) 31 | b = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32) 32 | final_score = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 33 | self.assertGreater(final_score, 0.60) 34 | 35 | def test_mi_numba_longer_inputs(self): 36 | b = np.array([1, 0, 0, 0, 1, 1, 1, 0] * 10**5, dtype=np.int32) 37 | final_score = mutual_info_estimator_numba(b, b, np.float32(1.0), False) 38 | self.assertGreater(final_score, 0.60) 39 | 40 | def test_mi_numba_permutation(self): 41 | a = np.array([1, 0, 0, 0, 1, 1, 1, 0] * 10**3, dtype=np.int32) 42 | b = np.array(np.random.permutation(a), dtype=np.int32) 43 | final_score = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 44 | self.assertLess(final_score, 0.05) 45 | 46 | def test_mi_numba_interaction(self): 47 | # Let's create incrementally more noisy features and compare 48 | a = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32) 49 | lowest = np.array(np.random.permutation(a), dtype=np.int32) 50 | medium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32) 51 | high = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32) 52 | 53 | lowest_score = mutual_info_estimator_numba( 54 | a, lowest, np.float32(1.0), False, 55 | ) 56 | medium_score = mutual_info_estimator_numba( 57 | a, medium, np.float32(1.0), False, 58 | ) 59 | high_score = mutual_info_estimator_numba( 60 | a, high, np.float32(1.0), False, 61 | ) 62 | 63 | scores = [lowest_score, medium_score, high_score] 64 | sorted_score_indices = np.argsort(scores) 65 | self.assertEqual(np.sum(np.array([0, 1, 2]) - sorted_score_indices), 0) 66 | 67 | def test_mi_numba_higher_order(self): 68 | # The famous xor test 69 | vector_first = np.round(np.random.random(1000)).astype(np.int32) 70 | vector_second = np.round(np.random.random(1000)).astype(np.int32) 71 | vector_third = np.logical_xor( 72 | vector_first, vector_second, 73 | ).astype(np.int32) 74 | 75 | score_independent_first = mutual_info_estimator_numba( 76 | vector_first, vector_third, np.float32(1.0), False, 77 | ) 78 | 79 | score_independent_second = mutual_info_estimator_numba( 80 | vector_second, vector_third, np.float32(1.0), False, 81 | ) 82 | 83 | # This must be very close to zero/negative 84 | self.assertLess(score_independent_first, 0.01) 85 | self.assertLess(score_independent_second, 0.01) 86 | 87 | # --interaction_order 2 simulation 88 | combined_feature = np.array( 89 | list(hash(x) for x in zip(vector_first, vector_second)), 90 | ).astype(np.int32) 91 | 92 | score_combined = mutual_info_estimator_numba( 93 | combined_feature, vector_third, np.float32(1.0), False, 94 | ) 95 | 96 | # This must be in the range of identity 97 | self.assertGreater(score_combined, 0.60) 98 | 99 | # === NEW COMPREHENSIVE TESTS === 100 | 101 | def test_empty_arrays(self): 102 | """Test behavior with empty arrays""" 103 | a = np.array([], dtype=np.int32) 104 | b = np.array([], dtype=np.int32) 105 | 106 | # Should handle empty arrays gracefully 107 | with self.assertRaises((IndexError, ValueError)): 108 | mutual_info_estimator_numba(a, b, np.float32(1.0), False) 109 | 110 | def test_single_element_arrays(self): 111 | """Test arrays with single elements""" 112 | a = np.array([1], dtype=np.int32) 113 | b = np.array([0], dtype=np.int32) 114 | 115 | # Single element arrays should work 116 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 117 | self.assertIsInstance(result, (float, np.float32)) 118 | 119 | def test_identical_arrays(self): 120 | """Test perfectly correlated arrays""" 121 | a = np.array([1, 2, 3, 1, 2, 3] * 100, dtype=np.int32) 122 | b = a.copy() 123 | 124 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 125 | # Identical arrays should have high mutual information 126 | self.assertGreater(result, 0.5) 127 | 128 | def test_approximation_factors(self): 129 | """Test different approximation factors""" 130 | a = np.array([1, 0, 1, 0, 1, 0] * 1000, dtype=np.int32) 131 | b = np.array([0, 1, 0, 1, 0, 1] * 1000, dtype=np.int32) 132 | 133 | # Test various approximation factors 134 | for factor in [0.1, 0.5, 1.0]: 135 | result = mutual_info_estimator_numba(a, b, np.float32(factor), False) 136 | self.assertIsInstance(result, (float, np.float32)) 137 | 138 | def test_approximation_factor_edge_cases(self): 139 | """Test edge cases for approximation factor""" 140 | a = np.array([1, 0, 1, 0] * 100, dtype=np.int32) 141 | b = np.array([0, 1, 0, 1] * 100, dtype=np.int32) 142 | 143 | # Very small approximation factor 144 | result = mutual_info_estimator_numba(a, b, np.float32(0.01), False) 145 | self.assertIsInstance(result, (float, np.float32)) 146 | 147 | # Approximation factor > 1 (should still work) 148 | result = mutual_info_estimator_numba(a, b, np.float32(1.5), False) 149 | self.assertIsInstance(result, (float, np.float32)) 150 | 151 | def test_cardinality_correction(self): 152 | """Test cardinality correction flag""" 153 | a = np.array([1, 0, 1, 0, 1, 0] * 500, dtype=np.int32) 154 | b = np.array([1, 0, 1, 0, 1, 0] * 500, dtype=np.int32) 155 | 156 | # Without cardinality correction 157 | result_no_corr = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 158 | 159 | # With cardinality correction 160 | result_with_corr = mutual_info_estimator_numba(a, b, np.float32(1.0), True) 161 | 162 | # Both should be valid but may differ 163 | self.assertIsInstance(result_no_corr, (float, np.float32)) 164 | self.assertIsInstance(result_with_corr, (float, np.float32)) 165 | 166 | def test_different_array_lengths(self): 167 | """Test arrays of different lengths (should fail)""" 168 | a = np.array([1, 0, 1], dtype=np.int32) 169 | b = np.array([0, 1], dtype=np.int32) 170 | 171 | with self.assertRaises((IndexError, ValueError)): 172 | mutual_info_estimator_numba(a, b, np.float32(1.0), False) 173 | 174 | def test_binary_vs_multiclass(self): 175 | """Test binary vs multiclass scenarios""" 176 | # Binary case 177 | a_binary = np.array([0, 1] * 500, dtype=np.int32) 178 | b_binary = np.array([1, 0] * 500, dtype=np.int32) 179 | 180 | result_binary = mutual_info_estimator_numba(a_binary, b_binary, np.float32(1.0), False) 181 | 182 | # Multiclass case 183 | a_multi = np.array([0, 1, 2] * 333 + [0], dtype=np.int32) 184 | b_multi = np.array([2, 0, 1] * 333 + [1], dtype=np.int32) 185 | 186 | result_multi = mutual_info_estimator_numba(a_multi, b_multi, np.float32(1.0), False) 187 | 188 | # Both should be valid 189 | self.assertIsInstance(result_binary, (float, np.float32)) 190 | self.assertIsInstance(result_multi, (float, np.float32)) 191 | 192 | def test_extreme_values(self): 193 | """Test with extreme integer values""" 194 | max_val = np.iinfo(np.int32).max 195 | a = np.array([0, max_val] * 100, dtype=np.int32) 196 | b = np.array([max_val, 0] * 100, dtype=np.int32) 197 | 198 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 199 | self.assertIsInstance(result, (float, np.float32)) 200 | 201 | def test_all_same_values(self): 202 | """Test arrays where all values are the same""" 203 | a = np.array([5] * 1000, dtype=np.int32) 204 | b = np.array([5] * 1000, dtype=np.int32) 205 | 206 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 207 | # Should handle constant arrays 208 | self.assertIsInstance(result, (float, np.float32)) 209 | 210 | def test_large_arrays_performance(self): 211 | """Test with large arrays for performance validation""" 212 | size = 50000 213 | a = np.random.randint(0, 10, size=size, dtype=np.int32) 214 | b = np.random.randint(0, 10, size=size, dtype=np.int32) 215 | 216 | result = mutual_info_estimator_numba(a, b, np.float32(0.1), True) 217 | self.assertIsInstance(result, (float, np.float32)) 218 | 219 | def test_deterministic_behavior(self): 220 | """Test that results are deterministic for same inputs""" 221 | a = np.array([1, 0, 1, 0, 1] * 200, dtype=np.int32) 222 | b = np.array([0, 1, 0, 1, 0] * 200, dtype=np.int32) 223 | 224 | # Multiple runs should give same result 225 | result1 = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 226 | result2 = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 227 | result3 = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 228 | 229 | self.assertEqual(result1, result2) 230 | self.assertEqual(result2, result3) 231 | 232 | def test_independence_detection(self): 233 | """Test detection of statistical independence""" 234 | np.random.seed(42) # For reproducible randomness 235 | 236 | # Create independent variables 237 | a = np.random.randint(0, 3, size=5000, dtype=np.int32) 238 | b = np.random.randint(0, 3, size=5000, dtype=np.int32) 239 | 240 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 241 | 242 | # Independent variables should have low mutual information 243 | # Note: Due to finite sample effects, may not be exactly 0 244 | self.assertLess(abs(result), 0.2) 245 | 246 | def test_functional_relationship(self): 247 | """Test detection of functional relationships""" 248 | # Y = f(X) relationship 249 | a = np.array([0, 1, 2] * 1000, dtype=np.int32) 250 | b = np.array([0, 2, 4] * 1000, dtype=np.int32) # b = 2*a 251 | 252 | result = mutual_info_estimator_numba(a, b, np.float32(1.0), False) 253 | 254 | # Functional relationship should have high mutual information 255 | self.assertGreater(result, 0.5) 256 | 257 | def test_noise_robustness(self): 258 | """Test robustness to noise in relationship""" 259 | np.random.seed(999) 260 | 261 | # Base relationship 262 | a = np.array([0, 1] * 2500, dtype=np.int32) 263 | b_clean = a.copy() 264 | 265 | # Add noise (flip 10% of values) 266 | noise_indices = np.random.choice(len(b_clean), size=len(b_clean)//10, replace=False) 267 | b_noisy = b_clean.copy() 268 | b_noisy[noise_indices] = 1 - b_noisy[noise_indices] 269 | 270 | result_clean = mutual_info_estimator_numba(a, b_clean, np.float32(1.0), False) 271 | result_noisy = mutual_info_estimator_numba(a, b_noisy, np.float32(1.0), False) 272 | 273 | # Noisy version should have lower MI than clean version 274 | self.assertLess(result_noisy, result_clean) 275 | 276 | # But both should be positive 277 | self.assertGreater(result_clean, 0.4) 278 | self.assertGreater(result_noisy, 0.0) 279 | -------------------------------------------------------------------------------- /outrank/task_ranking.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import glob 4 | import json 5 | import logging 6 | import os 7 | import signal 8 | from typing import Any 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import gzip 13 | import zstandard as zstd 14 | 15 | from outrank.algorithms.importance_estimator import rank_features_3MR 16 | from outrank.core_ranking import estimate_importances_minibatches 17 | from outrank.core_utils import display_random_tip 18 | from outrank.core_utils import display_tool_name 19 | from outrank.core_utils import get_dataset_info 20 | from outrank.core_utils import summarize_feature_bounds_for_transformers 21 | from outrank.core_utils import summarize_rare_counts 22 | from outrank.core_utils import write_json_dump_to_file 23 | 24 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 25 | signal.signal(signal.SIGINT, signal.default_int_handler) 26 | 27 | try: 28 | # pathos enables proper pickling during parallelization (multiprocessing does not) 29 | from pathos.multiprocessing import ProcessingPool as Pool 30 | 31 | except Exception as es: 32 | logging.info( 33 | f'\U0001F631 Please install the "pathos" library (pip install pathos) for required multithreading capabilities. {es}', 34 | ) 35 | 36 | 37 | def outrank_task_conduct_ranking(args: Any) -> None: 38 | # Data source = folder structure + relevant file specifications 39 | if args.task in ['identify_rare_values', 'feature_summary_transformers']: 40 | args.heuristic = 'Constant' 41 | 42 | if args.disable_tqdm == 'False': 43 | display_tool_name() 44 | display_random_tip() 45 | 46 | dataset_info = get_dataset_info(args) 47 | 48 | for arg in vars(args): 49 | logging.info(f'{arg} set to: {getattr(args, arg)}') 50 | 51 | # Generate output folders (if not present) 52 | output_dir = os.path.dirname( 53 | os.path.join( 54 | args.output_folder, 'pairwise_ranks.tsv', 55 | ), 56 | ) 57 | if not os.path.exists(output_dir): 58 | os.makedirs(output_dir) 59 | 60 | # Initialize the global pool 61 | GLOBAL_CPU_POOL = Pool(args.num_threads) 62 | global_mutual_information_estimates = [] 63 | global_bounds_storage = [] 64 | global_memory_storage = [] 65 | all_timings = [] 66 | # Traverse the batches 67 | for raw_dump in glob.glob(dataset_info.data_path): 68 | 69 | if ( 70 | args.data_source == 'ob-vw' 71 | or args.data_source == 'ob-csv' 72 | or args.data_source == 'csv-raw' 73 | or args.data_source == 'ob-raw-dump' 74 | ): 75 | all_subfiles = [raw_dump] 76 | 77 | for partial_data in all_subfiles: 78 | cmd_arguments = { 79 | 'input_file': partial_data, 80 | 'fw_col_mapping': dataset_info.fw_map, 81 | 'column_descriptions': dataset_info.column_names, 82 | 'numeric_column_types': dataset_info.column_types, 83 | 'args': args, 84 | 'data_encoding': dataset_info.encoding, 85 | 'cpu_pool': GLOBAL_CPU_POOL, 86 | 'delimiter': dataset_info.col_delimiter, 87 | 'logger': logging, 88 | } 89 | 90 | if ( 91 | args.data_source == 'ob-csv' 92 | or args.data_source == 'ob-vw' 93 | or args.data_source == 'csv-raw' 94 | or args.data_source == 'ob-raw-dump' 95 | ): 96 | ( 97 | checkpoint_timings, 98 | mutual_information_estimates, 99 | cardinality_object, 100 | bounds_object_storage, 101 | memory_object_storage, 102 | coverage_object, 103 | RARE_VALUE_STORAGE, 104 | GLOBAL_PRIOR_COMB_COUNTS, 105 | GLOBAL_ITEM_COUNTS, 106 | ) = estimate_importances_minibatches(**cmd_arguments) 107 | 108 | global_bounds_storage += bounds_object_storage 109 | global_memory_storage += memory_object_storage 110 | all_timings += checkpoint_timings 111 | 112 | if cardinality_object is None: 113 | continue 114 | 115 | if coverage_object is None: 116 | continue 117 | 118 | if mutual_information_estimates is not None: 119 | global_mutual_information_estimates.append( 120 | mutual_information_estimates, 121 | ) 122 | 123 | if args.task == 'identify_rare_values': 124 | logging.info('Summarizing rare values ..') 125 | summarize_rare_counts( 126 | RARE_VALUE_STORAGE, args, cardinality_object, dataset_info, 127 | ) 128 | exit() 129 | 130 | if args.task == 'feature_summary_transformers': 131 | summarize_feature_bounds_for_transformers( 132 | bounds_object_storage, 133 | dataset_info.column_types, 134 | args.task, 135 | args.label_column, 136 | ) 137 | exit() 138 | else: 139 | summary_of_numeric_features = summarize_feature_bounds_for_transformers( 140 | bounds_object_storage, 141 | dataset_info.column_types, 142 | args.task, 143 | args.label_column, 144 | output_summary_table_only=True, 145 | ) 146 | if summary_of_numeric_features is not None: 147 | num_out = os.path.join( 148 | args.output_folder, 'numeric_feature_statistics.tsv', 149 | ) 150 | summary_of_numeric_features.to_csv(num_out, sep='\t', index=False) 151 | logging.info( 152 | f'Stored statistics of numeric features to {num_out} ..', 153 | ) 154 | 155 | # Just in case. 156 | GLOBAL_CPU_POOL.close() 157 | GLOBAL_CPU_POOL.join() 158 | 159 | if len(global_mutual_information_estimates) == 0: 160 | logging.info('No rankings were obtained, exiting ..') 161 | exit() 162 | 163 | # Compute median imps across batches 164 | triplets = pd.concat(global_mutual_information_estimates, axis=0) 165 | triplets.columns = ['FeatureA', 'FeatureB', 'Score'] 166 | 167 | if '3mr' in args.heuristic: 168 | # relevance include MI-scores of features w.r.t. labels 169 | relevance_df = triplets[triplets.FeatureB == args.label_column].copy() 170 | relevance_df = relevance_df[ 171 | relevance_df.FeatureA.map(lambda x: ' AND_REL ' not in x) 172 | ][['FeatureA', 'Score']] 173 | relevance_df = relevance_df[relevance_df.FeatureA != args.label_column] 174 | 175 | # relations include MI-scores of combinations w.r.t. label 176 | relations_df = triplets[triplets.FeatureB == args.label_column][ 177 | ['FeatureA', 'Score'] 178 | ].copy() 179 | relations_df = relations_df[ 180 | relations_df.FeatureA.map(lambda x: ' AND_REL ' in x) 181 | ] 182 | relations_df['FeatureB'] = relations_df.FeatureA.map( 183 | lambda x: x.split(' AND_REL ')[1], 184 | ) 185 | relations_df['FeatureA'] = relations_df.FeatureA.map( 186 | lambda x: x.split(' AND_REL ')[0], 187 | ) 188 | 189 | # redundancies include MI-scores of features w.r.t. non-label features 190 | redundancies_df = triplets[( 191 | triplets.FeatureB != args.label_column 192 | )].copy() 193 | redundancies_df = redundancies_df[ 194 | redundancies_df.FeatureA != 195 | args.label_column 196 | ] 197 | redundancies_df = redundancies_df[ 198 | redundancies_df.apply( 199 | lambda x: (' AND_REL ' not in x.FeatureA) 200 | and (' AND_REL ' not in x.FeatureB), 201 | axis=1, 202 | ) 203 | ] 204 | 205 | # normalize 206 | relevance_df['score'] = (relevance_df.Score - relevance_df.Score.min()) / ( 207 | relevance_df.Score.max() - relevance_df.Score.min() 208 | ) 209 | relations_df['score'] = (relations_df.Score - relations_df.Score.min()) / ( 210 | relations_df.Score.max() - relations_df.Score.min() 211 | ) 212 | redundancies_df['score'] = ( 213 | redundancies_df.Score - redundancies_df.Score.min() 214 | ) / (redundancies_df.Score.max() - redundancies_df.Score.min()) 215 | 216 | # create dicts 217 | relevance_dict = { 218 | row.FeatureA: row.score for _, 219 | row in relevance_df.iterrows() 220 | } 221 | relations_dict = { 222 | (row.FeatureA, row.FeatureB): row.score 223 | for _, row in relations_df.iterrows() 224 | } 225 | relations_dict.update( 226 | { 227 | (row.FeatureB, row.FeatureA): row.score 228 | for _, row in relations_df.iterrows() 229 | }, 230 | ) 231 | redundancy_dict = { 232 | (row.FeatureA, row.FeatureB): row.score 233 | for _, row in redundancies_df.iterrows() 234 | } 235 | 236 | # compute 3mr ranks 237 | mrmrmr_ranking = rank_features_3MR( 238 | relevance_dict, redundancy_dict, relations_dict, 239 | ) 240 | mrmrmr_ranking.to_csv( 241 | os.path.join(args.output_folder, '3mr_ranks.tsv'), sep='\t', index=False, 242 | ) 243 | 244 | feature_first_modified = [] 245 | feature_second_modified = [] 246 | 247 | if args.include_cardinality_in_feature_names == 'True': 248 | for enx in range(triplets.shape[0]): 249 | feature_first = triplets.iloc[enx]['FeatureA'] 250 | feature_second = triplets.iloc[enx]['FeatureB'] 251 | card_first = str(len(cardinality_object[feature_first])) 252 | card_second = str(len(cardinality_object[feature_second])) 253 | cov_first = int( 254 | round((np.mean(np.array(coverage_object[feature_first]))), 1), 255 | ) 256 | cov_second = int( 257 | round(np.mean(np.array(coverage_object[feature_second])), 1), 258 | ) 259 | 260 | feature_first_modified.append( 261 | feature_first + f'-({card_first}; {cov_first})', 262 | ) 263 | feature_second_modified.append( 264 | feature_second + f'-({card_second}; {cov_second})', 265 | ) 266 | 267 | triplets['FeatureA'] = feature_first_modified 268 | triplets['FeatureB'] = feature_second_modified 269 | 270 | feature_memory_df = pd.DataFrame(global_memory_storage).mean() 271 | feature_memory_df.columns = ['NormalizedSize'] 272 | feature_memory_df.to_csv( 273 | f'{args.output_folder}/memory.tsv', sep='\t', index=True, 274 | ) 275 | 276 | triplets = triplets.sort_values(by=['Score']) 277 | 278 | triplets.to_csv( 279 | os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t', index=False, 280 | ) 281 | 282 | with open(f'{args.output_folder}/value_repetitions.json', 'w') as out_counts: 283 | out_dict = {} 284 | for k, v in GLOBAL_ITEM_COUNTS.items(): 285 | actual_hist = np.array(list(v.default_counter.values())) 286 | more_than = lambda n, ary: len(np.where(ary > n)[0]) 287 | out_dict[k] = {x: more_than(x, actual_hist) for x in [0] + [1 * 10 ** x for x in range(6)]} 288 | out_counts.write(json.dumps(out_dict)) 289 | 290 | with open(f'{args.output_folder}/combination_estimation_counts.json', 'w') as out_counts: 291 | out_dict = {str(k): v for k, v in GLOBAL_PRIOR_COMB_COUNTS.items()} 292 | out_counts.write(json.dumps(out_dict)) 293 | 294 | # Write timings and config for replicability 295 | dfx = pd.DataFrame(all_timings) 296 | dfx.to_json(f'{args.output_folder}/timings.json') 297 | write_json_dump_to_file(args, f'{args.output_folder}/arguments.json') 298 | 299 | logging.info( 300 | f'Finished with ranking! Result stored as: {args.output_folder}/pairwise_ranks.tsv. Cleaning up tmp files ..', 301 | ) 302 | 303 | os.remove('ranking_checkpoint_tmp.tsv') 304 | 305 | def identify_data_file_type(data_path): 306 | all_files = set(list(glob.glob(os.path.join(data_path, '*')))) 307 | gz_pname, zst_pname = 'data.vw.gz', 'data.vw.zst' 308 | if gz_pname in ''.join(all_files): 309 | return os.path.join(data_path, gz_pname) 310 | elif zst_pname in ''.join(all_files): 311 | return os.path.join(data_path, zst_pname) 312 | else: 313 | raise NotImplementedError('Please provide a valid data type .. (gz, zst)') 314 | --------------------------------------------------------------------------------