├── .azure-pipelines.old ├── pipeline-master.yml └── templates │ ├── create-env.yml │ ├── release.yml │ └── run-tests.yml ├── .bumpversion.cfg ├── .codecov.yml ├── .darglint ├── .deepsource.toml ├── .devcontainer ├── Dockerfile ├── devcontainer.json └── noop.txt ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation_fix.md │ ├── new_examples.md │ └── new_proposed_feature.md ├── config.yml ├── config.yml.save ├── pull_request_template.md ├── stale.yaml └── workflows │ ├── auto-release.yml │ ├── auto-update.yml │ ├── docs.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pyup.yml ├── .requirements ├── all.in ├── all.txt ├── base.in ├── base.txt ├── biology.in ├── biology.txt ├── chemistry.in ├── chemistry.txt ├── dev.in ├── dev.txt ├── docs.in ├── docs.txt ├── engineering.in ├── engineering.txt ├── spark.in ├── spark.txt ├── testing.in └── testing.txt ├── AUTHORS.md ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── environment-dev.yml ├── examples ├── add_column.md ├── convert_currency.md ├── data │ └── medium_franchise_raw_table.csv ├── filter_date.md ├── limit_column_characters.md ├── make_currency_column_numeric.md ├── notebooks │ ├── Pivot Data from Long to Wide Form.ipynb │ ├── Pivoting Data from Wide to Long.ipynb │ ├── README.rst │ ├── Row_to_Names.ipynb │ ├── anime.ipynb │ ├── bad_values.ipynb │ ├── bird_call.ipynb │ ├── board_games.ipynb │ ├── case_when.ipynb │ ├── coalesce.ipynb │ ├── complete.ipynb │ ├── dirty_data.ipynb │ ├── dirty_data.xlsx │ ├── encode_categorical.ipynb │ ├── expand_grid.ipynb │ ├── fill_direction.ipynb │ ├── french_trains.ipynb │ ├── groupby_agg.ipynb │ ├── inflating_converting_currency.ipynb │ ├── medium_franchise.ipynb │ ├── normalize.ipynb │ ├── process_text.ipynb │ ├── pyjanitor_intro.ipynb │ ├── select_columns.ipynb │ ├── sort_columns.ipynb │ ├── sort_naturally.ipynb │ ├── teacher_pupil.ipynb │ └── transform_column.ipynb ├── round_to_fraction.md ├── row_to_names.md └── then.md ├── janitor ├── __init__.py ├── accessors │ ├── __init__.py │ └── data_description.py ├── biology.py ├── chemistry.py ├── engineering.py ├── errors.py ├── finance.py ├── functions │ ├── __init__.py │ ├── _numba.py │ ├── add_columns.py │ ├── alias.py │ ├── also.py │ ├── bin_numeric.py │ ├── case_when.py │ ├── change_index_dtype.py │ ├── change_type.py │ ├── clean_names.py │ ├── coalesce.py │ ├── collapse_levels.py │ ├── complete.py │ ├── concatenate_columns.py │ ├── conditional_join.py │ ├── convert_date.py │ ├── count_cumulative_unique.py │ ├── currency_column_to_numeric.py │ ├── deconcatenate_column.py │ ├── drop_constant_columns.py │ ├── drop_duplicate_columns.py │ ├── dropnotnull.py │ ├── encode_categorical.py │ ├── expand_column.py │ ├── expand_grid.py │ ├── explode_index.py │ ├── factorize_columns.py │ ├── fill.py │ ├── filter.py │ ├── find_replace.py │ ├── flag_nulls.py │ ├── get_dupes.py │ ├── groupby_agg.py │ ├── groupby_topk.py │ ├── impute.py │ ├── jitter.py │ ├── join_apply.py │ ├── label_encode.py │ ├── limit_column_characters.py │ ├── min_max_scale.py │ ├── move.py │ ├── mutate.py │ ├── pivot.py │ ├── process_text.py │ ├── remove_columns.py │ ├── remove_empty.py │ ├── rename_columns.py │ ├── reorder_columns.py │ ├── round_to_fraction.py │ ├── row_to_names.py │ ├── select.py │ ├── shuffle.py │ ├── sort_column_value_order.py │ ├── sort_naturally.py │ ├── summarise.py │ ├── take_first.py │ ├── then.py │ ├── to_datetime.py │ ├── toset.py │ ├── transform_columns.py │ ├── truncate_datetime.py │ ├── update_where.py │ └── utils.py ├── io.py ├── math.py ├── ml.py ├── polars │ ├── __init__.py │ ├── clean_names.py │ ├── complete.py │ ├── dates_to_polars.py │ ├── pivot_longer.py │ ├── polars_flavor.py │ └── row_to_names.py ├── spark │ ├── __init__.py │ ├── backend.py │ └── functions.py ├── testing_utils │ ├── __init__.py │ ├── date_data.py │ └── strategies.py ├── timeseries.py ├── utils.py └── xarray │ ├── __init__.py │ └── functions.py ├── mkdocs.yml ├── mkdocs ├── AUTHORS.md ├── CHANGELOG.md ├── api │ ├── biology.md │ ├── chemistry.md │ ├── engineering.md │ ├── finance.md │ ├── functions.md │ ├── io.md │ ├── math.md │ ├── ml.md │ ├── polars.md │ ├── timeseries.md │ └── xarray.md ├── css │ └── apidocs.css ├── development │ └── lazy_imports.md ├── devguide.md ├── environment.yaml └── index.md ├── nbconvert_config.py ├── pyproject.toml ├── scripts ├── ci │ ├── build_environment.sh │ └── unpack_environment.sh ├── count_functions.py └── docker_deploy.sh ├── setup.py ├── talks └── scipy2019 │ ├── friends.png │ ├── readthedocs.png │ ├── slides.ipynb │ ├── slides.md │ ├── sprints.jpg │ └── twitter-wars.png └── tests ├── biology └── test_join_fasta.py ├── chemistry ├── test_maccs_keys_fingerprint.py ├── test_molecular_descriptors.py ├── test_morgan_fingerprint.py └── test_smiles2mol.py ├── conftest.py ├── engineering └── test_convert_units.py ├── finance ├── test_convert_currency.py ├── test_convert_stock.py ├── test_get_symbol.py └── test_inflate_currency.py ├── functions ├── test_add_column.py ├── test_add_columns.py ├── test_alias.py ├── test_also.py ├── test_bin_numeric.py ├── test_cartesian_product.py ├── test_case_when.py ├── test_change_index_dtype.py ├── test_change_type.py ├── test_clean_names.py ├── test_coalesce.py ├── test_collapse_levels.py ├── test_complete.py ├── test_concatenate_columns.py ├── test_conditional_join.py ├── test_convert_excel_date.py ├── test_convert_matlab_date.py ├── test_convert_unix_date.py ├── test_count_cumulative_unique.py ├── test_currency_column_to_numeric.py ├── test_data_description.py ├── test_deconcatenate_column.py ├── test_drop_constant_columns.py ├── test_drop_duplicate_columns.py ├── test_dropnotnull.py ├── test_encode_categorical.py ├── test_expand.py ├── test_expand_column.py ├── test_expand_grid.py ├── test_explode_index.py ├── test_factorize_columns.py ├── test_fill_direction.py ├── test_fill_empty.py ├── test_filter_column_isin.py ├── test_filter_date.py ├── test_filter_on.py ├── test_filter_string.py ├── test_find_replace.py ├── test_flag_nulls.py ├── test_get_dupes.py ├── test_groupby_agg.py ├── test_groupby_topk.py ├── test_impute.py ├── test_jitter.py ├── test_join_apply.py ├── test_label_encode.py ├── test_limit_column_characters.py ├── test_min_max_scale.py ├── test_move.py ├── test_mutate.py ├── test_pivot_longer.py ├── test_pivot_longer_spec.py ├── test_pivot_wider.py ├── test_pivot_wider_spec.py ├── test_process_text.py ├── test_remove_columns.py ├── test_remove_empty.py ├── test_rename_column.py ├── test_rename_columns.py ├── test_reorder_columns.py ├── test_round_to_fraction.py ├── test_row_to_names.py ├── test_select.py ├── test_select_columns.py ├── test_select_rows.py ├── test_shuffle.py ├── test_sort_column_value_order.py ├── test_sort_naturally.py ├── test_summarise.py ├── test_take_first.py ├── test_then.py ├── test_to_datetime.py ├── test_toset.py ├── test_transform_column.py ├── test_transform_columns.py ├── test_truncate_datetime.py ├── test_unionize_dataframe_categories.py └── test_update_where.py ├── helpers.py ├── io ├── test_read_commandline.py ├── test_read_csvs.py ├── test_tidyxl.py └── test_xlsx_table.py ├── math ├── test_ecdf.py ├── test_exp.py ├── test_log.py ├── test_logit.py ├── test_normal_cdf.py ├── test_probit.py ├── test_sigmoid.py ├── test_softmax.py └── test_z_score.py ├── ml └── test_get_features_targets.py ├── polars └── functions │ ├── test_clean_names_polars.py │ ├── test_complete_polars.py │ ├── test_convert_excel_date_polars.py │ ├── test_convert_matlab_date_polars.py │ ├── test_expand_polars.py │ ├── test_pivot_longer_polars.py │ ├── test_pivot_longer_spec_polars.py │ └── test_row_to_names_polars.py ├── spark ├── conftest.py └── functions │ ├── test_clean_names_spark.py │ └── test_update_where_spark.py ├── test_data ├── 016-MSPTDA-Excel.xlsx ├── corrected_smiles.txt ├── excel_without_headers.xlsx ├── file_example_XLSX_10.xlsx ├── sequences.fasta ├── sequences.tsv └── worked-examples.xlsx ├── test_documentation_build.py ├── test_helpers.py ├── timeseries ├── test_fill_missing_timestamps.py ├── test_flag_jumps.py └── test_sort_timestamps_monotonically.py ├── utils ├── test_check_column.py ├── test_deprecated_alias.py ├── test_deprecated_kwargs.py ├── test_idempotent.py ├── test_import_message.py ├── test_is_connected.py ├── test_replace_empty_string_with_none.py ├── test_skiperror.py └── test_skipna.py └── xarray ├── conftest.py ├── test_clone_using.py └── test_convert_datetime_to_number.py /.azure-pipelines.old/pipeline-master.yml: -------------------------------------------------------------------------------- 1 | pr: 2 | - dev 3 | 4 | jobs: 5 | - job: linux 6 | variables: 7 | activate.command: "source activate" 8 | JANITOR_CI_MACHINE: 1 9 | strategy: 10 | matrix: 11 | py37: 12 | python.version: "3.7" 13 | 14 | pool: 15 | vmImage: ubuntu-16.04 16 | 17 | steps: 18 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 19 | displayName: Add conda to PATH 20 | - template: templates/create-env.yml 21 | - template: templates/run-tests.yml 22 | # - template: templates/release.yml 23 | 24 | - job: macos 25 | variables: 26 | activate.command: "source activate" 27 | JANITOR_CI_MACHINE: 1 28 | strategy: 29 | matrix: 30 | py37: 31 | python.version: "3.7" 32 | 33 | pool: 34 | vmImage: macOS-10.14 35 | 36 | steps: 37 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 38 | displayName: Add conda to PATH 39 | 40 | # On Hosted macOS, the agent user doesn't have ownership of Miniconda's installation directory/ 41 | # We need to take ownership if we want to update conda or install packages globally 42 | - bash: sudo chown -R $USER $CONDA 43 | displayName: Take ownership of conda installation 44 | - template: templates/create-env.yml 45 | - template: templates/run-tests.yml 46 | # - template: templates/release.yml 47 | 48 | # Commenting out Windows build because it never fails when it should... 49 | # - job: windows 50 | # variables: 51 | # activate.command: "activate" 52 | # JANITOR_CI_MACHINE: 1 53 | # strategy: 54 | # matrix: 55 | # py37: 56 | # python.version: "3.7" 57 | 58 | # pool: 59 | # vmImage: vs2017-win2016 60 | 61 | # steps: 62 | # - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" 63 | # displayName: Add conda to PATH 64 | # - template: templates/create-env.yml 65 | # - template: templates/run-tests.yml 66 | # # - template: templates/release.yml 67 | -------------------------------------------------------------------------------- /.azure-pipelines.old/templates/create-env.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - script: | 3 | conda env create -f environment-dev.yml 4 | conda install -y python=$(python.version) 5 | $(activate.command) pyjanitor-dev 6 | python -m ipykernel install --user --name pyjanitor-dev 7 | python setup.py develop 8 | displayName: 'Install kernel, package and dependencies' 9 | -------------------------------------------------------------------------------- /.azure-pipelines.old/templates/release.yml: -------------------------------------------------------------------------------- 1 | # Unconditionally release pyjanitor. 2 | # If no version bump has happened, then nothing happens. 3 | # If a version bump has happened, then the build will automatically deploy 4 | # a release to PyPI. 5 | 6 | steps: 7 | - script: | 8 | $(activate.command) pyjanitor-dev 9 | conda install twine setuptools 10 | displayName: 'Install Twine and setuptools' 11 | - script: | 12 | $(activate.command) pyjanitor-dev 13 | python setup.py sdist bdist_wheel 14 | displayName: 'Build artifacts to deploy.' 15 | - script: | 16 | $(activate.command) pyjanitor-dev 17 | twine upload dist/* --skip-existing --username $(pypi.username) --password $(pypi.password) 18 | displayName: 'Upload built artifacts.' 19 | -------------------------------------------------------------------------------- /.azure-pipelines.old/templates/run-tests.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - script: | 3 | $(activate.command) pyjanitor-dev 4 | flake8 . --exclude ./nbconvert_config.py 5 | displayName: 'Code style: flake8' 6 | - script: | 7 | $(activate.command) pyjanitor-dev 8 | black 9 | displayName: 'Code style: black' 10 | - script: | 11 | $(activate.command) pyjanitor-dev 12 | interrogate 13 | displayName: 'Docstring coverage: interrogate' 14 | - script: | 15 | $(activate.command) pyjanitor-dev 16 | darglint janitor -v 2 17 | displayName: 'Docstring linter: darglint' 18 | - script: | 19 | $(activate.command) pyjanitor-dev 20 | pytest 21 | displayName: 'Unit tests.' 22 | - script: | 23 | $(activate.command) pyjanitor-dev 24 | python scripts/check-autodoc.py 25 | displayName: 'Check that all general functions have been added to docs.' 26 | - script: | 27 | $(activate.command) pyjanitor-dev 28 | bash <(curl -s https://codecov.io/bash) -t c4aaeb6c-be8f-44b2-a529-6871f3537261 29 | displayName: 'Upload code coverage.' 30 | - script: | 31 | $(activate.command) pyjanitor-dev 32 | python -m ipykernel install --user --name pyjanitor-dev 33 | jupyter nbconvert --to notebook --config nbconvert_config.py --execute --template full 34 | displayName: 'Test that all notebooks execute correctly.' 35 | - script: | 36 | $(activate.command) pyjanitor-dev 37 | cd docs && make html 38 | displayName: 'Test that HTML docs all build correctly.' 39 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.31.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:janitor/__init__.py] 9 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: yes 4 | 5 | coverage: 6 | precision: 2 7 | round: down 8 | range: "70...100" 9 | 10 | status: 11 | project: yes 12 | patch: yes 13 | changes: no 14 | 15 | parsers: 16 | gcov: 17 | branch_detection: 18 | conditional: yes 19 | loop: yes 20 | method: no 21 | macro: no 22 | 23 | comment: 24 | layout: "header, diff" 25 | behavior: default 26 | require_changes: no 27 | -------------------------------------------------------------------------------- /.darglint: -------------------------------------------------------------------------------- 1 | [darglint] 2 | docstring_style=google 3 | strictness=short 4 | ignore_regex=^(test_|_)(.*) 5 | -------------------------------------------------------------------------------- /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | test_patterns = [ 4 | "tests/**", 5 | "test_*.py", 6 | "scripts/check-autodoc.py", 7 | ] 8 | 9 | [[analyzers]] 10 | name = "python" 11 | enabled = true 12 | 13 | [analyzers.meta] 14 | runtime_version = "3.x.x" 15 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 4 | #------------------------------------------------------------------------------------------------------------- 5 | 6 | FROM continuumio/miniconda3 7 | 8 | # Avoid warnings by switching to noninteractive 9 | ENV DEBIAN_FRONTEND=noninteractive 10 | 11 | # This Dockerfile adds a non-root user with sudo access. Use the "remoteUser" 12 | # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs 13 | # will be updated to match your local UID/GID (when using the dockerFile property). 14 | # See https://aka.ms/vscode-remote/containers/non-root-user for details. 15 | ARG USERNAME=vscode 16 | ARG USER_UID=1000 17 | ARG USER_GID=$USER_UID 18 | 19 | # Copy environment-dev.yml (if found) to a temp locaition so we update the environment. Also 20 | # copy "noop.txt" so the COPY instruction does not fail if no environment-dev.yml exists. 21 | COPY environment-dev.yml* .devcontainer/noop.txt /tmp/conda-tmp/ 22 | 23 | # Configure apt and install packages 24 | RUN apt-get update \ 25 | && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ 26 | # 27 | # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed 28 | && apt-get -y install git openssh-client less iproute2 procps iproute2 lsb-release gcc build-essential \ 29 | # 30 | # Install pylint 31 | && /opt/conda/bin/pip install pylint \ 32 | && /opt/conda/bin/conda install mamba gh -c conda-forge \ 33 | # 34 | # Update Python environment based on environment-dev.yml (if present) 35 | && if [ -f "/tmp/conda-tmp/environment-dev.yml" ]; then /opt/conda/bin/mamba env update -n base -f /tmp/conda-tmp/environment-dev.yml; fi \ 36 | && rm -rf /tmp/conda-tmp \ 37 | # 38 | # Create a non-root user to use if preferred - see https://aka.ms/vscode-remote/containers/non-root-user. 39 | && groupadd --gid $USER_GID $USERNAME \ 40 | && useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME \ 41 | # [Optional] Add sudo support for the non-root user 42 | && apt-get install -y sudo \ 43 | && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME\ 44 | && chmod 0440 /etc/sudoers.d/$USERNAME \ 45 | # [Additional Customization] 46 | && apt-get install -y nano vim emacs \ 47 | # Clean up 48 | && apt-get autoremove -y \ 49 | && apt-get clean -y \ 50 | && rm -rf /var/lib/apt/lists/* 51 | 52 | # Switch back to dialog for any ad-hoc use of apt-get 53 | ENV DEBIAN_FRONTEND=dialog 54 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.117.1/containers/python-3-miniconda 3 | { 4 | "name": "pyjanitor dev container", 5 | // "context": "..", 6 | // "image": "registry.hub.docker.com/ericmjl/pyjanitor:devcontainer", 7 | "build": { 8 | "dockerfile": "Dockerfile", 9 | "context": ".." 10 | }, 11 | // Set *default* container specific settings.json values on container create. 12 | "settings": { 13 | "terminal.integrated.defaultProfile.linux": "bash", 14 | "python.defaultInterpreterPath": "/opt/conda/bin/python", 15 | "python.linting.enabled": true, 16 | "python.linting.pylintEnabled": true, 17 | "python.linting.pylintPath": "/opt/conda/bin/pylint", 18 | "python.formatting.provider": "black", 19 | "python.formatting.blackArgs": [ 20 | "--config", 21 | "pyproject.toml", 22 | ], 23 | "editor.formatOnSave": true, 24 | "files.insertFinalNewline": true, 25 | "files.trimFinalNewlines": true, 26 | "files.trimTrailingWhitespace": true, 27 | "[python]": { 28 | "editor.formatOnSaveMode": "file", 29 | }, 30 | }, 31 | // Add the IDs of extensions you want installed when the container is created. 32 | "extensions": [ 33 | "ms-python.python", 34 | "ms-python.vscode-pylance", 35 | "ms-vsliveshare.vsliveshare-pack", 36 | "arcticicestudio.nord-visual-studio-code", 37 | "ms-vsliveshare.vsliveshare", 38 | "ms-vsliveshare.vsliveshare-audio" 39 | ], 40 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 41 | "forwardPorts": [ 42 | 8000 43 | ], 44 | // Use 'postCreateCommand' to run commands after the container is created. 45 | "postCreateCommand": "pre-commit install --install-hooks && python setup.py develop" 46 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 47 | // "remoteUser": "vscode" 48 | } 49 | -------------------------------------------------------------------------------- /.devcontainer/noop.txt: -------------------------------------------------------------------------------- 1 | This file is copied into the container along with environment.yml* from the 2 | parent folder. This is done to prevent the Dockerfile COPY instruction from 3 | failing if no environment.yml is found. 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | per-file-ignores = 3 | janitor/functions/__init__.py:F401 4 | janitor/accessors/__init__.py:F401 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Please use this issue template if you are filing a bug report. 4 | --- 5 | 6 | # Brief Description 7 | 8 | 9 | 10 | # System Information 11 | 12 | 14 | 15 | - Operating system: macOS/Linux/Windows 16 | - OS details (optional): 17 | - Python version (required): 18 | 19 | # Minimally Reproducible Code 20 | 21 | 25 | 26 | # Error Messages 27 | 28 | 29 | 30 | ``` 31 | 32 | ``` 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_fix.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Propose a Documentation Fix 3 | about: Use this issue tracker template if you'd like to propose a fix to the documentation. 4 | --- 5 | 6 | # Brief Description of Fix 7 | 8 | 12 | 13 | Currently, the docs... 14 | 15 | I would like to propose a change, such that now the docs... 16 | 17 | # Relevant Context 18 | 19 | 21 | 22 | - [Link to documentation page](https://pyjanitor-devs.github.io/pyjanitor/) 23 | - [Link to exact file to be edited](https://github.com/pyjanitor-devs/pyjanitor/blob/dev/AUTHORS.md) 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new_examples.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Add/Modify Notebooks 3 | about: Use this specific template if you'd like to contribute a notebook to the examples gallery or modify an existing one. 4 | --- 5 | 6 | # Brief Description 7 | 8 | 9 | 10 | I'd like to write a notebook that... 11 | 12 | (optional but encouraged) This notebook would likely cover the following pyjanitor functions: 13 | 14 | - 15 | - 16 | - 17 | 18 | 19 | 20 | # Dataset 21 | 22 | 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new_proposed_feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Propose New Feature 3 | about: If you'd like to propose a new feature, please use this template. 4 | --- 5 | 6 | # Brief Description 7 | 8 | 9 | 10 | I would like to propose... 11 | 12 | # Example API 13 | 14 | 17 | 18 | Please modify the example API below to illustrate your proposed API, and then delete this sentence. 19 | 20 | ```python 21 | # transform only one column, while creating a new column name for it. 22 | df.transform_columns(column_names=['col1'], function=np.abs, new_column_names=['col1_abs']) 23 | 24 | # transform multiple columns by the same function, without creating a new column name. 25 | df.transform_columns(column_names=['col1', 'col2'], function=np.abs) 26 | 27 | # more examples below 28 | # ... 29 | ``` 30 | -------------------------------------------------------------------------------- /.github/config.yml: -------------------------------------------------------------------------------- 1 | # Configuration for the-welcome-bot - https://github.com/bash-bot/the-welcome-bot 2 | 3 | # Message for a new user who open an issue 4 | issueOpen: > 5 | Hello World! Welcome to the project! 6 | Thanks and congrats for opening your very first issue in this project. 7 | You may submit a PR on the same if you like! 8 | Hope you have a great time here :) 9 | 10 | # Message for a new user who comments on an issue 11 | issueComment: > 12 | Hello World! Welcome to the project! 13 | Thanks and congrats for your very first comment on this project. 14 | Checkout the README for more details on it. 15 | Want to contribute? Make an issue or submit a PR. 16 | Hope you have a great time here :) 17 | 18 | # Message for a new user who opens a PR 19 | prOpen: > 20 | Hello World! Welcome to the project! 21 | Thank you and congrats for your first PR on this project. 22 | We will review it soon! 23 | Till then you can checkout the README for more details on it. 24 | Hope you have a great time here :) 25 | 26 | # Default message for a new user 27 | welcomeMessage: > 28 | Hello World! Welcome to the project! Feel free to explore it. 29 | Checkout the README for more details on it. 30 | Want to contribute? Make an issue or submit a PR. 31 | You can check the contributing guides and code of conduct for the same. 32 | Hope you have a great time here :) 33 | -------------------------------------------------------------------------------- /.github/config.yml.save: -------------------------------------------------------------------------------- 1 | # Configuration for the-welcome-bot - https://github.com/bash-bot/the-welcome-bot 2 | 3 | # Message for a new user who open an issue 4 | issueOpen: > 5 | Hello World! Welcome to the project! 6 | Thanks and congrats for opening your very first issue in this project. 7 | You may submit a PR on the same if you like! 8 | Hope you have a great time here :) 9 | 10 | # Message for a new user who comments on an issue 11 | issueComment: > 12 | Hello World! Welcome to the project! 13 | Thanks and congrats for your very first comment on this project. 14 | Checkout the README for more details on it. 15 | Want to contribute? Make an issue or submit a PR. 16 | Hope you have a great time here :) 17 | 18 | # Message for a new user who opens a PR 19 | prOpen: > 20 | Hello World! Welcome to the project! 21 | Thankyou and congarts for your first PR on this project. 22 | We will review it soon! 23 | Till then you can checkout the README for more details on it. 24 | Hope you have a great time here :) 25 | 26 | # Default message for a new user 27 | welcomeMessage: > 28 | Hello World! Welcome to the project! Feel free to explore it. 29 | Checkout the README for more details on it. 30 | Want to contribute? Make an issue or submit a PR. 31 | You can check the contributing guides and code of conduct for the same. 32 | Hope you have a great time here : 33 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # PR Description 17 | 18 | Please describe the changes proposed in the pull request: 19 | 20 | - 21 | - 22 | - 23 | 24 | 25 | 26 | 27 | 28 | **This PR resolves #(put issue number here, and remove parentheses).** 29 | 30 | 31 | 32 | # PR Checklist 33 | 34 | 36 | 37 | Please ensure that you have done the following: 38 | 39 | 1. [ ] PR in from a fork off your branch. Do not PR from ``:`dev`, but rather from ``:``. 40 | 41 | 2. [ ] If you're not on the contributors list, add yourself to `AUTHORS.md`. 42 | 43 | 3. [ ] Add a line to `CHANGELOG.md` under the latest version header (i.e. the one that is "on deck") describing the contribution. 44 | - Do use some discretion here; if there are multiple PRs that are related, keep them in a single line. 45 | 46 | # Automatic checks 47 | 48 | There will be automatic checks run on the PR. These include: 49 | 50 | - Building a preview of the docs on Netlify 51 | - Automatically linting the code 52 | - Making sure the code is documented 53 | - Making sure that all tests are passed 54 | - Making sure that code coverage doesn't go down. 55 | 56 | # Relevant Reviewers 57 | 58 | 59 | 60 | Please tag maintainers to review. 61 | 62 | - @ericmjl 63 | -------------------------------------------------------------------------------- /.github/stale.yaml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 30 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/auto-update.yml: -------------------------------------------------------------------------------- 1 | # This workflow automatically updates PR branches with latest changes on target branch. 2 | # See: https://github.com/marketplace/actions/auto-update 3 | name: autoupdate 4 | on: 5 | # This will trigger on all pushes to all branches. 6 | push: 7 | branches: [dev] 8 | jobs: 9 | autoupdate: 10 | name: autoupdate 11 | runs-on: ubuntu-20.04 12 | steps: 13 | - uses: docker://chinthakagodawita/autoupdate-action:v1 14 | env: 15 | GITHUB_TOKEN: "${{ secrets.GHPAGES_TOKEN }}" 16 | PR_READY_STATE: "ready_for_review" 17 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | pull_request: 8 | branches: 9 | - dev 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell 16 | defaults: 17 | run: 18 | shell: bash -l {0} 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v2 23 | 24 | # See: https://github.com/marketplace/actions/setup-miniconda 25 | - name: Setup miniconda 26 | uses: conda-incubator/setup-miniconda@v3 27 | with: 28 | auto-update-conda: true 29 | environment-file: environment-dev.yml 30 | miniforge-version: latest 31 | 32 | - name: Install pyjanitor 33 | # use editable mode to avoid _pytest.pathlib.ImportPathMismatchError 34 | run: pip install -e . 35 | 36 | - name: Build docs 37 | run: mkdocs build 38 | 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | name: website 42 | path: site/ 43 | 44 | - name: Test docs 45 | run: pytest -m "documentation" 46 | 47 | - name: Docs preview 48 | if: ${{ github.event_name == 'pull_request' }} 49 | uses: nwtgck/actions-netlify@v1.1 50 | with: 51 | publish-dir: "./site" 52 | production-deploy: false 53 | github-token: ${{ secrets.GHPAGES_TOKEN }} 54 | deploy-message: "Deploy from GitHub Actions" 55 | enable-pull-request-comment: true 56 | enable-commit-comment: false 57 | overwrites-pull-request-comment: true 58 | alias: deploy-preview-${{ github.event.number }} 59 | env: 60 | NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} 61 | NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} 62 | timeout-minutes: 1 63 | 64 | - name: Deploy website 65 | if: ${{ github.event_name == 'push' }} 66 | uses: peaceiris/actions-gh-pages@v3 67 | with: 68 | # https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-set-personal-access-token-personal_token 69 | personal_token: ${{ secrets.GHPAGES_TOKEN }} 70 | publish_dir: ./site/ 71 | publish_branch: gh-pages 72 | # destination_dir: manuscript 73 | allow_empty_commit: false 74 | keep_files: false 75 | force_orphan: true 76 | enable_jekyll: false 77 | disable_nojekyll: false 78 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: pyjanitor tests 2 | 3 | on: 4 | # only run tests and coverage when src-code changes 5 | push: 6 | branches: 7 | - dev 8 | paths: 9 | - "janitor/**" 10 | - "tests/**" 11 | - ".codecov.yml" 12 | - ".github/workflows/tests.yml" 13 | pull_request: 14 | branches: 15 | - dev 16 | paths: 17 | - "janitor/**" 18 | - "tests/**" 19 | - ".codecov.yml" 20 | - ".github/workflows/tests.yml" 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | run-tests: 28 | strategy: 29 | fail-fast: false 30 | runs-on: ubuntu-latest 31 | name: Run pyjanitor test suite 32 | 33 | # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell 34 | defaults: 35 | run: 36 | shell: bash -l {0} 37 | 38 | steps: 39 | - name: Checkout repository 40 | uses: actions/checkout@v4 41 | 42 | # See: https://github.com/marketplace/actions/setup-miniconda 43 | - name: Setup miniconda 44 | uses: conda-incubator/setup-miniconda@v3 45 | with: 46 | auto-update-conda: true 47 | environment-file: environment-dev.yml 48 | miniforge-version: latest 49 | 50 | - name: Install pyjanitor 51 | run: python -m pip install -e . 52 | 53 | - name: Run docstrings tests 54 | run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml --doctest-only janitor 55 | 56 | - name: Run unit tests 57 | run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml tests 58 | 59 | # https://github.com/codecov/codecov-action 60 | - name: Upload code coverage 61 | uses: codecov/codecov-action@v2 62 | with: 63 | # fail_ci_if_error: true # optional (default = false) 64 | verbose: true # optional (default = false) 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | .vscode/* 3 | test*.xml 4 | *.DS_Store 5 | docs/notebooks 6 | pip-wheel-metadata 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | function_test.ipynb 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | .venv*/ 94 | venv/ 95 | env/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # pycharm 112 | .idea/ 113 | 114 | 115 | # Custom 116 | .pytest_cache 117 | 118 | # Ingore docs' symbolic link to notebooks 119 | docs/notebooks 120 | docs/* 121 | 122 | 123 | # Swap 124 | [._]*.s[a-v][a-z] 125 | [._]*.sw[a-p] 126 | [._]s[a-rt-v][a-z] 127 | [._]ss[a-gi-z] 128 | [._]sw[a-p] 129 | 130 | # Session 131 | Session.vim 132 | Sessionx.vim 133 | 134 | # Temporary 135 | .netrwhist 136 | *~ 137 | # Auto-generated tag files 138 | tags 139 | # Persistent undo 140 | [._]*.un~ 141 | 142 | # Other stuff 143 | *.profraw 144 | /scratch.py 145 | midpoint.csv 146 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v5.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-added-large-files 11 | - repo: https://github.com/psf/black 12 | rev: 25.1.0 13 | hooks: 14 | - id: black 15 | args: [--config, pyproject.toml] 16 | - repo: https://github.com/econchick/interrogate 17 | rev: 1.7.0 18 | hooks: 19 | - id: interrogate 20 | args: [-c, pyproject.toml] 21 | # Taking out darglint because it takes too long to run. 22 | # It may be superseded by ruff: https://github.com/astral-sh/ruff/issues/458 23 | # - repo: https://github.com/terrencepreilly/darglint 24 | # rev: v1.8.1 25 | # hooks: 26 | # - id: darglint 27 | # args: [-v 2] # this config makes the error messages a bit less cryptic. 28 | 29 | # The interim replacement for darglint is pydoclint. 30 | - repo: https://github.com/jsh9/pydoclint 31 | rev: 0.6.7 32 | hooks: 33 | - id: pydoclint 34 | args: 35 | - "--config=pyproject.toml" 36 | - repo: https://github.com/astral-sh/ruff-pre-commit 37 | # Ruff version. 38 | rev: v0.11.12 39 | hooks: 40 | - id: ruff 41 | args: [--fix] 42 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # configure updates globally 2 | # default: all 3 | # allowed: all, insecure, False 4 | update: all 5 | 6 | # configure dependency pinning globally 7 | # default: True 8 | # allowed: True, False 9 | pin: True 10 | 11 | # update schedule 12 | # default: empty 13 | # allowed: "every day", "every week", .. 14 | schedule: "every week on Saturday" 15 | -------------------------------------------------------------------------------- /.requirements/all.in: -------------------------------------------------------------------------------- 1 | -r base.in 2 | -r biology.in 3 | -r chemistry.in 4 | -r dev.in 5 | -r docs.in 6 | -r engineering.in 7 | -r spark.in 8 | -r testing.in 9 | # -e . 10 | -------------------------------------------------------------------------------- /.requirements/base.in: -------------------------------------------------------------------------------- 1 | # ipykernel 2 | # jupyter_client 3 | # lxml 4 | natsort 5 | # seaborn 6 | pandas_flavor 7 | multipledispatch 8 | scipy 9 | -------------------------------------------------------------------------------- /.requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/base.in 6 | # 7 | multipledispatch==0.6.0 8 | # via -r .requirements/base.in 9 | natsort==8.1.0 10 | # via -r .requirements/base.in 11 | numpy==1.22.3 12 | # via 13 | # pandas 14 | # scipy 15 | # xarray 16 | packaging==21.3 17 | # via xarray 18 | pandas==1.4.1 19 | # via 20 | # pandas-flavor 21 | # xarray 22 | pandas-flavor==0.2.0 23 | # via -r .requirements/base.in 24 | pyparsing==3.0.7 25 | # via packaging 26 | python-dateutil==2.8.2 27 | # via pandas 28 | pytz==2021.3 29 | # via pandas 30 | scipy==1.10.0 31 | # via -r .requirements/base.in 32 | six==1.16.0 33 | # via 34 | # multipledispatch 35 | # python-dateutil 36 | xarray==2022.3.0 37 | # via pandas-flavor 38 | -------------------------------------------------------------------------------- /.requirements/biology.in: -------------------------------------------------------------------------------- 1 | biopython 2 | -------------------------------------------------------------------------------- /.requirements/biology.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/biology.in 6 | # 7 | biopython==1.79 8 | # via -r .requirements/biology.in 9 | numpy==1.22.3 10 | # via biopython 11 | -------------------------------------------------------------------------------- /.requirements/chemistry.in: -------------------------------------------------------------------------------- 1 | # rdkit # needed fix https://github.com/rdkit/rdkit/issues/1812 2 | tqdm 3 | -------------------------------------------------------------------------------- /.requirements/chemistry.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/chemistry.in 6 | # 7 | tqdm==4.66.3 8 | # via -r chemistry.in 9 | -------------------------------------------------------------------------------- /.requirements/dev.in: -------------------------------------------------------------------------------- 1 | pip-tools 2 | pre-commit 3 | isort>=4.3.18 4 | black>=19.3b0 5 | darglint 6 | flake8 7 | -------------------------------------------------------------------------------- /.requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/dev.in 6 | # 7 | black==24.3.0 8 | # via -r dev.in 9 | cfgv==3.3.1 10 | # via pre-commit 11 | click==8.0.4 12 | # via 13 | # black 14 | # pip-tools 15 | darglint==1.8.1 16 | # via -r dev.in 17 | distlib==0.3.9 18 | # via virtualenv 19 | filelock==3.16.1 20 | # via virtualenv 21 | flake8==4.0.1 22 | # via -r dev.in 23 | identify==2.4.11 24 | # via pre-commit 25 | isort==5.10.1 26 | # via -r dev.in 27 | mccabe==0.6.1 28 | # via flake8 29 | mypy-extensions==0.4.3 30 | # via black 31 | nodeenv==1.6.0 32 | # via pre-commit 33 | packaging==24.0 34 | # via black 35 | pathspec==0.9.0 36 | # via black 37 | pep517==0.12.0 38 | # via pip-tools 39 | pip-tools==6.5.1 40 | # via -r dev.in 41 | platformdirs==4.3.6 42 | # via 43 | # black 44 | # virtualenv 45 | pre-commit==2.17.0 46 | # via -r dev.in 47 | pycodestyle==2.8.0 48 | # via flake8 49 | pyflakes==2.4.0 50 | # via flake8 51 | pyyaml==6.0 52 | # via pre-commit 53 | toml==0.10.2 54 | # via pre-commit 55 | tomli==2.0.1 56 | # via 57 | # black 58 | # pep517 59 | typing-extensions==4.10.0 60 | # via black 61 | virtualenv==20.26.6 62 | # via pre-commit 63 | wheel==0.38.1 64 | # via pip-tools 65 | 66 | # The following packages are considered to be unsafe in a requirements file: 67 | # pip 68 | # setuptools 69 | -------------------------------------------------------------------------------- /.requirements/docs.in: -------------------------------------------------------------------------------- 1 | mkdocs 2 | polars 3 | mkdocs-material 4 | mkdocstrings>=0.19.0 5 | mkdocstrings-python 6 | ipython>7.31.1 7 | -r biology.in 8 | -r chemistry.in 9 | -r engineering.in 10 | -r spark.in 11 | # -e . 12 | -------------------------------------------------------------------------------- /.requirements/engineering.in: -------------------------------------------------------------------------------- 1 | unyt 2 | -------------------------------------------------------------------------------- /.requirements/engineering.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/engineering.in 6 | # 7 | mpmath==1.3.0 8 | # via sympy 9 | numpy==1.22.3 10 | # via unyt 11 | sympy==1.10 12 | # via unyt 13 | unyt==2.8.0 14 | # via -r engineering.in 15 | -------------------------------------------------------------------------------- /.requirements/spark.in: -------------------------------------------------------------------------------- 1 | pyspark 2 | -------------------------------------------------------------------------------- /.requirements/spark.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/spark.in 6 | # 7 | py4j==0.10.9.5 8 | # via pyspark 9 | pyspark==3.3.2 10 | # via -r spark.in 11 | -------------------------------------------------------------------------------- /.requirements/testing.in: -------------------------------------------------------------------------------- 1 | pytest-cov 2 | pytest-xdist 3 | pytest>=3.4.2 4 | hypothesis>=4.4.0 5 | interrogate 6 | pandas-vet 7 | polars 8 | py>=1.10.0 9 | -------------------------------------------------------------------------------- /.requirements/testing.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.9 3 | # To update, run: 4 | # 5 | # pip-compile .requirements/testing.in 6 | # 7 | attrs==21.4.0 8 | # via 9 | # hypothesis 10 | # interrogate 11 | # pandas-vet 12 | # pytest 13 | click==8.0.4 14 | # via interrogate 15 | colorama==0.4.4 16 | # via interrogate 17 | coverage[toml]==6.3.2 18 | # via pytest-cov 19 | execnet==1.9.0 20 | # via pytest-xdist 21 | flake8==4.0.1 22 | # via pandas-vet 23 | hypothesis==6.39.3 24 | # via -r .requirements/testing.in 25 | iniconfig==1.1.1 26 | # via pytest 27 | interrogate==1.5.0 28 | # via -r .requirements/testing.in 29 | mccabe==0.6.1 30 | # via flake8 31 | packaging==21.3 32 | # via pytest 33 | pandas-vet==0.2.3 34 | # via -r .requirements/testing.in 35 | pluggy==1.0.0 36 | # via pytest 37 | py==1.11.0 38 | # via 39 | # -r .requirements/testing.in 40 | # interrogate 41 | # pytest 42 | # pytest-forked 43 | pycodestyle==2.8.0 44 | # via flake8 45 | pyflakes==2.4.0 46 | # via flake8 47 | pyparsing==3.0.7 48 | # via packaging 49 | pytest==7.0.1 50 | # via 51 | # -r .requirements/testing.in 52 | # pytest-cov 53 | # pytest-forked 54 | # pytest-xdist 55 | pytest-cov==3.0.0 56 | # via -r .requirements/testing.in 57 | pytest-forked==1.4.0 58 | # via pytest-xdist 59 | pytest-xdist==2.5.0 60 | # via -r .requirements/testing.in 61 | sortedcontainers==2.4.0 62 | # via hypothesis 63 | tabulate==0.8.9 64 | # via interrogate 65 | toml==0.10.2 66 | # via interrogate 67 | tomli==2.0.1 68 | # via 69 | # coverage 70 | # pytest 71 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | mkdocs/devguide.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-onwards pyjanitor devs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include .requirements/* 2 | include *.md 3 | include LICENSE 4 | include mkdocs/* 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | ACTIVATE=source activate pyjanitor-dev 3 | 4 | release: 5 | rm -f dist/* 6 | python setup.py sdist bdist_wheel 7 | twine upload dist/* 8 | 9 | # Note to self: 10 | # makefile has not been fully tested. 11 | # DO NOT COMMIT until testing is done. 12 | # 13 | # ALSO, remove this comment once it's tested!!!!!!!!!!! 14 | 15 | .PHONY: format test lint docs isort check style notebooks install 16 | 17 | format: 18 | @echo "Applying Black Python code formatting..." 19 | pre-commit run black --all-files 20 | 21 | test: 22 | @echo "Running test suite..." 23 | pytest -v -n auto --color=yes 24 | 25 | lint: 26 | @echo "Checking code formatting..." 27 | pre-commit run flake8 --all-files 28 | 29 | docs: 30 | @echo "Building documentation..." 31 | mkdocs build 32 | 33 | isort: 34 | @echo "Sorting imports..." 35 | isort --check-only --use-parentheses --trailing-comma --multi-line 3 --line-length 79 . 36 | 37 | check: test docs notebooks isort format lint 38 | @echo "checks complete" 39 | 40 | style: isort format 41 | @echo "styling complete" 42 | 43 | install: 44 | @echo "Creating Conda environment..." 45 | conda env create -f environment-dev.yml 46 | 47 | @echo "Installing PyJanitor in development mode..." 48 | $(ACTIVATE) && python setup.py develop 49 | 50 | @echo "Registering current virtual environment as a Jupyter Python kernel..." 51 | $(ACTIVATE) && python -m ipykernel install --user --name pyjanitor-dev --display-name "PyJanitor development" 52 | 53 | @echo "Installing pre-commit hooks" 54 | $(ACTIVATE) && pre-commit install 55 | 56 | compile-requirements: 57 | @echo "pip-compiling requirements files..." 58 | find .requirements -type f -name '*.in' | xargs -I {} sh -c\ 59 | 'echo "compiling" {} && pip-compile {} --upgrade -q' 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mkdocs/index.md -------------------------------------------------------------------------------- /environment-dev.yml: -------------------------------------------------------------------------------- 1 | name: pyjanitor-dev 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10 6 | - biopython 7 | - black=22.12.0 # keep this in sync with `.pre-commit-config.yaml` 8 | - bump2version=1.0.1 9 | - cairo 10 | - conda 11 | - hypothesis 12 | - ipykernel 13 | - ipython 14 | - isort 15 | - jinja2>=3.1.0 16 | - jupyterlab 17 | - lxml 18 | - make 19 | - mkdocs 20 | - mkdocs-material 21 | - mkdocstrings>=0.19.0 22 | - mkdocstrings-python 23 | - missingno 24 | - multipledispatch 25 | - mypy 26 | - natsort 27 | - numba 28 | - numpy==1.24.4 29 | - openpyxl 30 | - pandas-flavor 31 | - pandas-vet 32 | - pandas>=2.0 33 | - pip 34 | - pipreqs 35 | - pip-tools 36 | - pre-commit 37 | - polars 38 | - pyspark>=3.2.0 39 | - pytest 40 | - pytest-cov 41 | - pytest-xdist 42 | - pytest-doctestplus 43 | - python-language-server 44 | - rdkit 45 | - recommonmark 46 | - seaborn 47 | - twine 48 | - unyt 49 | - xarray 50 | - xlrd 51 | - xorg-libxrender 52 | - pip: 53 | - mknotebooks 54 | # Temporarily pinned to fix CI 55 | - setuptools==70.3.0 56 | -------------------------------------------------------------------------------- /examples/limit_column_characters.md: -------------------------------------------------------------------------------- 1 | # df.limit_column_characters() 2 | 3 | ## Description 4 | This method truncates column names to a given character length. In the case of duplicated column names, numbers are appended to the columns with a character separator (default is "_"). 5 | 6 | ## Parameters 7 | ### df 8 | A pandas dataframe. 9 | 10 | ### column_length 11 | Character length for which to truncate all columns. The column separator value and number for duplicate column name does 12 | not contribute. Therefore, if all columns are truncated to 10 13 | characters, the first distinct column will be 10 characters and the 14 | remaining will be 12 characters (assuming a column separator of one 15 | character). 16 | 17 | ### col_separator 18 | The separator to use for counting distinct column values. Default is "_". Supply an empty string (i.e. '') to remove the 19 | separator. 20 | 21 | ## Setup 22 | ```python 23 | import pandas as pd 24 | import janitor 25 | 26 | data_dict = { 27 | "really_long_name_for_a_column": range(10), 28 | "another_really_long_name_for_a_column": [2 * item for item in range(10)], 29 | "another_really_longer_name_for_a_column": list("lllongname"), 30 | "this_is_getting_out_of_hand": list("longername"), 31 | } 32 | ``` 33 | 34 | ## Example1: Standard truncation 35 | ```python 36 | example_dataframe = pd.DataFrame(data_dict) 37 | 38 | example_dataframe.limit_column_characters(7) 39 | ``` 40 | 41 | ### Output 42 | 43 | really_ another another_1 this_is 44 | 0 0 0 l l 45 | 1 1 2 l o 46 | 2 2 4 l n 47 | 3 3 6 o g 48 | 4 4 8 n e 49 | 5 5 10 g r 50 | 6 6 12 n n 51 | 7 7 14 a a 52 | 8 8 16 m m 53 | 9 9 18 e e 54 | 55 | ## Example2: Standard truncation with different separator character 56 | 57 | ```python 58 | 59 | example_dataframe2 = pd.DataFrame(data_dict) 60 | 61 | example_dataframe2.limit_column_characters(7, ".") 62 | ``` 63 | 64 | ### Output 65 | 66 | really_ another another.1 this_is 67 | 0 0 0 l l 68 | 1 1 2 l o 69 | 2 2 4 l n 70 | 3 3 6 o g 71 | 4 4 8 n e 72 | 5 5 10 g r 73 | 6 6 12 n n 74 | 7 7 14 a a 75 | 8 8 16 m m 76 | 9 9 18 e e 77 | -------------------------------------------------------------------------------- /examples/notebooks/README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | This folder contains jupyter notebooks demonstrating different ways to 6 | implement pyjanitor in your workflow. 7 | 8 | Guidelines 9 | ~~~~~~~~~~ 10 | 11 | When contributing example notebooks please include a short explanation of 12 | where the data came from and what it contains. Then go through your 13 | demonstration of data cleaning with pyjanitor in a step by step manner with 14 | clear documentation of what is being done. Please try to elaborate on what the 15 | benefits of pyjanitor are and why it should be implemented in your use-case. 16 | Optionally, feel free to add examples of analysis for the cleaned data. 17 | -------------------------------------------------------------------------------- /examples/notebooks/dirty_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/examples/notebooks/dirty_data.xlsx -------------------------------------------------------------------------------- /janitor/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level janitor API lives here.""" 2 | 3 | import pandas_flavor as pf # noqa: F401 4 | 5 | from .accessors import * # noqa: F403, F401 6 | from .functions import * # noqa: F403, F401 7 | from .io import * # noqa: F403, F401 8 | from .math import * # noqa: F403, F401 9 | from .ml import get_features_targets as _get_features_targets 10 | from .utils import refactored_function 11 | from .xarray import * # noqa: F403, F401 12 | 13 | 14 | @refactored_function( 15 | "get_features_targets() has moved. Please use ml.get_features_targets()." 16 | ) 17 | def get_features_targets(*args, **kwargs): 18 | """Wrapper for get_features_targets.""" 19 | return _get_features_targets(*args, **kwargs) 20 | 21 | 22 | __version__ = "0.31.0" 23 | -------------------------------------------------------------------------------- /janitor/accessors/__init__.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous mathematical operators.""" 2 | 3 | from janitor.accessors.data_description import DataDescription # noqa: F401 4 | -------------------------------------------------------------------------------- /janitor/accessors/data_description.py: -------------------------------------------------------------------------------- 1 | """DataDescription class for the DataDescription accessor.""" 2 | 3 | from typing import Dict, List, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_accessor("data_description") 10 | class DataDescription: 11 | """High-level description of data present in this DataFrame. 12 | 13 | This is a custom data accessor. 14 | """ 15 | 16 | def __init__(self, data): 17 | self._data = data 18 | self._desc = {} 19 | 20 | def _get_data_df(self) -> pd.DataFrame: 21 | """Get a table of descriptive information in a DataFrame format. 22 | 23 | :returns: A DataFrame containing the descriptive information. 24 | """ 25 | df = self._data 26 | 27 | data_dict = {} 28 | data_dict["column_name"] = df.columns.tolist() 29 | data_dict["type"] = df.dtypes.tolist() 30 | data_dict["count"] = df.count().tolist() 31 | data_dict["pct_missing"] = (1 - (df.count() / len(df))).tolist() 32 | data_dict["description"] = [self._desc.get(c, "") for c in df.columns] 33 | 34 | return pd.DataFrame(data_dict).set_index("column_name") 35 | 36 | @property 37 | def df(self) -> pd.DataFrame: 38 | """Get a table of descriptive information in a DataFrame format.""" 39 | return self._get_data_df() 40 | 41 | def __repr__(self): 42 | """Human-readable representation of the `DataDescription` object.""" 43 | return str(self._get_data_df()) 44 | 45 | def display(self): 46 | """Print the table of descriptive information about this DataFrame.""" 47 | print(self) 48 | 49 | def set_description(self, desc: Union[List, Dict]): 50 | """Update the description for each of the columns in the DataFrame. 51 | 52 | Args: 53 | desc: The structure containing the descriptions to update 54 | 55 | Raises: 56 | ValueError: If length of description list does not match 57 | number of columns in DataFrame. 58 | """ 59 | if isinstance(desc, list): 60 | if len(desc) != len(self._data.columns): 61 | raise ValueError( 62 | "Length of description list " 63 | f"({len(desc)}) does not match number of columns in " 64 | f"DataFrame ({len(self._data.columns)})" 65 | ) 66 | 67 | self._desc = dict(zip(self._data.columns, desc)) 68 | 69 | elif isinstance(desc, dict): 70 | self._desc = desc 71 | -------------------------------------------------------------------------------- /janitor/biology.py: -------------------------------------------------------------------------------- 1 | """Biology and bioinformatics-oriented data cleaning functions.""" 2 | 3 | import pandas as pd 4 | import pandas_flavor as pf 5 | 6 | from .utils import deprecated_alias, import_message 7 | 8 | try: 9 | from Bio import SeqIO 10 | except ImportError: 11 | import_message( 12 | submodule="biology", 13 | package="biopython", 14 | conda_channel="conda-forge", 15 | pip_install=True, 16 | ) 17 | 18 | 19 | @pf.register_dataframe_method 20 | @deprecated_alias(col_name="column_name") 21 | def join_fasta( 22 | df: pd.DataFrame, filename: str, id_col: str, column_name: str 23 | ) -> pd.DataFrame: 24 | """Convenience method to join in a FASTA file as a column. 25 | 26 | This allows us to add the string sequence of a FASTA file as a new column 27 | of data in the dataframe. 28 | 29 | This method only attaches the string representation of the SeqRecord.Seq 30 | object from Biopython. Does not attach the full SeqRecord. Alphabet is 31 | also not stored, under the assumption that the data scientist has domain 32 | knowledge of what kind of sequence is being read in (nucleotide vs. amino 33 | acid.) 34 | 35 | This method mutates the original DataFrame. 36 | 37 | For more advanced functions, please use phylopandas. 38 | 39 | Examples: 40 | >>> import tempfile 41 | >>> import pandas as pd 42 | >>> import janitor.biology 43 | >>> tf = tempfile.NamedTemporaryFile() 44 | >>> tf.write('''>SEQUENCE_1 45 | ... MTEITAAMVKELRESTGAGMMDCK 46 | ... >SEQUENCE_2 47 | ... SATVSEINSETDFVAKN'''.encode('utf8')) 48 | 66 49 | >>> tf.seek(0) 50 | 0 51 | >>> df = pd.DataFrame({"sequence_accession": 52 | ... ["SEQUENCE_1", "SEQUENCE_2", ]}) 53 | >>> df = df.join_fasta( # doctest: +SKIP 54 | ... filename=tf.name, 55 | ... id_col='sequence_accession', 56 | ... column_name='sequence', 57 | ... ) 58 | >>> df.sequence # doctest: +SKIP 59 | 0 MTEITAAMVKELRESTGAGMMDCK 60 | 1 SATVSEINSETDFVAKN 61 | Name: sequence, dtype: object 62 | 63 | Args: 64 | df: A pandas DataFrame. 65 | filename: Path to the FASTA file. 66 | id_col: The column in the DataFrame that houses sequence IDs. 67 | column_name: The name of the new column. 68 | 69 | Returns: 70 | A pandas DataFrame with new FASTA string sequence column. 71 | """ 72 | seqrecords = { 73 | x.id: x.seq.__str__() for x in SeqIO.parse(filename, "fasta") 74 | } 75 | seq_col = [seqrecords[i] for i in df[id_col]] 76 | df[column_name] = seq_col 77 | return df 78 | -------------------------------------------------------------------------------- /janitor/errors.py: -------------------------------------------------------------------------------- 1 | class JanitorError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /janitor/functions/alias.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `toset` function.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any 6 | 7 | import pandas as pd 8 | import pandas_flavor as pf 9 | 10 | 11 | @pf.register_series_method 12 | def alias(series: pd.Series, alias: Any = None) -> pd.Series: 13 | """Return a Series with a new name. Accepts either a scalar or a callable. 14 | 15 | 16 | Examples: 17 | >>> import pandas as pd 18 | >>> import janitor 19 | >>> s = pd.Series([1, 2, 3], name='series') 20 | >>> s 21 | 0 1 22 | 1 2 23 | 2 3 24 | Name: series, dtype: int64 25 | >>> s.alias('series_new') 26 | 0 1 27 | 1 2 28 | 2 3 29 | Name: series_new, dtype: int64 30 | >>> s.alias(str.upper) 31 | 0 1 32 | 1 2 33 | 2 3 34 | Name: SERIES, dtype: int64 35 | 36 | Args: 37 | series: A pandas Series. 38 | alias: scalar or callable to create a new name for the pandas Series. 39 | 40 | Returns: 41 | A new pandas Series. 42 | """ 43 | series = series[:] 44 | if alias is None: 45 | return series 46 | if callable(alias): 47 | alias = alias(series.name) 48 | series.name = alias 49 | return series 50 | -------------------------------------------------------------------------------- /janitor/functions/also.py: -------------------------------------------------------------------------------- 1 | """Implementation source for chainable function `also`.""" 2 | 3 | from typing import Any, Callable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_method 10 | def also( 11 | df: pd.DataFrame, func: Callable, *args: Any, **kwargs: Any 12 | ) -> pd.DataFrame: 13 | """Run a function with side effects. 14 | 15 | This function allows you to run an arbitrary function 16 | in the `pyjanitor` method chain. 17 | Doing so will let you do things like save the dataframe to disk midway 18 | while continuing to modify the dataframe afterwards. 19 | 20 | Examples: 21 | >>> import pandas as pd 22 | >>> import janitor 23 | >>> df = ( 24 | ... pd.DataFrame({"a": [1, 2, 3], "b": list("abc")}) 25 | ... .query("a > 1") 26 | ... .also(lambda df: print(f"DataFrame shape is: {df.shape}")) 27 | ... .rename_column(old_column_name="a", new_column_name="a_new") 28 | ... .also(lambda df: df.to_csv("midpoint.csv")) 29 | ... .also( 30 | ... lambda df: print(f"Columns: {df.columns}") 31 | ... ) 32 | ... ) 33 | DataFrame shape is: (2, 2) 34 | Columns: Index(['a_new', 'b'], dtype='object') 35 | 36 | Args: 37 | df: A pandas DataFrame. 38 | func: A function you would like to run in the method chain. 39 | It should take one DataFrame object as a parameter and have no return. 40 | If there is a return, it will be ignored. 41 | *args: Optional arguments for `func`. 42 | **kwargs: Optional keyword arguments for `func`. 43 | 44 | Returns: 45 | The input pandas DataFrame, unmodified. 46 | """ # noqa: E501 47 | func(df.copy(), *args, **kwargs) 48 | return df 49 | -------------------------------------------------------------------------------- /janitor/functions/bin_numeric.py: -------------------------------------------------------------------------------- 1 | """Implementation source for `bin_numeric`.""" 2 | 3 | from typing import Any, Optional, Sequence, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import check, check_column, deprecated_alias 9 | 10 | ScalarSequence = Sequence[float] 11 | 12 | 13 | @pf.register_dataframe_method 14 | @deprecated_alias( 15 | from_column="from_column_name", 16 | to_column="to_column_name", 17 | num_bins="bins", 18 | ) 19 | def bin_numeric( 20 | df: pd.DataFrame, 21 | from_column_name: str, 22 | to_column_name: str, 23 | bins: Optional[Union[int, ScalarSequence, pd.IntervalIndex]] = 5, 24 | **kwargs: Any, 25 | ) -> pd.DataFrame: 26 | """Generate a new column that labels bins for a specified numeric column. 27 | 28 | This method does not mutate the original DataFrame. 29 | 30 | A wrapper around the pandas [`cut()`][pd_cut_docs] function to bin data of 31 | one column, generating a new column with the results. 32 | 33 | [pd_cut_docs]: https://pandas.pydata.org/docs/reference/api/pandas.cut.html 34 | 35 | Examples: 36 | Binning a numeric column with specific bin edges. 37 | 38 | >>> import pandas as pd 39 | >>> import janitor 40 | >>> df = pd.DataFrame({"a": [3, 6, 9, 12, 15]}) 41 | >>> df.bin_numeric( 42 | ... from_column_name="a", to_column_name="a_binned", 43 | ... bins=[0, 5, 11, 15], 44 | ... ) 45 | a a_binned 46 | 0 3 (0, 5] 47 | 1 6 (5, 11] 48 | 2 9 (5, 11] 49 | 3 12 (11, 15] 50 | 4 15 (11, 15] 51 | 52 | Args: 53 | df: A pandas DataFrame. 54 | from_column_name: The column whose data you want binned. 55 | to_column_name: The new column to be created with the binned data. 56 | bins: The binning strategy to be utilized. Read the `pd.cut` 57 | documentation for more details. 58 | **kwargs: Additional kwargs to pass to `pd.cut`, except `retbins`. 59 | 60 | Raises: 61 | ValueError: If `retbins` is passed in as a kwarg. 62 | 63 | Returns: 64 | A pandas DataFrame. 65 | """ 66 | if "retbins" in kwargs: 67 | raise ValueError("`retbins` is not an acceptable keyword argument.") 68 | 69 | check("from_column_name", from_column_name, [str]) 70 | check("to_column_name", to_column_name, [str]) 71 | check_column(df, from_column_name) 72 | 73 | df = df.assign( 74 | **{ 75 | to_column_name: pd.cut(df[from_column_name], bins=bins, **kwargs), 76 | } 77 | ) 78 | 79 | return df 80 | -------------------------------------------------------------------------------- /janitor/functions/concatenate_columns.py: -------------------------------------------------------------------------------- 1 | from typing import Hashable, List 2 | 3 | import pandas as pd 4 | import pandas_flavor as pf 5 | 6 | from janitor.errors import JanitorError 7 | from janitor.utils import deprecated_alias 8 | 9 | 10 | @pf.register_dataframe_method 11 | @deprecated_alias(columns="column_names") 12 | def concatenate_columns( 13 | df: pd.DataFrame, 14 | column_names: List[Hashable], 15 | new_column_name: Hashable, 16 | sep: str = "-", 17 | ignore_empty: bool = True, 18 | ) -> pd.DataFrame: 19 | """Concatenates the set of columns into a single column. 20 | 21 | Used to quickly generate an index based on a group of columns. 22 | 23 | This method mutates the original DataFrame. 24 | 25 | Examples: 26 | Concatenate two columns row-wise. 27 | 28 | >>> import pandas as pd 29 | >>> import janitor 30 | >>> df = pd.DataFrame({"a": [1, 3, 5], "b": list("xyz")}) 31 | >>> df 32 | a b 33 | 0 1 x 34 | 1 3 y 35 | 2 5 z 36 | >>> df.concatenate_columns( 37 | ... column_names=["a", "b"], new_column_name="m", 38 | ... ) 39 | a b m 40 | 0 1 x 1-x 41 | 1 3 y 3-y 42 | 2 5 z 5-z 43 | 44 | Args: 45 | df: A pandas DataFrame. 46 | column_names: A list of columns to concatenate together. 47 | new_column_name: The name of the new column. 48 | sep: The separator between each column's data. 49 | ignore_empty: Ignore null values if exists. 50 | 51 | Raises: 52 | JanitorError: If at least two columns are not provided 53 | within `column_names`. 54 | 55 | Returns: 56 | A pandas DataFrame with concatenated columns. 57 | """ 58 | if len(column_names) < 2: 59 | raise JanitorError("At least two columns must be specified") 60 | 61 | df[new_column_name] = ( 62 | df[column_names].astype(str).fillna("").agg(sep.join, axis=1) 63 | ) 64 | 65 | if ignore_empty: 66 | 67 | def remove_empty_string(x): 68 | """Ignore empty/null string values from the concatenated output.""" 69 | return sep.join(x for x in x.split(sep) if x) 70 | 71 | df[new_column_name] = df[new_column_name].transform( 72 | remove_empty_string 73 | ) 74 | 75 | return df 76 | -------------------------------------------------------------------------------- /janitor/functions/drop_constant_columns.py: -------------------------------------------------------------------------------- 1 | """Implementation of drop_constant_columns.""" 2 | 3 | import pandas as pd 4 | import pandas_flavor as pf 5 | 6 | 7 | @pf.register_dataframe_method 8 | def drop_constant_columns(df: pd.DataFrame) -> pd.DataFrame: 9 | """Finds and drops the constant columns from a Pandas DataFrame. 10 | 11 | Examples: 12 | >>> import pandas as pd 13 | >>> import janitor 14 | >>> data_dict = { 15 | ... "a": [1, 1, 1], 16 | ... "b": [1, 2, 3], 17 | ... "c": [1, 1, 1], 18 | ... "d": ["rabbit", "leopard", "lion"], 19 | ... "e": ["Cambridge", "Shanghai", "Basel"] 20 | ... } 21 | >>> df = pd.DataFrame(data_dict) 22 | >>> df 23 | a b c d e 24 | 0 1 1 1 rabbit Cambridge 25 | 1 1 2 1 leopard Shanghai 26 | 2 1 3 1 lion Basel 27 | >>> df.drop_constant_columns() 28 | b d e 29 | 0 1 rabbit Cambridge 30 | 1 2 leopard Shanghai 31 | 2 3 lion Basel 32 | 33 | Args: 34 | df: Input Pandas DataFrame 35 | 36 | Returns: 37 | The Pandas DataFrame with the constant columns dropped. 38 | """ 39 | return df.loc[:, df.nunique().ne(1)] 40 | -------------------------------------------------------------------------------- /janitor/functions/drop_duplicate_columns.py: -------------------------------------------------------------------------------- 1 | """Implementation for `drop_duplicate_columns`.""" 2 | 3 | from typing import Hashable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_method 10 | def drop_duplicate_columns( 11 | df: pd.DataFrame, column_name: Hashable, nth_index: int = 0 12 | ) -> pd.DataFrame: 13 | """Remove a duplicated column specified by `column_name`. 14 | 15 | Specifying `nth_index=0` will remove the first column, 16 | `nth_index=1` will remove the second column, 17 | and so on and so forth. 18 | 19 | The corresponding tidyverse R's library is: 20 | `select(-_)` 21 | 22 | Examples: 23 | >>> import pandas as pd 24 | >>> import janitor 25 | >>> df = pd.DataFrame({ 26 | ... "a": range(2, 5), 27 | ... "b": range(3, 6), 28 | ... "A": range(4, 7), 29 | ... "a*": range(6, 9), 30 | ... }).clean_names(remove_special=True) 31 | >>> df 32 | a b a a 33 | 0 2 3 4 6 34 | 1 3 4 5 7 35 | 2 4 5 6 8 36 | >>> df.drop_duplicate_columns(column_name="a", nth_index=1) 37 | a b a 38 | 0 2 3 6 39 | 1 3 4 7 40 | 2 4 5 8 41 | 42 | Args: 43 | df: A pandas DataFrame 44 | column_name: Name of duplicated columns. 45 | nth_index: Among the duplicated columns, 46 | select the nth column to drop. 47 | 48 | Returns: 49 | A pandas DataFrame 50 | """ 51 | col_indexes = [ 52 | col_idx 53 | for col_idx, col_name in enumerate(df.columns) 54 | if col_name == column_name 55 | ] 56 | 57 | # Select the column to remove based on nth_index. 58 | removed_col_idx = col_indexes[nth_index] 59 | # Filter out columns except for the one to be removed. 60 | filtered_cols = [ 61 | c_i for c_i, _ in enumerate(df.columns) if c_i != removed_col_idx 62 | ] 63 | 64 | return df.iloc[:, filtered_cols] 65 | -------------------------------------------------------------------------------- /janitor/functions/dropnotnull.py: -------------------------------------------------------------------------------- 1 | """Implementation source for `dropnotnull`.""" 2 | 3 | from typing import Hashable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import deprecated_alias 9 | 10 | 11 | @pf.register_dataframe_method 12 | @deprecated_alias(column="column_name") 13 | def dropnotnull(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame: 14 | """Drop rows that do *not* have null values in the given column. 15 | 16 | This method does not mutate the original DataFrame. 17 | 18 | Examples: 19 | >>> import numpy as np 20 | >>> import pandas as pd 21 | >>> import janitor 22 | >>> df = pd.DataFrame({"a": [1., np.NaN, 3.], "b": [None, "y", "z"]}) 23 | >>> df 24 | a b 25 | 0 1.0 None 26 | 1 NaN y 27 | 2 3.0 z 28 | >>> df.dropnotnull("a") 29 | a b 30 | 1 NaN y 31 | >>> df.dropnotnull("b") 32 | a b 33 | 0 1.0 None 34 | 35 | Args: 36 | df: A pandas DataFrame. 37 | column_name: The column name to drop rows from. 38 | 39 | Returns: 40 | A pandas DataFrame with dropped rows. 41 | """ 42 | return df[pd.isna(df[column_name])] 43 | -------------------------------------------------------------------------------- /janitor/functions/expand_column.py: -------------------------------------------------------------------------------- 1 | """Implementation for expand_column.""" 2 | 3 | from typing import Hashable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import deprecated_alias 9 | 10 | 11 | @pf.register_dataframe_method 12 | @deprecated_alias(column="column_name") 13 | def expand_column( 14 | df: pd.DataFrame, 15 | column_name: Hashable, 16 | sep: str = "|", 17 | concat: bool = True, 18 | ) -> pd.DataFrame: 19 | """Expand a categorical column with multiple labels into dummy-coded columns. 20 | 21 | Super sugary syntax that wraps `pandas.Series.str.get_dummies`. 22 | 23 | This method does not mutate the original DataFrame. 24 | 25 | Examples: 26 | Functional usage syntax: 27 | 28 | >>> import pandas as pd 29 | >>> df = pd.DataFrame( 30 | ... { 31 | ... "col1": ["A, B", "B, C, D", "E, F", "A, E, F"], 32 | ... "col2": [1, 2, 3, 4], 33 | ... } 34 | ... ) 35 | >>> df = expand_column( 36 | ... df, 37 | ... column_name="col1", 38 | ... sep=", " # note space in sep 39 | ... ) 40 | >>> df 41 | col1 col2 A B C D E F 42 | 0 A, B 1 1 1 0 0 0 0 43 | 1 B, C, D 2 0 1 1 1 0 0 44 | 2 E, F 3 0 0 0 0 1 1 45 | 3 A, E, F 4 1 0 0 0 1 1 46 | 47 | Method chaining syntax: 48 | 49 | >>> import pandas as pd 50 | >>> import janitor 51 | >>> df = ( 52 | ... pd.DataFrame( 53 | ... { 54 | ... "col1": ["A, B", "B, C, D", "E, F", "A, E, F"], 55 | ... "col2": [1, 2, 3, 4], 56 | ... } 57 | ... ) 58 | ... .expand_column( 59 | ... column_name='col1', 60 | ... sep=', ' 61 | ... ) 62 | ... ) 63 | >>> df 64 | col1 col2 A B C D E F 65 | 0 A, B 1 1 1 0 0 0 0 66 | 1 B, C, D 2 0 1 1 1 0 0 67 | 2 E, F 3 0 0 0 0 1 1 68 | 3 A, E, F 4 1 0 0 0 1 1 69 | 70 | Args: 71 | df: A pandas DataFrame. 72 | column_name: Which column to expand. 73 | sep: The delimiter, same to 74 | `pandas.Series.str.get_dummies`'s `sep`. 75 | concat: Whether to return the expanded column concatenated to 76 | the original dataframe (`concat=True`), or to return it standalone 77 | (`concat=False`). 78 | 79 | Returns: 80 | A pandas DataFrame with an expanded column. 81 | """ # noqa: E501 82 | expanded_df = df[column_name].str.get_dummies(sep=sep) 83 | if concat: 84 | return df.join(expanded_df) 85 | return expanded_df 86 | -------------------------------------------------------------------------------- /janitor/functions/factorize_columns.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `factorize_columns` function""" 2 | 3 | from typing import Any, Hashable, Iterable, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.functions.utils import _factorize 9 | 10 | 11 | @pf.register_dataframe_method 12 | def factorize_columns( 13 | df: pd.DataFrame, 14 | column_names: Union[str, Iterable[str], Hashable], 15 | suffix: str = "_enc", 16 | **kwargs: Any, 17 | ) -> pd.DataFrame: 18 | """Converts labels into numerical data. 19 | 20 | This method will create a new column with the string `_enc` appended 21 | after the original column's name. 22 | This can be overridden with the suffix parameter. 23 | 24 | Internally, this method uses pandas `factorize` method. 25 | It takes in an optional suffix and keyword arguments also. 26 | An empty string as suffix will override the existing column. 27 | 28 | This method does not mutate the original DataFrame. 29 | 30 | Examples: 31 | >>> import pandas as pd 32 | >>> import janitor 33 | >>> df = pd.DataFrame({ 34 | ... "foo": ["b", "b", "a", "c", "b"], 35 | ... "bar": range(4, 9), 36 | ... }) 37 | >>> df 38 | foo bar 39 | 0 b 4 40 | 1 b 5 41 | 2 a 6 42 | 3 c 7 43 | 4 b 8 44 | >>> df.factorize_columns(column_names="foo") 45 | foo bar foo_enc 46 | 0 b 4 0 47 | 1 b 5 0 48 | 2 a 6 1 49 | 3 c 7 2 50 | 4 b 8 0 51 | 52 | Args: 53 | df: The pandas DataFrame object. 54 | column_names: A column name or an iterable (list or tuple) of 55 | column names. 56 | suffix: Suffix to be used for the new column. 57 | An empty string suffix means, it will override the existing column. 58 | **kwargs: Keyword arguments. It takes any of the keyword arguments, 59 | which the pandas factorize method takes like `sort`, `na_sentinel`, 60 | `size_hint`. 61 | 62 | Returns: 63 | A pandas DataFrame. 64 | """ 65 | df = _factorize(df.copy(), column_names, suffix, **kwargs) 66 | return df 67 | -------------------------------------------------------------------------------- /janitor/functions/get_dupes.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `get_dupes` function""" 2 | 3 | from typing import Hashable, Iterable, Optional, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import deprecated_alias 9 | 10 | 11 | @pf.register_dataframe_method 12 | @deprecated_alias(columns="column_names") 13 | def get_dupes( 14 | df: pd.DataFrame, 15 | column_names: Optional[Union[str, Iterable[str], Hashable]] = None, 16 | ) -> pd.DataFrame: 17 | """ 18 | Return all duplicate rows. 19 | 20 | This method does not mutate the original DataFrame. 21 | 22 | Examples: 23 | Method chaining syntax: 24 | 25 | >>> import pandas as pd 26 | >>> import janitor 27 | >>> df = pd.DataFrame({ 28 | ... "item": ["shoe", "shoe", "bag", "shoe", "bag"], 29 | ... "quantity": [100, 100, 75, 200, 75], 30 | ... }) 31 | >>> df 32 | item quantity 33 | 0 shoe 100 34 | 1 shoe 100 35 | 2 bag 75 36 | 3 shoe 200 37 | 4 bag 75 38 | >>> df.get_dupes() 39 | item quantity 40 | 0 shoe 100 41 | 1 shoe 100 42 | 2 bag 75 43 | 4 bag 75 44 | 45 | Optional `column_names` usage: 46 | 47 | >>> import pandas as pd 48 | >>> import janitor 49 | >>> df = pd.DataFrame({ 50 | ... "item": ["shoe", "shoe", "bag", "shoe", "bag"], 51 | ... "quantity": [100, 100, 75, 200, 75], 52 | ... }) 53 | >>> df 54 | item quantity 55 | 0 shoe 100 56 | 1 shoe 100 57 | 2 bag 75 58 | 3 shoe 200 59 | 4 bag 75 60 | >>> df.get_dupes(column_names=["item"]) 61 | item quantity 62 | 0 shoe 100 63 | 1 shoe 100 64 | 2 bag 75 65 | 3 shoe 200 66 | 4 bag 75 67 | >>> df.get_dupes(column_names=["quantity"]) 68 | item quantity 69 | 0 shoe 100 70 | 1 shoe 100 71 | 2 bag 75 72 | 4 bag 75 73 | 74 | Args: 75 | df: The pandas DataFrame object. 76 | column_names: A column name or an iterable 77 | (list or tuple) of column names. Following pandas API, this only 78 | considers certain columns for identifying duplicates. Defaults 79 | to using all columns. 80 | 81 | Returns: 82 | The duplicate rows, as a pandas DataFrame. 83 | """ 84 | return df.loc[df.duplicated(subset=column_names, keep=False)] 85 | -------------------------------------------------------------------------------- /janitor/functions/join_apply.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `join_apply` function""" 2 | 3 | from typing import Callable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_method 10 | def join_apply( 11 | df: pd.DataFrame, 12 | func: Callable, 13 | new_column_name: str, 14 | ) -> pd.DataFrame: 15 | """Join the result of applying a function across dataframe rows. 16 | 17 | This method does not mutate the original DataFrame. 18 | 19 | This is a convenience function that allows us to apply arbitrary functions 20 | that take any combination of information from any of the columns. The only 21 | requirement is that the function signature takes in a row from the 22 | DataFrame. 23 | 24 | Examples: 25 | Sum the result of two columns into a new column. 26 | 27 | >>> import pandas as pd 28 | >>> import janitor 29 | >>> df = pd.DataFrame({"a":[1, 2, 3], "b": [2, 3, 4]}) 30 | >>> df 31 | a b 32 | 0 1 2 33 | 1 2 3 34 | 2 3 4 35 | >>> df.join_apply( 36 | ... func=lambda x: 2 * x["a"] + x["b"], 37 | ... new_column_name="2a+b", 38 | ... ) 39 | a b 2a+b 40 | 0 1 2 4 41 | 1 2 3 7 42 | 2 3 4 10 43 | 44 | Incorporating conditionals in `func`. 45 | 46 | >>> import pandas as pd 47 | >>> import janitor 48 | >>> df = pd.DataFrame({"a": [1, 2, 3], "b": [20, 30, 40]}) 49 | >>> df 50 | a b 51 | 0 1 20 52 | 1 2 30 53 | 2 3 40 54 | >>> def take_a_if_even(x): 55 | ... if x["a"] % 2 == 0: 56 | ... return x["a"] 57 | ... else: 58 | ... return x["b"] 59 | >>> df.join_apply(take_a_if_even, "a_if_even") 60 | a b a_if_even 61 | 0 1 20 20 62 | 1 2 30 2 63 | 2 3 40 40 64 | 65 | Args: 66 | df: A pandas DataFrame. 67 | func: A function that is applied elementwise across all rows of the 68 | DataFrame. 69 | new_column_name: Name of the resulting column. 70 | 71 | Returns: 72 | A pandas DataFrame with new column appended. 73 | """ # noqa: E501 74 | df = df.copy().join(df.apply(func, axis=1).rename(new_column_name)) 75 | return df 76 | -------------------------------------------------------------------------------- /janitor/functions/label_encode.py: -------------------------------------------------------------------------------- 1 | """Implementation of `label_encode` function""" 2 | 3 | import warnings 4 | from typing import Hashable, Iterable, Union 5 | 6 | import pandas as pd 7 | import pandas_flavor as pf 8 | 9 | from janitor.functions.utils import _factorize 10 | from janitor.utils import deprecated_alias, refactored_function 11 | 12 | 13 | @pf.register_dataframe_method 14 | @refactored_function( 15 | message=( 16 | "This function will be deprecated in a 1.x release. " 17 | "Please use `janitor.factorize_columns` instead." 18 | ) 19 | ) 20 | @deprecated_alias(columns="column_names") 21 | def label_encode( 22 | df: pd.DataFrame, 23 | column_names: Union[str, Iterable[str], Hashable], 24 | ) -> pd.DataFrame: 25 | """Convert labels into numerical data. 26 | 27 | This method will create a new column with the string `_enc` appended 28 | after the original column's name. 29 | Consider this to be syntactic sugar. 30 | This function uses the `factorize` pandas function under the hood. 31 | 32 | This method behaves differently from 33 | [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]. 34 | This method creates a new column of numeric data. 35 | [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical] 36 | replaces the dtype of the original column with a *categorical* dtype. 37 | 38 | This method mutates the original DataFrame. 39 | 40 | !!!note 41 | 42 | This function will be deprecated in a 1.x release. 43 | Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns] 44 | instead. 45 | 46 | Examples: 47 | >>> import pandas as pd 48 | >>> import janitor 49 | >>> df = pd.DataFrame({ 50 | ... "foo": ["b", "b", "a", "c", "b"], 51 | ... "bar": range(4, 9), 52 | ... }) 53 | >>> df 54 | foo bar 55 | 0 b 4 56 | 1 b 5 57 | 2 a 6 58 | 3 c 7 59 | 4 b 8 60 | >>> df.label_encode(column_names="foo") 61 | foo bar foo_enc 62 | 0 b 4 0 63 | 1 b 5 0 64 | 2 a 6 1 65 | 3 c 7 2 66 | 4 b 8 0 67 | 68 | Args: 69 | df: The pandas DataFrame object. 70 | column_names: A column name or an iterable (list 71 | or tuple) of column names. 72 | 73 | Returns: 74 | A pandas DataFrame. 75 | """ # noqa: E501 76 | warnings.warn( 77 | "`label_encode` will be deprecated in a 1.x release. " 78 | "Please use `factorize_columns` instead." 79 | ) 80 | df = _factorize(df, column_names, "_enc") 81 | return df 82 | -------------------------------------------------------------------------------- /janitor/functions/remove_columns.py: -------------------------------------------------------------------------------- 1 | """Implementation of remove_columns.""" 2 | 3 | from typing import Hashable, Iterable, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import deprecated_alias, refactored_function 9 | 10 | 11 | @pf.register_dataframe_method 12 | @refactored_function( 13 | message=( 14 | "This function will be deprecated in a 1.x release. " 15 | "Please use `pd.DataFrame.drop` instead." 16 | ) 17 | ) 18 | @deprecated_alias(columns="column_names") 19 | def remove_columns( 20 | df: pd.DataFrame, 21 | column_names: Union[str, Iterable[str], Hashable], 22 | ) -> pd.DataFrame: 23 | """Remove the set of columns specified in `column_names`. 24 | 25 | This method does not mutate the original DataFrame. 26 | 27 | Intended to be the method-chaining alternative to `del df[col]`. 28 | 29 | !!!note 30 | 31 | This function will be deprecated in a 1.x release. 32 | Kindly use `pd.DataFrame.drop` instead. 33 | 34 | Examples: 35 | >>> import pandas as pd 36 | >>> import janitor 37 | >>> df = pd.DataFrame({"a": [2, 4, 6], "b": [1, 3, 5], "c": [7, 8, 9]}) 38 | >>> df 39 | a b c 40 | 0 2 1 7 41 | 1 4 3 8 42 | 2 6 5 9 43 | >>> df.remove_columns(column_names=['a', 'c']) 44 | b 45 | 0 1 46 | 1 3 47 | 2 5 48 | 49 | Args: 50 | df: A pandas DataFrame. 51 | column_names: The columns to remove. 52 | 53 | Returns: 54 | A pandas DataFrame. 55 | """ 56 | 57 | return df.drop(columns=column_names) 58 | -------------------------------------------------------------------------------- /janitor/functions/remove_empty.py: -------------------------------------------------------------------------------- 1 | """Implementation of remove_empty.""" 2 | 3 | import pandas as pd 4 | import pandas_flavor as pf 5 | 6 | 7 | @pf.register_dataframe_method 8 | def remove_empty(df: pd.DataFrame, reset_index: bool = True) -> pd.DataFrame: 9 | """Drop all rows and columns that are completely null. 10 | 11 | This method does not mutate the original DataFrame. 12 | 13 | Implementation is inspired from [StackOverflow][so]. 14 | 15 | [so]: https://stackoverflow.com/questions/38884538/python-pandas-find-all-rows-where-all-values-are-nan 16 | 17 | Examples: 18 | >>> import numpy as np 19 | >>> import pandas as pd 20 | >>> import janitor 21 | >>> df = pd.DataFrame({ 22 | ... "a": [1, np.nan, 2], 23 | ... "b": [3, np.nan, 4], 24 | ... "c": [np.nan, np.nan, np.nan], 25 | ... }) 26 | >>> df 27 | a b c 28 | 0 1.0 3.0 NaN 29 | 1 NaN NaN NaN 30 | 2 2.0 4.0 NaN 31 | >>> df.remove_empty() 32 | a b 33 | 0 1.0 3.0 34 | 1 2.0 4.0 35 | 36 | Args: 37 | df: The pandas DataFrame object. 38 | reset_index: Determines if the index is reset. 39 | 40 | Returns: 41 | A pandas DataFrame. 42 | """ # noqa: E501 43 | outcome = df.isna() 44 | outcome = df.loc[~outcome.all(axis=1), ~outcome.all(axis=0)] 45 | if reset_index: 46 | return outcome.reset_index(drop=True) 47 | return outcome 48 | -------------------------------------------------------------------------------- /janitor/functions/reorder_columns.py: -------------------------------------------------------------------------------- 1 | """Implementation source for `reorder_columns`.""" 2 | 3 | from typing import Hashable, Iterable, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import check 9 | 10 | 11 | @pf.register_dataframe_method 12 | def reorder_columns( 13 | df: pd.DataFrame, column_order: Union[Iterable[str], pd.Index, Hashable] 14 | ) -> pd.DataFrame: 15 | """Reorder DataFrame columns by specifying desired order as list of col names. 16 | 17 | Columns not specified retain their order and follow after the columns specified 18 | in `column_order`. 19 | 20 | All columns specified within the `column_order` list must be present within `df`. 21 | 22 | This method does not mutate the original DataFrame. 23 | 24 | Examples: 25 | >>> import pandas as pd 26 | >>> import janitor 27 | >>> df = pd.DataFrame({"col1": [1, 1, 1], "col2": [2, 2, 2], "col3": [3, 3, 3]}) 28 | >>> df 29 | col1 col2 col3 30 | 0 1 2 3 31 | 1 1 2 3 32 | 2 1 2 3 33 | >>> df.reorder_columns(['col3', 'col1']) 34 | col3 col1 col2 35 | 0 3 1 2 36 | 1 3 1 2 37 | 2 3 1 2 38 | 39 | Notice that the column order of `df` is now `col3`, `col1`, `col2`. 40 | 41 | Internally, this function uses `DataFrame.reindex` with `copy=False` 42 | to avoid unnecessary data duplication. 43 | 44 | Args: 45 | df: `DataFrame` to reorder 46 | column_order: A list of column names or Pandas `Index` 47 | specifying their order in the returned `DataFrame`. 48 | 49 | Raises: 50 | IndexError: If a column within `column_order` is not found 51 | within the DataFrame. 52 | 53 | Returns: 54 | A pandas DataFrame with reordered columns. 55 | """ # noqa: E501 56 | check("column_order", column_order, [list, tuple, pd.Index]) 57 | 58 | if any(col not in df.columns for col in column_order): 59 | raise IndexError( 60 | "One or more columns in `column_order` were not found in the " 61 | "DataFrame." 62 | ) 63 | 64 | # if column_order is a Pandas index, needs conversion to list: 65 | column_order = list(column_order) 66 | 67 | return df.reindex( 68 | columns=( 69 | column_order 70 | + [col for col in df.columns if col not in column_order] 71 | ), 72 | copy=False, 73 | ) 74 | -------------------------------------------------------------------------------- /janitor/functions/round_to_fraction.py: -------------------------------------------------------------------------------- 1 | """Implementation of `round_to_fraction`""" 2 | 3 | from typing import Hashable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pandas_flavor as pf 8 | 9 | from janitor.utils import check, check_column, deprecated_alias 10 | 11 | 12 | @pf.register_dataframe_method 13 | @deprecated_alias(col_name="column_name") 14 | def round_to_fraction( 15 | df: pd.DataFrame, 16 | column_name: Hashable, 17 | denominator: float, 18 | digits: float = np.inf, 19 | ) -> pd.DataFrame: 20 | """Round all values in a column to a fraction. 21 | 22 | This method mutates the original DataFrame. 23 | 24 | Taken from [the R package](https://github.com/sfirke/janitor/issues/235). 25 | 26 | Also, optionally round to a specified number of digits. 27 | 28 | Examples: 29 | Round numeric column to the nearest 1/4 value. 30 | 31 | >>> import numpy as np 32 | >>> import pandas as pd 33 | >>> import janitor 34 | >>> df = pd.DataFrame({ 35 | ... "a1": [1.263, 2.499, np.nan], 36 | ... "a2": ["x", "y", "z"], 37 | ... }) 38 | >>> df 39 | a1 a2 40 | 0 1.263 x 41 | 1 2.499 y 42 | 2 NaN z 43 | >>> df.round_to_fraction("a1", denominator=4) 44 | a1 a2 45 | 0 1.25 x 46 | 1 2.50 y 47 | 2 NaN z 48 | 49 | Args: 50 | df: A pandas DataFrame. 51 | column_name: Name of column to round to fraction. 52 | denominator: The denominator of the fraction for rounding. Must be 53 | a positive number. 54 | digits: The number of digits for rounding after rounding to the 55 | fraction. Default is np.inf (i.e. no subsequent rounding). 56 | 57 | Raises: 58 | ValueError: If `denominator` is not a positive number. 59 | 60 | Returns: 61 | A pandas DataFrame with a column's values rounded. 62 | """ 63 | check_column(df, column_name) 64 | check("denominator", denominator, [float, int]) 65 | check("digits", digits, [float, int]) 66 | 67 | if denominator <= 0: 68 | raise ValueError("denominator is expected to be a positive number.") 69 | 70 | df[column_name] = round(df[column_name] * denominator, 0) / denominator 71 | if not np.isinf(digits): 72 | df[column_name] = round(df[column_name], digits) 73 | 74 | return df 75 | -------------------------------------------------------------------------------- /janitor/functions/shuffle.py: -------------------------------------------------------------------------------- 1 | """Implementation of `shuffle` functions.""" 2 | 3 | from typing import Any 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_method 10 | def shuffle( 11 | df: pd.DataFrame, random_state: Any = None, reset_index: bool = True 12 | ) -> pd.DataFrame: 13 | """Shuffle the rows of the DataFrame. 14 | 15 | This method does not mutate the original DataFrame. 16 | 17 | Super-sugary syntax! Underneath the hood, we use `df.sample(frac=1)`, 18 | with the option to set the random state. 19 | 20 | Examples: 21 | >>> import pandas as pd 22 | >>> import janitor 23 | >>> df = pd.DataFrame({ 24 | ... "col1": range(5), 25 | ... "col2": list("abcde"), 26 | ... }) 27 | >>> df 28 | col1 col2 29 | 0 0 a 30 | 1 1 b 31 | 2 2 c 32 | 3 3 d 33 | 4 4 e 34 | >>> df.shuffle(random_state=42) 35 | col1 col2 36 | 0 1 b 37 | 1 4 e 38 | 2 2 c 39 | 3 0 a 40 | 4 3 d 41 | 42 | Args: 43 | df: A pandas DataFrame. 44 | random_state: If provided, set a seed for the random number 45 | generator. Passed to `pd.DataFrame.sample()`. 46 | reset_index: If True, reset the dataframe index to the default 47 | RangeIndex. 48 | 49 | Returns: 50 | A shuffled pandas DataFrame. 51 | """ 52 | result = df.sample(frac=1, random_state=random_state) 53 | if reset_index: 54 | result = result.reset_index(drop=True) 55 | return result 56 | -------------------------------------------------------------------------------- /janitor/functions/sort_naturally.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `sort_naturally` function.""" 2 | 3 | from typing import Any 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | from natsort import index_natsorted 8 | 9 | 10 | @pf.register_dataframe_method 11 | def sort_naturally( 12 | df: pd.DataFrame, column_name: str, **natsorted_kwargs: Any 13 | ) -> pd.DataFrame: 14 | """Sort a DataFrame by a column using *natural* sorting. 15 | 16 | Natural sorting is distinct from 17 | the default lexiographical sorting provided by `pandas`. 18 | For example, given the following list of items: 19 | 20 | ```python 21 | ["A1", "A11", "A3", "A2", "A10"] 22 | ``` 23 | 24 | Lexicographical sorting would give us: 25 | 26 | ```python 27 | ["A1", "A10", "A11", "A2", "A3"] 28 | ``` 29 | 30 | By contrast, "natural" sorting would give us: 31 | 32 | ```python 33 | ["A1", "A2", "A3", "A10", "A11"] 34 | ``` 35 | 36 | This function thus provides *natural* sorting 37 | on a single column of a dataframe. 38 | 39 | To accomplish this, we do a natural sort 40 | on the unique values that are present in the dataframe. 41 | Then, we reconstitute the entire dataframe 42 | in the naturally sorted order. 43 | 44 | Natural sorting is provided by the Python package 45 | [natsort](https://natsort.readthedocs.io/en/master/index.html). 46 | 47 | All keyword arguments to `natsort` should be provided 48 | after the column name to sort by is provided. 49 | They are passed through to the `natsorted` function. 50 | 51 | Examples: 52 | >>> import pandas as pd 53 | >>> import janitor 54 | >>> df = pd.DataFrame( 55 | ... { 56 | ... "Well": ["A21", "A3", "A21", "B2", "B51", "B12"], 57 | ... "Value": [1, 2, 13, 3, 4, 7], 58 | ... } 59 | ... ) 60 | >>> df 61 | Well Value 62 | 0 A21 1 63 | 1 A3 2 64 | 2 A21 13 65 | 3 B2 3 66 | 4 B51 4 67 | 5 B12 7 68 | >>> df.sort_naturally("Well") 69 | Well Value 70 | 1 A3 2 71 | 0 A21 1 72 | 2 A21 13 73 | 3 B2 3 74 | 5 B12 7 75 | 4 B51 4 76 | 77 | Args: 78 | df: A pandas DataFrame. 79 | column_name: The column on which natural sorting should take place. 80 | **natsorted_kwargs: Keyword arguments to be passed 81 | to natsort's `natsorted` function. 82 | 83 | Returns: 84 | A sorted pandas DataFrame. 85 | """ 86 | new_order = index_natsorted(df[column_name], **natsorted_kwargs) 87 | return df.iloc[new_order, :] 88 | -------------------------------------------------------------------------------- /janitor/functions/take_first.py: -------------------------------------------------------------------------------- 1 | """Implementation of take_first function.""" 2 | 3 | from typing import Hashable, Iterable, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | 9 | @pf.register_dataframe_method 10 | def take_first( 11 | df: pd.DataFrame, 12 | subset: Union[Hashable, Iterable[Hashable]], 13 | by: Hashable, 14 | ascending: bool = True, 15 | ) -> pd.DataFrame: 16 | """Take the first row within each group specified by `subset`. 17 | 18 | Examples: 19 | >>> import pandas as pd 20 | >>> import janitor 21 | >>> df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]}) 22 | >>> df 23 | a b 24 | 0 x 0 25 | 1 x 1 26 | 2 y 2 27 | 3 y 3 28 | >>> df.take_first(subset="a", by="b") 29 | a b 30 | 0 x 0 31 | 2 y 2 32 | 33 | Args: 34 | df: A pandas DataFrame. 35 | subset: Column(s) defining the group. 36 | by: Column to sort by. 37 | ascending: Whether or not to sort in ascending order, `bool`. 38 | 39 | Returns: 40 | A pandas DataFrame. 41 | """ 42 | result = df.sort_values(by=by, ascending=ascending).drop_duplicates( 43 | subset=subset, keep="first" 44 | ) 45 | 46 | return result 47 | -------------------------------------------------------------------------------- /janitor/functions/then.py: -------------------------------------------------------------------------------- 1 | """Implementation source for `then`.""" 2 | 3 | from typing import Callable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import refactored_function 9 | 10 | 11 | @pf.register_dataframe_method 12 | @refactored_function( 13 | message="This function will be deprecated in a 1.x release. " 14 | "Kindly use `pd.DataFrame.pipe` instead." 15 | ) 16 | def then(df: pd.DataFrame, func: Callable) -> pd.DataFrame: 17 | """Add an arbitrary function to run in the `pyjanitor` method chain. 18 | 19 | This method does not mutate the original DataFrame. 20 | 21 | !!!note 22 | 23 | This function will be deprecated in a 1.x release. 24 | Please use `pd.DataFrame.pipe` instead. 25 | 26 | Examples: 27 | A trivial example using a lambda `func`. 28 | 29 | >>> import pandas as pd 30 | >>> import janitor 31 | >>> (pd.DataFrame({"a": [1, 2, 3], "b": [7, 8, 9]}) 32 | ... .then(lambda df: df * 2)) 33 | a b 34 | 0 2 14 35 | 1 4 16 36 | 2 6 18 37 | 38 | Args: 39 | df: A pandas DataFrame. 40 | func: A function you would like to run in the method chain. 41 | It should take one parameter and return one parameter, each being 42 | the DataFrame object. After that, do whatever you want in the 43 | middle. Go crazy. 44 | 45 | Returns: 46 | A pandas DataFrame. 47 | """ 48 | df = func(df) 49 | return df 50 | -------------------------------------------------------------------------------- /janitor/functions/to_datetime.py: -------------------------------------------------------------------------------- 1 | """Implementation source for `to_datetime`.""" 2 | 3 | from typing import Any, Hashable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import deprecated_alias, refactored_function 9 | 10 | 11 | @pf.register_dataframe_method 12 | @deprecated_alias(column="column_name") 13 | @refactored_function( 14 | message=( 15 | "This function will be deprecated in a 1.x release. " 16 | "Please use `jn.transform_columns` instead." 17 | ) 18 | ) 19 | def to_datetime( 20 | df: pd.DataFrame, column_name: Hashable, **kwargs: Any 21 | ) -> pd.DataFrame: 22 | """Convert column to a datetime type, in-place. 23 | 24 | Intended to be the method-chaining equivalent of: 25 | 26 | ```python 27 | df[column_name] = pd.to_datetime(df[column_name], **kwargs) 28 | ``` 29 | 30 | This method mutates the original DataFrame. 31 | 32 | !!!note 33 | 34 | This function will be deprecated in a 1.x release. 35 | Please use [`jn.transform_column`][janitor.functions.transform_columns.transform_column] 36 | instead. 37 | 38 | Examples: 39 | Converting a string column to datetime type with custom format. 40 | 41 | >>> import pandas as pd 42 | >>> import janitor 43 | >>> df = pd.DataFrame({'date': ['20200101', '20200202', '20200303']}) 44 | >>> df 45 | date 46 | 0 20200101 47 | 1 20200202 48 | 2 20200303 49 | >>> df.to_datetime('date', format='%Y%m%d') 50 | date 51 | 0 2020-01-01 52 | 1 2020-02-02 53 | 2 2020-03-03 54 | 55 | Read the pandas documentation for [`to_datetime`][pd_docs] for more information. 56 | 57 | [pd_docs]: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html 58 | 59 | Args: 60 | df: A pandas DataFrame. 61 | column_name: Column name. 62 | **kwargs: Provide any kwargs that `pd.to_datetime` can take. 63 | 64 | Returns: 65 | A pandas DataFrame with updated datetime data. 66 | """ # noqa: E501 67 | df[column_name] = pd.to_datetime(df[column_name], **kwargs) 68 | 69 | return df 70 | -------------------------------------------------------------------------------- /janitor/functions/toset.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `toset` function.""" 2 | 3 | from typing import Set 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from janitor.utils import refactored_function 9 | 10 | 11 | @pf.register_series_method 12 | @refactored_function( 13 | message=( 14 | "This function will be deprecated in a 1.x release. " 15 | "Please use `set(df[column])` instead." 16 | ) 17 | ) 18 | def toset(series: pd.Series) -> Set: 19 | """Return a set of the values. 20 | 21 | !!!note 22 | 23 | This function will be deprecated in a 1.x release. 24 | Please use `set(df[column])` instead. 25 | 26 | These are each a scalar type, which is a Python scalar 27 | (for str, int, float) or a pandas scalar 28 | (for Timestamp/Timedelta/Interval/Period) 29 | 30 | Examples: 31 | >>> import pandas as pd 32 | >>> import janitor 33 | >>> s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"]) 34 | >>> s 35 | a 1 36 | b 2 37 | c 3 38 | d 5 39 | e 5 40 | dtype: int64 41 | >>> s.toset() 42 | {1, 2, 3, 5} 43 | 44 | Args: 45 | series: A pandas series. 46 | 47 | Returns: 48 | A set of values. 49 | """ 50 | 51 | return set(series.tolist()) 52 | -------------------------------------------------------------------------------- /janitor/functions/truncate_datetime.py: -------------------------------------------------------------------------------- 1 | """Implementation of the `truncate_datetime` family of functions.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pandas_flavor as pf 6 | from pandas.api.types import is_datetime64_any_dtype 7 | 8 | 9 | @pf.register_dataframe_method 10 | def truncate_datetime_dataframe( 11 | df: pd.DataFrame, 12 | datepart: str, 13 | ) -> pd.DataFrame: 14 | """Truncate times down to a user-specified precision of 15 | year, month, day, hour, minute, or second. 16 | 17 | This method does not mutate the original DataFrame. 18 | 19 | Examples: 20 | >>> import pandas as pd 21 | >>> import janitor 22 | >>> df = pd.DataFrame({ 23 | ... "foo": ["xxxx", "yyyy", "zzzz"], 24 | ... "dt": pd.date_range("2020-03-11", periods=3, freq="15H"), 25 | ... }) 26 | >>> df 27 | foo dt 28 | 0 xxxx 2020-03-11 00:00:00 29 | 1 yyyy 2020-03-11 15:00:00 30 | 2 zzzz 2020-03-12 06:00:00 31 | >>> df.truncate_datetime_dataframe("day") 32 | foo dt 33 | 0 xxxx 2020-03-11 34 | 1 yyyy 2020-03-11 35 | 2 zzzz 2020-03-12 36 | 37 | Args: 38 | df: The pandas DataFrame on which to truncate datetime. 39 | datepart: Truncation precision, YEAR, MONTH, DAY, 40 | HOUR, MINUTE, SECOND. (String is automagically 41 | capitalized) 42 | 43 | Raises: 44 | ValueError: If an invalid `datepart` precision is passed in. 45 | 46 | Returns: 47 | A pandas DataFrame with all valid datetimes truncated down 48 | to the specified precision. 49 | """ 50 | # idea from Stack Overflow 51 | # https://stackoverflow.com/a/28783971/7175713 52 | # https://numpy.org/doc/stable/reference/arrays.datetime.html 53 | ACCEPTABLE_DATEPARTS = { 54 | "YEAR": "datetime64[Y]", 55 | "MONTH": "datetime64[M]", 56 | "DAY": "datetime64[D]", 57 | "HOUR": "datetime64[h]", 58 | "MINUTE": "datetime64[m]", 59 | "SECOND": "datetime64[s]", 60 | } 61 | datepart = datepart.upper() 62 | if datepart not in ACCEPTABLE_DATEPARTS: 63 | raise ValueError( 64 | "Received an invalid `datepart` precision. " 65 | f"Please enter any one of {ACCEPTABLE_DATEPARTS}." 66 | ) 67 | 68 | dictionary = {} 69 | 70 | for label, series in df.items(): 71 | if is_datetime64_any_dtype(series): 72 | dtype = ACCEPTABLE_DATEPARTS[datepart] 73 | # TODO: add branch for pyarrow arrays 74 | series = np.array(series._values, dtype=dtype) 75 | dictionary[label] = series 76 | 77 | return pd.DataFrame(dictionary) 78 | -------------------------------------------------------------------------------- /janitor/functions/update_where.py: -------------------------------------------------------------------------------- 1 | """Function for updating values based on other column values.""" 2 | 3 | from typing import Any, Hashable 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | from pandas.api.types import is_bool_dtype 8 | 9 | from janitor.utils import deprecated_alias 10 | 11 | 12 | @pf.register_dataframe_method 13 | @deprecated_alias(target_col="target_column_name") 14 | def update_where( 15 | df: pd.DataFrame, 16 | conditions: Any, 17 | target_column_name: Hashable, 18 | target_val: Any, 19 | ) -> pd.DataFrame: 20 | """Add multiple conditions to update a column in the dataframe. 21 | 22 | This method does not mutate the original DataFrame. 23 | 24 | Examples: 25 | >>> import janitor 26 | >>> data = { 27 | ... "a": [1, 2, 3, 4], 28 | ... "b": [5, 6, 7, 8], 29 | ... "c": [0, 0, 0, 0], 30 | ... } 31 | >>> df = pd.DataFrame(data) 32 | >>> df 33 | a b c 34 | 0 1 5 0 35 | 1 2 6 0 36 | 2 3 7 0 37 | 3 4 8 0 38 | >>> df.update_where( 39 | ... conditions = (df.a > 2) & (df.b < 8), 40 | ... target_column_name = 'c', 41 | ... target_val = 10 42 | ... ) 43 | a b c 44 | 0 1 5 0 45 | 1 2 6 0 46 | 2 3 7 10 47 | 3 4 8 0 48 | >>> df.update_where( # supports pandas *query* style string expressions 49 | ... conditions = "a > 2 and b < 8", 50 | ... target_column_name = 'c', 51 | ... target_val = 10 52 | ... ) 53 | a b c 54 | 0 1 5 0 55 | 1 2 6 0 56 | 2 3 7 10 57 | 3 4 8 0 58 | 59 | Args: 60 | df: The pandas DataFrame object. 61 | conditions: Conditions used to update a target column 62 | and target value. 63 | target_column_name: Column to be updated. If column does not exist 64 | in DataFrame, a new column will be created; note that entries 65 | that do not get set in the new column will be null. 66 | target_val: Value to be updated. 67 | 68 | Raises: 69 | ValueError: If `conditions` does not return a boolean array-like 70 | data structure. 71 | 72 | Returns: 73 | A pandas DataFrame. 74 | """ 75 | 76 | df = df.copy() 77 | 78 | # use query mode if a string expression is passed 79 | if isinstance(conditions, str): 80 | conditions = df.eval(conditions) 81 | 82 | if not is_bool_dtype(conditions): 83 | raise ValueError( 84 | """ 85 | Kindly ensure that `conditions` passed 86 | evaluates to a Boolean dtype. 87 | """ 88 | ) 89 | 90 | df.loc[conditions, target_column_name] = target_val 91 | 92 | return df 93 | -------------------------------------------------------------------------------- /janitor/ml.py: -------------------------------------------------------------------------------- 1 | """Machine learning specific functions.""" 2 | 3 | from typing import Hashable, Iterable, List, Optional, Tuple, Union 4 | 5 | import pandas as pd 6 | import pandas_flavor as pf 7 | 8 | from .utils import deprecated_alias 9 | 10 | 11 | @pf.register_dataframe_method 12 | @deprecated_alias( 13 | target_columns="target_column_names", 14 | feature_columns="feature_column_names", 15 | ) 16 | def get_features_targets( 17 | df: pd.DataFrame, 18 | target_column_names: Union[str, Union[List, Tuple], Hashable], 19 | feature_column_names: Optional[Union[str, Iterable[str], Hashable]] = None, 20 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 21 | """Get the features and targets as separate DataFrames/Series. 22 | 23 | This method does not mutate the original DataFrame. 24 | 25 | The behaviour is as such: 26 | 27 | - `target_column_names` is mandatory. 28 | - If `feature_column_names` is present, then we will respect the column 29 | names inside there. 30 | - If `feature_column_names` is not passed in, then we will assume that 31 | the rest of the columns are feature columns, and return them. 32 | 33 | Examples: 34 | >>> import pandas as pd 35 | >>> import janitor.ml 36 | >>> df = pd.DataFrame( 37 | ... {"a": [1, 2, 3], "b": [-2, 0, 4], "c": [1.23, 7.89, 4.56]} 38 | ... ) 39 | >>> X, Y = df.get_features_targets(target_column_names=["a", "c"]) 40 | >>> X 41 | b 42 | 0 -2 43 | 1 0 44 | 2 4 45 | >>> Y 46 | a c 47 | 0 1 1.23 48 | 1 2 7.89 49 | 2 3 4.56 50 | 51 | Args: 52 | df: The pandas DataFrame object. 53 | target_column_names: Either a column name or an 54 | iterable (list or tuple) of column names that are the target(s) to 55 | be predicted. 56 | feature_column_names: The column name or 57 | iterable of column names that are the features (a.k.a. predictors) 58 | used to predict the targets. 59 | 60 | Returns: 61 | `(X, Y)` the feature matrix (`X`) and the target matrix (`Y`). 62 | Both are pandas DataFrames. 63 | """ 64 | Y = df[target_column_names] 65 | 66 | if feature_column_names: 67 | X = df[feature_column_names] 68 | else: 69 | if isinstance(target_column_names, (list, tuple)): # noqa: W503 70 | xcols = [c for c in df.columns if c not in target_column_names] 71 | else: 72 | xcols = [c for c in df.columns if target_column_names != c] 73 | 74 | X = df[xcols] 75 | return X, Y 76 | -------------------------------------------------------------------------------- /janitor/polars/__init__.py: -------------------------------------------------------------------------------- 1 | from .clean_names import clean_names, make_clean_names 2 | from .complete import complete, expand 3 | from .dates_to_polars import convert_excel_date, convert_matlab_date 4 | from .pivot_longer import pivot_longer, pivot_longer_spec 5 | from .row_to_names import row_to_names 6 | 7 | __all__ = [ 8 | "pivot_longer_spec", 9 | "pivot_longer", 10 | "clean_names", 11 | "make_clean_names", 12 | "row_to_names", 13 | "expand", 14 | "complete", 15 | "convert_excel_date", 16 | "convert_matlab_date", 17 | ] 18 | -------------------------------------------------------------------------------- /janitor/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * # noqa: F403, F401 2 | -------------------------------------------------------------------------------- /janitor/spark/backend.py: -------------------------------------------------------------------------------- 1 | """Backend functions for pyspark.""" 2 | 3 | from functools import wraps 4 | 5 | try: 6 | from pyspark.pandas.extensions import register_dataframe_accessor 7 | 8 | except ImportError: 9 | from janitor.utils import import_message 10 | 11 | import_message( 12 | submodule="spark", 13 | package="pyspark", 14 | conda_channel="conda-forge", 15 | pip_install=True, 16 | ) 17 | 18 | 19 | def register_dataframe_method(method): 20 | """Register a function as a method attached to the Pyspark DataFrame. 21 | 22 | !!! note 23 | 24 | Modified based on pandas_flavor.register. 25 | 26 | 30 | """ 31 | 32 | def inner(*args, **kwargs): 33 | class AccessorMethod: 34 | def __init__(self, pyspark_obj): 35 | self._obj = pyspark_obj 36 | 37 | @wraps(method) 38 | def __call__(self, *args, **kwargs): 39 | return method(self._obj, *args, **kwargs) 40 | 41 | register_dataframe_accessor(method.__name__)(AccessorMethod) 42 | 43 | return method 44 | 45 | return inner() 46 | -------------------------------------------------------------------------------- /janitor/testing_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/janitor/testing_utils/__init__.py -------------------------------------------------------------------------------- /janitor/xarray/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * # noqa: F403, F401 2 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Project information 2 | site_name: pyjanitor documentation 3 | site_url: https://pyjanitor-devs.github.io/pyjanitor 4 | site_description: >- 5 | Python implementation of the R package janitor 6 | 7 | # Repository 8 | repo_name: "pyjanitor-devs/pyjanitor" 9 | repo_url: "https://github.com/pyjanitor-devs/pyjanitor" 10 | 11 | # Configuration 12 | docs_dir: mkdocs/ 13 | watch: 14 | - janitor/ 15 | 16 | theme: 17 | name: "material" 18 | palette: 19 | - media: "(prefers-color-scheme: light)" 20 | scheme: default 21 | primary: "blue grey" 22 | accent: "light blue" 23 | icon: 24 | logo: "fontawesome/solid/book" 25 | features: 26 | - navigation.instant 27 | # - navigation.tabs 28 | - navigation.top 29 | - toc.follow 30 | - content.code.copy 31 | language: en 32 | 33 | # Page tree 34 | # We customize the navigation by hand to control the order 35 | # in which pages show up. 36 | nav: 37 | - Home: index.md 38 | - API Reference: 39 | - Functions: api/functions.md 40 | - Biology: api/biology.md 41 | - Chemistry: api/chemistry.md 42 | - Engineering: api/engineering.md 43 | - Finance: api/finance.md 44 | - Input/Output (io): api/io.md 45 | - Machine Learning: api/ml.md 46 | - Math: api/math.md 47 | # - PySpark: api/pyspark.md # will be added back later 48 | - Polars: api/polars.md 49 | - Timeseries: api/timeseries.md 50 | - XArray: api/xarray.md 51 | - Development Guide: devguide.md 52 | - Changelog: CHANGELOG.md 53 | - Authors: AUTHORS.md 54 | 55 | plugins: 56 | - search 57 | - autorefs 58 | - mkdocstrings: 59 | default_handler: python 60 | handlers: 61 | python: 62 | options: 63 | docstring_style: "google" 64 | docstring_options: 65 | trim_doctest_flags: true 66 | show_if_no_docstring: false 67 | show_root_toc_entry: false 68 | show_root_heading: false 69 | show_submodules: true 70 | show_source: true 71 | members_order: alphabetical 72 | # - mknotebooks: 73 | # execute: true 74 | # write_markdown: true 75 | # allow_errors: true 76 | # timeout: 1200 77 | # binder: true 78 | # binder_service_name: "gh" 79 | # binder_branch: "master" 80 | 81 | markdown_extensions: 82 | - admonition 83 | - pymdownx.highlight: 84 | use_pygments: true 85 | - pymdownx.inlinehilite 86 | # - pymdownx.tabbed: 87 | # alternate_style: true 88 | - pymdownx.superfences 89 | 90 | extra_javascript: 91 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML 92 | 93 | extra_css: 94 | - css/apidocs.css 95 | -------------------------------------------------------------------------------- /mkdocs/AUTHORS.md: -------------------------------------------------------------------------------- 1 | ../AUTHORS.md -------------------------------------------------------------------------------- /mkdocs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /mkdocs/api/biology.md: -------------------------------------------------------------------------------- 1 | # Biology 2 | 3 | ::: janitor.biology 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/chemistry.md: -------------------------------------------------------------------------------- 1 | # Chemistry 2 | 3 | ::: janitor.chemistry 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/engineering.md: -------------------------------------------------------------------------------- 1 | # Engineering 2 | 3 | ::: janitor.engineering 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/finance.md: -------------------------------------------------------------------------------- 1 | # Finance 2 | 3 | ::: janitor.finance 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/functions.md: -------------------------------------------------------------------------------- 1 | # Functions 2 | 3 | ::: janitor.functions 4 | options: 5 | filters: 6 | - "!^_" 7 | members: 8 | - add_columns 9 | - alias 10 | - also 11 | - bin_numeric 12 | - case_when 13 | - change_index_dtype 14 | - change_type 15 | - clean_names 16 | - coalesce 17 | - collapse_levels 18 | - complete 19 | - concatenate_columns 20 | - conditional_join 21 | - convert_date 22 | - count_cumulative_unique 23 | - currency_column_to_numeric 24 | - deconcatenate_column 25 | - drop_constant_columns 26 | - drop_duplicate_columns 27 | - dropnotnull 28 | - encode_categorical 29 | - expand_column 30 | - expand_grid 31 | - explode_index 32 | - factorize_columns 33 | - fill 34 | - filter 35 | - find_replace 36 | - flag_nulls 37 | - get_dupes 38 | - groupby_agg 39 | - groupby_topk 40 | - impute 41 | - jitter 42 | - join_apply 43 | - label_encode 44 | - limit_column_characters 45 | - min_max_scale 46 | - move 47 | - mutate 48 | - pivot 49 | - process_text 50 | - remove_columns 51 | - remove_empty 52 | - rename_columns 53 | - reorder_columns 54 | - round_to_fraction 55 | - row_to_names 56 | - select 57 | - shuffle 58 | - sort_column_value_order 59 | - sort_naturally 60 | - summarise 61 | - take_first 62 | - then 63 | - to_datetime 64 | - toset 65 | - transform_columns 66 | - truncate_datetime 67 | - update_where 68 | - utils 69 | -------------------------------------------------------------------------------- /mkdocs/api/io.md: -------------------------------------------------------------------------------- 1 | # Input/Output (io) 2 | 3 | ::: janitor.io 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/math.md: -------------------------------------------------------------------------------- 1 | # Math 2 | 3 | ::: janitor.math 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/ml.md: -------------------------------------------------------------------------------- 1 | # Machine Learning 2 | 3 | ::: janitor.ml 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/polars.md: -------------------------------------------------------------------------------- 1 | # Polars 2 | 3 | ::: janitor.polars 4 | options: 5 | filters: 6 | - "!^_" 7 | members: 8 | - clean_names 9 | - complete 10 | - pivot_longer 11 | - row_to_names 12 | -------------------------------------------------------------------------------- /mkdocs/api/timeseries.md: -------------------------------------------------------------------------------- 1 | # Timeseries 2 | 3 | ::: janitor.timeseries 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/api/xarray.md: -------------------------------------------------------------------------------- 1 | # XArray 2 | 3 | ::: janitor.xarray.functions 4 | options: 5 | filters: 6 | - "!^_" 7 | -------------------------------------------------------------------------------- /mkdocs/css/apidocs.css: -------------------------------------------------------------------------------- 1 | /* https://mkdocstrings.github.io/theming/#css-classes */ 2 | .doc-property { 3 | border-radius: 15px; 4 | padding: 0 5px; 5 | } 6 | .doc-property-special { 7 | background-color: blue; 8 | color: white; 9 | } 10 | .doc-property-private { 11 | background-color: red; 12 | color: white; 13 | } 14 | .doc-property-property { 15 | background-color: green; 16 | color: white; 17 | } 18 | .doc-property-read-only { 19 | background-color: yellow; 20 | color: black; 21 | } 22 | 23 | /* https://mkdocstrings.github.io/handlers/python/#recommended-style-material */ 24 | /* Indentation. */ 25 | div.doc-contents:not(.first) { 26 | padding-left: 25px; 27 | border-left: 4px solid rgba(230, 230, 230); 28 | margin-bottom: 80px; 29 | } 30 | 31 | /* add a keyboard shortcut icon for search bar, 32 | * https://github.com/squidfunk/mkdocs-material/issues/2574#issuecomment-821979698 33 | */ 34 | [data-md-toggle="search"]:not(:checked) ~ .md-header .md-search__form::after { 35 | position: absolute; 36 | top: 0.3rem; 37 | right: 0.3rem; 38 | display: block; 39 | padding: 0.1rem 0.4rem; 40 | color: var(--md-default-bg-color--lighter); 41 | font-weight: bold; 42 | font-size: 0.8rem; 43 | border: 0.05rem solid var(--md-default-bg-color--lighter); 44 | border-radius: 0.1rem; 45 | content: "/"; 46 | } 47 | 48 | /* prevent selection of chevron in example blocks 49 | * cf. https://mkdocstrings.github.io/recipes/#prevent-selection-of-prompts-and-output-in-python-code-blocks 50 | */ 51 | .highlight .gp, .highlight .go { /* Generic.Prompt, Generic.Output */ 52 | user-select: none; 53 | } 54 | -------------------------------------------------------------------------------- /mkdocs/development/lazy_imports.md: -------------------------------------------------------------------------------- 1 | # Lazy Imports 2 | 3 | In `pyjanitor`, we use lazy imports to speed up `import janitor`. 4 | Prior to using lazy imports, `import janitor` would take about 1-2 seconds to complete, 5 | thereby causing significant delays for downstream consumers of `pyjanitor`. 6 | Slow importing be undesirable as it would slow down programs that demand low latency. 7 | 8 | ## A brief history of the decision 9 | 10 | The original issue was raised by @ericmjl 11 | in issue ([#1059](https://github.com/pyjanitor-devs/pyjanitor/issues/1059)). 12 | The basis there is that the scientific Python community 13 | was hurting with imports that took a long time, 14 | especially the ones that depended on SciPy and Pandas. 15 | As `pyjanitor` is a package that depends on `pandas`, 16 | it was important for us to see if we could improve the speed at which imports happened. 17 | 18 | ## Current Speed Benchmark 19 | 20 | As of 5 April 2022, imports take about ~0.5 seconds (give or take) to complete 21 | on a GitHub Codespaces workspace. 22 | This is much more desirable than the original 1-2 seconds, 23 | also measured on a GitHub Codespaces workspace. 24 | 25 | ## How to benchmark 26 | 27 | To benchmark, we run the following line: 28 | 29 | ```bash 30 | python -X importtime -c "import janitor" 2> timing.log 31 | ``` 32 | 33 | Then, using the `tuna` CLI tool, we can view the timing log: 34 | 35 | ```bash 36 | tuna timing.log 37 | ``` 38 | 39 | Note: You may need to install tuna using `pip install -U tuna`. 40 | `tuna`'s development repository is [on GitHub][tuna] 41 | 42 | [tuna]: https://github.com/nschloe/tuna. 43 | 44 | You'll be redirected to your browser, 45 | where the web UI will allow you to see 46 | which imports are causing time delays. 47 | 48 | ![Tuna's Web UI](./images/tuna.png) 49 | 50 | ## Which imports to lazily load 51 | 52 | Generally speaking, the _external_ imports are the ones that 53 | when lazily loaded, will give the maximal gain in speed. 54 | You can also opt to lazily load `pyjanitor` submodules, 55 | but we doubt they will give much advantage in speed. 56 | -------------------------------------------------------------------------------- /mkdocs/environment.yaml: -------------------------------------------------------------------------------- 1 | # 14 August 2022: Temporarily commenting out. 2 | # See: https://github.com/pyjanitor-devs/pyjanitor/pull/1147#issuecomment-1214508157 3 | # for more context on why. 4 | # name: pyjanitor-doc 5 | # channels: 6 | # - conda-forge 7 | # dependencies: 8 | # - python 9 | # # required 10 | # - pandas 11 | # - pandas-flavor 12 | # - multipledispatch 13 | # - scipy 14 | # # optional 15 | # - biopython 16 | # - natsort 17 | # - pyspark>=3.2.0 18 | # - rdkit 19 | # - tqdm 20 | # - unyt 21 | # - xarray 22 | # - numba 23 | # # doc 24 | # - mkdocs 25 | # - mkdocs-material 26 | # # To fix #1146 27 | # # - mkdocstrings-python 28 | # - mkdocstrings=0.18.1 29 | # - mkdocstrings-python-legacy=0.2.2 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | exclude = ''' 3 | /( 4 | \.git 5 | | \.hg 6 | | \.mypy_cache 7 | | \.tox 8 | | \.venv 9 | | _build 10 | | buck-out 11 | | build 12 | | dist 13 | | env 14 | | venv 15 | )/ 16 | ''' 17 | include = '\.pyi?$' 18 | line-length = 79 19 | target-version = ['py36', 'py37', 'py38'] 20 | 21 | [tool.interrogate] 22 | exclude = ["setup.py", "docs", "nbconvert_config.py"] 23 | fail-under = 55 24 | ignore-init-method = true 25 | ignore-init-module = true 26 | ignore-module = false 27 | ignore-private = false 28 | ignore-semiprivate = false 29 | quiet = false 30 | verbose = 2 31 | 32 | # https://docs.pytest.org/en/6.2.x/mark.html#registering-marks 33 | [tool.pytest.ini_options] 34 | markers = [ 35 | "functions: test for general functions", 36 | "biology: tests for biology", 37 | "chemistry: tests for chemistry", 38 | "finance: tests for finance", 39 | "utils: utility tests", 40 | "engineering: tests for engineering", 41 | "ml: tests for machine learning", 42 | "polars: tests for polars methods", 43 | "spark_functions: tests for pyspark functions", 44 | "xarray: tests for xarray functions", 45 | "timeseries: tests for timeseries", 46 | "documentation: tests for documentation", 47 | "turtle: tests that take more than 5 seconds to execute", 48 | ] 49 | 50 | 51 | [tool.ruff] 52 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 53 | lint.select = ["E", "F", "I"] 54 | lint.ignore = [] 55 | 56 | # Allow fix for all enabled rules (when `--fix`) is provided. 57 | lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 58 | lint.unfixable = [] 59 | 60 | # Exclude a variety of commonly ignored directories. 61 | exclude = [ 62 | ".bzr", 63 | ".direnv", 64 | ".eggs", 65 | ".git", 66 | ".git-rewrite", 67 | ".hg", 68 | ".mypy_cache", 69 | ".nox", 70 | ".pants.d", 71 | ".pytype", 72 | ".ruff_cache", 73 | ".svn", 74 | ".tox", 75 | ".venv", 76 | "__pypackages__", 77 | "_build", 78 | "buck-out", 79 | "build", 80 | "dist", 81 | "node_modules", 82 | "venv", 83 | "nbconvert_config.py", 84 | ] 85 | 86 | # Same as Black. 87 | line-length = 88 88 | 89 | # Allow unused variables when underscore-prefixed. 90 | lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 91 | 92 | # Assume Python 3.10 93 | target-version = "py310" 94 | 95 | [tool.ruff.lint.mccabe] 96 | # Unlike Flake8, default to a complexity level of 10. 97 | max-complexity = 10 98 | -------------------------------------------------------------------------------- /scripts/ci/build_environment.sh: -------------------------------------------------------------------------------- 1 | conda install -c conda-forge mamba 2 | mamba env create -f environment-dev.yml 3 | -------------------------------------------------------------------------------- /scripts/ci/unpack_environment.sh: -------------------------------------------------------------------------------- 1 | mkdir -p /tmp/pyjanitor-dev_env 2 | tar -xzf pyjanitor-dev.tar.gz -C /tmp/pyjanitor-dev_env 3 | source /tmp/pyjanitor-dev_env/bin/activate 4 | conda-unpack 5 | -------------------------------------------------------------------------------- /scripts/count_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to count the number of functions inside each source file. 3 | 4 | Can be used for many purposes. 5 | 6 | Intended to be run from pyjanitor's top-level directory. 7 | 8 | 9 | """ 10 | 11 | import ast 12 | import os 13 | from pathlib import Path 14 | 15 | 16 | def count_number_of_functions(filepath): 17 | """Count number of functions inside a .py file.""" 18 | # Taken from: https://stackoverflow.com/a/37514895/1274908 19 | with open(filepath, "r+") as f: 20 | tree = ast.parse(f.read()) 21 | return sum(isinstance(exp, ast.FunctionDef) for exp in tree.body) 22 | 23 | 24 | def janitor_submodules(): 25 | """Yield a list of janitor submodules and their full paths.""" 26 | files = [f for f in os.listdir("janitor") if f.endswith(".py")] 27 | 28 | for file in files: 29 | yield Path("janitor") / file 30 | 31 | 32 | def main(): 33 | """Main executable function.""" 34 | for filepath in janitor_submodules(): 35 | num_funcs = count_number_of_functions(filepath) 36 | print(filepath, num_funcs) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /scripts/docker_deploy.sh: -------------------------------------------------------------------------------- 1 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin 2 | docker push ericmjl/pyjanitor:devcontainer 3 | -------------------------------------------------------------------------------- /talks/scipy2019/friends.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/friends.png -------------------------------------------------------------------------------- /talks/scipy2019/readthedocs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/readthedocs.png -------------------------------------------------------------------------------- /talks/scipy2019/slides.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | # Clean APIs for Data Cleaning 8 | 9 | Eric J. Ma 10 | Novartis Institutes for Biomedical Research 11 | SciPy 2019 12 | 13 | --- 14 | 15 | ## pandasshee 16 | 17 | --- 18 | 19 | ## readable code 20 | 21 | --- 22 | 23 | ## side-by-side 24 | 25 | - pop-up pandas code 26 | - pop-up pyjanitor code 27 | 28 | --- 29 | 30 | ## live demo 31 | 32 | --- 33 | 34 | ## history 35 | 36 | --- 37 | 38 | ## welcoming newcomers 39 | 40 | --- 41 | 42 | ## 43 | -------------------------------------------------------------------------------- /talks/scipy2019/sprints.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/sprints.jpg -------------------------------------------------------------------------------- /talks/scipy2019/twitter-wars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/twitter-wars.png -------------------------------------------------------------------------------- /tests/biology/test_join_fasta.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | import pytest 5 | from helpers import running_on_ci 6 | 7 | import janitor.biology # noqa: F403, F401 8 | 9 | # Skip all tests if Biopython not installed 10 | pytestmark = pytest.mark.skipif( 11 | (importlib.util.find_spec("Bio") is None) & ~running_on_ci(), 12 | reason="Biology tests relying on Biopython only required for CI", 13 | ) 14 | 15 | 16 | @pytest.mark.biology 17 | def test_join_fasta(biodf): 18 | """Test adding sequence from FASTA file in `sequence` column.""" 19 | df = biodf.join_fasta( 20 | filename=os.path.join(pytest.TEST_DATA_DIR, "sequences.fasta"), 21 | id_col="sequence_accession", 22 | column_name="sequence", 23 | ) 24 | 25 | assert "sequence" in df.columns 26 | -------------------------------------------------------------------------------- /tests/chemistry/test_maccs_keys_fingerprint.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | from helpers import running_on_ci 5 | 6 | import janitor.chemistry # noqa: F401 7 | 8 | # Skip all tests if rdkit not installed 9 | pytestmark = pytest.mark.skipif( 10 | (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(), 11 | reason="rdkit tests only required for CI", 12 | ) 13 | 14 | 15 | @pytest.mark.chemistry 16 | def test_maccs_keys_fingerprint(chemdf): 17 | """Test conversion of SMILES strings to MACCS keys fingerprints.""" 18 | maccs_keys = chemdf.smiles2mol("smiles", "mol").maccs_keys_fingerprint( 19 | "mol" 20 | ) 21 | assert maccs_keys.shape == (10, 167) 22 | assert set(maccs_keys.to_numpy().flatten().tolist()) == set([0, 1]) 23 | -------------------------------------------------------------------------------- /tests/chemistry/test_molecular_descriptors.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | from helpers import running_on_ci 5 | 6 | # Skip all tests if rdkit not installed 7 | pytestmark = pytest.mark.skipif( 8 | (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(), 9 | reason="rdkit tests only required for CI", 10 | ) 11 | 12 | 13 | @pytest.mark.chemistry 14 | def test_molecular_descriptors(chemdf): 15 | """Test conversion of Mol objects to 39 column molecular descriptors.""" 16 | mol_desc = chemdf.smiles2mol("smiles", "mol").molecular_descriptors("mol") 17 | assert mol_desc.shape == (10, 39) 18 | -------------------------------------------------------------------------------- /tests/chemistry/test_morgan_fingerprint.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | from helpers import running_on_ci 5 | 6 | pytestmark = pytest.mark.skipif( 7 | (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(), 8 | reason="rdkit tests only required for CI", 9 | ) 10 | 11 | 12 | @pytest.mark.chemistry 13 | def test_morgan_fingerprint_counts(chemdf): 14 | """Test counts of Morgan Fingerprints converted from Mol objects.""" 15 | morgans = chemdf.smiles2mol("smiles", "mol").morgan_fingerprint( 16 | "mol", kind="counts" 17 | ) 18 | assert morgans.shape == (10, 2048) 19 | assert (morgans.to_numpy() >= 0).all() 20 | 21 | 22 | @pytest.mark.chemistry 23 | def test_morgan_fingerprint_bits(chemdf): 24 | """Test bits of Morgan Fingerprints converted from Mol objects.""" 25 | morgans = chemdf.smiles2mol("smiles", "mol").morgan_fingerprint( 26 | "mol", kind="bits" 27 | ) 28 | assert morgans.shape == (10, 2048) 29 | assert set(morgans.to_numpy().flatten().tolist()) == set([0, 1]) 30 | 31 | 32 | @pytest.mark.chemistry 33 | def test_morgan_fingerprint_kind_error(chemdf): 34 | """Test `morgan_fingerprint` raises exception for invalid `kind`.""" 35 | with pytest.raises(ValueError): 36 | chemdf.smiles2mol("smiles", "mol").morgan_fingerprint( 37 | "mol", kind="invalid-kind" 38 | ) 39 | -------------------------------------------------------------------------------- /tests/chemistry/test_smiles2mol.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | from helpers import running_on_ci 5 | 6 | # Skip all tests if rdkit not installed 7 | pytestmark = pytest.mark.skipif( 8 | (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(), 9 | reason="rdkit tests only required for CI", 10 | ) 11 | 12 | 13 | @pytest.mark.parametrize("progressbar", [None, "terminal", "notebook"]) 14 | @pytest.mark.chemistry 15 | def test_smiles2mol(chemdf, progressbar): 16 | """Test each SMILES properly converted to Mol object.""" 17 | from rdkit import Chem 18 | 19 | chemdf = chemdf.smiles2mol("smiles", "mol", progressbar) 20 | assert "mol" in chemdf.columns 21 | for elem in chemdf["mol"]: 22 | assert isinstance(elem, Chem.rdchem.Mol) 23 | 24 | 25 | def test_smiles2mol_bad_progressbar(chemdf): 26 | """Test that bad progressbar value raises error.""" 27 | with pytest.raises(ValueError): 28 | chemdf = chemdf.smiles2mol("smiles", "mol", progressbar="blah") 29 | -------------------------------------------------------------------------------- /tests/finance/test_convert_currency.py: -------------------------------------------------------------------------------- 1 | """Tests for convert_currency() in finance module.""" 2 | 3 | from datetime import date, datetime 4 | 5 | import pytest 6 | import requests 7 | 8 | from janitor.finance import convert_currency # noqa: F401 9 | 10 | 11 | @pytest.mark.finance 12 | @pytest.mark.xfail(reason="changes made to web API prevent this from running") 13 | def test_make_currency_api_request(): 14 | """ 15 | Test for currency API request. 16 | 17 | This test exists because we rely solely on the service by 18 | exchangeratesapi. That said, we also mark it as expected to fail because 19 | it sometimes pings the exchange rates API a too frequently and causes 20 | tests to fail. 21 | 22 | For an example of how this test fails, see: 23 | https://github.com/pyjanitor-devs/pyjanitor/issues/147 24 | """ 25 | r = requests.get("https://api.exchangeratesapi.io") 26 | assert r.status_code == 200 27 | 28 | 29 | @pytest.mark.xfail(reason="changes made to web API prevent this from running") 30 | @pytest.mark.finance 31 | def test_make_new_currency_col(dataframe): 32 | """Test converting to same currency equals original currency column.""" 33 | df = dataframe.convert_currency("a", "USD", "USD", make_new_column=True) 34 | assert all(df["a"] == df["a_USD"]) 35 | 36 | 37 | @pytest.mark.finance 38 | @pytest.mark.xfail(reason="changes made to web API prevent this from running") 39 | def test_historical_datetime(dataframe): 40 | """Test conversion raises exception for datetime outside API range.""" 41 | with pytest.raises(ValueError): 42 | assert dataframe.convert_currency( 43 | "a", 44 | "USD", 45 | "AUD", 46 | make_new_column=True, 47 | historical_date=datetime(1982, 10, 27), 48 | ) 49 | 50 | 51 | @pytest.mark.finance 52 | @pytest.mark.xfail(reason="changes made to web API prevent this from running") 53 | def test_historical_date(dataframe): 54 | """Test conversion raises exception for date outside API range.""" 55 | with pytest.raises(ValueError): 56 | assert dataframe.convert_currency( 57 | "a", 58 | "USD", 59 | "AUD", 60 | make_new_column=True, 61 | historical_date=date(1982, 10, 27), 62 | ) 63 | 64 | 65 | @pytest.mark.finance 66 | @pytest.mark.xfail(reason="changes made to web API prevent this from running") 67 | def test_currency_check(dataframe): 68 | """Test conversion raises exception for invalid currency.""" 69 | with pytest.raises(ValueError): 70 | assert dataframe.convert_currency("a", "USD", "INVALID-CURRENCY") 71 | -------------------------------------------------------------------------------- /tests/finance/test_convert_stock.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from janitor.finance import get_symbol 4 | 5 | 6 | @pytest.mark.xfail(reason="Flaky because it depends on internet connectivity.") 7 | def test_convert_stock(): 8 | """ 9 | Tests get_symbol function, 10 | get_symbol should return appropriate string 11 | corresponding to abbreviation. 12 | This string will be a company's full name, 13 | and the abbreviation will be the NSYE 14 | symbol for the company. 15 | 16 | Example: 17 | print(get_symbol("aapl")) 18 | console >> Apple Inc. 19 | 20 | If the symbol does not have a corresponding 21 | company, Nonetype should be returned. 22 | """ 23 | assert get_symbol("GME") == "GameStop Corp." 24 | assert get_symbol("AAPL") != "Aramark" 25 | assert get_symbol("ASNF") is None 26 | -------------------------------------------------------------------------------- /tests/finance/test_get_symbol.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from janitor.finance import get_symbol 4 | 5 | """ 6 | tests the convert_symbol helper function. 7 | 8 | Test 1: GME is Gamestop Corp. Test should run fine. 9 | Test 2: GME is not Globius Medical Inc. 10 | Test 3: A little redundant, but it's another 11 | 'happy path' to show get_symbol works for more 12 | abbreviations than just the one tested so far. 13 | Test 4: ZZZZ does not belong to any company, 14 | it should therefore it should be None 15 | """ 16 | 17 | 18 | @pytest.mark.xfail( 19 | reason="Flaky, because it depends on internet connectivity." 20 | ) 21 | def test_get_symbol(): 22 | assert get_symbol("GME") == "GameStop Corp." 23 | assert get_symbol("GME") != "Globus Medical Inc." 24 | assert get_symbol("F") == "Ford Motor Company" 25 | assert get_symbol("ZZZZ") is None 26 | -------------------------------------------------------------------------------- /tests/functions/test_add_columns.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from hypothesis import given, settings 5 | from hypothesis import strategies as st 6 | from pandas.testing import assert_series_equal 7 | 8 | from janitor.testing_utils.strategies import df_strategy 9 | 10 | 11 | @pytest.mark.functions 12 | @given( 13 | df=df_strategy(), 14 | x_vals=st.floats(), 15 | n_yvals=st.integers(min_value=0, max_value=100), 16 | ) 17 | @settings(deadline=None, max_examples=10) 18 | def test_add_columns(df, x_vals, n_yvals): 19 | """ 20 | Test for adding multiple columns at the same time. 21 | """ 22 | y_vals = np.linspace(0, 42, n_yvals) 23 | 24 | if n_yvals != len(df) or n_yvals == 0: 25 | with pytest.raises(ValueError): 26 | df = df.add_columns(x=x_vals, y=y_vals) 27 | 28 | else: 29 | df = df.add_columns(x=x_vals, y=y_vals) 30 | series = pd.Series([x_vals] * len(df)) 31 | series.name = "x" 32 | assert_series_equal(df["x"], series) 33 | 34 | series = pd.Series(y_vals) 35 | series.name = "y" 36 | assert_series_equal(df["y"], series) 37 | -------------------------------------------------------------------------------- /tests/functions/test_alias.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.testing import assert_series_equal 3 | 4 | 5 | def test_alias_no_name(): 6 | """Test output if Series does not have a name""" 7 | series = pd.Series([1, 2, 3]) 8 | assert_series_equal(series, series.alias()) 9 | 10 | 11 | def test_alias_callable(): 12 | """Test output if alias is a callable""" 13 | series = pd.Series([1, 2, 3], name="UPPER") 14 | assert_series_equal(series.rename("upper"), series.alias(str.lower)) 15 | 16 | 17 | def test_alias_scalar(): 18 | """Test output if alias is a scalar""" 19 | series = pd.Series([1, 2, 3], name="UPPER") 20 | assert_series_equal(series.rename("upper"), series.alias("upper")) 21 | -------------------------------------------------------------------------------- /tests/functions/test_also.py: -------------------------------------------------------------------------------- 1 | """Unit tests for `.also()`.""" 2 | 3 | from unittest.mock import Mock 4 | 5 | import pytest 6 | 7 | 8 | def remove_first_two_letters_from_col_names(df): 9 | """Helper function to mutate dataframe by changing column names.""" 10 | col_names = df.columns 11 | col_names = [name[2:] for name in col_names] 12 | df.columns = col_names 13 | return df 14 | 15 | 16 | def remove_rows_3_and_4(df): 17 | """Helper function to mutate dataframe by removing rows.""" 18 | df = df.drop(3, axis=0) 19 | df = df.drop(4, axis=0) 20 | return df 21 | 22 | 23 | def drop_inplace(df): 24 | """ 25 | Helper function to mutate dataframe by dropping a column. 26 | 27 | We usually would not use `inplace=True` in a block, 28 | but the intent here is to test that 29 | the in-place modification of a dataframe 30 | doesn't get passed through in the `.also()` function. 31 | Hence, we tell Flake8 to skip checking `PD002` on that line. 32 | 33 | .. # noqa: DAR101 34 | """ 35 | df.drop(columns=[df.columns[0]], inplace=True) # noqa: PD002 36 | 37 | 38 | @pytest.mark.functions 39 | def test_also_column_manipulation_no_change(dataframe): 40 | """Test that changed dataframe inside `.also()` doesn't get returned.""" 41 | cols = tuple(dataframe.columns) 42 | df = dataframe.also(remove_first_two_letters_from_col_names) 43 | assert dataframe is df 44 | assert cols == tuple(df.columns) 45 | 46 | 47 | @pytest.mark.functions 48 | def test_also_remove_rows_no_change(dataframe): 49 | """Test that changed dataframe inside `.also()` doesn't get returned.""" 50 | df = dataframe.also(remove_rows_3_and_4) 51 | rows = tuple(df.index) 52 | assert rows == (0, 1, 2, 3, 4, 5, 6, 7, 8) 53 | 54 | 55 | @pytest.mark.functions 56 | def test_also_runs_function(dataframe): 57 | """Test that `.also()` executes the function.""" 58 | method = Mock(return_value=None) 59 | df = dataframe.also(method) 60 | assert id(df) == id(dataframe) 61 | assert method.call_count == 1 62 | 63 | 64 | @pytest.mark.functions 65 | def test_also_args(dataframe): 66 | """Test that the args are passed through to the function.""" 67 | method = Mock(return_value=None) 68 | _ = dataframe.also(method, 5) 69 | 70 | assert method.call_args[0][1] == 5 71 | 72 | 73 | @pytest.mark.functions 74 | def test_also_kwargs(dataframe): 75 | """Test that the kwargs are passed through to the function.""" 76 | method = Mock(return_value=None) 77 | _ = dataframe.also(method, n=5) 78 | 79 | assert method.call_args[1] == {"n": 5} 80 | 81 | 82 | @pytest.mark.functions 83 | def test_also_drop_inplace(dataframe): 84 | """Test that in-place modification of dataframe does not pass through.""" 85 | cols = tuple(dataframe.columns) 86 | df = dataframe.also(drop_inplace) 87 | assert tuple(df.columns) == cols 88 | -------------------------------------------------------------------------------- /tests/functions/test_bin_numeric.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given, settings 3 | 4 | from janitor.testing_utils.strategies import df_strategy 5 | 6 | 7 | @pytest.mark.functions 8 | @given(df=df_strategy()) 9 | @settings(deadline=None, max_examples=10) 10 | def test_bin_numeric_expected_columns(df): 11 | df = df.bin_numeric(from_column_name="a", to_column_name="a_bin") 12 | expected_columns = [ 13 | "a", 14 | "Bell__Chart", 15 | "decorated-elephant", 16 | "animals@#$%^", 17 | "cities", 18 | "a_bin", 19 | ] 20 | 21 | assert set(df.columns) == set(expected_columns) 22 | 23 | 24 | @pytest.mark.functions 25 | @given(df=df_strategy()) 26 | @settings(deadline=None, max_examples=10) 27 | def test_bin_numeric_kwargs_has_no_retbins(df): 28 | with pytest.raises(ValueError): 29 | labels = ["a", "b", "c", "d", "e"] 30 | df.bin_numeric( 31 | from_column_name="a", 32 | to_column_name="a_bin", 33 | bins=5, 34 | labels=labels, 35 | retbins=True, 36 | ) 37 | -------------------------------------------------------------------------------- /tests/functions/test_coalesce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from pandas.testing import assert_frame_equal 5 | 6 | 7 | @pytest.fixture 8 | def df(): 9 | "Base DataFrame fixture" 10 | return pd.DataFrame( 11 | {"a": [1, np.nan, 3], "b": [2, 3, 1], "c": [2, np.nan, 9]} 12 | ) 13 | 14 | 15 | @pytest.mark.xfail(reason="column_names is a variable args") 16 | def test_wrong_type_column_names(df): 17 | """Raise Error if wrong type is provided for `column_names`.""" 18 | with pytest.raises(TypeError): 19 | df.coalesce("a", "b") 20 | 21 | 22 | def test_wrong_type_target_column_name(df): 23 | """Raise TypeError if wrong type is provided for `target_column_name`.""" 24 | with pytest.raises(TypeError): 25 | df.coalesce("a", "b", target_column_name=["new_name"]) 26 | 27 | 28 | def test_wrong_type_default_value(df): 29 | """Raise TypeError if wrong type is provided for `default_value`.""" 30 | with pytest.raises(TypeError): 31 | df.coalesce( 32 | "a", "b", target_column_name="new_name", default_value=[1, 2, 3] 33 | ) 34 | 35 | 36 | def test_len_column_names_less_than_2(df): 37 | """Raise Error if column_names length is less than 2.""" 38 | with pytest.raises(ValueError): 39 | df.coalesce("a") 40 | 41 | 42 | def test_empty_column_names(df): 43 | """Return dataframe if `column_names` is empty.""" 44 | assert_frame_equal(df.coalesce(), df) 45 | 46 | 47 | @pytest.mark.functions 48 | def test_coalesce_without_target(df): 49 | """Test output if `target_column_name` is not provided.""" 50 | result = df.coalesce("a", "b", "c") 51 | expected_output = df.assign( 52 | a=df["a"].combine_first(df["b"].combine_first(df["c"])) 53 | ) 54 | assert_frame_equal(result, expected_output) 55 | 56 | 57 | @pytest.mark.functions 58 | def test_coalesce_without_delete(): 59 | """Test output if nulls remain and `default_value` is provided.""" 60 | df = pd.DataFrame( 61 | {"s1": [np.nan, np.nan, 6, 9, 9], "s2": [np.nan, 8, 7, 9, 9]} 62 | ) 63 | expected = df.assign(s3=df.s1.combine_first(df.s2).fillna(0)) 64 | result = df.coalesce("s1", "s2", target_column_name="s3", default_value=0) 65 | assert_frame_equal(result, expected) 66 | 67 | 68 | def test_coalesce_duplicate_columns(): 69 | """ 70 | Test output on duplicate columns. 71 | """ 72 | df = pd.DataFrame( 73 | np.array([[1.0, 2.0, 2.0], [np.nan, 3.0, np.nan], [3.0, 1.0, 9.0]]), 74 | columns=["a", "a", "c"], 75 | ) 76 | 77 | expected = pd.DataFrame( 78 | np.array([[1.0, 2.0, 2.0], [3, 3.0, np.nan], [3.0, 1.0, 9.0]]), 79 | columns=["a", "a", "c"], 80 | ) 81 | 82 | actual = df.coalesce("a") 83 | 84 | assert_frame_equal(expected, actual) 85 | -------------------------------------------------------------------------------- /tests/functions/test_concatenate_columns.py: -------------------------------------------------------------------------------- 1 | """Tests for concatenate_columns.""" 2 | 3 | import pytest 4 | 5 | from janitor.errors import JanitorError 6 | 7 | 8 | @pytest.mark.functions 9 | def test_concatenate_columns(dataframe): 10 | """Basic test for concatenate_columns.""" 11 | df = dataframe.concatenate_columns( 12 | column_names=["a", "decorated-elephant"], 13 | sep="-", 14 | new_column_name="index", 15 | ) 16 | assert "index" in df.columns 17 | 18 | 19 | @pytest.mark.functions 20 | def test_concatenate_columns_null_values(missingdata_df): 21 | """Test for concatenating columns with null values. 22 | 23 | Null values ought to show up as "nan" in strings 24 | in the concatenated column. 25 | """ 26 | df = missingdata_df.concatenate_columns( 27 | column_names=["a", "decorated-elephant"], 28 | sep="-", 29 | new_column_name="index", 30 | ignore_empty=True, 31 | ) 32 | expected_values = ["1.0-1", "2.0-2", "nan-3"] * 3 33 | assert expected_values == df["index"].tolist() 34 | 35 | 36 | @pytest.mark.functions 37 | @pytest.mark.parametrize("column_names", [["a"], []]) 38 | def test_concatenate_columns_errors(dataframe, column_names): 39 | """ 40 | Test that an error is raised when less than two columns are specified. 41 | """ 42 | with pytest.raises( 43 | JanitorError, match="At least two columns must be specified" 44 | ): 45 | dataframe.concatenate_columns( 46 | column_names=column_names, new_column_name="index" 47 | ) 48 | -------------------------------------------------------------------------------- /tests/functions/test_convert_excel_date.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | 7 | @pytest.mark.functions 8 | def test_convert_excel_date(): 9 | # using openpyxl as the engine staves off an error that crops up 10 | # during the CI build up with xlrd 11 | df = ( 12 | pd.read_excel( 13 | Path(pytest.EXAMPLES_DIR) / "notebooks" / "dirty_data.xlsx", 14 | engine="openpyxl", 15 | ) 16 | .clean_names() 17 | .convert_excel_date("hire_date") 18 | ) 19 | 20 | assert df["hire_date"].dtype == "M8[ns]" 21 | -------------------------------------------------------------------------------- /tests/functions/test_convert_matlab_date.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_convert_matlab_date(): 7 | mlab = [ 8 | 733_301.0, 9 | 729_159.0, 10 | 734_471.0, 11 | 737_299.563_296_356_5, 12 | 737_300.000_000_000_0, 13 | ] 14 | df = pd.DataFrame(mlab, columns=["dates"]).convert_matlab_date("dates") 15 | 16 | assert df["dates"].dtype == "M8[ns]" 17 | -------------------------------------------------------------------------------- /tests/functions/test_convert_unix_date.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | 7 | @pytest.mark.skipif( 8 | os.name == "nt", reason="Skip *nix-specific tests on Windows" 9 | ) 10 | def test_convert_unix_date(): 11 | unix = [ 12 | "1284101485", 13 | 1_284_101_486, 14 | "1284101487000", 15 | 1_284_101_488_000, 16 | "1284101489", 17 | "1284101490", 18 | -2_147_483_648, 19 | 2_147_483_648, 20 | ] 21 | df = pd.DataFrame(unix, columns=["dates"]).convert_unix_date("dates") 22 | 23 | assert df["dates"].dtype == "M8[ns]" 24 | -------------------------------------------------------------------------------- /tests/functions/test_drop_constant_columns.py: -------------------------------------------------------------------------------- 1 | """Tests for drop_constant_columns.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | from pandas.testing import assert_frame_equal 6 | 7 | 8 | @pytest.mark.functions 9 | def test_drop_constant_columns(df_constant_columns): 10 | """Test that executes drop_constant_columns function.""" 11 | processed_df = df_constant_columns.drop_constant_columns() 12 | expected_col_list = ["Bell__Chart", "decorated-elephant", "cities"] 13 | assert processed_df.columns.to_list() == expected_col_list 14 | data = { 15 | "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, 16 | "decorated-elephant": [1, 2, 3] * 3, 17 | "cities": ["Cambridge", "Shanghai", "Basel"] * 3, 18 | } 19 | expected_df = pd.DataFrame(data) 20 | assert_frame_equal(processed_df, expected_df) 21 | -------------------------------------------------------------------------------- /tests/functions/test_drop_duplicate_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | 5 | import janitor # noqa: F401 6 | 7 | 8 | @pytest.mark.functions 9 | def test_drop_duplicate_columns(df_duplicated_columns): 10 | # df_duplicated_columns contains columns 'a', duplicated three times 11 | clean_df = df_duplicated_columns.drop_duplicate_columns(column_name="a") 12 | assert clean_df.columns.to_list() == ["b", "a", "a"] 13 | expected_df = pd.DataFrame( 14 | {"b": range(10), "a": range(10, 20), "a*": range(20, 30)} 15 | ).clean_names(remove_special=True) 16 | assert_frame_equal(clean_df, expected_df) 17 | 18 | 19 | @pytest.mark.functions 20 | def test_drop_duplicate_columns_for_second_duplicated_column( 21 | df_duplicated_columns, 22 | ): 23 | clean_df = df_duplicated_columns.drop_duplicate_columns( 24 | column_name="a", nth_index=1 25 | ) 26 | expected_df = pd.DataFrame( 27 | {"a": range(10), "b": range(10), "a*": range(20, 30)} 28 | ).clean_names(remove_special=True) 29 | assert clean_df.columns.to_list() == ["a", "b", "a"] 30 | assert_frame_equal(clean_df, expected_df) 31 | 32 | 33 | @pytest.mark.functions 34 | def test_drop_duplicate_columns_for_third_duplicated_column( 35 | df_duplicated_columns, 36 | ): 37 | clean_df = df_duplicated_columns.drop_duplicate_columns( 38 | column_name="a", nth_index=2 39 | ) 40 | expected_df = pd.DataFrame( 41 | {"a": range(10), "b": range(10), "A": range(10, 20)} 42 | ).clean_names(remove_special=True) 43 | assert clean_df.columns.to_list() == ["a", "b", "a"] 44 | assert_frame_equal(clean_df, expected_df) 45 | 46 | 47 | @pytest.mark.functions 48 | def test_drop_duplicate_columns_with_error(df_duplicated_columns): 49 | with pytest.raises(IndexError): 50 | df_duplicated_columns.drop_duplicate_columns( 51 | column_name="a", nth_index=3 52 | ) 53 | -------------------------------------------------------------------------------- /tests/functions/test_dropnotnull.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | 5 | 6 | @pytest.mark.functions 7 | def test_dropnotnull(missingdata_df): 8 | df = missingdata_df.clean_names() 9 | df_drop = df.dropnotnull("bell_chart") 10 | 11 | assert pd.isna(df_drop["bell_chart"]).all() 12 | 13 | assert_frame_equal(df.loc[df_drop.index], df_drop) 14 | -------------------------------------------------------------------------------- /tests/functions/test_expand_column.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_expand_column(): 7 | data = { 8 | "col1": ["A, B", "B, C, D", "E, F", "A, E, F"], 9 | "col2": [1, 2, 3, 4], 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | expanded_df = df.expand_column(column_name="col1", sep=", ", concat=False) 14 | assert expanded_df.shape[1] == 6 15 | 16 | 17 | @pytest.mark.functions 18 | def test_expand_and_concat(): 19 | data = { 20 | "col1": ["A, B", "B, C, D", "E, F", "A, E, F"], 21 | "col2": [1, 2, 3, 4], 22 | } 23 | 24 | df = pd.DataFrame(data).expand_column( 25 | column_name="col1", sep=", ", concat=True 26 | ) 27 | assert df.shape[1] == 8 28 | 29 | 30 | @pytest.mark.functions 31 | def test_sep_default_parameter(): 32 | """Test that the default parameter is a pipe character `|`.""" 33 | df = pd.DataFrame( 34 | { 35 | "col1": ["A|B", "B|C|D", "E|F", "A|E|F"], 36 | "col2": [1, 2, 3, 4], 37 | } 38 | ) 39 | result = df.expand_column("col1") 40 | 41 | assert result.shape[1] == 8 42 | -------------------------------------------------------------------------------- /tests/functions/test_fill_empty.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_fill_empty(null_df): 7 | df = null_df.fill_empty(column_names=["2"], value=3) 8 | assert set(df.loc[:, "2"]) == set([3]) 9 | 10 | 11 | @pytest.mark.functions 12 | def test_fill_empty_column_string(null_df): 13 | df = null_df.fill_empty(column_names="2", value=3) 14 | assert set(df.loc[:, "2"]) == set([3]) 15 | 16 | 17 | @pytest.mark.functions 18 | @pytest.mark.parametrize( 19 | "column_names", 20 | [ 21 | (0, 1, "2", "3"), # tuple 22 | [0, 1, "2", "3"], # list 23 | {0, 1, "2", "3"}, # set 24 | ({0: 0, 1: 1, "2": "2", "3": "3"}).keys(), # dict key 25 | ({0: 0, 1: 1, "2": "2", "3": "3"}).values(), # dict value 26 | pd.Index([0, 1, "2", "3"]), # Index 27 | ], 28 | ) 29 | def test_column_names_iterable_type(null_df, column_names): 30 | result = null_df.fill_empty(column_names=column_names, value=3) 31 | excepted = null_df.fillna(3) 32 | 33 | assert result.equals(excepted) 34 | -------------------------------------------------------------------------------- /tests/functions/test_filter_column_isin.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import assume, given, settings 3 | 4 | from janitor.testing_utils.strategies import ( 5 | categoricaldf_strategy, 6 | names_strategy, 7 | ) 8 | 9 | 10 | @pytest.mark.functions 11 | @given(df=categoricaldf_strategy(), iterable=names_strategy()) 12 | @settings(deadline=None, max_examples=10) 13 | def test_filter_column_isin(df, iterable): 14 | """ 15 | `filter_column_isin` should return the property that the column of 16 | interest's set of values should be a subset of the iterable provided. 17 | This encompasses a few scenarios: 18 | 19 | - Each element in iterable is present in the column. 20 | - No elements of iterable are present in the column. 21 | - A subset of elements in iterable are present in the column. 22 | 23 | All 3 cases can be caught by using subsets. 24 | """ 25 | assume(len(iterable) >= 1) 26 | df = df.filter_column_isin("names", iterable) 27 | assert set(df["names"]).issubset(iterable) 28 | -------------------------------------------------------------------------------- /tests/functions/test_filter_on.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | @pytest.mark.parametrize("complement,expected", [(True, 6), (False, 3)]) 6 | def test_filter_on(dataframe, complement, expected): 7 | df = dataframe.filter_on("a == 3", complement=complement) 8 | assert len(df) == expected 9 | 10 | 11 | @pytest.mark.functions 12 | @pytest.mark.parametrize("complement,expected", [(True, 3), (False, 6)]) 13 | def test_filter_on_with_multiple_criteria(dataframe, complement, expected): 14 | df = dataframe.filter_on("(a == 3) | (a == 1)", complement=complement) 15 | assert len(df) == expected 16 | -------------------------------------------------------------------------------- /tests/functions/test_filter_string.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | def test_filter_string(dataframe): 6 | df = dataframe.filter_string( 7 | column_name="animals@#$%^", 8 | search_string="bbit", 9 | ) 10 | 11 | assert len(df) == 3 12 | 13 | 14 | def test_filter_string_complement(dataframe): 15 | df = dataframe.filter_string( 16 | column_name="cities", 17 | search_string="hang", 18 | complement=True, 19 | ) 20 | 21 | assert len(df) == 6 22 | 23 | 24 | def test_filter_string_case(dataframe): 25 | df = dataframe.filter_string( 26 | column_name="cities", 27 | search_string="B", 28 | case=False, 29 | ) 30 | 31 | assert len(df) == 6 32 | 33 | 34 | def test_filter_string_regex(dataframe): 35 | df = dataframe.change_type("Bell__Chart", str).filter_string( 36 | column_name="Bell__Chart", 37 | search_string="1.", 38 | regex=False, 39 | ) 40 | 41 | assert len(df) == 3 42 | -------------------------------------------------------------------------------- /tests/functions/test_find_replace.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def df(): 8 | return pd.DataFrame( 9 | {"a": [1, np.nan, 3], "b": [2, 3, 1], "c": [2, np.nan, 2]} 10 | ) 11 | 12 | 13 | @pytest.mark.functions 14 | def test_find_replace_single(df): 15 | assert df["a"].iloc[2] == 3 16 | df.find_replace(a={3: 5}) 17 | assert df["a"].iloc[2] == 5 18 | 19 | assert sum(df["c"] == 2) == 2 20 | assert sum(df["c"] == 5) == 0 21 | df.find_replace(c={2: 5}) 22 | assert sum(df["c"] == 2) == 0 23 | assert sum(df["c"] == 5) == 2 24 | 25 | 26 | @pytest.mark.functions 27 | def test_find_replace_null_raises_error(df): 28 | with pytest.raises(ValueError): 29 | df.find_replace(a={np.nan: 5}) 30 | 31 | 32 | @pytest.fixture 33 | def df_orders(): 34 | return pd.DataFrame( 35 | { 36 | "customer": ["Mary", "Tom", "Lila"], 37 | "order": ["ice coffee", "lemonade", "regular coffee"], 38 | } 39 | ) 40 | 41 | 42 | @pytest.mark.functions 43 | def test_find_replace_regex(df_orders): 44 | df_orders.find_replace(order={"coffee$": "latte"}, match="regex") 45 | assert df_orders["order"].iloc[0] == "latte" 46 | assert df_orders["order"].iloc[1] == "lemonade" 47 | assert df_orders["order"].iloc[-1] == "latte" 48 | 49 | 50 | @pytest.mark.functions 51 | def test_find_replace_regex_match_raises_error(df_orders): 52 | with pytest.raises(ValueError): 53 | df_orders.find_replace(order={"lemonade": "orange juice"}, match="bla") 54 | -------------------------------------------------------------------------------- /tests/functions/test_get_dupes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_get_dupes(): 7 | df = pd.DataFrame() 8 | df["a"] = [1, 2, 1] 9 | df["b"] = [1, 2, 1] 10 | df_dupes = df.get_dupes() 11 | assert df_dupes.shape == (2, 2) 12 | 13 | df2 = pd.DataFrame() 14 | df2["a"] = [1, 2, 3] 15 | df2["b"] = [1, 2, 3] 16 | df2_dupes = df2.get_dupes() 17 | assert df2_dupes.shape == (0, 2) 18 | -------------------------------------------------------------------------------- /tests/functions/test_impute.py: -------------------------------------------------------------------------------- 1 | """Tests for the `impute` functions""" 2 | 3 | import pytest 4 | from pandas.testing import assert_frame_equal 5 | 6 | 7 | @pytest.mark.functions 8 | def test_impute_single_value(missingdata_df): 9 | """Check if constant value is imputed correctly.""" 10 | df = missingdata_df.impute("a", 5) 11 | assert set(df["a"]) == set([1, 2, 5]) 12 | 13 | 14 | @pytest.mark.functions 15 | def test_impute_single_value_multiple_columns(missingdata_df): 16 | """Check if constant value is imputed correctly.""" 17 | df = missingdata_df.impute(["a", "Bell__Chart"], 5) 18 | assert_frame_equal( 19 | missingdata_df.assign(**df.loc[:, ["a", "Bell__Chart"]].fillna(5)), df 20 | ) 21 | 22 | 23 | @pytest.mark.functions 24 | @pytest.mark.parametrize( 25 | "statistic,expected", 26 | [ 27 | ("mean", set([1, 2, 1.5])), 28 | ("average", set([1, 2, 1.5])), 29 | ("median", set([1, 2, 1.5])), 30 | ("mode", set([1, 2])), 31 | ("min", set([1, 2])), 32 | ("minimum", set([1, 2])), 33 | ("max", set([1, 2])), 34 | ("maximum", set([1, 2])), 35 | ], 36 | ) 37 | def test_impute_statistical(missingdata_df, statistic, expected): 38 | """Check if imputing via statistic_column_name works correctly.""" 39 | df = missingdata_df.impute("a", statistic_column_name=statistic) 40 | assert set(df["a"]) == expected 41 | 42 | 43 | @pytest.mark.functions 44 | def test_impute_error_with_invalid_inputs(missingdata_df): 45 | """Check errors are properly raised with invalid inputs.""" 46 | with pytest.raises( 47 | ValueError, 48 | match="Only one of `value` or " 49 | "`statistic_column_name` " 50 | "should be provided.", 51 | ): 52 | missingdata_df.impute( 53 | "a", 54 | value=0, 55 | statistic_column_name="mean", 56 | ) 57 | 58 | with pytest.raises( 59 | KeyError, match="`statistic_column_name` must be one of.+" 60 | ): 61 | missingdata_df.impute("a", statistic_column_name="foobar") 62 | 63 | with pytest.raises( 64 | ValueError, match="Kindly specify a value or a statistic_column_name" 65 | ): 66 | missingdata_df.impute("a") 67 | -------------------------------------------------------------------------------- /tests/functions/test_join_apply.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | 5 | import janitor # noqa: F401 6 | 7 | 8 | @pytest.mark.functions 9 | def test_join_apply(): 10 | df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}).join_apply( 11 | lambda x: 2 * x["a"] + x["b"], new_column_name="2a+b" 12 | ) 13 | 14 | expected = df.copy() 15 | expected["2a+b"] = [4, 7, 10] 16 | 17 | assert_frame_equal(df, expected) 18 | -------------------------------------------------------------------------------- /tests/functions/test_label_encode.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_single_column_label_encode(): 7 | df = pd.DataFrame( 8 | {"a": ["hello", "hello", "sup"], "b": [1, 2, 3]} 9 | ).label_encode(column_names="a") 10 | assert "a_enc" in df.columns 11 | 12 | 13 | @pytest.mark.functions 14 | def test_single_column_fail_label_encode(): 15 | with pytest.raises(ValueError): 16 | pd.DataFrame( 17 | {"a": ["hello", "hello", "sup"], "b": [1, 2, 3]} 18 | ).label_encode( 19 | column_names="c" 20 | ) # noqa: 841 21 | 22 | 23 | @pytest.mark.functions 24 | def test_multicolumn_label_encode(): 25 | df = pd.DataFrame( 26 | { 27 | "a": ["hello", "hello", "sup"], 28 | "b": [1, 2, 3], 29 | "c": ["aloha", "nihao", "nihao"], 30 | } 31 | ).label_encode(column_names=["a", "c"]) 32 | assert "a_enc" in df.columns 33 | assert "c_enc" in df.columns 34 | 35 | 36 | @pytest.mark.functions 37 | def test_label_encode_invalid_input(dataframe): 38 | with pytest.raises(NotImplementedError): 39 | dataframe.label_encode(1) 40 | -------------------------------------------------------------------------------- /tests/functions/test_limit_column_characters.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | def test_limit_column_characters(dataframe): 6 | df = dataframe.limit_column_characters(1) 7 | assert df.columns[0] == "a" 8 | assert df.columns[1] == "B" 9 | assert df.columns[2] == "d" 10 | assert df.columns[3] == "a_1" 11 | assert df.columns[4] == "c" 12 | 13 | 14 | @pytest.mark.functions 15 | def test_limit_column_characters_different_positions(dataframe): 16 | df = dataframe 17 | df.columns = ["first", "first", "second", "second", "first"] 18 | df.limit_column_characters(3) 19 | 20 | assert df.columns[0] == "fir" 21 | assert df.columns[1] == "fir_1" 22 | assert df.columns[2] == "sec" 23 | assert df.columns[3] == "sec_1" 24 | assert df.columns[4] == "fir_2" 25 | 26 | 27 | @pytest.mark.functions 28 | def test_limit_column_characters_different_positions_different_separator( 29 | dataframe, 30 | ): 31 | df = dataframe 32 | df.columns = ["first", "first", "second", "second", "first"] 33 | df.limit_column_characters(3, ".") 34 | 35 | assert df.columns[0] == "fir" 36 | assert df.columns[1] == "fir.1" 37 | assert df.columns[2] == "sec" 38 | assert df.columns[3] == "sec.1" 39 | assert df.columns[4] == "fir.2" 40 | 41 | 42 | @pytest.mark.functions 43 | def test_limit_column_characters_all_unique(dataframe): 44 | df = dataframe.limit_column_characters(2) 45 | assert df.columns[0] == "a" 46 | assert df.columns[1] == "Be" 47 | assert df.columns[2] == "de" 48 | assert df.columns[3] == "an" 49 | assert df.columns[4] == "ci" 50 | -------------------------------------------------------------------------------- /tests/functions/test_remove_columns.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | def test_remove_columns_one_col(dataframe): 6 | df = dataframe.remove_columns(column_names=["a"]) 7 | assert len(df.columns) == 4 8 | 9 | 10 | @pytest.mark.functions 11 | def test_remove_columns_mult_cols(dataframe): 12 | df = dataframe.remove_columns(column_names=["a", "Bell__Chart"]) 13 | assert len(df.columns) == 3 14 | 15 | 16 | @pytest.mark.functions 17 | def test_remove_columns_no_cols(dataframe): 18 | df = dataframe.remove_columns(column_names=[]) 19 | assert len(df.columns) == 5 20 | 21 | 22 | @pytest.mark.functions 23 | def test_remove_columns_all_cols(dataframe): 24 | df = dataframe.remove_columns( 25 | column_names=[ 26 | "a", 27 | "Bell__Chart", 28 | "decorated-elephant", 29 | "animals@#$%^", 30 | "cities", 31 | ] 32 | ) 33 | assert len(df.columns) == 0 34 | 35 | 36 | @pytest.mark.skip(reason="Not sure why this is failing") 37 | def test_remove_columns_strange_cols(dataframe): 38 | df = dataframe.remove_columns( 39 | column_names=[ 40 | "a", 41 | ["Bell__Chart", "decorated-elephant", "animals@#$%^", "cities"], 42 | ] 43 | ) 44 | assert len(df.columns) == 0 45 | 46 | 47 | @pytest.mark.functions 48 | def test_remove_columns_strange_cols_multilevel(multilevel_dataframe): 49 | # When creating a multi level dataframe with 4 columns * 2 columns 50 | # (16 columns in total) 51 | # From input 52 | 53 | # If 2 columns (2 tuples = 4 codes) are removed 54 | df = multilevel_dataframe.remove_columns( 55 | column_names=[("bar", "one"), ("baz", "two")] 56 | ) 57 | 58 | # Then the total number of codes must be 12 (16-4) 59 | assert ( 60 | len([item for sublist in df.columns.codes for item in sublist]) == 12 61 | ) 62 | -------------------------------------------------------------------------------- /tests/functions/test_remove_empty.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from hypothesis import given, settings 5 | 6 | from janitor.testing_utils.strategies import df_strategy 7 | 8 | 9 | @pytest.mark.functions 10 | @given(df=df_strategy()) 11 | @settings(deadline=None, max_examples=10) 12 | def test_remove_empty(df): 13 | """This test ensures that there are no columns that are completely null""" 14 | df = df.remove_empty() 15 | for col in df.columns: 16 | assert not pd.isna(df[col]).all() 17 | for r, d in df.iterrows(): 18 | assert not pd.isna(d).all() 19 | 20 | 21 | @pytest.mark.functions 22 | def test_index_after_remove_empty(): 23 | """This test ensures that the index is reset correctly""" 24 | df = pd.DataFrame() 25 | df["a"] = [1, np.nan, np.nan, 3, np.nan, 6] 26 | df["b"] = [1, np.nan, 1, 3, np.nan, 6] 27 | df_nonempty = df.remove_empty() 28 | assert np.array_equal( 29 | np.asarray(df_nonempty.index), np.asarray(range(0, len(df_nonempty))) 30 | ) 31 | 32 | 33 | @pytest.mark.functions 34 | def test_reset_index_false(): 35 | """Test output when reset_index is False""" 36 | df = pd.DataFrame() 37 | df["a"] = [1, np.nan, np.nan, 3, np.nan, 6] 38 | df["b"] = [1, np.nan, 1, 3, np.nan, 6] 39 | df_nonempty = df.remove_empty(reset_index=False) 40 | assert np.array_equal( 41 | df.notna().any(axis=1).to_numpy().nonzero()[0], 42 | df_nonempty.index.to_numpy(), 43 | ) 44 | -------------------------------------------------------------------------------- /tests/functions/test_rename_column.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given # noqa: F401 3 | 4 | 5 | @pytest.mark.functions 6 | def test_rename_column(dataframe): 7 | df = dataframe.clean_names().rename_column("a", "index") 8 | assert set(df.columns) == set( 9 | ["index", "bell_chart", "decorated_elephant", "animals@#$%^", "cities"] 10 | ) 11 | assert "a" not in set(df.columns) 12 | 13 | 14 | @pytest.mark.functions 15 | def test_rename_column_absent_column(dataframe): 16 | """ 17 | rename_column should raise an error if the column is absent. 18 | """ 19 | with pytest.raises(ValueError): 20 | dataframe.clean_names().rename_column("bb", "index") 21 | -------------------------------------------------------------------------------- /tests/functions/test_rename_columns.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given # noqa: F401 3 | 4 | 5 | @pytest.mark.functions 6 | def test_rename_columns(dataframe): 7 | """ 8 | Tests If rename_columns renames multiple columns based on the 9 | dictionary mappings. 10 | """ 11 | df = dataframe.clean_names().rename_columns( 12 | {"a": "index", "bell_chart": "chart"} 13 | ) 14 | assert set(df.columns) == set( 15 | ["index", "chart", "decorated_elephant", "animals@#$%^", "cities"] 16 | ) 17 | assert "a" not in set(df.columns) 18 | 19 | 20 | @pytest.mark.functions 21 | def test_rename_columns_absent_column(dataframe): 22 | """ 23 | rename_column should raise an error if the column is absent. 24 | """ 25 | df = dataframe.copy() 26 | with pytest.raises(ValueError): 27 | df.clean_names().rename_columns({"a": "index", "bb": "chart"}) 28 | 29 | assert set(df.columns) == set(dataframe.columns) 30 | 31 | 32 | @pytest.mark.functions 33 | def test_rename_columns_function(dataframe): 34 | """ 35 | rename_columns should apply the given function for each column name 36 | """ 37 | df = dataframe.clean_names().rename_columns(function=str.upper) 38 | assert set(df.columns) == set( 39 | ["A", "BELL_CHART", "DECORATED_ELEPHANT", "ANIMALS@#$%^", "CITIES"] 40 | ) 41 | 42 | assert "a" not in set(df.columns) 43 | 44 | 45 | @pytest.mark.functions 46 | def test_rename_columns_no_args(dataframe): 47 | """ 48 | rename_columns should throw error when both column_name and function are 49 | not provided. 50 | """ 51 | df = dataframe.copy() 52 | with pytest.raises(ValueError): 53 | df.rename_columns() 54 | 55 | assert set(df.columns) == set(dataframe.columns) 56 | -------------------------------------------------------------------------------- /tests/functions/test_reorder_columns.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given, settings 3 | 4 | from janitor.testing_utils.strategies import df_strategy 5 | 6 | 7 | @pytest.mark.functions 8 | @given(df=df_strategy()) 9 | @settings(deadline=None, max_examples=10) 10 | def test_reorder_columns(df): 11 | # NOTE: This test essentially has four different tests underneath it. 12 | # WE should be able to refactor this using pytest.mark.parametrize. 13 | 14 | # sanity checking of inputs 15 | 16 | # input is not a list or pd.Index 17 | with pytest.raises(TypeError): 18 | df.reorder_columns("a") 19 | 20 | # one of the columns is not present in the DataFrame 21 | with pytest.raises(IndexError): 22 | df.reorder_columns(["notpresent"]) 23 | 24 | # reordering functionality 25 | 26 | # sanity check when desired order matches current order 27 | # this also tests whether the function can take Pandas Index objects 28 | assert all(df.reorder_columns(df.columns).columns == df.columns) 29 | 30 | # when columns are list & not all columns of DataFrame are included 31 | assert all( 32 | df.reorder_columns(["animals@#$%^", "Bell__Chart"]).columns 33 | == ["animals@#$%^", "Bell__Chart", "a", "decorated-elephant", "cities"] 34 | ) 35 | -------------------------------------------------------------------------------- /tests/functions/test_round_to_fraction.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | def test_round_to_nearest_half(dataframe): 6 | """Checks output for rounding to the nearest 1/2.""" 7 | df = dataframe.round_to_fraction("Bell__Chart", 2) 8 | assert df.iloc[0, 1] == 1.0 9 | assert df.iloc[1, 1] == 2.5 10 | assert df.iloc[2, 1] == 3.0 11 | assert df.iloc[3, 1] == 1.0 12 | assert df.iloc[4, 1] == 2.5 13 | assert df.iloc[5, 1] == 3.0 14 | assert df.iloc[6, 1] == 1.0 15 | assert df.iloc[7, 1] == 2.5 16 | assert df.iloc[8, 1] == 3.0 17 | 18 | 19 | @pytest.mark.functions 20 | def test_round_digits(dataframe): 21 | """Checks rounding to the specified number of digits.""" 22 | df = dataframe.round_to_fraction("Bell__Chart", 7, digits=3) 23 | assert df.iloc[0, 1] == 1.286 24 | assert df.iloc[1, 1] == 2.429 25 | assert df.iloc[2, 1] == 3.286 26 | 27 | 28 | @pytest.mark.functions 29 | @pytest.mark.parametrize( 30 | "denominator", 31 | [0, -5, -0.25], 32 | ) 33 | def test_invalid_denominator_args(dataframe, denominator): 34 | """Ensure ValueError's are raised if denominator value passed in 35 | is invalid. 36 | """ 37 | with pytest.raises(ValueError): 38 | dataframe.round_to_fraction("Bell__Chart", denominator) 39 | -------------------------------------------------------------------------------- /tests/functions/test_shuffle.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.functions 5 | def test_shuffle_without_index_reset(dataframe): 6 | """ 7 | Test the shuffle function. 8 | 9 | This test checks that the set of indices in the shuffled dataframe are 10 | identical to the set of indices in the original. 11 | """ 12 | df = dataframe.shuffle(reset_index=False) 13 | assert set(df.index) == set(dataframe.index) 14 | 15 | 16 | @pytest.mark.functions 17 | def test_shuffle(dataframe): 18 | """ 19 | Test the shuffle function. 20 | 21 | This test checks that the set of dataframes has identical columns and 22 | number of rows. 23 | """ 24 | df = dataframe.shuffle() 25 | assert len(df) == len(dataframe) 26 | assert set(df.columns) == set(dataframe.columns) 27 | -------------------------------------------------------------------------------- /tests/functions/test_sort_naturally.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for sort_naturally. 3 | 4 | Some places where this test suite could be improved: 5 | 6 | - Replace example-based test 7 | with Hypothesis-generated property-based test. [intermediate] 8 | - Provide another example-based test of something 9 | that needs to be naturally rather than lexiographically sorted. 10 | """ 11 | 12 | import pandas as pd 13 | import pytest 14 | from natsort import natsorted 15 | from pandas.testing import assert_frame_equal 16 | 17 | import janitor # noqa: F401 18 | 19 | 20 | @pytest.fixture 21 | def well_dataframe(): 22 | data = { 23 | "Well": ["A21", "A3", "A21", "B2", "B51", "B12"], 24 | "Value": [1, 2, 13, 3, 4, 7], 25 | } 26 | df = pd.DataFrame(data) 27 | return df 28 | 29 | 30 | def test_sort_naturally(well_dataframe): 31 | """Example-based test for sort_naturally. 32 | 33 | We check that: 34 | 35 | - the resultant dataframe is sorted identically 36 | to what natsorted would provide, 37 | - the data in the dataframe are not corrupted. 38 | """ 39 | sorted_df = well_dataframe.sort_naturally("Well") 40 | assert sorted_df["Well"].tolist() == natsorted(well_dataframe["Well"]) 41 | assert_frame_equal(sorted_df.sort_index(), well_dataframe) 42 | -------------------------------------------------------------------------------- /tests/functions/test_take_first.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | 5 | 6 | @pytest.mark.functions 7 | def test_take_first(): 8 | df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]}) 9 | 10 | res = df.take_first(subset="a", by="b") 11 | exp = df.iloc[[0, 2], :] 12 | 13 | assert_frame_equal(res, exp) 14 | -------------------------------------------------------------------------------- /tests/functions/test_then.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def remove_first_two_letters_from_col_names(df): 5 | col_names = df.columns 6 | col_names = [name[2:] for name in col_names] 7 | df.columns = col_names 8 | return df 9 | 10 | 11 | def remove_rows_3_and_4(df): 12 | df = df.drop(3, axis=0) 13 | df = df.drop(4, axis=0) 14 | return df 15 | 16 | 17 | @pytest.mark.functions 18 | def test_then_column_names(dataframe): 19 | df = dataframe.then(remove_first_two_letters_from_col_names) 20 | cols = tuple(df.columns) 21 | assert cols == ("", "ll__Chart", "corated-elephant", "imals@#$%^", "ties") 22 | 23 | 24 | @pytest.mark.functions 25 | def test_then_remove_rows(dataframe): 26 | df = dataframe.then(remove_rows_3_and_4) 27 | rows = tuple(df.index) 28 | assert rows == (0, 1, 2, 5, 6, 7, 8) 29 | -------------------------------------------------------------------------------- /tests/functions/test_to_datetime.py: -------------------------------------------------------------------------------- 1 | """Tests for `to_datetime` function.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | 8 | @pytest.mark.functions 9 | def test_to_datetime(): 10 | """Checks to_datetime functionality is as expected.""" 11 | 12 | df = pd.DataFrame( 13 | {"date1": ["20190101", "20190102", "20190304", np.nan]} 14 | ).to_datetime("date1", format="%Y%m%d") 15 | assert df["date1"].dtype == np.dtype("datetime64[ns]") 16 | assert df["date1"].iloc[0].isoformat() == "2019-01-01T00:00:00" 17 | -------------------------------------------------------------------------------- /tests/functions/test_toset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_coalesce_with_title(): 7 | s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"]).toset() 8 | 9 | assert isinstance(s, set) 10 | assert len(s) == 4 11 | assert s == set([1, 2, 3, 5]) 12 | -------------------------------------------------------------------------------- /tests/functions/test_truncate_datetime.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from pandas.testing import assert_frame_equal 7 | 8 | 9 | @pytest.mark.functions 10 | def test_truncate_datetime_dataframe_invalid_datepart(): 11 | """Checks if a ValueError is appropriately raised when datepart is 12 | not a valid enumeration. 13 | """ 14 | with pytest.raises(ValueError, match=r"invalid `datepart`"): 15 | pd.DataFrame().truncate_datetime_dataframe("INVALID") 16 | 17 | 18 | @pytest.mark.functions 19 | def test_truncate_datetime_dataframe_all_parts(): 20 | """Test for truncate_datetime_dataframe, for all valid dateparts. 21 | Also only passes if `truncate_datetime_dataframe` method is idempotent. 22 | """ 23 | x = datetime(2022, 3, 21, 9, 1, 15, 666) 24 | df = pd.DataFrame({"dt": [x], "foo": [np.nan]}, copy=False) 25 | 26 | result = df.truncate_datetime_dataframe("second") 27 | assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9, 1, 15, 0) 28 | result = df.truncate_datetime_dataframe("minute") 29 | assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9, 1) 30 | result = df.truncate_datetime_dataframe("HOUR") 31 | assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9) 32 | result = df.truncate_datetime_dataframe("Day") 33 | assert result.loc[0, "dt"] == datetime(2022, 3, 21) 34 | result = df.truncate_datetime_dataframe("month") 35 | assert result.loc[0, "dt"] == datetime(2022, 3, 1) 36 | result = df.truncate_datetime_dataframe("yeaR") 37 | assert result.loc[0, "dt"] == datetime(2022, 1, 1) 38 | 39 | 40 | # bad data 41 | @pytest.mark.functions 42 | def test_truncate_datetime_dataframe_do_nothing(): 43 | """Ensure nothing changes (and no errors raised) if there are no datetime- 44 | compatible columns. 45 | """ 46 | in_data = { 47 | "a": [1, 0], 48 | "b": ["foo", ""], 49 | "c": [np.nan, 3.0], 50 | "d": [True, False], 51 | } 52 | 53 | result = pd.DataFrame(in_data).truncate_datetime_dataframe("year") 54 | expected = pd.DataFrame(in_data) 55 | 56 | assert_frame_equal(result, expected) 57 | 58 | 59 | @pytest.mark.functions 60 | def test_truncate_datetime_containing_NaT(): 61 | """Ensure NaT is ignored safely (no-op) and no TypeError is thrown.""" 62 | x = datetime(2022, 3, 21, 9, 1, 15, 666) 63 | df = pd.DataFrame({"dt": [x, pd.NaT], "foo": [np.nan, 3]}) 64 | expected = pd.DataFrame( 65 | {"dt": [x.replace(microsecond=0), pd.NaT], "foo": [np.nan, 3]} 66 | ) 67 | 68 | result = df.truncate_datetime_dataframe("second").assign( 69 | dt=lambda df: df["dt"].dt.as_unit("ns") 70 | ) 71 | assert_frame_equal(result, expected) 72 | -------------------------------------------------------------------------------- /tests/functions/test_update_where.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | 5 | from janitor.functions import update_where 6 | 7 | 8 | @pytest.mark.functions 9 | def test_update_where(dataframe): 10 | """ 11 | Test that it accepts conditional parameters 12 | """ 13 | assert_frame_equal( 14 | dataframe.update_where( 15 | (dataframe["decorated-elephant"] == 1) 16 | & (dataframe["animals@#$%^"] == "rabbit"), 17 | "cities", 18 | "Durham", 19 | ), 20 | dataframe.replace("Cambridge", "Durham"), 21 | ) 22 | 23 | 24 | @pytest.fixture 25 | def df(): 26 | return pd.DataFrame( 27 | {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [0, 0, 0, 0]} 28 | ) 29 | 30 | 31 | def test_update_where_query(df): 32 | """Test that function works with pandas query-style string expression.""" 33 | 34 | expected = pd.DataFrame( 35 | {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [0, 0, 10, 0]} 36 | ) 37 | result = update_where( 38 | df, conditions="a > 2 and b < 8", target_column_name="c", target_val=10 39 | ) 40 | 41 | assert_frame_equal(result, expected) 42 | 43 | 44 | def test_not_boolean_conditions(df): 45 | """Raise Error if `conditions` is not a boolean type.""" 46 | with pytest.raises(ValueError): 47 | df.update_where( 48 | conditions=(df.a + 5), 49 | target_column_name="c", 50 | target_val=10, 51 | ) 52 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | """Helper functions for running tests.""" 2 | 3 | import os 4 | 5 | 6 | def running_on_ci() -> bool: 7 | """Return True if running on CI machine.""" 8 | return os.environ.get("JANITOR_CI_MACHINE") is not None 9 | -------------------------------------------------------------------------------- /tests/io/test_read_commandline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | from subprocess import CalledProcessError 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | import janitor.io 10 | 11 | 12 | def test_read_commandline(dataframe): 13 | """ 14 | Test asserts that the dataframe made 15 | from the read_commandline function is 16 | identical to the test dataframe from 17 | which the .csv file was created. 18 | 19 | """ 20 | # create a temporary .csv file from test data 21 | temp_dir = tempfile.gettempdir() 22 | 23 | dataframe.to_csv(f"{temp_dir}/dataframe.csv", index=0) 24 | 25 | # create a new dataframe from the temporary .csv using 26 | # the cat command from the bash commandline 27 | 28 | if sys.platform in ["win32"]: 29 | # cat is not an operable command for Windows command line 30 | # "type" is a similar call 31 | df = janitor.io.read_commandline(f"type {temp_dir}\\dataframe.csv") 32 | else: 33 | df = janitor.io.read_commandline(f"cat {temp_dir}/dataframe.csv") 34 | 35 | # Make assertion that new dataframe created with read_commandline 36 | # is equal to the test dataframe 37 | assert df.equals(dataframe) 38 | 39 | # clean up after the test 40 | os.unlink(f"{temp_dir}/dataframe.csv") 41 | 42 | 43 | def test_read_commandline_bad_cmd(dataframe): 44 | """ 45 | Test 1 raises a TypeError if read_commandline 46 | is given an input that is not a string. 47 | 48 | Test 2 raises a CalledProcessError if 49 | read_commandline is given a string 50 | which is not a valid bash command. 51 | 52 | Test 3 raises an EmptyDataError if 53 | read_commandline is given a string which 54 | is a valid bash command, however results 55 | in the shell not creating a dataframe. 56 | """ 57 | temp_dir = tempfile.gettempdir() 58 | 59 | # create a temporary .csv file 60 | dataframe.to_csv(f"{temp_dir}/dataframe.csv") 61 | 62 | # Test 1 63 | with pytest.raises(TypeError): 64 | janitor.io.read_commandline(6) 65 | 66 | # Test 2 67 | with pytest.raises(CalledProcessError): 68 | janitor.io.read_commandline("bad command") 69 | 70 | # Test 3 71 | # windows does not support "cat" in commandline 72 | # "type" command must be used and it returns a different error 73 | cmd = "cat" 74 | 75 | ExpectedError = pd.errors.EmptyDataError 76 | if sys.platform in ["win32"]: 77 | cmd = "type" 78 | ExpectedError = CalledProcessError 79 | 80 | with pytest.raises(ExpectedError): 81 | janitor.io.read_commandline(cmd) 82 | 83 | # clean up after the tests 84 | os.unlink(f"{temp_dir}/dataframe.csv") 85 | -------------------------------------------------------------------------------- /tests/math/test_ecdf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from hypothesis import given, settings 4 | from hypothesis.extra.pandas import series 5 | 6 | 7 | @given(s=series(dtype=np.number)) 8 | @settings(deadline=None) 9 | def test_ecdf(s): 10 | """A simple execution test.""" 11 | if s.isna().sum() > 0: 12 | with pytest.raises(ValueError): 13 | x, y = s.ecdf() 14 | else: 15 | x, y = s.ecdf() 16 | assert len(x) == len(y) 17 | 18 | 19 | @given(s=series(dtype=str)) 20 | def test_ecdf_string(s): 21 | """Test that type enforcement is in place.""" 22 | with pytest.raises(TypeError): 23 | x, y = s.ecdf() 24 | -------------------------------------------------------------------------------- /tests/math/test_exp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.mark.functions 7 | def test_exp(): 8 | s = pd.Series([0, 1, 2, 3, -1]) 9 | out = s.exp() 10 | assert (out == np.exp(s)).all() 11 | assert (s.index == out.index).all() 12 | -------------------------------------------------------------------------------- /tests/math/test_log.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.mark.functions 7 | def test_log(): 8 | s = pd.Series([0, 1, 2, 3, -1]) 9 | 10 | with pytest.raises(RuntimeError): 11 | s.log(error="raise") 12 | 13 | with pytest.warns(RuntimeWarning): 14 | out = s.log(error="warn") 15 | 16 | assert out[s <= 0].isna().all() 17 | assert (out.index == s.index).all() 18 | assert (out[s > 0] == np.log(np.array([1, 2, 3]))).all() 19 | 20 | out = s.log(error="ignore") 21 | 22 | assert out[s <= 0].isna().all() 23 | assert (out.index == s.index).all() 24 | assert (out[s > 0] == np.log(np.array([1, 2, 3]))).all() 25 | -------------------------------------------------------------------------------- /tests/math/test_logit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.mark.functions 7 | def test_logit(): 8 | s = pd.Series([0, 0.1, 0.2, 0.3, 0.5, 0.9, 1, 2]) 9 | inside = (0 < s) & (s < 1) 10 | valid = np.array([0.1, 0.2, 0.3, 0.5, 0.9]) 11 | ans = np.log(valid / (1 - valid)) 12 | 13 | with pytest.raises(RuntimeError): 14 | s.logit(error="raise") 15 | 16 | with pytest.warns(RuntimeWarning): 17 | out = s.logit(error="warn") 18 | 19 | assert out[inside].notna().all() 20 | assert out[inside].to_numpy() == pytest.approx(ans) 21 | assert (out.index == s.index).all() 22 | assert out[~inside].isna().all() 23 | 24 | out = s.logit(error="ignore") 25 | 26 | assert out[inside].notna().all() 27 | assert out[inside].to_numpy() == pytest.approx(ans) 28 | assert (out.index == s.index).all() 29 | assert out[~inside].isna().all() 30 | -------------------------------------------------------------------------------- /tests/math/test_normal_cdf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from scipy.stats import norm 4 | 5 | 6 | @pytest.mark.functions 7 | def test_normal_cdf(): 8 | s = pd.Series([0, 1, 2, 3, -1]) 9 | out = s.normal_cdf() 10 | assert (out == norm.cdf(s)).all() 11 | assert (s.index == out.index).all() 12 | -------------------------------------------------------------------------------- /tests/math/test_probit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from scipy.stats import norm 5 | 6 | 7 | @pytest.mark.functions 8 | def test_probit(): 9 | s = pd.Series([-1, 0, 0.1, 0.2, 0.3, 1, 2]) 10 | inside = (0 < s) & (s < 1) 11 | valid = np.array([0.1, 0.2, 0.3]) 12 | ans = norm.ppf(valid) 13 | 14 | with pytest.raises(RuntimeError): 15 | s.probit(error="raise") 16 | 17 | with pytest.warns(RuntimeWarning): 18 | out = s.probit(error="warn") 19 | 20 | assert out[inside].notna().all() 21 | assert (out[inside] == ans).all() 22 | assert (out.index == s.index).all() 23 | assert out[~inside].isna().all() 24 | 25 | out = s.probit(error="ignore") 26 | 27 | assert out[inside].notna().all() 28 | assert (out[inside] == ans).all() 29 | assert (out.index == s.index).all() 30 | assert out[~inside].isna().all() 31 | -------------------------------------------------------------------------------- /tests/math/test_sigmoid.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from scipy.special import expit 4 | 5 | 6 | @pytest.mark.functions 7 | def test_sigmoid(): 8 | s = pd.Series([0, 1, 2, 3, -1]) 9 | out = s.sigmoid() 10 | assert (out == expit(s)).all() 11 | assert (s.index == out.index).all() 12 | -------------------------------------------------------------------------------- /tests/math/test_softmax.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from scipy.special import softmax as scipy_softmax 4 | 5 | 6 | @pytest.mark.functions 7 | def test_softmax(): 8 | s = pd.Series([0, 1, 2, 3, -1]) 9 | out = s.softmax() 10 | assert (out == scipy_softmax(s)).all() 11 | assert (s.index == out.index).all() 12 | assert out.sum() == 1.0 13 | -------------------------------------------------------------------------------- /tests/math/test_z_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.mark.functions 6 | def test_z_score(): 7 | s = pd.Series([0, 1, 2, 3, -1]) 8 | 9 | m = s.mean() 10 | st = s.std() 11 | 12 | ans = (s - m) / st 13 | 14 | d = {} 15 | 16 | assert (s.z_score(moments_dict=d) == ans).all() 17 | assert (s.z_score().index == s.index).all() 18 | 19 | assert d["mean"] == m 20 | assert d["std"] == st 21 | -------------------------------------------------------------------------------- /tests/ml/test_get_features_targets.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given, settings 3 | 4 | import janitor.ml # noqa: F401 5 | from janitor.testing_utils.strategies import df_strategy 6 | 7 | 8 | @pytest.mark.ml 9 | @given(df=df_strategy()) 10 | @settings(deadline=None, max_examples=10) 11 | def test_get_features_targets(df): 12 | """Test one column returned as target and rest as features.""" 13 | X, y = df.clean_names().get_features_targets( 14 | target_column_names="bell_chart" 15 | ) 16 | assert X.shape[1] == 4 17 | assert len(y.shape) == 1 18 | 19 | 20 | @pytest.mark.ml 21 | @given(df=df_strategy()) 22 | @settings(deadline=None, max_examples=10) 23 | def test_get_features_targets_multi_features(df): 24 | """Test one column returned as target and two as features.""" 25 | X, y = df.clean_names().get_features_targets( 26 | feature_column_names=["animals@#$%^", "cities"], 27 | target_column_names="bell_chart", 28 | ) 29 | assert X.shape[1] == 2 30 | assert len(y.shape) == 1 31 | 32 | 33 | @pytest.mark.ml 34 | @given(df=df_strategy()) 35 | @settings(deadline=None, max_examples=10) 36 | def test_get_features_target_multi_columns(df): 37 | """Test two columns returned as target and rest as features.""" 38 | X, y = df.clean_names().get_features_targets( 39 | target_column_names=["a", "bell_chart"] 40 | ) 41 | assert X.shape[1] == 3 42 | assert y.shape[1] == 2 43 | -------------------------------------------------------------------------------- /tests/polars/functions/test_convert_excel_date_polars.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | import janitor.polars # noqa: F401 4 | 5 | 6 | def test_convert_excel_date(): 7 | df = pl.DataFrame({"dates": [42580.3333333333]}) 8 | 9 | expression = pl.col("dates").convert_excel_date().alias("dd") 10 | expression = df.with_columns(expression).get_column("dd") 11 | assert expression.dtype.is_temporal() is True 12 | -------------------------------------------------------------------------------- /tests/polars/functions/test_convert_matlab_date_polars.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | import janitor.polars # noqa: F401 4 | 5 | 6 | def test_convert_matlab_date(): 7 | df = pl.DataFrame( 8 | { 9 | "dates": [ 10 | 733_301.0, 11 | 729_159.0, 12 | 734_471.0, 13 | 737_299.563_296_356_5, 14 | 737_300.000_000_000_0, 15 | ] 16 | } 17 | ) 18 | expression = pl.col("dates").convert_matlab_date().alias("dd") 19 | expression = df.with_columns(expression).get_column("dd") 20 | assert expression.dtype.is_temporal() is True 21 | -------------------------------------------------------------------------------- /tests/polars/functions/test_expand_polars.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import pytest 3 | from polars.testing import assert_frame_equal 4 | 5 | import janitor.polars # noqa: F401 6 | 7 | 8 | @pytest.fixture 9 | def df(): 10 | """pytest fixture""" 11 | return pl.DataFrame( 12 | dict( 13 | group=(1, 2, 1, 2), 14 | item_id=(1, 2, 2, 3), 15 | item_name=("a", "a", "b", "b"), 16 | value1=(1, None, 3, 4), 17 | value2=range(4, 8), 18 | ) 19 | ) 20 | 21 | 22 | def test_column_None(df): 23 | """Test output if *columns is empty.""" 24 | assert_frame_equal(df.expand(), df) 25 | 26 | 27 | def test_empty_groups(df): 28 | """Raise TypeError if wrong column type is passed.""" 29 | msg = "The argument passed to the columns parameter " 30 | msg += "should either be a string, a column selector, " 31 | msg += "a polars expression, or a polars Series; instead got.+" 32 | with pytest.raises(TypeError, match=msg): 33 | df.complete("group", {}) 34 | 35 | 36 | def test_type_sort(df): 37 | """Raise TypeError if `sort` is not boolean.""" 38 | with pytest.raises(TypeError): 39 | df.complete("group", "item_id", sort=11) 40 | 41 | 42 | def test_expand_1(df): 43 | """ 44 | Test output for janitor.expand. 45 | """ 46 | expected = df.expand("group", "item_id", "item_name", sort=True) 47 | actual = ( 48 | df.select(pl.col("group").unique()) 49 | .join(df.select(pl.col("item_id").unique()), how="cross") 50 | .join(df.select(pl.col("item_name").unique()), how="cross") 51 | .sort(by=pl.all()) 52 | ) 53 | assert_frame_equal(actual, expected) 54 | 55 | 56 | def test_expand_2(df): 57 | """ 58 | Test output for janitor.expand. 59 | """ 60 | expected = df.expand( 61 | "group", df.get_column("item_id"), "item_name", sort=True 62 | ) 63 | actual = ( 64 | df.select(pl.col("group").unique()) 65 | .join(df.select(pl.col("item_id").unique()), how="cross") 66 | .join(df.select(pl.col("item_name").unique()), how="cross") 67 | .sort(by=pl.all()) 68 | ) 69 | assert_frame_equal(actual, expected) 70 | -------------------------------------------------------------------------------- /tests/spark/conftest.py: -------------------------------------------------------------------------------- 1 | """Spark fixtures.""" 2 | 3 | import pytest 4 | 5 | try: 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql.types import ( 8 | FloatType, 9 | IntegerType, 10 | StringType, 11 | StructField, 12 | StructType, 13 | ) 14 | except ImportError: 15 | pass 16 | 17 | 18 | @pytest.fixture # (scope="session") 19 | def spark(): 20 | """Create spark session.""" 21 | spark = SparkSession.builder.getOrCreate() 22 | yield spark 23 | spark.stop() 24 | 25 | 26 | @pytest.fixture 27 | def spark_df(spark): 28 | """Create spark dataframe.""" 29 | schema = StructType( 30 | [ 31 | StructField("a", IntegerType(), True), 32 | StructField("Bell__Chart", FloatType(), True), 33 | StructField("decorated-elephant", IntegerType(), True), 34 | StructField("animals@#$%^", StringType(), True), 35 | StructField("cities", StringType(), True), 36 | ] 37 | ) 38 | return spark.createDataFrame([], schema) 39 | 40 | 41 | @pytest.fixture 42 | def spark_dataframe(spark, dataframe): 43 | """Another function to create spark dataframe.""" 44 | return spark.createDataFrame(dataframe) 45 | -------------------------------------------------------------------------------- /tests/test_data/016-MSPTDA-Excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/016-MSPTDA-Excel.xlsx -------------------------------------------------------------------------------- /tests/test_data/excel_without_headers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/excel_without_headers.xlsx -------------------------------------------------------------------------------- /tests/test_data/file_example_XLSX_10.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/file_example_XLSX_10.xlsx -------------------------------------------------------------------------------- /tests/test_data/worked-examples.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/worked-examples.xlsx -------------------------------------------------------------------------------- /tests/test_documentation_build.py: -------------------------------------------------------------------------------- 1 | """Tests for documentation build.""" 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | # If `mkdocs` wasn't installed in environment, just skip. 8 | # Can't use `pytest.importorskip("mkdocs")`, 'mkdocs' is also 9 | # a folder name to pyjanitor project. 10 | pytest.importorskip("mkdocstrings") 11 | 12 | 13 | @pytest.mark.documentation 14 | def test_docs_general_functions_present(): 15 | """Test that all docs pages build correctly. 16 | 17 | TODO: There has to be a better way to automatically check that 18 | all of the functions are present in the docs. 19 | This is an awesome thing that we could use help with in the future. 20 | """ 21 | # Build docs using mkdocs 22 | os.system("mkdocs build --clean") 23 | 24 | # We want to check that the following keywords are all present. 25 | # I put in a subsample of general functions. 26 | # This can be made much more robust. 27 | rendered_correctly = False 28 | with open("./site/api/functions/index.html", "r+") as f: 29 | for line in f.readlines(): 30 | if "add_columns" in line or "update_where" in line: 31 | rendered_correctly = True 32 | assert rendered_correctly 33 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | """Tests for test helper functions.""" 2 | 3 | from helpers import running_on_ci 4 | 5 | 6 | def test_running_on_ci_local(monkeypatch): 7 | """Test running_on_ci run on local machine returns False.""" 8 | monkeypatch.delenv("JANITOR_CI_MACHINE", raising=False) 9 | assert running_on_ci() is False 10 | 11 | 12 | def test_running_on_ci_ci(monkeypatch): 13 | """Test running_on_ci run on CI machine returns True.""" 14 | monkeypatch.setenv("JANITOR_CI_MACHINE", "1") 15 | assert running_on_ci() is True 16 | -------------------------------------------------------------------------------- /tests/timeseries/test_fill_missing_timestamps.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from janitor.timeseries import _get_missing_timestamps, fill_missing_timestamps 7 | 8 | 9 | # Random data for testing 10 | @pytest.fixture 11 | def timeseries_dataframe() -> pd.DataFrame: 12 | """ 13 | Returns a time series dataframe 14 | """ 15 | ts_index = pd.date_range("1/1/2019", periods=1000, freq="1h") 16 | v1 = [randint(1, 2000) for i in range(1000)] 17 | test_df = pd.DataFrame({"v1": v1}, index=ts_index) 18 | return test_df 19 | 20 | 21 | @pytest.mark.timeseries 22 | def test_fill_missing_timestamps(timeseries_dataframe): 23 | """Test that filling missing timestamps works as expected.""" 24 | # Remove random row from the data frame 25 | random_number = randint(1, len(timeseries_dataframe)) 26 | df1 = timeseries_dataframe.drop(timeseries_dataframe.index[random_number]) 27 | 28 | # Fill missing timestamps 29 | # fix for GH#1184 is to use the start and end from 30 | # timeseries_dataframe 31 | # imagine that the last row of df1 is removed, or the first entry 32 | # the length check in the assert line will fail 33 | result = fill_missing_timestamps( 34 | df1, 35 | frequency="1h", 36 | first_time_stamp=timeseries_dataframe.index.min(), 37 | last_time_stamp=timeseries_dataframe.index.max(), 38 | ) 39 | 40 | # Testing if the missing timestamp has been filled 41 | assert len(result) == len(timeseries_dataframe) 42 | 43 | # Testing if indices are exactly the same after filling 44 | original_index = timeseries_dataframe.index 45 | new_index = result.index 46 | delta = original_index.difference(new_index) 47 | 48 | assert delta.empty is True 49 | 50 | 51 | @pytest.mark.timeseries 52 | def test__get_missing_timestamps(timeseries_dataframe): 53 | """Test utility function for identifying the missing timestamps.""" 54 | from random import sample 55 | 56 | timeseries_dataframe.index.freq = None 57 | timestamps_to_drop = sample(timeseries_dataframe.index.tolist(), 3) 58 | df = timeseries_dataframe.drop(index=timestamps_to_drop) 59 | missing_timestamps = _get_missing_timestamps(df, "1h") 60 | assert set(missing_timestamps.index) == set(timestamps_to_drop) 61 | -------------------------------------------------------------------------------- /tests/timeseries/test_sort_timestamps_monotonically.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import janitor # noqa: F401 7 | import janitor.timeseries # noqa: F401 8 | 9 | 10 | @pytest.fixture 11 | def timeseries_dataframe() -> pd.DataFrame: 12 | """ 13 | Returns a time series dataframe 14 | """ 15 | ts_index = pd.date_range("1/1/2019", periods=1000, freq="1h") 16 | v1 = [randint(1, 2000) for i in range(1000)] 17 | test_df = pd.DataFrame({"v1": v1}, index=ts_index) 18 | return test_df 19 | 20 | 21 | # NOTE: The tests possibly can be merged back together later 22 | # if they are parametrized properly. 23 | # NOTE: These tests use `df.equals(other_df)`, 24 | # because the desired `pd.assert_frame_equal(df, other_df)` 25 | # constantly failed on the CI systems. 26 | # It's a task for later to fix. 27 | 28 | 29 | @pytest.mark.timeseries 30 | def test_sort_timestamps_monotonically(timeseries_dataframe): 31 | """Test sort_timestamps_monotonically for ascending order""" 32 | df = timeseries_dataframe.shuffle( 33 | reset_index=False 34 | ).sort_timestamps_monotonically() 35 | assert df.equals(timeseries_dataframe) 36 | 37 | 38 | @pytest.mark.timeseries 39 | def test_sort_timestamps_monotonically_decreasing(timeseries_dataframe): 40 | """Test sort_timestamps_monotonically for descending order""" 41 | df2 = timeseries_dataframe.sort_index(ascending=False) 42 | df3 = df2.sort_timestamps_monotonically("decreasing") 43 | assert df3.equals(df2) 44 | 45 | 46 | @pytest.mark.timeseries 47 | def test_sort_timestamps_monotonically_strict(timeseries_dataframe): 48 | """Test sort_timestamps_monotonically for index duplication handling""" 49 | df = timeseries_dataframe.shuffle(reset_index=False) 50 | random_number = df.index[randint(1, len(timeseries_dataframe))] 51 | df = pd.concat( 52 | [df, df.loc[[random_number], :]] 53 | ).sort_timestamps_monotonically(direction="increasing", strict=True) 54 | assert df.equals(timeseries_dataframe) 55 | -------------------------------------------------------------------------------- /tests/utils/test_check_column.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from janitor.utils import check_column 4 | 5 | 6 | @pytest.mark.utils 7 | def test_check_column(dataframe): 8 | """ 9 | check_column should return if column exist 10 | """ 11 | assert check_column(dataframe, ["a"]) is None 12 | 13 | 14 | @pytest.mark.utils 15 | def test_check_column_single(dataframe): 16 | """ 17 | Check works with a single input 18 | """ 19 | 20 | assert check_column(dataframe, "a") is None 21 | 22 | with pytest.raises(ValueError): 23 | check_column(dataframe, "b") 24 | 25 | # should also work with non-string inputs 26 | 27 | with pytest.raises(ValueError): 28 | check_column(dataframe, 2) 29 | 30 | dataframe[2] = "asdf" 31 | 32 | assert check_column(dataframe, 2) is None 33 | 34 | 35 | @pytest.mark.utils 36 | def test_check_column_absent_column(dataframe): 37 | """ 38 | check_column should raise an error if the column is absent. 39 | """ 40 | with pytest.raises(ValueError): 41 | check_column(dataframe, ["b"]) 42 | 43 | 44 | @pytest.mark.utils 45 | def test_check_column_excludes(dataframe): 46 | """ 47 | check_column should return if column is absent and present is False 48 | """ 49 | assert check_column(dataframe, ["b"], present=False) is None 50 | 51 | 52 | @pytest.mark.utils 53 | def test_check_column_absent_column_excludes(dataframe): 54 | """ 55 | check_column should raise an error if the column is absent and present is 56 | False 57 | """ 58 | with pytest.raises(ValueError): 59 | check_column(dataframe, ["a"], present=False) 60 | -------------------------------------------------------------------------------- /tests/utils/test_deprecated_alias.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pytest 4 | 5 | from janitor.utils import deprecated_alias 6 | 7 | 8 | @deprecated_alias(a="alpha", b="beta") 9 | def simple_sum(alpha, beta): 10 | gamma = alpha + beta 11 | return gamma 12 | 13 | 14 | @pytest.mark.utils 15 | def test_old_aliases(): 16 | """ 17 | Using old aliases should result in `DeprecationWarning` 18 | """ 19 | with pytest.warns(DeprecationWarning): 20 | simple_sum(a=2, b=6) 21 | 22 | 23 | @pytest.mark.utils 24 | def test_new_aliases(): 25 | """ 26 | Using new aliases should not result in errors or warnings 27 | """ 28 | # https://github.com/scikit-learn/scikit-learn/issues/22572#issuecomment-1047316960 29 | with warnings.catch_warnings(record=True) as record: 30 | simple_sum(alpha=2, beta=6) 31 | assert not record 32 | 33 | assert simple_sum(alpha=2, beta=6) 34 | 35 | 36 | @pytest.mark.utils 37 | def test_mixed_aliases(): 38 | """ 39 | Using mixed aliases should result in errors 40 | """ 41 | with pytest.raises(TypeError): 42 | assert simple_sum(alpha=2, beta=6, a=5) 43 | -------------------------------------------------------------------------------- /tests/utils/test_idempotent.py: -------------------------------------------------------------------------------- 1 | from math import fabs, floor 2 | 3 | import pytest 4 | 5 | from janitor.utils import idempotent 6 | 7 | 8 | @pytest.mark.utils 9 | @pytest.mark.parametrize("func,data", [(fabs, -5), (floor, 10.45)]) 10 | def test__idempotence(func, data): 11 | idempotent(func, data) 12 | -------------------------------------------------------------------------------- /tests/utils/test_import_message.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pytest 5 | 6 | from janitor.utils import import_message 7 | 8 | 9 | @pytest.mark.utils 10 | def test_import_message(capsys): 11 | is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta")) 12 | if is_conda: 13 | message = ( 14 | "To use the janitor submodule biology, you need to install " 15 | "biopython.\n\n" 16 | "To do so, use the following command:\n\n" 17 | " conda install -c conda-forge biopython\n" 18 | ) 19 | else: 20 | message = ( 21 | "To use the janitor submodule biology, you need to install " 22 | "biopython.\n\n" 23 | "To do so, use the following command:\n\n" 24 | " pip install biopython\n" 25 | ) 26 | import_message( 27 | submodule="biology", 28 | package="biopython", 29 | conda_channel="conda-forge", 30 | pip_install=True, 31 | ) 32 | captured = capsys.readouterr() 33 | assert captured.out == message 34 | -------------------------------------------------------------------------------- /tests/utils/test_is_connected.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | import pytest 4 | 5 | from janitor.utils import is_connected 6 | 7 | """ 8 | Tests the is_connected helper function, 9 | which is a function to check if the client 10 | is connected to the internet. 11 | 12 | Example: 13 | print(is_connected("www.google.com")) 14 | console >> True 15 | 16 | Test 1: happy path, ensures function work 17 | 18 | Test 2: web addresses that are not recognized 19 | will return false (comzzz is not a tld). 20 | 21 | Test 3: web addresses that are not recognized 22 | will return false (aadsfff.com does not exist 23 | at time of testing). 24 | 25 | If test 3 fails, perhaps this is because 26 | the website now exists. If that is the case, 27 | alter or delete the test. 28 | """ 29 | 30 | 31 | def test_is_connected(): 32 | assert is_connected("www.google.com") 33 | with pytest.raises(socket.gaierror): 34 | assert is_connected("www.google.comzzz") is False 35 | with pytest.raises(socket.gaierror): 36 | assert is_connected("aadsfff.com") is False 37 | -------------------------------------------------------------------------------- /tests/utils/test_replace_empty_string_with_none.py: -------------------------------------------------------------------------------- 1 | """Tests for _replace_empty_string_with_none helper function.""" 2 | 3 | import pandas as pd 4 | import pytest 5 | from pandas.testing import assert_series_equal 6 | 7 | from janitor.functions.currency_column_to_numeric import ( 8 | _replace_empty_string_with_none, 9 | _replace_original_empty_string_with_none, 10 | ) 11 | 12 | 13 | @pytest.mark.utils 14 | def test_replace_empty_string_with_none(): 15 | """Example-based test for _replace_empty_string_with_none.""" 16 | df = pd.DataFrame({"a": ["", 1, 0.34, "6.5", ""]}) 17 | df_expected = pd.DataFrame({"a": [None, 1, 0.34, "6.5", None]}) 18 | 19 | df["a"] = _replace_empty_string_with_none(df["a"]) 20 | assert_series_equal(df["a"], df_expected["a"]) 21 | 22 | 23 | @pytest.mark.utils 24 | def test_replace_original_empty_string_with_none(): 25 | """ 26 | Example test for the "original" _replace_empty_string_with_none. 27 | 28 | NOTE: This should be deprecated, I think? 29 | TODO: Investigate whether this should be deprecated. 30 | """ 31 | df = pd.DataFrame({"a": [1, 0.34, "6.5", None, "ORIGINAL_NA", "foo"]}) 32 | df_expected = pd.DataFrame({"a": [1, 0.34, "6.5", None, None, "foo"]}) 33 | 34 | df["a"] = _replace_original_empty_string_with_none(df["a"]) 35 | assert_series_equal(df["a"], df_expected["a"]) 36 | -------------------------------------------------------------------------------- /tests/utils/test_skiperror.py: -------------------------------------------------------------------------------- 1 | """Tests for skiperror.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from janitor.utils import skiperror 8 | 9 | 10 | @pytest.mark.functions 11 | def test_skiperror(): 12 | """ 13 | Overall test for skiperror. 14 | 15 | TODO: I believe this test should be refactored into smaller "unit" tests. 16 | """ 17 | df = pd.DataFrame({"x": [1, 2, 3, "a"], "y": [1, 2, 3, "b"]}) 18 | 19 | def func(s): 20 | """Dummy helper function.""" 21 | return s + 1 22 | 23 | # Verify that applying function causes error 24 | with pytest.raises(Exception): 25 | df["x"].apply(func) 26 | 27 | result = df["x"].apply(skiperror(func)) 28 | assert (result.to_numpy()[:-1] == np.array([2, 3, 4])).all() and np.isnan( 29 | result.to_numpy()[-1] 30 | ) 31 | 32 | result = df["x"].apply(skiperror(func, return_x=True)) 33 | assert (result.to_numpy() == np.array([2, 3, 4, "a"], dtype=object)).all() 34 | 35 | result = df["x"].apply(skiperror(func, return_x=False, return_val=5)) 36 | assert (result.to_numpy() == np.array([2, 3, 4, 5])).all() 37 | -------------------------------------------------------------------------------- /tests/utils/test_skipna.py: -------------------------------------------------------------------------------- 1 | """Tests for skipna.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from janitor.utils import skipna 8 | 9 | 10 | @pytest.mark.functions 11 | def test_skipna(): 12 | """ 13 | Overall test for skipna. 14 | 15 | TODO: Should be refactored into separate tests. 16 | """ 17 | df = pd.DataFrame({"x": ["a", "b", "c", np.nan], "y": [1, 2, 3, np.nan]}) 18 | 19 | def func(s): 20 | """Dummy helper func.""" 21 | return s + "1" 22 | 23 | # Verify that applying function causes error 24 | with pytest.raises(Exception): 25 | df["x"].apply(func) 26 | 27 | result = df["x"].apply(skipna(func)) 28 | assert ( 29 | result.to_numpy()[:-1] == np.array(["a1", "b1", "c1"]) 30 | ).all() and np.isnan(result.to_numpy()[-1]) 31 | -------------------------------------------------------------------------------- /tests/xarray/conftest.py: -------------------------------------------------------------------------------- 1 | """Fixtures for xarray tests.""" 2 | 3 | import numpy as np 4 | import pytest 5 | import xarray as xr 6 | 7 | 8 | @pytest.fixture 9 | def da(): 10 | """ 11 | Input testing DataArray for clone_using and convert_datetime_to_number. 12 | 13 | It creates a two-dimensional array of random integers adds axis coordinates 14 | that are either linearly or log-spaced increments. 15 | 16 | Included is a simple metadata dictionary passed as `attrs`. 17 | 18 | .. # noqa: DAR201 19 | """ 20 | da = xr.DataArray( 21 | np.random.randint(0, 100, size=(512, 1024)), 22 | dims=["random_ax_1", "random_ax_2"], 23 | coords=dict( 24 | random_ax_1=np.linspace(0, 1, 512), 25 | random_ax_2=np.logspace(-2, 2, 1024), 26 | ), 27 | name="blarg", 28 | attrs=dict(a=3, b=["asdf", "fdsa"]), 29 | ) 30 | return da 31 | -------------------------------------------------------------------------------- /tests/xarray/test_clone_using.py: -------------------------------------------------------------------------------- 1 | """Tests for clone_using.""" 2 | 3 | import numpy as np 4 | import pytest 5 | import xarray as xr 6 | 7 | import janitor # noqa: F401 8 | 9 | 10 | @pytest.mark.xarray 11 | def test_successful_cloning_coords(da): 12 | """Test that clone_using coordinates works correctly.""" 13 | 14 | # with copying coords 15 | new_da: xr.DataArray = da.clone_using(np.random.randn(*da.data.shape)) 16 | 17 | with pytest.raises(AssertionError): 18 | np.testing.assert_equal(new_da.data, da.data) 19 | 20 | assert all( 21 | ( 22 | new_coord == old_coord 23 | for new_coord, old_coord in zip(new_da.coords, da.coords) 24 | ) 25 | ) 26 | assert new_da.dims == da.dims 27 | 28 | 29 | @pytest.mark.xarray 30 | def test_successful_cloning_no_coords(da): 31 | """Test that cloning works without coordinates.""" 32 | 33 | new_da: xr.DataArray = da.clone_using( 34 | np.random.randn(*da.data.shape), use_coords=False 35 | ) 36 | 37 | with pytest.raises(AssertionError): 38 | np.testing.assert_equal(new_da.data, da.data) 39 | 40 | assert new_da.dims == da.dims 41 | 42 | 43 | @pytest.mark.xarray 44 | def test_metadata_cloning(da): 45 | """Test that metadata gets cloned over.""" 46 | new_da: xr.DataArray = da.clone_using( 47 | np.random.randn(*da.data.shape), use_attrs=True, new_name="new_name" 48 | ) 49 | 50 | assert new_da.name != da.name 51 | assert new_da.attrs == da.attrs 52 | 53 | 54 | @pytest.mark.xarray 55 | def test_no_coords_errors(da: xr.DataArray): 56 | """Test that errors are raised when dims do not match.""" 57 | # number of dims should match 58 | with pytest.raises(ValueError): 59 | da.clone_using(np.random.randn(10, 10, 10), use_coords=False) 60 | 61 | # shape of each axis does not need to match 62 | da.clone_using(np.random.randn(10, 10), use_coords=False) 63 | 64 | 65 | @pytest.mark.xarray 66 | def test_coords_errors(da: xr.DataArray): 67 | # number of dims should match 68 | with pytest.raises(ValueError): 69 | da.clone_using(np.random.randn(10, 10, 10), use_coords=False) 70 | 71 | # shape of each axis must match when using coords 72 | with pytest.raises(ValueError): 73 | da.clone_using(np.random.randn(10, 10), use_coords=True) 74 | -------------------------------------------------------------------------------- /tests/xarray/test_convert_datetime_to_number.py: -------------------------------------------------------------------------------- 1 | """Tests for datetime_conversion.""" 2 | 3 | import numpy as np 4 | import pytest 5 | import xarray as xr 6 | 7 | 8 | @pytest.mark.xarray 9 | def test_datetime_conversion(da): 10 | """Test that datetime conversion works on DataArrays.""" 11 | seconds_arr = np.arange(512) 12 | 13 | # dataarrays 14 | new_da = da.assign_coords( 15 | random_ax_1=1e9 * seconds_arr * np.timedelta64(1, "ns") 16 | ).convert_datetime_to_number("m", dim="random_ax_1") 17 | 18 | # account for rounding errors 19 | np.testing.assert_array_almost_equal( 20 | new_da.coords["random_ax_1"].data, 1 / 60 * seconds_arr 21 | ) 22 | 23 | # datasets 24 | new_ds = xr.Dataset( 25 | dict( 26 | array=da.assign_coords( 27 | random_ax_1=1e9 * seconds_arr * np.timedelta64(1, "ns") 28 | ) 29 | ) 30 | ).convert_datetime_to_number("m", dim="random_ax_1") 31 | 32 | np.testing.assert_array_almost_equal( 33 | new_ds.coords["random_ax_1"].data, 1 / 60 * seconds_arr 34 | ) 35 | --------------------------------------------------------------------------------