├── .azure-pipelines.old
    ├── pipeline-master.yml
    └── templates
    │   ├── create-env.yml
    │   ├── release.yml
    │   └── run-tests.yml
├── .bumpversion.cfg
├── .codecov.yml
├── .darglint
├── .deepsource.toml
├── .devcontainer
    ├── Dockerfile
    ├── devcontainer.json
    └── noop.txt
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation_fix.md
    │   ├── new_examples.md
    │   └── new_proposed_feature.md
    ├── config.yml
    ├── config.yml.save
    ├── pull_request_template.md
    ├── stale.yaml
    └── workflows
    │   ├── auto-release.yml
    │   ├── auto-update.yml
    │   ├── docs.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pyup.yml
├── .requirements
    ├── all.in
    ├── all.txt
    ├── base.in
    ├── base.txt
    ├── biology.in
    ├── biology.txt
    ├── chemistry.in
    ├── chemistry.txt
    ├── dev.in
    ├── dev.txt
    ├── docs.in
    ├── docs.txt
    ├── engineering.in
    ├── engineering.txt
    ├── spark.in
    ├── spark.txt
    ├── testing.in
    └── testing.txt
├── AUTHORS.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── environment-dev.yml
├── examples
    ├── add_column.md
    ├── convert_currency.md
    ├── data
    │   └── medium_franchise_raw_table.csv
    ├── filter_date.md
    ├── limit_column_characters.md
    ├── make_currency_column_numeric.md
    ├── notebooks
    │   ├── Pivot Data from Long to Wide Form.ipynb
    │   ├── Pivoting Data from Wide to Long.ipynb
    │   ├── README.rst
    │   ├── Row_to_Names.ipynb
    │   ├── anime.ipynb
    │   ├── bad_values.ipynb
    │   ├── bird_call.ipynb
    │   ├── board_games.ipynb
    │   ├── case_when.ipynb
    │   ├── coalesce.ipynb
    │   ├── complete.ipynb
    │   ├── dirty_data.ipynb
    │   ├── dirty_data.xlsx
    │   ├── encode_categorical.ipynb
    │   ├── expand_grid.ipynb
    │   ├── fill_direction.ipynb
    │   ├── french_trains.ipynb
    │   ├── groupby_agg.ipynb
    │   ├── inflating_converting_currency.ipynb
    │   ├── medium_franchise.ipynb
    │   ├── normalize.ipynb
    │   ├── process_text.ipynb
    │   ├── pyjanitor_intro.ipynb
    │   ├── select_columns.ipynb
    │   ├── sort_columns.ipynb
    │   ├── sort_naturally.ipynb
    │   ├── teacher_pupil.ipynb
    │   └── transform_column.ipynb
    ├── round_to_fraction.md
    ├── row_to_names.md
    └── then.md
├── janitor
    ├── __init__.py
    ├── accessors
    │   ├── __init__.py
    │   └── data_description.py
    ├── biology.py
    ├── chemistry.py
    ├── engineering.py
    ├── errors.py
    ├── finance.py
    ├── functions
    │   ├── __init__.py
    │   ├── _numba.py
    │   ├── add_columns.py
    │   ├── alias.py
    │   ├── also.py
    │   ├── bin_numeric.py
    │   ├── case_when.py
    │   ├── change_index_dtype.py
    │   ├── change_type.py
    │   ├── clean_names.py
    │   ├── coalesce.py
    │   ├── collapse_levels.py
    │   ├── complete.py
    │   ├── concatenate_columns.py
    │   ├── conditional_join.py
    │   ├── convert_date.py
    │   ├── count_cumulative_unique.py
    │   ├── currency_column_to_numeric.py
    │   ├── deconcatenate_column.py
    │   ├── drop_constant_columns.py
    │   ├── drop_duplicate_columns.py
    │   ├── dropnotnull.py
    │   ├── encode_categorical.py
    │   ├── expand_column.py
    │   ├── expand_grid.py
    │   ├── explode_index.py
    │   ├── factorize_columns.py
    │   ├── fill.py
    │   ├── filter.py
    │   ├── find_replace.py
    │   ├── flag_nulls.py
    │   ├── get_dupes.py
    │   ├── groupby_agg.py
    │   ├── groupby_topk.py
    │   ├── impute.py
    │   ├── jitter.py
    │   ├── join_apply.py
    │   ├── label_encode.py
    │   ├── limit_column_characters.py
    │   ├── min_max_scale.py
    │   ├── move.py
    │   ├── mutate.py
    │   ├── pivot.py
    │   ├── process_text.py
    │   ├── remove_columns.py
    │   ├── remove_empty.py
    │   ├── rename_columns.py
    │   ├── reorder_columns.py
    │   ├── round_to_fraction.py
    │   ├── row_to_names.py
    │   ├── select.py
    │   ├── shuffle.py
    │   ├── sort_column_value_order.py
    │   ├── sort_naturally.py
    │   ├── summarise.py
    │   ├── take_first.py
    │   ├── then.py
    │   ├── to_datetime.py
    │   ├── toset.py
    │   ├── transform_columns.py
    │   ├── truncate_datetime.py
    │   ├── update_where.py
    │   └── utils.py
    ├── io.py
    ├── math.py
    ├── ml.py
    ├── polars
    │   ├── __init__.py
    │   ├── clean_names.py
    │   ├── complete.py
    │   ├── dates_to_polars.py
    │   ├── pivot_longer.py
    │   ├── polars_flavor.py
    │   └── row_to_names.py
    ├── spark
    │   ├── __init__.py
    │   ├── backend.py
    │   └── functions.py
    ├── testing_utils
    │   ├── __init__.py
    │   ├── date_data.py
    │   └── strategies.py
    ├── timeseries.py
    ├── utils.py
    └── xarray
    │   ├── __init__.py
    │   └── functions.py
├── mkdocs.yml
├── mkdocs
    ├── AUTHORS.md
    ├── CHANGELOG.md
    ├── api
    │   ├── biology.md
    │   ├── chemistry.md
    │   ├── engineering.md
    │   ├── finance.md
    │   ├── functions.md
    │   ├── io.md
    │   ├── math.md
    │   ├── ml.md
    │   ├── polars.md
    │   ├── timeseries.md
    │   └── xarray.md
    ├── css
    │   └── apidocs.css
    ├── development
    │   └── lazy_imports.md
    ├── devguide.md
    ├── environment.yaml
    └── index.md
├── nbconvert_config.py
├── pyproject.toml
├── scripts
    ├── ci
    │   ├── build_environment.sh
    │   └── unpack_environment.sh
    ├── count_functions.py
    └── docker_deploy.sh
├── setup.py
├── talks
    └── scipy2019
    │   ├── friends.png
    │   ├── readthedocs.png
    │   ├── slides.ipynb
    │   ├── slides.md
    │   ├── sprints.jpg
    │   └── twitter-wars.png
└── tests
    ├── biology
        └── test_join_fasta.py
    ├── chemistry
        ├── test_maccs_keys_fingerprint.py
        ├── test_molecular_descriptors.py
        ├── test_morgan_fingerprint.py
        └── test_smiles2mol.py
    ├── conftest.py
    ├── engineering
        └── test_convert_units.py
    ├── finance
        ├── test_convert_currency.py
        ├── test_convert_stock.py
        ├── test_get_symbol.py
        └── test_inflate_currency.py
    ├── functions
        ├── test_add_column.py
        ├── test_add_columns.py
        ├── test_alias.py
        ├── test_also.py
        ├── test_bin_numeric.py
        ├── test_cartesian_product.py
        ├── test_case_when.py
        ├── test_change_index_dtype.py
        ├── test_change_type.py
        ├── test_clean_names.py
        ├── test_coalesce.py
        ├── test_collapse_levels.py
        ├── test_complete.py
        ├── test_concatenate_columns.py
        ├── test_conditional_join.py
        ├── test_convert_excel_date.py
        ├── test_convert_matlab_date.py
        ├── test_convert_unix_date.py
        ├── test_count_cumulative_unique.py
        ├── test_currency_column_to_numeric.py
        ├── test_data_description.py
        ├── test_deconcatenate_column.py
        ├── test_drop_constant_columns.py
        ├── test_drop_duplicate_columns.py
        ├── test_dropnotnull.py
        ├── test_encode_categorical.py
        ├── test_expand.py
        ├── test_expand_column.py
        ├── test_expand_grid.py
        ├── test_explode_index.py
        ├── test_factorize_columns.py
        ├── test_fill_direction.py
        ├── test_fill_empty.py
        ├── test_filter_column_isin.py
        ├── test_filter_date.py
        ├── test_filter_on.py
        ├── test_filter_string.py
        ├── test_find_replace.py
        ├── test_flag_nulls.py
        ├── test_get_dupes.py
        ├── test_groupby_agg.py
        ├── test_groupby_topk.py
        ├── test_impute.py
        ├── test_jitter.py
        ├── test_join_apply.py
        ├── test_label_encode.py
        ├── test_limit_column_characters.py
        ├── test_min_max_scale.py
        ├── test_move.py
        ├── test_mutate.py
        ├── test_pivot_longer.py
        ├── test_pivot_longer_spec.py
        ├── test_pivot_wider.py
        ├── test_pivot_wider_spec.py
        ├── test_process_text.py
        ├── test_remove_columns.py
        ├── test_remove_empty.py
        ├── test_rename_column.py
        ├── test_rename_columns.py
        ├── test_reorder_columns.py
        ├── test_round_to_fraction.py
        ├── test_row_to_names.py
        ├── test_select.py
        ├── test_select_columns.py
        ├── test_select_rows.py
        ├── test_shuffle.py
        ├── test_sort_column_value_order.py
        ├── test_sort_naturally.py
        ├── test_summarise.py
        ├── test_take_first.py
        ├── test_then.py
        ├── test_to_datetime.py
        ├── test_toset.py
        ├── test_transform_column.py
        ├── test_transform_columns.py
        ├── test_truncate_datetime.py
        ├── test_unionize_dataframe_categories.py
        └── test_update_where.py
    ├── helpers.py
    ├── io
        ├── test_read_commandline.py
        ├── test_read_csvs.py
        ├── test_tidyxl.py
        └── test_xlsx_table.py
    ├── math
        ├── test_ecdf.py
        ├── test_exp.py
        ├── test_log.py
        ├── test_logit.py
        ├── test_normal_cdf.py
        ├── test_probit.py
        ├── test_sigmoid.py
        ├── test_softmax.py
        └── test_z_score.py
    ├── ml
        └── test_get_features_targets.py
    ├── polars
        └── functions
        │   ├── test_clean_names_polars.py
        │   ├── test_complete_polars.py
        │   ├── test_convert_excel_date_polars.py
        │   ├── test_convert_matlab_date_polars.py
        │   ├── test_expand_polars.py
        │   ├── test_pivot_longer_polars.py
        │   ├── test_pivot_longer_spec_polars.py
        │   └── test_row_to_names_polars.py
    ├── spark
        ├── conftest.py
        └── functions
        │   ├── test_clean_names_spark.py
        │   └── test_update_where_spark.py
    ├── test_data
        ├── 016-MSPTDA-Excel.xlsx
        ├── corrected_smiles.txt
        ├── excel_without_headers.xlsx
        ├── file_example_XLSX_10.xlsx
        ├── sequences.fasta
        ├── sequences.tsv
        └── worked-examples.xlsx
    ├── test_documentation_build.py
    ├── test_helpers.py
    ├── timeseries
        ├── test_fill_missing_timestamps.py
        ├── test_flag_jumps.py
        └── test_sort_timestamps_monotonically.py
    ├── utils
        ├── test_check_column.py
        ├── test_deprecated_alias.py
        ├── test_deprecated_kwargs.py
        ├── test_idempotent.py
        ├── test_import_message.py
        ├── test_is_connected.py
        ├── test_replace_empty_string_with_none.py
        ├── test_skiperror.py
        └── test_skipna.py
    └── xarray
        ├── conftest.py
        ├── test_clone_using.py
        └── test_convert_datetime_to_number.py


/.azure-pipelines.old/pipeline-master.yml:
--------------------------------------------------------------------------------
 1 | pr:
 2 | - dev
 3 | 
 4 | jobs:
 5 |   - job: linux
 6 |     variables:
 7 |       activate.command: "source activate"
 8 |       JANITOR_CI_MACHINE: 1
 9 |     strategy:
10 |       matrix:
11 |         py37:
12 |           python.version: "3.7"
13 | 
14 |     pool:
15 |       vmImage: ubuntu-16.04
16 | 
17 |     steps:
18 |     - bash: echo "##vso[task.prependpath]$CONDA/bin"
19 |       displayName: Add conda to PATH
20 |     - template: templates/create-env.yml
21 |     - template: templates/run-tests.yml
22 |     # - template: templates/release.yml
23 | 
24 |   - job: macos
25 |     variables:
26 |       activate.command: "source activate"
27 |       JANITOR_CI_MACHINE: 1
28 |     strategy:
29 |       matrix:
30 |         py37:
31 |           python.version: "3.7"
32 | 
33 |     pool:
34 |       vmImage: macOS-10.14
35 | 
36 |     steps:
37 |     - bash: echo "##vso[task.prependpath]$CONDA/bin"
38 |       displayName: Add conda to PATH
39 | 
40 |     # On Hosted macOS, the agent user doesn't have ownership of Miniconda's installation directory/
41 |     # We need to take ownership if we want to update conda or install packages globally
42 |     - bash: sudo chown -R $USER $CONDA
43 |       displayName: Take ownership of conda installation
44 |     - template: templates/create-env.yml
45 |     - template: templates/run-tests.yml
46 |     # - template: templates/release.yml
47 | 
48 |   # Commenting out Windows build because it never fails when it should...
49 |   # - job: windows
50 |   #   variables:
51 |   #     activate.command: "activate"
52 |   #     JANITOR_CI_MACHINE: 1
53 |   #   strategy:
54 |   #     matrix:
55 |   #       py37:
56 |   #         python.version: "3.7"
57 | 
58 |   #   pool:
59 |   #     vmImage: vs2017-win2016
60 | 
61 |   #   steps:
62 |   #   - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
63 |   #     displayName: Add conda to PATH
64 |   #   - template: templates/create-env.yml
65 |   #   - template: templates/run-tests.yml
66 |   #   # - template: templates/release.yml
67 | 


--------------------------------------------------------------------------------
/.azure-pipelines.old/templates/create-env.yml:
--------------------------------------------------------------------------------
1 | steps:
2 |   - script: |
3 |       conda env create -f environment-dev.yml
4 |       conda install -y python=$(python.version)
5 |       $(activate.command) pyjanitor-dev
6 |       python -m ipykernel install --user --name pyjanitor-dev
7 |       python setup.py develop
8 |     displayName: 'Install kernel, package and dependencies'
9 | 


--------------------------------------------------------------------------------
/.azure-pipelines.old/templates/release.yml:
--------------------------------------------------------------------------------
 1 | # Unconditionally release pyjanitor.
 2 | # If no version bump has happened, then nothing happens.
 3 | # If a version bump has happened, then the build will automatically deploy
 4 | # a release to PyPI.
 5 | 
 6 | steps:
 7 |   - script: |
 8 |       $(activate.command) pyjanitor-dev
 9 |       conda install twine setuptools
10 |     displayName: 'Install Twine and setuptools'
11 |   - script: |
12 |       $(activate.command) pyjanitor-dev
13 |       python setup.py sdist bdist_wheel
14 |     displayName: 'Build artifacts to deploy.'
15 |   - script: |
16 |       $(activate.command) pyjanitor-dev
17 |       twine upload dist/* --skip-existing --username $(pypi.username) --password $(pypi.password)
18 |     displayName: 'Upload built artifacts.'
19 | 


--------------------------------------------------------------------------------
/.azure-pipelines.old/templates/run-tests.yml:
--------------------------------------------------------------------------------
 1 | steps:
 2 |   - script: |
 3 |       $(activate.command) pyjanitor-dev
 4 |       flake8 . --exclude ./nbconvert_config.py
 5 |     displayName: 'Code style: flake8'
 6 |   - script: |
 7 |       $(activate.command) pyjanitor-dev
 8 |       black
 9 |     displayName: 'Code style: black'
10 |   - script: |
11 |       $(activate.command) pyjanitor-dev
12 |       interrogate
13 |     displayName: 'Docstring coverage: interrogate'
14 |   - script: |
15 |       $(activate.command) pyjanitor-dev
16 |       darglint janitor -v 2
17 |     displayName: 'Docstring linter: darglint'
18 |   - script: |
19 |       $(activate.command) pyjanitor-dev
20 |       pytest
21 |     displayName: 'Unit tests.'
22 |   - script: |
23 |       $(activate.command) pyjanitor-dev
24 |       python scripts/check-autodoc.py
25 |     displayName: 'Check that all general functions have been added to docs.'
26 |   - script: |
27 |       $(activate.command) pyjanitor-dev
28 |       bash <(curl -s https://codecov.io/bash) -t c4aaeb6c-be8f-44b2-a529-6871f3537261
29 |     displayName: 'Upload code coverage.'
30 |   - script: |
31 |       $(activate.command) pyjanitor-dev
32 |       python -m ipykernel install --user --name pyjanitor-dev
33 |       jupyter nbconvert --to notebook --config nbconvert_config.py --execute --template full
34 |     displayName: 'Test that all notebooks execute correctly.'
35 |   - script: |
36 |       $(activate.command) pyjanitor-dev
37 |       cd docs && make html
38 |     displayName: 'Test that HTML docs all build correctly.'
39 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.31.0
3 | commit = True
4 | tag = True
5 | 
6 | [bumpversion:file:setup.py]
7 | 
8 | [bumpversion:file:janitor/__init__.py]
9 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   notify:
 3 |     require_ci_to_pass: yes
 4 | 
 5 | coverage:
 6 |   precision: 2
 7 |   round: down
 8 |   range: "70...100"
 9 | 
10 |   status:
11 |     project: yes
12 |     patch: yes
13 |     changes: no
14 | 
15 | parsers:
16 |   gcov:
17 |     branch_detection:
18 |       conditional: yes
19 |       loop: yes
20 |       method: no
21 |       macro: no
22 | 
23 | comment:
24 |   layout: "header, diff"
25 |   behavior: default
26 |   require_changes: no
27 | 


--------------------------------------------------------------------------------
/.darglint:
--------------------------------------------------------------------------------
1 | [darglint]
2 | docstring_style=google
3 | strictness=short
4 | ignore_regex=^(test_|_)(.*)
5 | 


--------------------------------------------------------------------------------
/.deepsource.toml:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | 
 3 | test_patterns = [
 4 |   "tests/**",
 5 |   "test_*.py",
 6 |   "scripts/check-autodoc.py",
 7 | ]
 8 | 
 9 | [[analyzers]]
10 | name = "python"
11 | enabled = true
12 | 
13 |   [analyzers.meta]
14 |   runtime_version = "3.x.x"
15 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
 4 | #-------------------------------------------------------------------------------------------------------------
 5 | 
 6 | FROM continuumio/miniconda3
 7 | 
 8 | # Avoid warnings by switching to noninteractive
 9 | ENV DEBIAN_FRONTEND=noninteractive
10 | 
11 | # This Dockerfile adds a non-root user with sudo access. Use the "remoteUser"
12 | # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs
13 | # will be updated to match your local UID/GID (when using the dockerFile property).
14 | # See https://aka.ms/vscode-remote/containers/non-root-user for details.
15 | ARG USERNAME=vscode
16 | ARG USER_UID=1000
17 | ARG USER_GID=$USER_UID
18 | 
19 | # Copy environment-dev.yml (if found) to a temp locaition so we update the environment. Also
20 | # copy "noop.txt" so the COPY instruction does not fail if no environment-dev.yml exists.
21 | COPY environment-dev.yml* .devcontainer/noop.txt /tmp/conda-tmp/
22 | 
23 | # Configure apt and install packages
24 | RUN apt-get update \
25 |     && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
26 |     #
27 |     # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
28 |     && apt-get -y install git openssh-client less iproute2 procps iproute2 lsb-release gcc build-essential \
29 |     #
30 |     # Install pylint
31 |     && /opt/conda/bin/pip install pylint \
32 |     && /opt/conda/bin/conda install mamba gh -c conda-forge \
33 |     #
34 |     # Update Python environment based on environment-dev.yml (if present)
35 |     && if [ -f "/tmp/conda-tmp/environment-dev.yml" ]; then /opt/conda/bin/mamba env update -n base -f /tmp/conda-tmp/environment-dev.yml; fi \
36 |     && rm -rf /tmp/conda-tmp \
37 |     #
38 |     # Create a non-root user to use if preferred - see https://aka.ms/vscode-remote/containers/non-root-user.
39 |     && groupadd --gid $USER_GID $USERNAME \
40 |     && useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME \
41 |     # [Optional] Add sudo support for the non-root user
42 |     && apt-get install -y sudo \
43 |     && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME\
44 |     && chmod 0440 /etc/sudoers.d/$USERNAME \
45 |     # [Additional Customization]
46 |     && apt-get install -y nano vim emacs \
47 |     # Clean up
48 |     && apt-get autoremove -y \
49 |     && apt-get clean -y \
50 |     && rm -rf /var/lib/apt/lists/*
51 | 
52 | # Switch back to dialog for any ad-hoc use of apt-get
53 | ENV DEBIAN_FRONTEND=dialog
54 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.117.1/containers/python-3-miniconda
 3 | {
 4 | 	"name": "pyjanitor dev container",
 5 | 	// "context": "..",
 6 | 	// "image": "registry.hub.docker.com/ericmjl/pyjanitor:devcontainer",
 7 | 	"build": {
 8 | 		"dockerfile": "Dockerfile",
 9 | 		"context": ".."
10 | 	},
11 | 	// Set *default* container specific settings.json values on container create.
12 | 	"settings": {
13 | 		"terminal.integrated.defaultProfile.linux": "bash",
14 | 		"python.defaultInterpreterPath": "/opt/conda/bin/python",
15 | 		"python.linting.enabled": true,
16 | 		"python.linting.pylintEnabled": true,
17 | 		"python.linting.pylintPath": "/opt/conda/bin/pylint",
18 | 		"python.formatting.provider": "black",
19 | 		"python.formatting.blackArgs": [
20 | 			"--config",
21 | 			"pyproject.toml",
22 | 		],
23 | 		"editor.formatOnSave": true,
24 | 		"files.insertFinalNewline": true,
25 | 		"files.trimFinalNewlines": true,
26 | 		"files.trimTrailingWhitespace": true,
27 | 		"[python]": {
28 | 			"editor.formatOnSaveMode": "file",
29 | 		},
30 | 	},
31 | 	// Add the IDs of extensions you want installed when the container is created.
32 | 	"extensions": [
33 | 		"ms-python.python",
34 | 		"ms-python.vscode-pylance",
35 | 		"ms-vsliveshare.vsliveshare-pack",
36 | 		"arcticicestudio.nord-visual-studio-code",
37 | 		"ms-vsliveshare.vsliveshare",
38 | 		"ms-vsliveshare.vsliveshare-audio"
39 | 	],
40 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
41 | 	"forwardPorts": [
42 | 		8000
43 | 	],
44 | 	// Use 'postCreateCommand' to run commands after the container is created.
45 | 	"postCreateCommand": "pre-commit install --install-hooks && python setup.py develop"
46 | 	// Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
47 | 	// "remoteUser": "vscode"
48 | }
49 | 


--------------------------------------------------------------------------------
/.devcontainer/noop.txt:
--------------------------------------------------------------------------------
1 | This file is copied into the container along with environment.yml* from the
2 | parent folder. This is done to prevent the Dockerfile COPY instruction from
3 | failing if no environment.yml is found.
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | per-file-ignores =
3 |     janitor/functions/__init__.py:F401
4 |     janitor/accessors/__init__.py:F401
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Please use this issue template if you are filing a bug report.
 4 | ---
 5 | 
 6 | # Brief Description
 7 | 
 8 | <!-- Please provide a brief description of your bug. Do NOT paste the stack trace here. -->
 9 | 
10 | # System Information
11 | 
12 | <!-- System information helps us. To keep things simple, just let us know the OS and Python version first.
13 | You can provide the optional information later. -->
14 | 
15 | - Operating system: macOS/Linux/Windows  <!-- delete the appropriate ones -->
16 | - OS details (optional):  <!-- e.g. version, or Linux distro -->
17 | - Python version (required):
18 | 
19 | # Minimally Reproducible Code
20 | 
21 | <!-- If you provide minimal code that reproduces the problem, this makes it easier for us to debug what's going on.
22 | 
23 | Minimal code should be trivially copy/pastable into a Python interpreter in its entirety. Be sure to include imports.
24 | -->
25 | 
26 | # Error Messages
27 | 
28 | <!-- If you get an error message, please paste it between the backticks here. -->
29 | 
30 | ```
31 | 
32 | ```
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_fix.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Propose a Documentation Fix
 3 | about: Use this issue tracker template if you'd like to propose a fix to the documentation.
 4 | ---
 5 | 
 6 | # Brief Description of Fix
 7 | 
 8 | <!-- Please describe the fix in terms of a "before" and "after". In other words, what's not so good about the current docs
 9 | page, and what you would like to see it become.
10 | 
11 | Example starter wording is provided. -->
12 | 
13 | Currently, the docs...
14 | 
15 | I would like to propose a change, such that now the docs...
16 | 
17 | # Relevant Context
18 | 
19 | <!-- Please put here, in bullet points, links to the relevant docs page. A few starting template points are available
20 | to get you started. -->
21 | 
22 | - [Link to documentation page](https://pyjanitor-devs.github.io/pyjanitor/)
23 | - [Link to exact file to be edited](https://github.com/pyjanitor-devs/pyjanitor/blob/dev/AUTHORS.md)
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new_examples.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Add/Modify Notebooks
 3 | about: Use this specific template if you'd like to contribute a notebook to the examples gallery or modify an existing one.
 4 | ---
 5 | 
 6 | # Brief Description
 7 | 
 8 | <!-- Please describe briefly what you'd like to do in the notebook. -->
 9 | 
10 | I'd like to write a notebook that...
11 | 
12 | (optional but encouraged) This notebook would likely cover the following pyjanitor functions:
13 | 
14 | -
15 | -
16 | -
17 | 
18 | <!-- It's ok if you don't eventually use those functions, by the way! -->
19 | 
20 | # Dataset
21 | 
22 | <!-- Please list here where you plan to get the dataset from. -->
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new_proposed_feature.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Propose New Feature
 3 | about: If you'd like to propose a new feature, please use this template.
 4 | ---
 5 | 
 6 | # Brief Description
 7 | 
 8 | <!-- Please provide a brief description of what you'd like to propose. -->
 9 | 
10 | I would like to propose...
11 | 
12 | # Example API
13 | 
14 | <!-- One of the selling points of pyjanitor is the API. Hence, we guard the API very carefully, and want to
15 | make sure that it is accessible and understandable to many people. Please provide a few examples of what the API
16 | of the new function you're proposing will look like. We have provided an example that you should modify. -->
17 | 
18 | Please modify the example API below to illustrate your proposed API, and then delete this sentence.
19 | 
20 | ```python
21 | # transform only one column, while creating a new column name for it.
22 | df.transform_columns(column_names=['col1'], function=np.abs, new_column_names=['col1_abs'])
23 | 
24 | # transform multiple columns by the same function, without creating a new column name.
25 | df.transform_columns(column_names=['col1', 'col2'], function=np.abs)
26 | 
27 | # more examples below
28 | # ...
29 | ```
30 | 


--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for the-welcome-bot - https://github.com/bash-bot/the-welcome-bot
 2 | 
 3 | # Message for a new user who open an issue
 4 | issueOpen: >
 5 |   Hello World! Welcome to the project!
 6 |   Thanks and congrats for opening your very first issue in this project.
 7 |   You may submit a PR on the same if you like!
 8 |   Hope you have a great time here :)
 9 | 
10 | # Message for a new user who comments on an issue
11 | issueComment: >
12 |   Hello World! Welcome to the project!
13 |   Thanks and congrats for your very first comment on this project.
14 |   Checkout the README for more details on it.
15 |   Want to contribute? Make an issue or submit a PR.
16 |   Hope you have a great time here :)
17 | 
18 | # Message for a new user who opens a PR
19 | prOpen: >
20 |   Hello World! Welcome to the project!
21 |   Thank you and congrats for your first PR on this project.
22 |   We will review it soon!
23 |   Till then you can checkout the README for more details on it.
24 |   Hope you have a great time here :)
25 | 
26 | # Default message for a new user
27 | welcomeMessage: >
28 |   Hello World! Welcome to the project! Feel free to explore it.
29 |   Checkout the README for more details on it.
30 |   Want to contribute? Make an issue or submit a PR.
31 |   You can check the contributing guides and code of conduct for the same.
32 |   Hope you have a great time here :)
33 | 


--------------------------------------------------------------------------------
/.github/config.yml.save:
--------------------------------------------------------------------------------
 1 | # Configuration for the-welcome-bot - https://github.com/bash-bot/the-welcome-bot
 2 | 
 3 | # Message for a new user who open an issue
 4 | issueOpen: >
 5 |   Hello World! Welcome to the project!
 6 |   Thanks and congrats for opening your very first issue in this project.
 7 |   You may submit a PR on the same if you like!
 8 |   Hope you have a great time here :)
 9 | 
10 | # Message for a new user who comments on an issue
11 | issueComment: >
12 |   Hello World! Welcome to the project!
13 |   Thanks and congrats for your very first comment on this project.
14 |   Checkout the README for more details on it.
15 |   Want to contribute? Make an issue or submit a PR.
16 |   Hope you have a great time here :)
17 | 
18 | # Message for a new user who opens a PR
19 | prOpen: >
20 |   Hello World! Welcome to the project!
21 |   Thankyou and congarts for your first PR on this project.
22 |   We will review it soon!
23 |   Till then you can checkout the README for more details on it.
24 |   Hope you have a great time here :)
25 | 
26 | # Default message for a new user
27 | welcomeMessage: >
28 |   Hello World! Welcome to the project! Feel free to explore it.
29 |   Checkout the README for more details on it.
30 |   Want to contribute? Make an issue or submit a PR.
31 |   You can check the contributing guides and code of conduct for the same.
32 |   Hope you have a great time here :
33 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- Thank you for your PR!
 2 | 
 3 | BEFORE YOU CONTINUE! Please add the appropriate three-letter abbreviation to your title.
 4 | 
 5 | The abbreviations can be:
 6 | - [DOC]: Documentation fixes.
 7 | - [ENH]: Code contributions and new features.
 8 | - [TST]: Test-related contributions.
 9 | - [INF]: Infrastructure-related contributions.
10 | 
11 | Also, do not forget to tag the relevant issue here as well.
12 | 
13 | Finally, as commits come in, don't forget to regularly rebase!
14 | -->
15 | 
16 | # PR Description
17 | 
18 | Please describe the changes proposed in the pull request:
19 | 
20 | -
21 | -
22 | -
23 | 
24 | <!-- Doing so provides maintainers with context on what the PR is, and can help us more effectively review your PR. -->
25 | 
26 | <!-- Please also identify below which issue that has been raised that you are going to close. -->
27 | 
28 | **This PR resolves #(put issue number here, and remove parentheses).**
29 | 
30 | <!-- As you go down the PR template, please feel free to delete sections that are irrelevant. -->
31 | 
32 | # PR Checklist
33 | 
34 | <!-- This checklist exists for newcomers who are not yet familiar with our requirements. If you are experienced with
35 | the project, please feel free to delete this section. -->
36 | 
37 | Please ensure that you have done the following:
38 | 
39 | 1. [ ] PR in from a fork off your branch. Do not PR from `<your_username>`:`dev`, but rather from `<your_username>`:`<feature-branch_name>`.
40 | <!-- Doing this helps us keep the commit history much cleaner than it would otherwise be. -->
41 | 2. [ ] If you're not on the contributors list, add yourself to `AUTHORS.md`.
42 | <!-- We'd like to acknowledge your contributions! -->
43 | 3. [ ] Add a line to `CHANGELOG.md` under the latest version header (i.e. the one that is "on deck") describing the contribution.
44 |     - Do use some discretion here; if there are multiple PRs that are related, keep them in a single line.
45 | 
46 | # Automatic checks
47 | 
48 | There will be automatic checks run on the PR. These include:
49 | 
50 | - Building a preview of the docs on Netlify
51 | - Automatically linting the code
52 | - Making sure the code is documented
53 | - Making sure that all tests are passed
54 | - Making sure that code coverage doesn't go down.
55 | 
56 | # Relevant Reviewers
57 | 
58 | <!-- Finally, please tag relevant maintainers to review. -->
59 | 
60 | Please tag maintainers to review.
61 | 
62 | - @ericmjl
63 | 


--------------------------------------------------------------------------------
/.github/stale.yaml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 30
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-update.yml:
--------------------------------------------------------------------------------
 1 | # This workflow automatically updates PR branches with latest changes on target branch.
 2 | # See: https://github.com/marketplace/actions/auto-update
 3 | name: autoupdate
 4 | on:
 5 |   # This will trigger on all pushes to all branches.
 6 |   push:
 7 |     branches: [dev]
 8 | jobs:
 9 |   autoupdate:
10 |     name: autoupdate
11 |     runs-on: ubuntu-20.04
12 |     steps:
13 |       - uses: docker://chinthakagodawita/autoupdate-action:v1
14 |         env:
15 |           GITHUB_TOKEN: "${{ secrets.GHPAGES_TOKEN }}"
16 |           PR_READY_STATE: "ready_for_review"
17 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - dev
 7 |   pull_request:
 8 |     branches:
 9 |       - dev
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell
16 |     defaults:
17 |       run:
18 |         shell: bash -l {0}
19 | 
20 |     steps:
21 |       - name: Checkout repository
22 |         uses: actions/checkout@v2
23 | 
24 |       # See: https://github.com/marketplace/actions/setup-miniconda
25 |       - name: Setup miniconda
26 |         uses: conda-incubator/setup-miniconda@v3
27 |         with:
28 |           auto-update-conda: true
29 |           environment-file: environment-dev.yml
30 |           miniforge-version: latest
31 | 
32 |       - name: Install pyjanitor
33 |         # use editable mode to avoid _pytest.pathlib.ImportPathMismatchError
34 |         run: pip install -e .
35 | 
36 |       - name: Build docs
37 |         run: mkdocs build
38 | 
39 |       - uses: actions/upload-artifact@v4
40 |         with:
41 |           name: website
42 |           path: site/
43 | 
44 |       - name: Test docs
45 |         run: pytest -m "documentation"
46 | 
47 |       - name: Docs preview
48 |         if: ${{ github.event_name == 'pull_request' }}
49 |         uses: nwtgck/actions-netlify@v1.1
50 |         with:
51 |           publish-dir: "./site"
52 |           production-deploy: false
53 |           github-token: ${{ secrets.GHPAGES_TOKEN }}
54 |           deploy-message: "Deploy from GitHub Actions"
55 |           enable-pull-request-comment: true
56 |           enable-commit-comment: false
57 |           overwrites-pull-request-comment: true
58 |           alias: deploy-preview-${{ github.event.number }}
59 |         env:
60 |           NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
61 |           NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
62 |         timeout-minutes: 1
63 | 
64 |       - name: Deploy website
65 |         if: ${{ github.event_name == 'push' }}
66 |         uses: peaceiris/actions-gh-pages@v3
67 |         with:
68 |           # https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-set-personal-access-token-personal_token
69 |           personal_token: ${{ secrets.GHPAGES_TOKEN }}
70 |           publish_dir: ./site/
71 |           publish_branch: gh-pages
72 |           # destination_dir: manuscript
73 |           allow_empty_commit: false
74 |           keep_files: false
75 |           force_orphan: true
76 |           enable_jekyll: false
77 |           disable_nojekyll: false
78 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: pyjanitor tests
 2 | 
 3 | on:
 4 |   # only run tests and coverage when src-code changes
 5 |   push:
 6 |     branches:
 7 |       - dev
 8 |     paths:
 9 |       - "janitor/**"
10 |       - "tests/**"
11 |       - ".codecov.yml"
12 |       - ".github/workflows/tests.yml"
13 |   pull_request:
14 |     branches:
15 |       - dev
16 |     paths:
17 |       - "janitor/**"
18 |       - "tests/**"
19 |       - ".codecov.yml"
20 |       - ".github/workflows/tests.yml"
21 | 
22 | concurrency:
23 |   group: ${{ github.workflow }}-${{ github.ref }}
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   run-tests:
28 |     strategy:
29 |       fail-fast: false
30 |     runs-on: ubuntu-latest
31 |     name: Run pyjanitor test suite
32 | 
33 |     # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell
34 |     defaults:
35 |       run:
36 |         shell: bash -l {0}
37 | 
38 |     steps:
39 |       - name: Checkout repository
40 |         uses: actions/checkout@v4
41 | 
42 |       # See: https://github.com/marketplace/actions/setup-miniconda
43 |       - name: Setup miniconda
44 |         uses: conda-incubator/setup-miniconda@v3
45 |         with:
46 |           auto-update-conda: true
47 |           environment-file: environment-dev.yml
48 |           miniforge-version: latest
49 | 
50 |       - name: Install pyjanitor
51 |         run: python -m pip install -e .
52 | 
53 |       - name: Run docstrings tests
54 |         run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml --doctest-only janitor
55 | 
56 |       - name: Run unit tests
57 |         run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml tests
58 | 
59 |       # https://github.com/codecov/codecov-action
60 |       - name: Upload code coverage
61 |         uses: codecov/codecov-action@v2
62 |         with:
63 |           # fail_ci_if_error: true # optional (default = false)
64 |           verbose: true # optional (default = false)
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom
  2 | .vscode/*
  3 | test*.xml
  4 | *.DS_Store
  5 | docs/notebooks
  6 | pip-wheel-metadata
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | function_test.ipynb
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # dotenv
 89 | .env
 90 | 
 91 | # virtualenv
 92 | .venv
 93 | .venv*/
 94 | venv/
 95 | env/
 96 | ENV/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | # pycharm
112 | .idea/
113 | 
114 | 
115 | # Custom
116 | .pytest_cache
117 | 
118 | # Ingore docs' symbolic link to notebooks
119 | docs/notebooks
120 | docs/*
121 | 
122 | 
123 | # Swap
124 | [._]*.s[a-v][a-z]
125 | [._]*.sw[a-p]
126 | [._]s[a-rt-v][a-z]
127 | [._]ss[a-gi-z]
128 | [._]sw[a-p]
129 | 
130 | # Session
131 | Session.vim
132 | Sessionx.vim
133 | 
134 | # Temporary
135 | .netrwhist
136 | *~
137 | # Auto-generated tag files
138 | tags
139 | # Persistent undo
140 | [._]*.un~
141 | 
142 | # Other stuff
143 | *.profraw
144 | /scratch.py
145 | midpoint.csv
146 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v5.0.0
 6 |     hooks:
 7 |       - id: trailing-whitespace
 8 |       - id: end-of-file-fixer
 9 |       - id: check-yaml
10 |       - id: check-added-large-files
11 |   - repo: https://github.com/psf/black
12 |     rev: 25.1.0
13 |     hooks:
14 |       - id: black
15 |         args: [--config, pyproject.toml]
16 |   - repo: https://github.com/econchick/interrogate
17 |     rev: 1.7.0
18 |     hooks:
19 |       - id: interrogate
20 |         args: [-c, pyproject.toml]
21 |   # Taking out darglint because it takes too long to run.
22 |   # It may be superseded by ruff: https://github.com/astral-sh/ruff/issues/458
23 |   # - repo: https://github.com/terrencepreilly/darglint
24 |   #   rev: v1.8.1
25 |   #   hooks:
26 |   #     - id: darglint
27 |   #       args: [-v 2] # this config makes the error messages a bit less cryptic.
28 | 
29 |   # The interim replacement for darglint is pydoclint.
30 |   - repo: https://github.com/jsh9/pydoclint
31 |     rev: 0.6.7
32 |     hooks:
33 |       - id: pydoclint
34 |         args:
35 |           - "--config=pyproject.toml"
36 |   - repo: https://github.com/astral-sh/ruff-pre-commit
37 |     # Ruff version.
38 |     rev: v0.11.12
39 |     hooks:
40 |       - id: ruff
41 |         args: [--fix]
42 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
 1 | # configure updates globally
 2 | # default: all
 3 | # allowed: all, insecure, False
 4 | update: all
 5 | 
 6 | # configure dependency pinning globally
 7 | # default: True
 8 | # allowed: True, False
 9 | pin: True
10 | 
11 | # update schedule
12 | # default: empty
13 | # allowed: "every day", "every week", ..
14 | schedule: "every week on Saturday"
15 | 


--------------------------------------------------------------------------------
/.requirements/all.in:
--------------------------------------------------------------------------------
 1 | -r base.in
 2 | -r biology.in
 3 | -r chemistry.in
 4 | -r dev.in
 5 | -r docs.in
 6 | -r engineering.in
 7 | -r spark.in
 8 | -r testing.in
 9 | # -e .
10 | 


--------------------------------------------------------------------------------
/.requirements/base.in:
--------------------------------------------------------------------------------
1 | # ipykernel
2 | # jupyter_client
3 | # lxml
4 | natsort
5 | # seaborn
6 | pandas_flavor
7 | multipledispatch
8 | scipy
9 | 


--------------------------------------------------------------------------------
/.requirements/base.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/base.in
 6 | #
 7 | multipledispatch==0.6.0
 8 |     # via -r .requirements/base.in
 9 | natsort==8.1.0
10 |     # via -r .requirements/base.in
11 | numpy==1.22.3
12 |     # via
13 |     #   pandas
14 |     #   scipy
15 |     #   xarray
16 | packaging==21.3
17 |     # via xarray
18 | pandas==1.4.1
19 |     # via
20 |     #   pandas-flavor
21 |     #   xarray
22 | pandas-flavor==0.2.0
23 |     # via -r .requirements/base.in
24 | pyparsing==3.0.7
25 |     # via packaging
26 | python-dateutil==2.8.2
27 |     # via pandas
28 | pytz==2021.3
29 |     # via pandas
30 | scipy==1.10.0
31 |     # via -r .requirements/base.in
32 | six==1.16.0
33 |     # via
34 |     #   multipledispatch
35 |     #   python-dateutil
36 | xarray==2022.3.0
37 |     # via pandas-flavor
38 | 


--------------------------------------------------------------------------------
/.requirements/biology.in:
--------------------------------------------------------------------------------
1 | biopython
2 | 


--------------------------------------------------------------------------------
/.requirements/biology.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/biology.in
 6 | #
 7 | biopython==1.79
 8 |     # via -r .requirements/biology.in
 9 | numpy==1.22.3
10 |     # via biopython
11 | 


--------------------------------------------------------------------------------
/.requirements/chemistry.in:
--------------------------------------------------------------------------------
1 | # rdkit  # needed fix https://github.com/rdkit/rdkit/issues/1812
2 | tqdm
3 | 


--------------------------------------------------------------------------------
/.requirements/chemistry.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with python 3.9
3 | # To update, run:
4 | #
5 | #    pip-compile .requirements/chemistry.in
6 | #
7 | tqdm==4.66.3
8 |     # via -r chemistry.in
9 | 


--------------------------------------------------------------------------------
/.requirements/dev.in:
--------------------------------------------------------------------------------
1 | pip-tools
2 | pre-commit
3 | isort>=4.3.18
4 | black>=19.3b0
5 | darglint
6 | flake8
7 | 


--------------------------------------------------------------------------------
/.requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/dev.in
 6 | #
 7 | black==24.3.0
 8 |     # via -r dev.in
 9 | cfgv==3.3.1
10 |     # via pre-commit
11 | click==8.0.4
12 |     # via
13 |     #   black
14 |     #   pip-tools
15 | darglint==1.8.1
16 |     # via -r dev.in
17 | distlib==0.3.9
18 |     # via virtualenv
19 | filelock==3.16.1
20 |     # via virtualenv
21 | flake8==4.0.1
22 |     # via -r dev.in
23 | identify==2.4.11
24 |     # via pre-commit
25 | isort==5.10.1
26 |     # via -r dev.in
27 | mccabe==0.6.1
28 |     # via flake8
29 | mypy-extensions==0.4.3
30 |     # via black
31 | nodeenv==1.6.0
32 |     # via pre-commit
33 | packaging==24.0
34 |     # via black
35 | pathspec==0.9.0
36 |     # via black
37 | pep517==0.12.0
38 |     # via pip-tools
39 | pip-tools==6.5.1
40 |     # via -r dev.in
41 | platformdirs==4.3.6
42 |     # via
43 |     #   black
44 |     #   virtualenv
45 | pre-commit==2.17.0
46 |     # via -r dev.in
47 | pycodestyle==2.8.0
48 |     # via flake8
49 | pyflakes==2.4.0
50 |     # via flake8
51 | pyyaml==6.0
52 |     # via pre-commit
53 | toml==0.10.2
54 |     # via pre-commit
55 | tomli==2.0.1
56 |     # via
57 |     #   black
58 |     #   pep517
59 | typing-extensions==4.10.0
60 |     # via black
61 | virtualenv==20.26.6
62 |     # via pre-commit
63 | wheel==0.38.1
64 |     # via pip-tools
65 | 
66 | # The following packages are considered to be unsafe in a requirements file:
67 | # pip
68 | # setuptools
69 | 


--------------------------------------------------------------------------------
/.requirements/docs.in:
--------------------------------------------------------------------------------
 1 | mkdocs
 2 | polars
 3 | mkdocs-material
 4 | mkdocstrings>=0.19.0
 5 | mkdocstrings-python
 6 | ipython>7.31.1
 7 | -r biology.in
 8 | -r chemistry.in
 9 | -r engineering.in
10 | -r spark.in
11 | # -e .
12 | 


--------------------------------------------------------------------------------
/.requirements/engineering.in:
--------------------------------------------------------------------------------
1 | unyt
2 | 


--------------------------------------------------------------------------------
/.requirements/engineering.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/engineering.in
 6 | #
 7 | mpmath==1.3.0
 8 |     # via sympy
 9 | numpy==1.22.3
10 |     # via unyt
11 | sympy==1.10
12 |     # via unyt
13 | unyt==2.8.0
14 |     # via -r engineering.in
15 | 


--------------------------------------------------------------------------------
/.requirements/spark.in:
--------------------------------------------------------------------------------
1 | pyspark
2 | 


--------------------------------------------------------------------------------
/.requirements/spark.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/spark.in
 6 | #
 7 | py4j==0.10.9.5
 8 |     # via pyspark
 9 | pyspark==3.3.2
10 |     # via -r spark.in
11 | 


--------------------------------------------------------------------------------
/.requirements/testing.in:
--------------------------------------------------------------------------------
1 | pytest-cov
2 | pytest-xdist
3 | pytest>=3.4.2
4 | hypothesis>=4.4.0
5 | interrogate
6 | pandas-vet
7 | polars
8 | py>=1.10.0
9 | 


--------------------------------------------------------------------------------
/.requirements/testing.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile .requirements/testing.in
 6 | #
 7 | attrs==21.4.0
 8 |     # via
 9 |     #   hypothesis
10 |     #   interrogate
11 |     #   pandas-vet
12 |     #   pytest
13 | click==8.0.4
14 |     # via interrogate
15 | colorama==0.4.4
16 |     # via interrogate
17 | coverage[toml]==6.3.2
18 |     # via pytest-cov
19 | execnet==1.9.0
20 |     # via pytest-xdist
21 | flake8==4.0.1
22 |     # via pandas-vet
23 | hypothesis==6.39.3
24 |     # via -r .requirements/testing.in
25 | iniconfig==1.1.1
26 |     # via pytest
27 | interrogate==1.5.0
28 |     # via -r .requirements/testing.in
29 | mccabe==0.6.1
30 |     # via flake8
31 | packaging==21.3
32 |     # via pytest
33 | pandas-vet==0.2.3
34 |     # via -r .requirements/testing.in
35 | pluggy==1.0.0
36 |     # via pytest
37 | py==1.11.0
38 |     # via
39 |     #   -r .requirements/testing.in
40 |     #   interrogate
41 |     #   pytest
42 |     #   pytest-forked
43 | pycodestyle==2.8.0
44 |     # via flake8
45 | pyflakes==2.4.0
46 |     # via flake8
47 | pyparsing==3.0.7
48 |     # via packaging
49 | pytest==7.0.1
50 |     # via
51 |     #   -r .requirements/testing.in
52 |     #   pytest-cov
53 |     #   pytest-forked
54 |     #   pytest-xdist
55 | pytest-cov==3.0.0
56 |     # via -r .requirements/testing.in
57 | pytest-forked==1.4.0
58 |     # via pytest-xdist
59 | pytest-xdist==2.5.0
60 |     # via -r .requirements/testing.in
61 | sortedcontainers==2.4.0
62 |     # via hypothesis
63 | tabulate==0.8.9
64 |     # via interrogate
65 | toml==0.10.2
66 |     # via interrogate
67 | tomli==2.0.1
68 |     # via
69 |     #   coverage
70 |     #   pytest
71 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | mkdocs/devguide.md


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018-onwards pyjanitor devs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include .requirements/*
2 | include *.md
3 | include LICENSE
4 | include mkdocs/*
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/bash
 2 | ACTIVATE=source activate pyjanitor-dev
 3 | 
 4 | release:
 5 | 	rm -f dist/*
 6 | 	python setup.py sdist bdist_wheel
 7 | 	twine upload dist/*
 8 | 
 9 | # Note to self:
10 | # makefile has not been fully tested.
11 | # DO NOT COMMIT until testing is done.
12 | #
13 | # ALSO, remove this comment once it's tested!!!!!!!!!!!
14 | 
15 | .PHONY: format test lint docs isort check style notebooks install
16 | 
17 | format:
18 | 	@echo "Applying Black Python code formatting..."
19 | 	pre-commit run black --all-files
20 | 
21 | test:
22 | 	@echo "Running test suite..."
23 | 	pytest -v -n auto --color=yes
24 | 
25 | lint:
26 | 	@echo "Checking code formatting..."
27 | 	pre-commit run flake8 --all-files
28 | 
29 | docs:
30 | 	@echo "Building documentation..."
31 | 	mkdocs build
32 | 
33 | isort:
34 | 	@echo "Sorting imports..."
35 | 	isort --check-only --use-parentheses --trailing-comma --multi-line 3 --line-length 79 .
36 | 
37 | check: test docs notebooks isort format lint
38 | 	@echo "checks complete"
39 | 
40 | style: isort format
41 | 	@echo "styling complete"
42 | 
43 | install:
44 | 	@echo "Creating Conda environment..."
45 | 	conda env create -f environment-dev.yml
46 | 
47 | 	@echo "Installing PyJanitor in development mode..."
48 | 	$(ACTIVATE) && python setup.py develop
49 | 
50 | 	@echo "Registering current virtual environment as a Jupyter Python kernel..."
51 | 	$(ACTIVATE) && python -m ipykernel install --user --name pyjanitor-dev --display-name "PyJanitor development"
52 | 
53 | 	@echo "Installing pre-commit hooks"
54 | 	$(ACTIVATE) && pre-commit install
55 | 
56 | compile-requirements:
57 | 	@echo "pip-compiling requirements files..."
58 | 	find .requirements -type f -name '*.in' | xargs -I {} sh -c\
59 | 		'echo "compiling" {} && pip-compile {} --upgrade -q'
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | mkdocs/index.md


--------------------------------------------------------------------------------
/environment-dev.yml:
--------------------------------------------------------------------------------
 1 | name: pyjanitor-dev
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.10
 6 |   - biopython
 7 |   - black=22.12.0 # keep this in sync with `.pre-commit-config.yaml`
 8 |   - bump2version=1.0.1
 9 |   - cairo
10 |   - conda
11 |   - hypothesis
12 |   - ipykernel
13 |   - ipython
14 |   - isort
15 |   - jinja2>=3.1.0
16 |   - jupyterlab
17 |   - lxml
18 |   - make
19 |   - mkdocs
20 |   - mkdocs-material
21 |   - mkdocstrings>=0.19.0
22 |   - mkdocstrings-python
23 |   - missingno
24 |   - multipledispatch
25 |   - mypy
26 |   - natsort
27 |   - numba
28 |   - numpy==1.24.4
29 |   - openpyxl
30 |   - pandas-flavor
31 |   - pandas-vet
32 |   - pandas>=2.0
33 |   - pip
34 |   - pipreqs
35 |   - pip-tools
36 |   - pre-commit
37 |   - polars
38 |   - pyspark>=3.2.0
39 |   - pytest
40 |   - pytest-cov
41 |   - pytest-xdist
42 |   - pytest-doctestplus
43 |   - python-language-server
44 |   - rdkit
45 |   - recommonmark
46 |   - seaborn
47 |   - twine
48 |   - unyt
49 |   - xarray
50 |   - xlrd
51 |   - xorg-libxrender
52 |   - pip:
53 |       - mknotebooks
54 |   # Temporarily pinned to fix CI
55 |   - setuptools==70.3.0
56 | 


--------------------------------------------------------------------------------
/examples/limit_column_characters.md:
--------------------------------------------------------------------------------
 1 | # df.limit_column_characters()
 2 | 
 3 | ## Description
 4 | This method truncates column names to a given character length. In the case of duplicated column names, numbers are appended to the columns with a character separator (default is "_").
 5 | 
 6 | ## Parameters
 7 | ### df
 8 | A pandas dataframe.
 9 | 
10 | ### column_length
11 | Character length for which to truncate all columns. The column separator value and number for duplicate column name does
12 |     not contribute. Therefore, if all columns are truncated to 10
13 |     characters, the first distinct column will be 10 characters and the
14 |     remaining will be 12 characters (assuming a column separator of one
15 |     character).
16 | 
17 | ### col_separator
18 | The separator to use for counting distinct column values. Default is "_". Supply an empty string (i.e. '') to remove the
19 |     separator.
20 | 
21 | ## Setup
22 | ```python
23 | import pandas as pd
24 | import janitor
25 | 
26 | data_dict = {
27 |     "really_long_name_for_a_column": range(10),
28 |     "another_really_long_name_for_a_column": [2 * item for item in range(10)],
29 |     "another_really_longer_name_for_a_column": list("lllongname"),
30 |     "this_is_getting_out_of_hand": list("longername"),
31 | }
32 | ```
33 | 
34 | ## Example1: Standard truncation
35 |  ```python
36 | example_dataframe = pd.DataFrame(data_dict)
37 | 
38 | example_dataframe.limit_column_characters(7)
39 | ```
40 | 
41 | ### Output
42 | 
43 |        really_  another another_1 this_is
44 |     0        0        0         l       l
45 |     1        1        2         l       o
46 |     2        2        4         l       n
47 |     3        3        6         o       g
48 |     4        4        8         n       e
49 |     5        5       10         g       r
50 |     6        6       12         n       n
51 |     7        7       14         a       a
52 |     8        8       16         m       m
53 |     9        9       18         e       e
54 | 
55 | ## Example2: Standard truncation with different separator character
56 | 
57 | ```python
58 | 
59 | example_dataframe2 = pd.DataFrame(data_dict)
60 | 
61 | example_dataframe2.limit_column_characters(7, ".")
62 | ```
63 | 
64 | ### Output
65 | 
66 |        really_  another another.1 this_is
67 |     0        0        0         l       l
68 |     1        1        2         l       o
69 |     2        2        4         l       n
70 |     3        3        6         o       g
71 |     4        4        8         n       e
72 |     5        5       10         g       r
73 |     6        6       12         n       n
74 |     7        7       14         a       a
75 |     8        8       16         m       m
76 |     9        9       18         e       e
77 | 


--------------------------------------------------------------------------------
/examples/notebooks/README.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | This folder contains jupyter notebooks demonstrating different ways to
 6 | implement pyjanitor in your workflow.
 7 | 
 8 | Guidelines
 9 | ~~~~~~~~~~
10 | 
11 | When contributing example notebooks please include a short explanation of
12 | where the data came from and what it contains. Then go through your
13 | demonstration of data cleaning with pyjanitor in a step by step manner with
14 | clear documentation of what is being done. Please try to elaborate on what the
15 | benefits of pyjanitor are and why it should be implemented in your use-case.
16 | Optionally, feel free to add examples of analysis for the cleaned data.
17 | 


--------------------------------------------------------------------------------
/examples/notebooks/dirty_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/examples/notebooks/dirty_data.xlsx


--------------------------------------------------------------------------------
/janitor/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top-level janitor API lives here."""
 2 | 
 3 | import pandas_flavor as pf  # noqa: F401
 4 | 
 5 | from .accessors import *  # noqa: F403, F401
 6 | from .functions import *  # noqa: F403, F401
 7 | from .io import *  # noqa: F403, F401
 8 | from .math import *  # noqa: F403, F401
 9 | from .ml import get_features_targets as _get_features_targets
10 | from .utils import refactored_function
11 | from .xarray import *  # noqa: F403, F401
12 | 
13 | 
14 | @refactored_function(
15 |     "get_features_targets() has moved. Please use ml.get_features_targets()."
16 | )
17 | def get_features_targets(*args, **kwargs):
18 |     """Wrapper for get_features_targets."""
19 |     return _get_features_targets(*args, **kwargs)
20 | 
21 | 
22 | __version__ = "0.31.0"
23 | 


--------------------------------------------------------------------------------
/janitor/accessors/__init__.py:
--------------------------------------------------------------------------------
1 | """Miscellaneous mathematical operators."""
2 | 
3 | from janitor.accessors.data_description import DataDescription  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/janitor/accessors/data_description.py:
--------------------------------------------------------------------------------
 1 | """DataDescription class for the DataDescription accessor."""
 2 | 
 3 | from typing import Dict, List, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_accessor("data_description")
10 | class DataDescription:
11 |     """High-level description of data present in this DataFrame.
12 | 
13 |     This is a custom data accessor.
14 |     """
15 | 
16 |     def __init__(self, data):
17 |         self._data = data
18 |         self._desc = {}
19 | 
20 |     def _get_data_df(self) -> pd.DataFrame:
21 |         """Get a table of descriptive information in a DataFrame format.
22 | 
23 |         :returns: A DataFrame containing the descriptive information.
24 |         """
25 |         df = self._data
26 | 
27 |         data_dict = {}
28 |         data_dict["column_name"] = df.columns.tolist()
29 |         data_dict["type"] = df.dtypes.tolist()
30 |         data_dict["count"] = df.count().tolist()
31 |         data_dict["pct_missing"] = (1 - (df.count() / len(df))).tolist()
32 |         data_dict["description"] = [self._desc.get(c, "") for c in df.columns]
33 | 
34 |         return pd.DataFrame(data_dict).set_index("column_name")
35 | 
36 |     @property
37 |     def df(self) -> pd.DataFrame:
38 |         """Get a table of descriptive information in a DataFrame format."""
39 |         return self._get_data_df()
40 | 
41 |     def __repr__(self):
42 |         """Human-readable representation of the `DataDescription` object."""
43 |         return str(self._get_data_df())
44 | 
45 |     def display(self):
46 |         """Print the table of descriptive information about this DataFrame."""
47 |         print(self)
48 | 
49 |     def set_description(self, desc: Union[List, Dict]):
50 |         """Update the description for each of the columns in the DataFrame.
51 | 
52 |         Args:
53 |             desc: The structure containing the descriptions to update
54 | 
55 |         Raises:
56 |             ValueError: If length of description list does not match
57 |                 number of columns in DataFrame.
58 |         """
59 |         if isinstance(desc, list):
60 |             if len(desc) != len(self._data.columns):
61 |                 raise ValueError(
62 |                     "Length of description list "
63 |                     f"({len(desc)}) does not match number of columns in "
64 |                     f"DataFrame ({len(self._data.columns)})"
65 |                 )
66 | 
67 |             self._desc = dict(zip(self._data.columns, desc))
68 | 
69 |         elif isinstance(desc, dict):
70 |             self._desc = desc
71 | 


--------------------------------------------------------------------------------
/janitor/biology.py:
--------------------------------------------------------------------------------
 1 | """Biology and bioinformatics-oriented data cleaning functions."""
 2 | 
 3 | import pandas as pd
 4 | import pandas_flavor as pf
 5 | 
 6 | from .utils import deprecated_alias, import_message
 7 | 
 8 | try:
 9 |     from Bio import SeqIO
10 | except ImportError:
11 |     import_message(
12 |         submodule="biology",
13 |         package="biopython",
14 |         conda_channel="conda-forge",
15 |         pip_install=True,
16 |     )
17 | 
18 | 
19 | @pf.register_dataframe_method
20 | @deprecated_alias(col_name="column_name")
21 | def join_fasta(
22 |     df: pd.DataFrame, filename: str, id_col: str, column_name: str
23 | ) -> pd.DataFrame:
24 |     """Convenience method to join in a FASTA file as a column.
25 | 
26 |     This allows us to add the string sequence of a FASTA file as a new column
27 |     of data in the dataframe.
28 | 
29 |     This method only attaches the string representation of the SeqRecord.Seq
30 |     object from Biopython. Does not attach the full SeqRecord. Alphabet is
31 |     also not stored, under the assumption that the data scientist has domain
32 |     knowledge of what kind of sequence is being read in (nucleotide vs. amino
33 |     acid.)
34 | 
35 |     This method mutates the original DataFrame.
36 | 
37 |     For more advanced functions, please use phylopandas.
38 | 
39 |     Examples:
40 |         >>> import tempfile
41 |         >>> import pandas as pd
42 |         >>> import janitor.biology
43 |         >>> tf = tempfile.NamedTemporaryFile()
44 |         >>> tf.write('''>SEQUENCE_1
45 |         ... MTEITAAMVKELRESTGAGMMDCK
46 |         ... >SEQUENCE_2
47 |         ... SATVSEINSETDFVAKN'''.encode('utf8'))
48 |         66
49 |         >>> tf.seek(0)
50 |         0
51 |         >>> df = pd.DataFrame({"sequence_accession":
52 |         ... ["SEQUENCE_1", "SEQUENCE_2", ]})
53 |         >>> df = df.join_fasta(  # doctest: +SKIP
54 |         ...     filename=tf.name,
55 |         ...     id_col='sequence_accession',
56 |         ...     column_name='sequence',
57 |         ... )
58 |         >>> df.sequence  # doctest: +SKIP
59 |         0    MTEITAAMVKELRESTGAGMMDCK
60 |         1           SATVSEINSETDFVAKN
61 |         Name: sequence, dtype: object
62 | 
63 |     Args:
64 |         df: A pandas DataFrame.
65 |         filename: Path to the FASTA file.
66 |         id_col: The column in the DataFrame that houses sequence IDs.
67 |         column_name: The name of the new column.
68 | 
69 |     Returns:
70 |         A pandas DataFrame with new FASTA string sequence column.
71 |     """
72 |     seqrecords = {
73 |         x.id: x.seq.__str__() for x in SeqIO.parse(filename, "fasta")
74 |     }
75 |     seq_col = [seqrecords[i] for i in df[id_col]]
76 |     df[column_name] = seq_col
77 |     return df
78 | 


--------------------------------------------------------------------------------
/janitor/errors.py:
--------------------------------------------------------------------------------
1 | class JanitorError(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/janitor/functions/alias.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `toset` function."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any
 6 | 
 7 | import pandas as pd
 8 | import pandas_flavor as pf
 9 | 
10 | 
11 | @pf.register_series_method
12 | def alias(series: pd.Series, alias: Any = None) -> pd.Series:
13 |     """Return a Series with a new name. Accepts either a scalar or a callable.
14 | 
15 | 
16 |     Examples:
17 |         >>> import pandas as pd
18 |         >>> import janitor
19 |         >>> s = pd.Series([1, 2, 3], name='series')
20 |         >>> s
21 |         0    1
22 |         1    2
23 |         2    3
24 |         Name: series, dtype: int64
25 |         >>> s.alias('series_new')
26 |         0    1
27 |         1    2
28 |         2    3
29 |         Name: series_new, dtype: int64
30 |         >>> s.alias(str.upper)
31 |         0    1
32 |         1    2
33 |         2    3
34 |         Name: SERIES, dtype: int64
35 | 
36 |     Args:
37 |         series: A pandas Series.
38 |         alias: scalar or callable to create a new name for the pandas Series.
39 | 
40 |     Returns:
41 |         A new pandas Series.
42 |     """
43 |     series = series[:]
44 |     if alias is None:
45 |         return series
46 |     if callable(alias):
47 |         alias = alias(series.name)
48 |     series.name = alias
49 |     return series
50 | 


--------------------------------------------------------------------------------
/janitor/functions/also.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for chainable function `also`."""
 2 | 
 3 | from typing import Any, Callable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def also(
11 |     df: pd.DataFrame, func: Callable, *args: Any, **kwargs: Any
12 | ) -> pd.DataFrame:
13 |     """Run a function with side effects.
14 | 
15 |     This function allows you to run an arbitrary function
16 |     in the `pyjanitor` method chain.
17 |     Doing so will let you do things like save the dataframe to disk midway
18 |     while continuing to modify the dataframe afterwards.
19 | 
20 |     Examples:
21 |         >>> import pandas as pd
22 |         >>> import janitor
23 |         >>> df = (
24 |         ...     pd.DataFrame({"a": [1, 2, 3], "b": list("abc")})
25 |         ...     .query("a > 1")
26 |         ...     .also(lambda df: print(f"DataFrame shape is: {df.shape}"))
27 |         ...     .rename_column(old_column_name="a", new_column_name="a_new")
28 |         ...     .also(lambda df: df.to_csv("midpoint.csv"))
29 |         ...     .also(
30 |         ...         lambda df: print(f"Columns: {df.columns}")
31 |         ...     )
32 |         ... )
33 |         DataFrame shape is: (2, 2)
34 |         Columns: Index(['a_new', 'b'], dtype='object')
35 | 
36 |     Args:
37 |         df: A pandas DataFrame.
38 |         func: A function you would like to run in the method chain.
39 |             It should take one DataFrame object as a parameter and have no return.
40 |             If there is a return, it will be ignored.
41 |         *args: Optional arguments for `func`.
42 |         **kwargs: Optional keyword arguments for `func`.
43 | 
44 |     Returns:
45 |         The input pandas DataFrame, unmodified.
46 |     """  # noqa: E501
47 |     func(df.copy(), *args, **kwargs)
48 |     return df
49 | 


--------------------------------------------------------------------------------
/janitor/functions/bin_numeric.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for `bin_numeric`."""
 2 | 
 3 | from typing import Any, Optional, Sequence, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import check, check_column, deprecated_alias
 9 | 
10 | ScalarSequence = Sequence[float]
11 | 
12 | 
13 | @pf.register_dataframe_method
14 | @deprecated_alias(
15 |     from_column="from_column_name",
16 |     to_column="to_column_name",
17 |     num_bins="bins",
18 | )
19 | def bin_numeric(
20 |     df: pd.DataFrame,
21 |     from_column_name: str,
22 |     to_column_name: str,
23 |     bins: Optional[Union[int, ScalarSequence, pd.IntervalIndex]] = 5,
24 |     **kwargs: Any,
25 | ) -> pd.DataFrame:
26 |     """Generate a new column that labels bins for a specified numeric column.
27 | 
28 |     This method does not mutate the original DataFrame.
29 | 
30 |     A wrapper around the pandas [`cut()`][pd_cut_docs] function to bin data of
31 |     one column, generating a new column with the results.
32 | 
33 |     [pd_cut_docs]: https://pandas.pydata.org/docs/reference/api/pandas.cut.html
34 | 
35 |     Examples:
36 |         Binning a numeric column with specific bin edges.
37 | 
38 |         >>> import pandas as pd
39 |         >>> import janitor
40 |         >>> df = pd.DataFrame({"a": [3, 6, 9, 12, 15]})
41 |         >>> df.bin_numeric(
42 |         ...     from_column_name="a", to_column_name="a_binned",
43 |         ...     bins=[0, 5, 11, 15],
44 |         ... )
45 |             a  a_binned
46 |         0   3    (0, 5]
47 |         1   6   (5, 11]
48 |         2   9   (5, 11]
49 |         3  12  (11, 15]
50 |         4  15  (11, 15]
51 | 
52 |     Args:
53 |         df: A pandas DataFrame.
54 |         from_column_name: The column whose data you want binned.
55 |         to_column_name: The new column to be created with the binned data.
56 |         bins: The binning strategy to be utilized. Read the `pd.cut`
57 |             documentation for more details.
58 |         **kwargs: Additional kwargs to pass to `pd.cut`, except `retbins`.
59 | 
60 |     Raises:
61 |         ValueError: If `retbins` is passed in as a kwarg.
62 | 
63 |     Returns:
64 |         A pandas DataFrame.
65 |     """
66 |     if "retbins" in kwargs:
67 |         raise ValueError("`retbins` is not an acceptable keyword argument.")
68 | 
69 |     check("from_column_name", from_column_name, [str])
70 |     check("to_column_name", to_column_name, [str])
71 |     check_column(df, from_column_name)
72 | 
73 |     df = df.assign(
74 |         **{
75 |             to_column_name: pd.cut(df[from_column_name], bins=bins, **kwargs),
76 |         }
77 |     )
78 | 
79 |     return df
80 | 


--------------------------------------------------------------------------------
/janitor/functions/concatenate_columns.py:
--------------------------------------------------------------------------------
 1 | from typing import Hashable, List
 2 | 
 3 | import pandas as pd
 4 | import pandas_flavor as pf
 5 | 
 6 | from janitor.errors import JanitorError
 7 | from janitor.utils import deprecated_alias
 8 | 
 9 | 
10 | @pf.register_dataframe_method
11 | @deprecated_alias(columns="column_names")
12 | def concatenate_columns(
13 |     df: pd.DataFrame,
14 |     column_names: List[Hashable],
15 |     new_column_name: Hashable,
16 |     sep: str = "-",
17 |     ignore_empty: bool = True,
18 | ) -> pd.DataFrame:
19 |     """Concatenates the set of columns into a single column.
20 | 
21 |     Used to quickly generate an index based on a group of columns.
22 | 
23 |     This method mutates the original DataFrame.
24 | 
25 |     Examples:
26 |         Concatenate two columns row-wise.
27 | 
28 |         >>> import pandas as pd
29 |         >>> import janitor
30 |         >>> df = pd.DataFrame({"a": [1, 3, 5], "b": list("xyz")})
31 |         >>> df
32 |            a  b
33 |         0  1  x
34 |         1  3  y
35 |         2  5  z
36 |         >>> df.concatenate_columns(
37 |         ...     column_names=["a", "b"], new_column_name="m",
38 |         ... )
39 |            a  b    m
40 |         0  1  x  1-x
41 |         1  3  y  3-y
42 |         2  5  z  5-z
43 | 
44 |     Args:
45 |         df: A pandas DataFrame.
46 |         column_names: A list of columns to concatenate together.
47 |         new_column_name: The name of the new column.
48 |         sep: The separator between each column's data.
49 |         ignore_empty: Ignore null values if exists.
50 | 
51 |     Raises:
52 |         JanitorError: If at least two columns are not provided
53 |             within `column_names`.
54 | 
55 |     Returns:
56 |         A pandas DataFrame with concatenated columns.
57 |     """
58 |     if len(column_names) < 2:
59 |         raise JanitorError("At least two columns must be specified")
60 | 
61 |     df[new_column_name] = (
62 |         df[column_names].astype(str).fillna("").agg(sep.join, axis=1)
63 |     )
64 | 
65 |     if ignore_empty:
66 | 
67 |         def remove_empty_string(x):
68 |             """Ignore empty/null string values from the concatenated output."""
69 |             return sep.join(x for x in x.split(sep) if x)
70 | 
71 |         df[new_column_name] = df[new_column_name].transform(
72 |             remove_empty_string
73 |         )
74 | 
75 |     return df
76 | 


--------------------------------------------------------------------------------
/janitor/functions/drop_constant_columns.py:
--------------------------------------------------------------------------------
 1 | """Implementation of drop_constant_columns."""
 2 | 
 3 | import pandas as pd
 4 | import pandas_flavor as pf
 5 | 
 6 | 
 7 | @pf.register_dataframe_method
 8 | def drop_constant_columns(df: pd.DataFrame) -> pd.DataFrame:
 9 |     """Finds and drops the constant columns from a Pandas DataFrame.
10 | 
11 |     Examples:
12 |         >>> import pandas as pd
13 |         >>> import janitor
14 |         >>> data_dict = {
15 |         ...     "a": [1, 1, 1],
16 |         ...     "b": [1, 2, 3],
17 |         ...     "c": [1, 1, 1],
18 |         ...     "d": ["rabbit", "leopard", "lion"],
19 |         ...     "e": ["Cambridge", "Shanghai", "Basel"]
20 |         ... }
21 |         >>> df = pd.DataFrame(data_dict)
22 |         >>> df
23 |            a  b  c        d          e
24 |         0  1  1  1   rabbit  Cambridge
25 |         1  1  2  1  leopard   Shanghai
26 |         2  1  3  1     lion      Basel
27 |         >>> df.drop_constant_columns()
28 |            b        d          e
29 |         0  1   rabbit  Cambridge
30 |         1  2  leopard   Shanghai
31 |         2  3     lion      Basel
32 | 
33 |     Args:
34 |         df: Input Pandas DataFrame
35 | 
36 |     Returns:
37 |         The Pandas DataFrame with the constant columns dropped.
38 |     """
39 |     return df.loc[:, df.nunique().ne(1)]
40 | 


--------------------------------------------------------------------------------
/janitor/functions/drop_duplicate_columns.py:
--------------------------------------------------------------------------------
 1 | """Implementation for `drop_duplicate_columns`."""
 2 | 
 3 | from typing import Hashable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def drop_duplicate_columns(
11 |     df: pd.DataFrame, column_name: Hashable, nth_index: int = 0
12 | ) -> pd.DataFrame:
13 |     """Remove a duplicated column specified by `column_name`.
14 | 
15 |     Specifying `nth_index=0` will remove the first column,
16 |     `nth_index=1` will remove the second column,
17 |     and so on and so forth.
18 | 
19 |     The corresponding tidyverse R's library is:
20 |     `select(-<column_name>_<nth_index + 1>)`
21 | 
22 |     Examples:
23 |         >>> import pandas as pd
24 |         >>> import janitor
25 |         >>> df = pd.DataFrame({
26 |         ...     "a": range(2, 5),
27 |         ...     "b": range(3, 6),
28 |         ...     "A": range(4, 7),
29 |         ...     "a*": range(6, 9),
30 |         ... }).clean_names(remove_special=True)
31 |         >>> df
32 |            a  b  a  a
33 |         0  2  3  4  6
34 |         1  3  4  5  7
35 |         2  4  5  6  8
36 |         >>> df.drop_duplicate_columns(column_name="a", nth_index=1)
37 |            a  b  a
38 |         0  2  3  6
39 |         1  3  4  7
40 |         2  4  5  8
41 | 
42 |     Args:
43 |         df: A pandas DataFrame
44 |         column_name: Name of duplicated columns.
45 |         nth_index: Among the duplicated columns,
46 |             select the nth column to drop.
47 | 
48 |     Returns:
49 |         A pandas DataFrame
50 |     """
51 |     col_indexes = [
52 |         col_idx
53 |         for col_idx, col_name in enumerate(df.columns)
54 |         if col_name == column_name
55 |     ]
56 | 
57 |     # Select the column to remove based on nth_index.
58 |     removed_col_idx = col_indexes[nth_index]
59 |     # Filter out columns except for the one to be removed.
60 |     filtered_cols = [
61 |         c_i for c_i, _ in enumerate(df.columns) if c_i != removed_col_idx
62 |     ]
63 | 
64 |     return df.iloc[:, filtered_cols]
65 | 


--------------------------------------------------------------------------------
/janitor/functions/dropnotnull.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for `dropnotnull`."""
 2 | 
 3 | from typing import Hashable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import deprecated_alias
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @deprecated_alias(column="column_name")
13 | def dropnotnull(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
14 |     """Drop rows that do *not* have null values in the given column.
15 | 
16 |     This method does not mutate the original DataFrame.
17 | 
18 |     Examples:
19 |         >>> import numpy as np
20 |         >>> import pandas as pd
21 |         >>> import janitor
22 |         >>> df = pd.DataFrame({"a": [1., np.NaN, 3.], "b": [None, "y", "z"]})
23 |         >>> df
24 |              a     b
25 |         0  1.0  None
26 |         1  NaN     y
27 |         2  3.0     z
28 |         >>> df.dropnotnull("a")
29 |             a  b
30 |         1 NaN  y
31 |         >>> df.dropnotnull("b")
32 |              a     b
33 |         0  1.0  None
34 | 
35 |     Args:
36 |         df: A pandas DataFrame.
37 |         column_name: The column name to drop rows from.
38 | 
39 |     Returns:
40 |         A pandas DataFrame with dropped rows.
41 |     """
42 |     return df[pd.isna(df[column_name])]
43 | 


--------------------------------------------------------------------------------
/janitor/functions/expand_column.py:
--------------------------------------------------------------------------------
 1 | """Implementation for expand_column."""
 2 | 
 3 | from typing import Hashable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import deprecated_alias
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @deprecated_alias(column="column_name")
13 | def expand_column(
14 |     df: pd.DataFrame,
15 |     column_name: Hashable,
16 |     sep: str = "|",
17 |     concat: bool = True,
18 | ) -> pd.DataFrame:
19 |     """Expand a categorical column with multiple labels into dummy-coded columns.
20 | 
21 |     Super sugary syntax that wraps `pandas.Series.str.get_dummies`.
22 | 
23 |     This method does not mutate the original DataFrame.
24 | 
25 |     Examples:
26 |         Functional usage syntax:
27 | 
28 |         >>> import pandas as pd
29 |         >>> df = pd.DataFrame(
30 |         ...     {
31 |         ...         "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
32 |         ...         "col2": [1, 2, 3, 4],
33 |         ...     }
34 |         ... )
35 |         >>> df = expand_column(
36 |         ...     df,
37 |         ...     column_name="col1",
38 |         ...     sep=", "  # note space in sep
39 |         ... )
40 |         >>> df
41 |               col1  col2  A  B  C  D  E  F
42 |         0     A, B     1  1  1  0  0  0  0
43 |         1  B, C, D     2  0  1  1  1  0  0
44 |         2     E, F     3  0  0  0  0  1  1
45 |         3  A, E, F     4  1  0  0  0  1  1
46 | 
47 |         Method chaining syntax:
48 | 
49 |         >>> import pandas as pd
50 |         >>> import janitor
51 |         >>> df = (
52 |         ...     pd.DataFrame(
53 |         ...         {
54 |         ...             "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
55 |         ...             "col2": [1, 2, 3, 4],
56 |         ...         }
57 |         ...     )
58 |         ...     .expand_column(
59 |         ...         column_name='col1',
60 |         ...         sep=', '
61 |         ...     )
62 |         ... )
63 |         >>> df
64 |               col1  col2  A  B  C  D  E  F
65 |         0     A, B     1  1  1  0  0  0  0
66 |         1  B, C, D     2  0  1  1  1  0  0
67 |         2     E, F     3  0  0  0  0  1  1
68 |         3  A, E, F     4  1  0  0  0  1  1
69 | 
70 |     Args:
71 |         df: A pandas DataFrame.
72 |         column_name: Which column to expand.
73 |         sep: The delimiter, same to
74 |             `pandas.Series.str.get_dummies`'s `sep`.
75 |         concat: Whether to return the expanded column concatenated to
76 |             the original dataframe (`concat=True`), or to return it standalone
77 |             (`concat=False`).
78 | 
79 |     Returns:
80 |         A pandas DataFrame with an expanded column.
81 |     """  # noqa: E501
82 |     expanded_df = df[column_name].str.get_dummies(sep=sep)
83 |     if concat:
84 |         return df.join(expanded_df)
85 |     return expanded_df
86 | 


--------------------------------------------------------------------------------
/janitor/functions/factorize_columns.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `factorize_columns` function"""
 2 | 
 3 | from typing import Any, Hashable, Iterable, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.functions.utils import _factorize
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | def factorize_columns(
13 |     df: pd.DataFrame,
14 |     column_names: Union[str, Iterable[str], Hashable],
15 |     suffix: str = "_enc",
16 |     **kwargs: Any,
17 | ) -> pd.DataFrame:
18 |     """Converts labels into numerical data.
19 | 
20 |     This method will create a new column with the string `_enc` appended
21 |     after the original column's name.
22 |     This can be overridden with the suffix parameter.
23 | 
24 |     Internally, this method uses pandas `factorize` method.
25 |     It takes in an optional suffix and keyword arguments also.
26 |     An empty string as suffix will override the existing column.
27 | 
28 |     This method does not mutate the original DataFrame.
29 | 
30 |     Examples:
31 |         >>> import pandas as pd
32 |         >>> import janitor
33 |         >>> df = pd.DataFrame({
34 |         ...     "foo": ["b", "b", "a", "c", "b"],
35 |         ...     "bar": range(4, 9),
36 |         ... })
37 |         >>> df
38 |           foo  bar
39 |         0   b    4
40 |         1   b    5
41 |         2   a    6
42 |         3   c    7
43 |         4   b    8
44 |         >>> df.factorize_columns(column_names="foo")
45 |           foo  bar  foo_enc
46 |         0   b    4        0
47 |         1   b    5        0
48 |         2   a    6        1
49 |         3   c    7        2
50 |         4   b    8        0
51 | 
52 |     Args:
53 |         df: The pandas DataFrame object.
54 |         column_names: A column name or an iterable (list or tuple) of
55 |             column names.
56 |         suffix: Suffix to be used for the new column.
57 |             An empty string suffix means, it will override the existing column.
58 |         **kwargs: Keyword arguments. It takes any of the keyword arguments,
59 |             which the pandas factorize method takes like `sort`, `na_sentinel`,
60 |             `size_hint`.
61 | 
62 |     Returns:
63 |         A pandas DataFrame.
64 |     """
65 |     df = _factorize(df.copy(), column_names, suffix, **kwargs)
66 |     return df
67 | 


--------------------------------------------------------------------------------
/janitor/functions/get_dupes.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `get_dupes` function"""
 2 | 
 3 | from typing import Hashable, Iterable, Optional, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import deprecated_alias
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @deprecated_alias(columns="column_names")
13 | def get_dupes(
14 |     df: pd.DataFrame,
15 |     column_names: Optional[Union[str, Iterable[str], Hashable]] = None,
16 | ) -> pd.DataFrame:
17 |     """
18 |     Return all duplicate rows.
19 | 
20 |     This method does not mutate the original DataFrame.
21 | 
22 |     Examples:
23 |         Method chaining syntax:
24 | 
25 |         >>> import pandas as pd
26 |         >>> import janitor
27 |         >>> df = pd.DataFrame({
28 |         ...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
29 |         ...     "quantity": [100, 100, 75, 200, 75],
30 |         ... })
31 |         >>> df
32 |            item  quantity
33 |         0  shoe       100
34 |         1  shoe       100
35 |         2   bag        75
36 |         3  shoe       200
37 |         4   bag        75
38 |         >>> df.get_dupes()
39 |            item  quantity
40 |         0  shoe       100
41 |         1  shoe       100
42 |         2   bag        75
43 |         4   bag        75
44 | 
45 |         Optional `column_names` usage:
46 | 
47 |         >>> import pandas as pd
48 |         >>> import janitor
49 |         >>> df = pd.DataFrame({
50 |         ...     "item": ["shoe", "shoe", "bag", "shoe", "bag"],
51 |         ...     "quantity": [100, 100, 75, 200, 75],
52 |         ... })
53 |         >>> df
54 |            item  quantity
55 |         0  shoe       100
56 |         1  shoe       100
57 |         2   bag        75
58 |         3  shoe       200
59 |         4   bag        75
60 |         >>> df.get_dupes(column_names=["item"])
61 |            item  quantity
62 |         0  shoe       100
63 |         1  shoe       100
64 |         2   bag        75
65 |         3  shoe       200
66 |         4   bag        75
67 |         >>> df.get_dupes(column_names=["quantity"])
68 |            item  quantity
69 |         0  shoe       100
70 |         1  shoe       100
71 |         2   bag        75
72 |         4   bag        75
73 | 
74 |     Args:
75 |         df: The pandas DataFrame object.
76 |         column_names: A column name or an iterable
77 |             (list or tuple) of column names. Following pandas API, this only
78 |             considers certain columns for identifying duplicates. Defaults
79 |             to using all columns.
80 | 
81 |     Returns:
82 |         The duplicate rows, as a pandas DataFrame.
83 |     """
84 |     return df.loc[df.duplicated(subset=column_names, keep=False)]
85 | 


--------------------------------------------------------------------------------
/janitor/functions/join_apply.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `join_apply` function"""
 2 | 
 3 | from typing import Callable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def join_apply(
11 |     df: pd.DataFrame,
12 |     func: Callable,
13 |     new_column_name: str,
14 | ) -> pd.DataFrame:
15 |     """Join the result of applying a function across dataframe rows.
16 | 
17 |     This method does not mutate the original DataFrame.
18 | 
19 |     This is a convenience function that allows us to apply arbitrary functions
20 |     that take any combination of information from any of the columns. The only
21 |     requirement is that the function signature takes in a row from the
22 |     DataFrame.
23 | 
24 |     Examples:
25 |         Sum the result of two columns into a new column.
26 | 
27 |         >>> import pandas as pd
28 |         >>> import janitor
29 |         >>> df = pd.DataFrame({"a":[1, 2, 3], "b": [2, 3, 4]})
30 |         >>> df
31 |            a  b
32 |         0  1  2
33 |         1  2  3
34 |         2  3  4
35 |         >>> df.join_apply(
36 |         ...     func=lambda x: 2 * x["a"] + x["b"],
37 |         ...     new_column_name="2a+b",
38 |         ... )
39 |            a  b  2a+b
40 |         0  1  2     4
41 |         1  2  3     7
42 |         2  3  4    10
43 | 
44 |         Incorporating conditionals in `func`.
45 | 
46 |         >>> import pandas as pd
47 |         >>> import janitor
48 |         >>> df = pd.DataFrame({"a": [1, 2, 3], "b": [20, 30, 40]})
49 |         >>> df
50 |            a   b
51 |         0  1  20
52 |         1  2  30
53 |         2  3  40
54 |         >>> def take_a_if_even(x):
55 |         ...     if x["a"] % 2 == 0:
56 |         ...         return x["a"]
57 |         ...     else:
58 |         ...         return x["b"]
59 |         >>> df.join_apply(take_a_if_even, "a_if_even")
60 |            a   b  a_if_even
61 |         0  1  20         20
62 |         1  2  30          2
63 |         2  3  40         40
64 | 
65 |     Args:
66 |         df: A pandas DataFrame.
67 |         func: A function that is applied elementwise across all rows of the
68 |             DataFrame.
69 |         new_column_name: Name of the resulting column.
70 | 
71 |     Returns:
72 |         A pandas DataFrame with new column appended.
73 |     """  # noqa: E501
74 |     df = df.copy().join(df.apply(func, axis=1).rename(new_column_name))
75 |     return df
76 | 


--------------------------------------------------------------------------------
/janitor/functions/label_encode.py:
--------------------------------------------------------------------------------
 1 | """Implementation of `label_encode` function"""
 2 | 
 3 | import warnings
 4 | from typing import Hashable, Iterable, Union
 5 | 
 6 | import pandas as pd
 7 | import pandas_flavor as pf
 8 | 
 9 | from janitor.functions.utils import _factorize
10 | from janitor.utils import deprecated_alias, refactored_function
11 | 
12 | 
13 | @pf.register_dataframe_method
14 | @refactored_function(
15 |     message=(
16 |         "This function will be deprecated in a 1.x release. "
17 |         "Please use `janitor.factorize_columns` instead."
18 |     )
19 | )
20 | @deprecated_alias(columns="column_names")
21 | def label_encode(
22 |     df: pd.DataFrame,
23 |     column_names: Union[str, Iterable[str], Hashable],
24 | ) -> pd.DataFrame:
25 |     """Convert labels into numerical data.
26 | 
27 |     This method will create a new column with the string `_enc` appended
28 |     after the original column's name.
29 |     Consider this to be syntactic sugar.
30 |     This function uses the `factorize` pandas function under the hood.
31 | 
32 |     This method behaves differently from
33 |     [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
34 |     This method creates a new column of numeric data.
35 |     [`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
36 |     replaces the dtype of the original column with a *categorical* dtype.
37 | 
38 |     This method mutates the original DataFrame.
39 | 
40 |     !!!note
41 | 
42 |         This function will be deprecated in a 1.x release.
43 |         Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
44 |         instead.
45 | 
46 |     Examples:
47 |         >>> import pandas as pd
48 |         >>> import janitor
49 |         >>> df = pd.DataFrame({
50 |         ...     "foo": ["b", "b", "a", "c", "b"],
51 |         ...     "bar": range(4, 9),
52 |         ... })
53 |         >>> df
54 |           foo  bar
55 |         0   b    4
56 |         1   b    5
57 |         2   a    6
58 |         3   c    7
59 |         4   b    8
60 |         >>> df.label_encode(column_names="foo")
61 |           foo  bar  foo_enc
62 |         0   b    4        0
63 |         1   b    5        0
64 |         2   a    6        1
65 |         3   c    7        2
66 |         4   b    8        0
67 | 
68 |     Args:
69 |         df: The pandas DataFrame object.
70 |         column_names: A column name or an iterable (list
71 |             or tuple) of column names.
72 | 
73 |     Returns:
74 |         A pandas DataFrame.
75 |     """  # noqa: E501
76 |     warnings.warn(
77 |         "`label_encode` will be deprecated in a 1.x release. "
78 |         "Please use `factorize_columns` instead."
79 |     )
80 |     df = _factorize(df, column_names, "_enc")
81 |     return df
82 | 


--------------------------------------------------------------------------------
/janitor/functions/remove_columns.py:
--------------------------------------------------------------------------------
 1 | """Implementation of remove_columns."""
 2 | 
 3 | from typing import Hashable, Iterable, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import deprecated_alias, refactored_function
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @refactored_function(
13 |     message=(
14 |         "This function will be deprecated in a 1.x release. "
15 |         "Please use `pd.DataFrame.drop` instead."
16 |     )
17 | )
18 | @deprecated_alias(columns="column_names")
19 | def remove_columns(
20 |     df: pd.DataFrame,
21 |     column_names: Union[str, Iterable[str], Hashable],
22 | ) -> pd.DataFrame:
23 |     """Remove the set of columns specified in `column_names`.
24 | 
25 |     This method does not mutate the original DataFrame.
26 | 
27 |     Intended to be the method-chaining alternative to `del df[col]`.
28 | 
29 |     !!!note
30 | 
31 |         This function will be deprecated in a 1.x release.
32 |         Kindly use `pd.DataFrame.drop` instead.
33 | 
34 |     Examples:
35 |         >>> import pandas as pd
36 |         >>> import janitor
37 |         >>> df = pd.DataFrame({"a": [2, 4, 6], "b": [1, 3, 5], "c": [7, 8, 9]})
38 |         >>> df
39 |            a  b  c
40 |         0  2  1  7
41 |         1  4  3  8
42 |         2  6  5  9
43 |         >>> df.remove_columns(column_names=['a', 'c'])
44 |            b
45 |         0  1
46 |         1  3
47 |         2  5
48 | 
49 |     Args:
50 |         df: A pandas DataFrame.
51 |         column_names: The columns to remove.
52 | 
53 |     Returns:
54 |         A pandas DataFrame.
55 |     """
56 | 
57 |     return df.drop(columns=column_names)
58 | 


--------------------------------------------------------------------------------
/janitor/functions/remove_empty.py:
--------------------------------------------------------------------------------
 1 | """Implementation of remove_empty."""
 2 | 
 3 | import pandas as pd
 4 | import pandas_flavor as pf
 5 | 
 6 | 
 7 | @pf.register_dataframe_method
 8 | def remove_empty(df: pd.DataFrame, reset_index: bool = True) -> pd.DataFrame:
 9 |     """Drop all rows and columns that are completely null.
10 | 
11 |     This method does not mutate the original DataFrame.
12 | 
13 |     Implementation is inspired from [StackOverflow][so].
14 | 
15 |     [so]: https://stackoverflow.com/questions/38884538/python-pandas-find-all-rows-where-all-values-are-nan
16 | 
17 |     Examples:
18 |         >>> import numpy as np
19 |         >>> import pandas as pd
20 |         >>> import janitor
21 |         >>> df = pd.DataFrame({
22 |         ...     "a": [1, np.nan, 2],
23 |         ...     "b": [3, np.nan, 4],
24 |         ...     "c": [np.nan, np.nan, np.nan],
25 |         ... })
26 |         >>> df
27 |              a    b   c
28 |         0  1.0  3.0 NaN
29 |         1  NaN  NaN NaN
30 |         2  2.0  4.0 NaN
31 |         >>> df.remove_empty()
32 |              a    b
33 |         0  1.0  3.0
34 |         1  2.0  4.0
35 | 
36 |     Args:
37 |         df: The pandas DataFrame object.
38 |         reset_index: Determines if the index is reset.
39 | 
40 |     Returns:
41 |         A pandas DataFrame.
42 |     """  # noqa: E501
43 |     outcome = df.isna()
44 |     outcome = df.loc[~outcome.all(axis=1), ~outcome.all(axis=0)]
45 |     if reset_index:
46 |         return outcome.reset_index(drop=True)
47 |     return outcome
48 | 


--------------------------------------------------------------------------------
/janitor/functions/reorder_columns.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for `reorder_columns`."""
 2 | 
 3 | from typing import Hashable, Iterable, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import check
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | def reorder_columns(
13 |     df: pd.DataFrame, column_order: Union[Iterable[str], pd.Index, Hashable]
14 | ) -> pd.DataFrame:
15 |     """Reorder DataFrame columns by specifying desired order as list of col names.
16 | 
17 |     Columns not specified retain their order and follow after the columns specified
18 |     in `column_order`.
19 | 
20 |     All columns specified within the `column_order` list must be present within `df`.
21 | 
22 |     This method does not mutate the original DataFrame.
23 | 
24 |     Examples:
25 |         >>> import pandas as pd
26 |         >>> import janitor
27 |         >>> df = pd.DataFrame({"col1": [1, 1, 1], "col2": [2, 2, 2], "col3": [3, 3, 3]})
28 |         >>> df
29 |            col1  col2  col3
30 |         0     1     2     3
31 |         1     1     2     3
32 |         2     1     2     3
33 |         >>> df.reorder_columns(['col3', 'col1'])
34 |            col3  col1  col2
35 |         0     3     1     2
36 |         1     3     1     2
37 |         2     3     1     2
38 | 
39 |         Notice that the column order of `df` is now `col3`, `col1`, `col2`.
40 | 
41 |     Internally, this function uses `DataFrame.reindex` with `copy=False`
42 |     to avoid unnecessary data duplication.
43 | 
44 |     Args:
45 |         df: `DataFrame` to reorder
46 |         column_order: A list of column names or Pandas `Index`
47 |             specifying their order in the returned `DataFrame`.
48 | 
49 |     Raises:
50 |         IndexError: If a column within `column_order` is not found
51 |             within the DataFrame.
52 | 
53 |     Returns:
54 |         A pandas DataFrame with reordered columns.
55 |     """  # noqa: E501
56 |     check("column_order", column_order, [list, tuple, pd.Index])
57 | 
58 |     if any(col not in df.columns for col in column_order):
59 |         raise IndexError(
60 |             "One or more columns in `column_order` were not found in the "
61 |             "DataFrame."
62 |         )
63 | 
64 |     # if column_order is a Pandas index, needs conversion to list:
65 |     column_order = list(column_order)
66 | 
67 |     return df.reindex(
68 |         columns=(
69 |             column_order
70 |             + [col for col in df.columns if col not in column_order]
71 |         ),
72 |         copy=False,
73 |     )
74 | 


--------------------------------------------------------------------------------
/janitor/functions/round_to_fraction.py:
--------------------------------------------------------------------------------
 1 | """Implementation of `round_to_fraction`"""
 2 | 
 3 | from typing import Hashable
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import pandas_flavor as pf
 8 | 
 9 | from janitor.utils import check, check_column, deprecated_alias
10 | 
11 | 
12 | @pf.register_dataframe_method
13 | @deprecated_alias(col_name="column_name")
14 | def round_to_fraction(
15 |     df: pd.DataFrame,
16 |     column_name: Hashable,
17 |     denominator: float,
18 |     digits: float = np.inf,
19 | ) -> pd.DataFrame:
20 |     """Round all values in a column to a fraction.
21 | 
22 |     This method mutates the original DataFrame.
23 | 
24 |     Taken from [the R package](https://github.com/sfirke/janitor/issues/235).
25 | 
26 |     Also, optionally round to a specified number of digits.
27 | 
28 |     Examples:
29 |         Round numeric column to the nearest 1/4 value.
30 | 
31 |         >>> import numpy as np
32 |         >>> import pandas as pd
33 |         >>> import janitor
34 |         >>> df = pd.DataFrame({
35 |         ...     "a1": [1.263, 2.499, np.nan],
36 |         ...     "a2": ["x", "y", "z"],
37 |         ... })
38 |         >>> df
39 |               a1 a2
40 |         0  1.263  x
41 |         1  2.499  y
42 |         2    NaN  z
43 |         >>> df.round_to_fraction("a1", denominator=4)
44 |              a1 a2
45 |         0  1.25  x
46 |         1  2.50  y
47 |         2   NaN  z
48 | 
49 |     Args:
50 |         df: A pandas DataFrame.
51 |         column_name: Name of column to round to fraction.
52 |         denominator: The denominator of the fraction for rounding. Must be
53 |             a positive number.
54 |         digits: The number of digits for rounding after rounding to the
55 |             fraction. Default is np.inf (i.e. no subsequent rounding).
56 | 
57 |     Raises:
58 |         ValueError: If `denominator` is not a positive number.
59 | 
60 |     Returns:
61 |         A pandas DataFrame with a column's values rounded.
62 |     """
63 |     check_column(df, column_name)
64 |     check("denominator", denominator, [float, int])
65 |     check("digits", digits, [float, int])
66 | 
67 |     if denominator <= 0:
68 |         raise ValueError("denominator is expected to be a positive number.")
69 | 
70 |     df[column_name] = round(df[column_name] * denominator, 0) / denominator
71 |     if not np.isinf(digits):
72 |         df[column_name] = round(df[column_name], digits)
73 | 
74 |     return df
75 | 


--------------------------------------------------------------------------------
/janitor/functions/shuffle.py:
--------------------------------------------------------------------------------
 1 | """Implementation of `shuffle` functions."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def shuffle(
11 |     df: pd.DataFrame, random_state: Any = None, reset_index: bool = True
12 | ) -> pd.DataFrame:
13 |     """Shuffle the rows of the DataFrame.
14 | 
15 |     This method does not mutate the original DataFrame.
16 | 
17 |     Super-sugary syntax! Underneath the hood, we use `df.sample(frac=1)`,
18 |     with the option to set the random state.
19 | 
20 |     Examples:
21 |         >>> import pandas as pd
22 |         >>> import janitor
23 |         >>> df = pd.DataFrame({
24 |         ...     "col1": range(5),
25 |         ...     "col2": list("abcde"),
26 |         ... })
27 |         >>> df
28 |            col1 col2
29 |         0     0    a
30 |         1     1    b
31 |         2     2    c
32 |         3     3    d
33 |         4     4    e
34 |         >>> df.shuffle(random_state=42)
35 |            col1 col2
36 |         0     1    b
37 |         1     4    e
38 |         2     2    c
39 |         3     0    a
40 |         4     3    d
41 | 
42 |     Args:
43 |         df: A pandas DataFrame.
44 |         random_state: If provided, set a seed for the random number
45 |             generator. Passed to `pd.DataFrame.sample()`.
46 |         reset_index: If True, reset the dataframe index to the default
47 |             RangeIndex.
48 | 
49 |     Returns:
50 |         A shuffled pandas DataFrame.
51 |     """
52 |     result = df.sample(frac=1, random_state=random_state)
53 |     if reset_index:
54 |         result = result.reset_index(drop=True)
55 |     return result
56 | 


--------------------------------------------------------------------------------
/janitor/functions/sort_naturally.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `sort_naturally` function."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | from natsort import index_natsorted
 8 | 
 9 | 
10 | @pf.register_dataframe_method
11 | def sort_naturally(
12 |     df: pd.DataFrame, column_name: str, **natsorted_kwargs: Any
13 | ) -> pd.DataFrame:
14 |     """Sort a DataFrame by a column using *natural* sorting.
15 | 
16 |     Natural sorting is distinct from
17 |     the default lexiographical sorting provided by `pandas`.
18 |     For example, given the following list of items:
19 | 
20 |     ```python
21 |     ["A1", "A11", "A3", "A2", "A10"]
22 |     ```
23 | 
24 |     Lexicographical sorting would give us:
25 | 
26 |     ```python
27 |     ["A1", "A10", "A11", "A2", "A3"]
28 |     ```
29 | 
30 |     By contrast, "natural" sorting would give us:
31 | 
32 |     ```python
33 |     ["A1", "A2", "A3", "A10", "A11"]
34 |     ```
35 | 
36 |     This function thus provides *natural* sorting
37 |     on a single column of a dataframe.
38 | 
39 |     To accomplish this, we do a natural sort
40 |     on the unique values that are present in the dataframe.
41 |     Then, we reconstitute the entire dataframe
42 |     in the naturally sorted order.
43 | 
44 |     Natural sorting is provided by the Python package
45 |     [natsort](https://natsort.readthedocs.io/en/master/index.html).
46 | 
47 |     All keyword arguments to `natsort` should be provided
48 |     after the column name to sort by is provided.
49 |     They are passed through to the `natsorted` function.
50 | 
51 |     Examples:
52 |         >>> import pandas as pd
53 |         >>> import janitor
54 |         >>> df = pd.DataFrame(
55 |         ...     {
56 |         ...         "Well": ["A21", "A3", "A21", "B2", "B51", "B12"],
57 |         ...         "Value": [1, 2, 13, 3, 4, 7],
58 |         ...     }
59 |         ... )
60 |         >>> df
61 |           Well  Value
62 |         0  A21      1
63 |         1   A3      2
64 |         2  A21     13
65 |         3   B2      3
66 |         4  B51      4
67 |         5  B12      7
68 |         >>> df.sort_naturally("Well")
69 |           Well  Value
70 |         1   A3      2
71 |         0  A21      1
72 |         2  A21     13
73 |         3   B2      3
74 |         5  B12      7
75 |         4  B51      4
76 | 
77 |     Args:
78 |         df: A pandas DataFrame.
79 |         column_name: The column on which natural sorting should take place.
80 |         **natsorted_kwargs: Keyword arguments to be passed
81 |             to natsort's `natsorted` function.
82 | 
83 |     Returns:
84 |         A sorted pandas DataFrame.
85 |     """
86 |     new_order = index_natsorted(df[column_name], **natsorted_kwargs)
87 |     return df.iloc[new_order, :]
88 | 


--------------------------------------------------------------------------------
/janitor/functions/take_first.py:
--------------------------------------------------------------------------------
 1 | """Implementation of take_first function."""
 2 | 
 3 | from typing import Hashable, Iterable, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def take_first(
11 |     df: pd.DataFrame,
12 |     subset: Union[Hashable, Iterable[Hashable]],
13 |     by: Hashable,
14 |     ascending: bool = True,
15 | ) -> pd.DataFrame:
16 |     """Take the first row within each group specified by `subset`.
17 | 
18 |     Examples:
19 |         >>> import pandas as pd
20 |         >>> import janitor
21 |         >>> df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]})
22 |         >>> df
23 |            a  b
24 |         0  x  0
25 |         1  x  1
26 |         2  y  2
27 |         3  y  3
28 |         >>> df.take_first(subset="a", by="b")
29 |            a  b
30 |         0  x  0
31 |         2  y  2
32 | 
33 |     Args:
34 |         df: A pandas DataFrame.
35 |         subset: Column(s) defining the group.
36 |         by: Column to sort by.
37 |         ascending: Whether or not to sort in ascending order, `bool`.
38 | 
39 |     Returns:
40 |         A pandas DataFrame.
41 |     """
42 |     result = df.sort_values(by=by, ascending=ascending).drop_duplicates(
43 |         subset=subset, keep="first"
44 |     )
45 | 
46 |     return result
47 | 


--------------------------------------------------------------------------------
/janitor/functions/then.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for `then`."""
 2 | 
 3 | from typing import Callable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import refactored_function
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @refactored_function(
13 |     message="This function will be deprecated in a 1.x release. "
14 |     "Kindly use `pd.DataFrame.pipe` instead."
15 | )
16 | def then(df: pd.DataFrame, func: Callable) -> pd.DataFrame:
17 |     """Add an arbitrary function to run in the `pyjanitor` method chain.
18 | 
19 |     This method does not mutate the original DataFrame.
20 | 
21 |     !!!note
22 | 
23 |         This function will be deprecated in a 1.x release.
24 |         Please use `pd.DataFrame.pipe` instead.
25 | 
26 |     Examples:
27 |         A trivial example using a lambda `func`.
28 | 
29 |         >>> import pandas as pd
30 |         >>> import janitor
31 |         >>> (pd.DataFrame({"a": [1, 2, 3], "b": [7, 8, 9]})
32 |         ...  .then(lambda df: df * 2))
33 |            a   b
34 |         0  2  14
35 |         1  4  16
36 |         2  6  18
37 | 
38 |     Args:
39 |         df: A pandas DataFrame.
40 |         func: A function you would like to run in the method chain.
41 |             It should take one parameter and return one parameter, each being
42 |             the DataFrame object. After that, do whatever you want in the
43 |             middle. Go crazy.
44 | 
45 |     Returns:
46 |         A pandas DataFrame.
47 |     """
48 |     df = func(df)
49 |     return df
50 | 


--------------------------------------------------------------------------------
/janitor/functions/to_datetime.py:
--------------------------------------------------------------------------------
 1 | """Implementation source for `to_datetime`."""
 2 | 
 3 | from typing import Any, Hashable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import deprecated_alias, refactored_function
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @deprecated_alias(column="column_name")
13 | @refactored_function(
14 |     message=(
15 |         "This function will be deprecated in a 1.x release. "
16 |         "Please use `jn.transform_columns` instead."
17 |     )
18 | )
19 | def to_datetime(
20 |     df: pd.DataFrame, column_name: Hashable, **kwargs: Any
21 | ) -> pd.DataFrame:
22 |     """Convert column to a datetime type, in-place.
23 | 
24 |     Intended to be the method-chaining equivalent of:
25 | 
26 |     ```python
27 |     df[column_name] = pd.to_datetime(df[column_name], **kwargs)
28 |     ```
29 | 
30 |     This method mutates the original DataFrame.
31 | 
32 |     !!!note
33 | 
34 |         This function will be deprecated in a 1.x release.
35 |         Please use [`jn.transform_column`][janitor.functions.transform_columns.transform_column]
36 |         instead.
37 | 
38 |     Examples:
39 |         Converting a string column to datetime type with custom format.
40 | 
41 |         >>> import pandas as pd
42 |         >>> import janitor
43 |         >>> df = pd.DataFrame({'date': ['20200101', '20200202', '20200303']})
44 |         >>> df
45 |                date
46 |         0  20200101
47 |         1  20200202
48 |         2  20200303
49 |         >>> df.to_datetime('date', format='%Y%m%d')
50 |                 date
51 |         0 2020-01-01
52 |         1 2020-02-02
53 |         2 2020-03-03
54 | 
55 |     Read the pandas documentation for [`to_datetime`][pd_docs] for more information.
56 | 
57 |     [pd_docs]: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
58 | 
59 |     Args:
60 |         df: A pandas DataFrame.
61 |         column_name: Column name.
62 |         **kwargs: Provide any kwargs that `pd.to_datetime` can take.
63 | 
64 |     Returns:
65 |         A pandas DataFrame with updated datetime data.
66 |     """  # noqa: E501
67 |     df[column_name] = pd.to_datetime(df[column_name], **kwargs)
68 | 
69 |     return df
70 | 


--------------------------------------------------------------------------------
/janitor/functions/toset.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `toset` function."""
 2 | 
 3 | from typing import Set
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from janitor.utils import refactored_function
 9 | 
10 | 
11 | @pf.register_series_method
12 | @refactored_function(
13 |     message=(
14 |         "This function will be deprecated in a 1.x release. "
15 |         "Please use `set(df[column])` instead."
16 |     )
17 | )
18 | def toset(series: pd.Series) -> Set:
19 |     """Return a set of the values.
20 | 
21 |     !!!note
22 | 
23 |         This function will be deprecated in a 1.x release.
24 |         Please use `set(df[column])` instead.
25 | 
26 |     These are each a scalar type, which is a Python scalar
27 |     (for str, int, float) or a pandas scalar
28 |     (for Timestamp/Timedelta/Interval/Period)
29 | 
30 |     Examples:
31 |         >>> import pandas as pd
32 |         >>> import janitor
33 |         >>> s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"])
34 |         >>> s
35 |         a    1
36 |         b    2
37 |         c    3
38 |         d    5
39 |         e    5
40 |         dtype: int64
41 |         >>> s.toset()
42 |         {1, 2, 3, 5}
43 | 
44 |     Args:
45 |         series: A pandas series.
46 | 
47 |     Returns:
48 |         A set of values.
49 |     """
50 | 
51 |     return set(series.tolist())
52 | 


--------------------------------------------------------------------------------
/janitor/functions/truncate_datetime.py:
--------------------------------------------------------------------------------
 1 | """Implementation of the `truncate_datetime` family of functions."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pandas_flavor as pf
 6 | from pandas.api.types import is_datetime64_any_dtype
 7 | 
 8 | 
 9 | @pf.register_dataframe_method
10 | def truncate_datetime_dataframe(
11 |     df: pd.DataFrame,
12 |     datepart: str,
13 | ) -> pd.DataFrame:
14 |     """Truncate times down to a user-specified precision of
15 |     year, month, day, hour, minute, or second.
16 | 
17 |     This method does not mutate the original DataFrame.
18 | 
19 |     Examples:
20 |         >>> import pandas as pd
21 |         >>> import janitor
22 |         >>> df = pd.DataFrame({
23 |         ...     "foo": ["xxxx", "yyyy", "zzzz"],
24 |         ...     "dt": pd.date_range("2020-03-11", periods=3, freq="15H"),
25 |         ... })
26 |         >>> df
27 |             foo                  dt
28 |         0  xxxx 2020-03-11 00:00:00
29 |         1  yyyy 2020-03-11 15:00:00
30 |         2  zzzz 2020-03-12 06:00:00
31 |         >>> df.truncate_datetime_dataframe("day")
32 |             foo         dt
33 |         0  xxxx 2020-03-11
34 |         1  yyyy 2020-03-11
35 |         2  zzzz 2020-03-12
36 | 
37 |     Args:
38 |         df: The pandas DataFrame on which to truncate datetime.
39 |         datepart: Truncation precision, YEAR, MONTH, DAY,
40 |             HOUR, MINUTE, SECOND. (String is automagically
41 |             capitalized)
42 | 
43 |     Raises:
44 |         ValueError: If an invalid `datepart` precision is passed in.
45 | 
46 |     Returns:
47 |         A pandas DataFrame with all valid datetimes truncated down
48 |             to the specified precision.
49 |     """
50 |     # idea from Stack Overflow
51 |     # https://stackoverflow.com/a/28783971/7175713
52 |     # https://numpy.org/doc/stable/reference/arrays.datetime.html
53 |     ACCEPTABLE_DATEPARTS = {
54 |         "YEAR": "datetime64[Y]",
55 |         "MONTH": "datetime64[M]",
56 |         "DAY": "datetime64[D]",
57 |         "HOUR": "datetime64[h]",
58 |         "MINUTE": "datetime64[m]",
59 |         "SECOND": "datetime64[s]",
60 |     }
61 |     datepart = datepart.upper()
62 |     if datepart not in ACCEPTABLE_DATEPARTS:
63 |         raise ValueError(
64 |             "Received an invalid `datepart` precision. "
65 |             f"Please enter any one of {ACCEPTABLE_DATEPARTS}."
66 |         )
67 | 
68 |     dictionary = {}
69 | 
70 |     for label, series in df.items():
71 |         if is_datetime64_any_dtype(series):
72 |             dtype = ACCEPTABLE_DATEPARTS[datepart]
73 |             # TODO: add branch for pyarrow arrays
74 |             series = np.array(series._values, dtype=dtype)
75 |         dictionary[label] = series
76 | 
77 |     return pd.DataFrame(dictionary)
78 | 


--------------------------------------------------------------------------------
/janitor/functions/update_where.py:
--------------------------------------------------------------------------------
 1 | """Function for updating values based on other column values."""
 2 | 
 3 | from typing import Any, Hashable
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | from pandas.api.types import is_bool_dtype
 8 | 
 9 | from janitor.utils import deprecated_alias
10 | 
11 | 
12 | @pf.register_dataframe_method
13 | @deprecated_alias(target_col="target_column_name")
14 | def update_where(
15 |     df: pd.DataFrame,
16 |     conditions: Any,
17 |     target_column_name: Hashable,
18 |     target_val: Any,
19 | ) -> pd.DataFrame:
20 |     """Add multiple conditions to update a column in the dataframe.
21 | 
22 |     This method does not mutate the original DataFrame.
23 | 
24 |     Examples:
25 |         >>> import janitor
26 |         >>> data = {
27 |         ...    "a": [1, 2, 3, 4],
28 |         ...    "b": [5, 6, 7, 8],
29 |         ...    "c": [0, 0, 0, 0],
30 |         ... }
31 |         >>> df = pd.DataFrame(data)
32 |         >>> df
33 |            a  b  c
34 |         0  1  5  0
35 |         1  2  6  0
36 |         2  3  7  0
37 |         3  4  8  0
38 |         >>> df.update_where(
39 |         ...    conditions = (df.a > 2) & (df.b < 8),
40 |         ...    target_column_name = 'c',
41 |         ...    target_val = 10
42 |         ... )
43 |            a  b   c
44 |         0  1  5   0
45 |         1  2  6   0
46 |         2  3  7  10
47 |         3  4  8   0
48 |         >>> df.update_where( # supports pandas *query* style string expressions
49 |         ...    conditions = "a > 2 and b < 8",
50 |         ...    target_column_name = 'c',
51 |         ...    target_val = 10
52 |         ... )
53 |            a  b   c
54 |         0  1  5   0
55 |         1  2  6   0
56 |         2  3  7  10
57 |         3  4  8   0
58 | 
59 |     Args:
60 |         df: The pandas DataFrame object.
61 |         conditions: Conditions used to update a target column
62 |             and target value.
63 |         target_column_name: Column to be updated. If column does not exist
64 |             in DataFrame, a new column will be created; note that entries
65 |             that do not get set in the new column will be null.
66 |         target_val: Value to be updated.
67 | 
68 |     Raises:
69 |         ValueError: If `conditions` does not return a boolean array-like
70 |             data structure.
71 | 
72 |     Returns:
73 |         A pandas DataFrame.
74 |     """
75 | 
76 |     df = df.copy()
77 | 
78 |     # use query mode if a string expression is passed
79 |     if isinstance(conditions, str):
80 |         conditions = df.eval(conditions)
81 | 
82 |     if not is_bool_dtype(conditions):
83 |         raise ValueError(
84 |             """
85 |             Kindly ensure that `conditions` passed
86 |             evaluates to a Boolean dtype.
87 |             """
88 |         )
89 | 
90 |     df.loc[conditions, target_column_name] = target_val
91 | 
92 |     return df
93 | 


--------------------------------------------------------------------------------
/janitor/ml.py:
--------------------------------------------------------------------------------
 1 | """Machine learning specific functions."""
 2 | 
 3 | from typing import Hashable, Iterable, List, Optional, Tuple, Union
 4 | 
 5 | import pandas as pd
 6 | import pandas_flavor as pf
 7 | 
 8 | from .utils import deprecated_alias
 9 | 
10 | 
11 | @pf.register_dataframe_method
12 | @deprecated_alias(
13 |     target_columns="target_column_names",
14 |     feature_columns="feature_column_names",
15 | )
16 | def get_features_targets(
17 |     df: pd.DataFrame,
18 |     target_column_names: Union[str, Union[List, Tuple], Hashable],
19 |     feature_column_names: Optional[Union[str, Iterable[str], Hashable]] = None,
20 | ) -> Tuple[pd.DataFrame, pd.DataFrame]:
21 |     """Get the features and targets as separate DataFrames/Series.
22 | 
23 |     This method does not mutate the original DataFrame.
24 | 
25 |     The behaviour is as such:
26 | 
27 |     - `target_column_names` is mandatory.
28 |     - If `feature_column_names` is present, then we will respect the column
29 |         names inside there.
30 |     - If `feature_column_names` is not passed in, then we will assume that
31 |     the rest of the columns are feature columns, and return them.
32 | 
33 |     Examples:
34 |         >>> import pandas as pd
35 |         >>> import janitor.ml
36 |         >>> df = pd.DataFrame(
37 |         ...     {"a": [1, 2, 3], "b": [-2, 0, 4], "c": [1.23, 7.89, 4.56]}
38 |         ... )
39 |         >>> X, Y = df.get_features_targets(target_column_names=["a", "c"])
40 |         >>> X
41 |            b
42 |         0 -2
43 |         1  0
44 |         2  4
45 |         >>> Y
46 |            a     c
47 |         0  1  1.23
48 |         1  2  7.89
49 |         2  3  4.56
50 | 
51 |     Args:
52 |         df: The pandas DataFrame object.
53 |         target_column_names: Either a column name or an
54 |             iterable (list or tuple) of column names that are the target(s) to
55 |             be predicted.
56 |         feature_column_names: The column name or
57 |             iterable of column names that are the features (a.k.a. predictors)
58 |             used to predict the targets.
59 | 
60 |     Returns:
61 |         `(X, Y)` the feature matrix (`X`) and the target matrix (`Y`).
62 |             Both are pandas DataFrames.
63 |     """
64 |     Y = df[target_column_names]
65 | 
66 |     if feature_column_names:
67 |         X = df[feature_column_names]
68 |     else:
69 |         if isinstance(target_column_names, (list, tuple)):  # noqa: W503
70 |             xcols = [c for c in df.columns if c not in target_column_names]
71 |         else:
72 |             xcols = [c for c in df.columns if target_column_names != c]
73 | 
74 |         X = df[xcols]
75 |     return X, Y
76 | 


--------------------------------------------------------------------------------
/janitor/polars/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clean_names import clean_names, make_clean_names
 2 | from .complete import complete, expand
 3 | from .dates_to_polars import convert_excel_date, convert_matlab_date
 4 | from .pivot_longer import pivot_longer, pivot_longer_spec
 5 | from .row_to_names import row_to_names
 6 | 
 7 | __all__ = [
 8 |     "pivot_longer_spec",
 9 |     "pivot_longer",
10 |     "clean_names",
11 |     "make_clean_names",
12 |     "row_to_names",
13 |     "expand",
14 |     "complete",
15 |     "convert_excel_date",
16 |     "convert_matlab_date",
17 | ]
18 | 


--------------------------------------------------------------------------------
/janitor/spark/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *  # noqa: F403, F401
2 | 


--------------------------------------------------------------------------------
/janitor/spark/backend.py:
--------------------------------------------------------------------------------
 1 | """Backend functions for pyspark."""
 2 | 
 3 | from functools import wraps
 4 | 
 5 | try:
 6 |     from pyspark.pandas.extensions import register_dataframe_accessor
 7 | 
 8 | except ImportError:
 9 |     from janitor.utils import import_message
10 | 
11 |     import_message(
12 |         submodule="spark",
13 |         package="pyspark",
14 |         conda_channel="conda-forge",
15 |         pip_install=True,
16 |     )
17 | 
18 | 
19 | def register_dataframe_method(method):
20 |     """Register a function as a method attached to the Pyspark DataFrame.
21 | 
22 |     !!! note
23 | 
24 |         Modified based on pandas_flavor.register.
25 | 
26 |     <!--
27 |     # noqa: DAR101 method
28 |     # noqa: DAR201
29 |     -->
30 |     """
31 | 
32 |     def inner(*args, **kwargs):
33 |         class AccessorMethod:
34 |             def __init__(self, pyspark_obj):
35 |                 self._obj = pyspark_obj
36 | 
37 |             @wraps(method)
38 |             def __call__(self, *args, **kwargs):
39 |                 return method(self._obj, *args, **kwargs)
40 | 
41 |         register_dataframe_accessor(method.__name__)(AccessorMethod)
42 | 
43 |         return method
44 | 
45 |     return inner()
46 | 


--------------------------------------------------------------------------------
/janitor/testing_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/janitor/testing_utils/__init__.py


--------------------------------------------------------------------------------
/janitor/xarray/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *  # noqa: F403, F401
2 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | # Project information
 2 | site_name: pyjanitor documentation
 3 | site_url: https://pyjanitor-devs.github.io/pyjanitor
 4 | site_description: >-
 5 |   Python implementation of the R package janitor
 6 | 
 7 | # Repository
 8 | repo_name: "pyjanitor-devs/pyjanitor"
 9 | repo_url: "https://github.com/pyjanitor-devs/pyjanitor"
10 | 
11 | # Configuration
12 | docs_dir: mkdocs/
13 | watch:
14 |   - janitor/
15 | 
16 | theme:
17 |   name: "material"
18 |   palette:
19 |     - media: "(prefers-color-scheme: light)"
20 |       scheme: default
21 |       primary: "blue grey"
22 |       accent: "light blue"
23 |   icon:
24 |     logo: "fontawesome/solid/book"
25 |   features:
26 |     - navigation.instant
27 |     # - navigation.tabs
28 |     - navigation.top
29 |     - toc.follow
30 |     - content.code.copy
31 |   language: en
32 | 
33 | # Page tree
34 | # We customize the navigation by hand to control the order
35 | # in which pages show up.
36 | nav:
37 |   - Home: index.md
38 |   - API Reference:
39 |       - Functions: api/functions.md
40 |       - Biology: api/biology.md
41 |       - Chemistry: api/chemistry.md
42 |       - Engineering: api/engineering.md
43 |       - Finance: api/finance.md
44 |       - Input/Output (io): api/io.md
45 |       - Machine Learning: api/ml.md
46 |       - Math: api/math.md
47 |       # - PySpark: api/pyspark.md  # will be added back later
48 |       - Polars: api/polars.md
49 |       - Timeseries: api/timeseries.md
50 |       - XArray: api/xarray.md
51 |   - Development Guide: devguide.md
52 |   - Changelog: CHANGELOG.md
53 |   - Authors: AUTHORS.md
54 | 
55 | plugins:
56 |   - search
57 |   - autorefs
58 |   - mkdocstrings:
59 |       default_handler: python
60 |       handlers:
61 |         python:
62 |           options:
63 |             docstring_style: "google"
64 |             docstring_options:
65 |               trim_doctest_flags: true
66 |             show_if_no_docstring: false
67 |             show_root_toc_entry: false
68 |             show_root_heading: false
69 |             show_submodules: true
70 |             show_source: true
71 |             members_order: alphabetical
72 | # - mknotebooks:
73 | #     execute: true
74 | #     write_markdown: true
75 | #     allow_errors: true
76 | #     timeout: 1200
77 | #     binder: true
78 | #     binder_service_name: "gh"
79 | #     binder_branch: "master"
80 | 
81 | markdown_extensions:
82 |   - admonition
83 |   - pymdownx.highlight:
84 |       use_pygments: true
85 |   - pymdownx.inlinehilite
86 |   # - pymdownx.tabbed:
87 |   #     alternate_style: true
88 |   - pymdownx.superfences
89 | 
90 | extra_javascript:
91 |   - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML
92 | 
93 | extra_css:
94 |   - css/apidocs.css
95 | 


--------------------------------------------------------------------------------
/mkdocs/AUTHORS.md:
--------------------------------------------------------------------------------
1 | ../AUTHORS.md


--------------------------------------------------------------------------------
/mkdocs/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ../CHANGELOG.md


--------------------------------------------------------------------------------
/mkdocs/api/biology.md:
--------------------------------------------------------------------------------
1 | # Biology
2 | 
3 | ::: janitor.biology
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/chemistry.md:
--------------------------------------------------------------------------------
1 | # Chemistry
2 | 
3 | ::: janitor.chemistry
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/engineering.md:
--------------------------------------------------------------------------------
1 | # Engineering
2 | 
3 | ::: janitor.engineering
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/finance.md:
--------------------------------------------------------------------------------
1 | # Finance
2 | 
3 | ::: janitor.finance
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/functions.md:
--------------------------------------------------------------------------------
 1 | # Functions
 2 | 
 3 | ::: janitor.functions
 4 |     options:
 5 |       filters:
 6 |       - "!^_"
 7 |       members:
 8 |         - add_columns
 9 |         - alias
10 |         - also
11 |         - bin_numeric
12 |         - case_when
13 |         - change_index_dtype
14 |         - change_type
15 |         - clean_names
16 |         - coalesce
17 |         - collapse_levels
18 |         - complete
19 |         - concatenate_columns
20 |         - conditional_join
21 |         - convert_date
22 |         - count_cumulative_unique
23 |         - currency_column_to_numeric
24 |         - deconcatenate_column
25 |         - drop_constant_columns
26 |         - drop_duplicate_columns
27 |         - dropnotnull
28 |         - encode_categorical
29 |         - expand_column
30 |         - expand_grid
31 |         - explode_index
32 |         - factorize_columns
33 |         - fill
34 |         - filter
35 |         - find_replace
36 |         - flag_nulls
37 |         - get_dupes
38 |         - groupby_agg
39 |         - groupby_topk
40 |         - impute
41 |         - jitter
42 |         - join_apply
43 |         - label_encode
44 |         - limit_column_characters
45 |         - min_max_scale
46 |         - move
47 |         - mutate
48 |         - pivot
49 |         - process_text
50 |         - remove_columns
51 |         - remove_empty
52 |         - rename_columns
53 |         - reorder_columns
54 |         - round_to_fraction
55 |         - row_to_names
56 |         - select
57 |         - shuffle
58 |         - sort_column_value_order
59 |         - sort_naturally
60 |         - summarise
61 |         - take_first
62 |         - then
63 |         - to_datetime
64 |         - toset
65 |         - transform_columns
66 |         - truncate_datetime
67 |         - update_where
68 |         - utils
69 | 


--------------------------------------------------------------------------------
/mkdocs/api/io.md:
--------------------------------------------------------------------------------
1 | # Input/Output (io)
2 | 
3 | ::: janitor.io
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/math.md:
--------------------------------------------------------------------------------
1 | # Math
2 | 
3 | ::: janitor.math
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/ml.md:
--------------------------------------------------------------------------------
1 | # Machine Learning
2 | 
3 | ::: janitor.ml
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/polars.md:
--------------------------------------------------------------------------------
 1 | # Polars
 2 | 
 3 | ::: janitor.polars
 4 |     options:
 5 |       filters:
 6 |       - "!^_"
 7 |       members:
 8 |         - clean_names
 9 |         - complete
10 |         - pivot_longer
11 |         - row_to_names
12 | 


--------------------------------------------------------------------------------
/mkdocs/api/timeseries.md:
--------------------------------------------------------------------------------
1 | # Timeseries
2 | 
3 | ::: janitor.timeseries
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/api/xarray.md:
--------------------------------------------------------------------------------
1 | # XArray
2 | 
3 | ::: janitor.xarray.functions
4 |     options:
5 |       filters:
6 |       - "!^_"
7 | 


--------------------------------------------------------------------------------
/mkdocs/css/apidocs.css:
--------------------------------------------------------------------------------
 1 | /* https://mkdocstrings.github.io/theming/#css-classes */
 2 | .doc-property {
 3 |   border-radius: 15px;
 4 |   padding: 0 5px;
 5 | }
 6 | .doc-property-special {
 7 |   background-color: blue;
 8 |   color: white;
 9 | }
10 | .doc-property-private {
11 |   background-color: red;
12 |   color: white;
13 | }
14 | .doc-property-property {
15 |   background-color: green;
16 |   color: white;
17 | }
18 | .doc-property-read-only {
19 |   background-color: yellow;
20 |   color: black;
21 | }
22 | 
23 | /* https://mkdocstrings.github.io/handlers/python/#recommended-style-material */
24 | /* Indentation. */
25 | div.doc-contents:not(.first) {
26 |   padding-left: 25px;
27 |   border-left: 4px solid rgba(230, 230, 230);
28 |   margin-bottom: 80px;
29 | }
30 | 
31 | /* add a keyboard shortcut icon for search bar,
32 |  * https://github.com/squidfunk/mkdocs-material/issues/2574#issuecomment-821979698
33 |  */
34 | [data-md-toggle="search"]:not(:checked) ~ .md-header .md-search__form::after {
35 |   position: absolute;
36 |   top: 0.3rem;
37 |   right: 0.3rem;
38 |   display: block;
39 |   padding: 0.1rem 0.4rem;
40 |   color: var(--md-default-bg-color--lighter);
41 |   font-weight: bold;
42 |   font-size: 0.8rem;
43 |   border: 0.05rem solid var(--md-default-bg-color--lighter);
44 |   border-radius: 0.1rem;
45 |   content: "/";
46 | }
47 | 
48 | /* prevent selection of chevron in example blocks
49 |  * cf. https://mkdocstrings.github.io/recipes/#prevent-selection-of-prompts-and-output-in-python-code-blocks
50 |  */
51 | .highlight .gp, .highlight .go { /* Generic.Prompt, Generic.Output */
52 |     user-select: none;
53 | }
54 | 


--------------------------------------------------------------------------------
/mkdocs/development/lazy_imports.md:
--------------------------------------------------------------------------------
 1 | # Lazy Imports
 2 | 
 3 | In `pyjanitor`, we use lazy imports to speed up `import janitor`.
 4 | Prior to using lazy imports, `import janitor` would take about 1-2 seconds to complete,
 5 | thereby causing significant delays for downstream consumers of `pyjanitor`.
 6 | Slow importing be undesirable as it would slow down programs that demand low latency.
 7 | 
 8 | ## A brief history of the decision
 9 | 
10 | The original issue was raised by @ericmjl
11 | in issue ([#1059](https://github.com/pyjanitor-devs/pyjanitor/issues/1059)).
12 | The basis there is that the scientific Python community
13 | was hurting with imports that took a long time,
14 | especially the ones that depended on SciPy and Pandas.
15 | As `pyjanitor` is a package that depends on `pandas`,
16 | it was important for us to see if we could improve the speed at which imports happened.
17 | 
18 | ## Current Speed Benchmark
19 | 
20 | As of 5 April 2022, imports take about ~0.5 seconds (give or take) to complete
21 | on a GitHub Codespaces workspace.
22 | This is much more desirable than the original 1-2 seconds,
23 | also measured on a GitHub Codespaces workspace.
24 | 
25 | ## How to benchmark
26 | 
27 | To benchmark, we run the following line:
28 | 
29 | ```bash
30 | python -X importtime -c "import janitor" 2> timing.log
31 | ```
32 | 
33 | Then, using the `tuna` CLI tool, we can view the timing log:
34 | 
35 | ```bash
36 | tuna timing.log
37 | ```
38 | 
39 | Note: You may need to install tuna using `pip install -U tuna`.
40 | `tuna`'s development repository is [on GitHub][tuna]
41 | 
42 | [tuna]: https://github.com/nschloe/tuna.
43 | 
44 | You'll be redirected to your browser,
45 | where the web UI will allow you to see
46 | which imports are causing time delays.
47 | 
48 | ![Tuna's Web UI](./images/tuna.png)
49 | 
50 | ## Which imports to lazily load
51 | 
52 | Generally speaking, the _external_ imports are the ones that
53 | when lazily loaded, will give the maximal gain in speed.
54 | You can also opt to lazily load `pyjanitor` submodules,
55 | but we doubt they will give much advantage in speed.
56 | 


--------------------------------------------------------------------------------
/mkdocs/environment.yaml:
--------------------------------------------------------------------------------
 1 | # 14 August 2022: Temporarily commenting out.
 2 | # See: https://github.com/pyjanitor-devs/pyjanitor/pull/1147#issuecomment-1214508157
 3 | # for more context on why.
 4 | # name: pyjanitor-doc
 5 | # channels:
 6 | #   - conda-forge
 7 | # dependencies:
 8 | #   - python
 9 | #   # required
10 | #   - pandas
11 | #   - pandas-flavor
12 | #   - multipledispatch
13 | #   - scipy
14 | #   # optional
15 | #   - biopython
16 | #   - natsort
17 | #   - pyspark>=3.2.0
18 | #   - rdkit
19 | #   - tqdm
20 | #   - unyt
21 | #   - xarray
22 | #   - numba
23 | #   # doc
24 | #   - mkdocs
25 | #   - mkdocs-material
26 | #   # To fix #1146
27 | #   # - mkdocstrings-python
28 | #   - mkdocstrings=0.18.1
29 | #   - mkdocstrings-python-legacy=0.2.2
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | exclude = '''
 3 | /(
 4 |     \.git
 5 |   | \.hg
 6 |   | \.mypy_cache
 7 |   | \.tox
 8 |   | \.venv
 9 |   | _build
10 |   | buck-out
11 |   | build
12 |   | dist
13 |   | env
14 |   | venv
15 | )/
16 | '''
17 | include = '\.pyi?$'
18 | line-length = 79
19 | target-version = ['py36', 'py37', 'py38']
20 | 
21 | [tool.interrogate]
22 | exclude = ["setup.py", "docs", "nbconvert_config.py"]
23 | fail-under = 55
24 | ignore-init-method = true
25 | ignore-init-module = true
26 | ignore-module = false
27 | ignore-private = false
28 | ignore-semiprivate = false
29 | quiet = false
30 | verbose = 2
31 | 
32 | # https://docs.pytest.org/en/6.2.x/mark.html#registering-marks
33 | [tool.pytest.ini_options]
34 | markers = [
35 |     "functions: test for general functions",
36 |     "biology: tests for biology",
37 |     "chemistry: tests for chemistry",
38 |     "finance: tests for finance",
39 |     "utils: utility tests",
40 |     "engineering: tests for engineering",
41 |     "ml: tests for machine learning",
42 |     "polars: tests for polars methods",
43 |     "spark_functions: tests for pyspark functions",
44 |     "xarray: tests for xarray functions",
45 |     "timeseries: tests for timeseries",
46 |     "documentation: tests for documentation",
47 |     "turtle: tests that take more than 5 seconds to execute",
48 | ]
49 | 
50 | 
51 | [tool.ruff]
52 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
53 | lint.select = ["E", "F", "I"]
54 | lint.ignore = []
55 | 
56 | # Allow fix for all enabled rules (when `--fix`) is provided.
57 | lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
58 | lint.unfixable = []
59 | 
60 | # Exclude a variety of commonly ignored directories.
61 | exclude = [
62 |     ".bzr",
63 |     ".direnv",
64 |     ".eggs",
65 |     ".git",
66 |     ".git-rewrite",
67 |     ".hg",
68 |     ".mypy_cache",
69 |     ".nox",
70 |     ".pants.d",
71 |     ".pytype",
72 |     ".ruff_cache",
73 |     ".svn",
74 |     ".tox",
75 |     ".venv",
76 |     "__pypackages__",
77 |     "_build",
78 |     "buck-out",
79 |     "build",
80 |     "dist",
81 |     "node_modules",
82 |     "venv",
83 |     "nbconvert_config.py",
84 | ]
85 | 
86 | # Same as Black.
87 | line-length = 88
88 | 
89 | # Allow unused variables when underscore-prefixed.
90 | lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
91 | 
92 | # Assume Python 3.10
93 | target-version = "py310"
94 | 
95 | [tool.ruff.lint.mccabe]
96 | # Unlike Flake8, default to a complexity level of 10.
97 | max-complexity = 10
98 | 


--------------------------------------------------------------------------------
/scripts/ci/build_environment.sh:
--------------------------------------------------------------------------------
1 | conda install -c conda-forge mamba
2 | mamba env create -f environment-dev.yml
3 | 


--------------------------------------------------------------------------------
/scripts/ci/unpack_environment.sh:
--------------------------------------------------------------------------------
1 | mkdir -p /tmp/pyjanitor-dev_env
2 | tar -xzf pyjanitor-dev.tar.gz -C /tmp/pyjanitor-dev_env
3 | source /tmp/pyjanitor-dev_env/bin/activate
4 | conda-unpack
5 | 


--------------------------------------------------------------------------------
/scripts/count_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to count the number of functions inside each source file.
 3 | 
 4 | Can be used for many purposes.
 5 | 
 6 | Intended to be run from pyjanitor's top-level directory.
 7 | 
 8 | 
 9 | """
10 | 
11 | import ast
12 | import os
13 | from pathlib import Path
14 | 
15 | 
16 | def count_number_of_functions(filepath):
17 |     """Count number of functions inside a .py file."""
18 |     # Taken from: https://stackoverflow.com/a/37514895/1274908
19 |     with open(filepath, "r+") as f:
20 |         tree = ast.parse(f.read())
21 |         return sum(isinstance(exp, ast.FunctionDef) for exp in tree.body)
22 | 
23 | 
24 | def janitor_submodules():
25 |     """Yield a list of janitor submodules and their full paths."""
26 |     files = [f for f in os.listdir("janitor") if f.endswith(".py")]
27 | 
28 |     for file in files:
29 |         yield Path("janitor") / file
30 | 
31 | 
32 | def main():
33 |     """Main executable function."""
34 |     for filepath in janitor_submodules():
35 |         num_funcs = count_number_of_functions(filepath)
36 |         print(filepath, num_funcs)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/scripts/docker_deploy.sh:
--------------------------------------------------------------------------------
1 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
2 | docker push ericmjl/pyjanitor:devcontainer
3 | 


--------------------------------------------------------------------------------
/talks/scipy2019/friends.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/friends.png


--------------------------------------------------------------------------------
/talks/scipy2019/readthedocs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/readthedocs.png


--------------------------------------------------------------------------------
/talks/scipy2019/slides.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Outline:
 3 | 1.
 4 | 
 5 | -->
 6 | 
 7 | # Clean APIs for Data Cleaning
 8 | 
 9 | Eric J. Ma
10 | Novartis Institutes for Biomedical Research
11 | SciPy 2019
12 | 
13 | ---
14 | 
15 | ## pandasshee
16 | 
17 | ---
18 | 
19 | ## readable code
20 | 
21 | ---
22 | 
23 | ## side-by-side
24 | 
25 | - pop-up pandas code
26 | - pop-up pyjanitor code
27 | 
28 | ---
29 | 
30 | ## live demo
31 | 
32 | ---
33 | 
34 | ## history
35 | 
36 | ---
37 | 
38 | ## welcoming newcomers
39 | 
40 | ---
41 | 
42 | ##
43 | 


--------------------------------------------------------------------------------
/talks/scipy2019/sprints.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/sprints.jpg


--------------------------------------------------------------------------------
/talks/scipy2019/twitter-wars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/talks/scipy2019/twitter-wars.png


--------------------------------------------------------------------------------
/tests/biology/test_join_fasta.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | import pytest
 5 | from helpers import running_on_ci
 6 | 
 7 | import janitor.biology  # noqa: F403, F401
 8 | 
 9 | # Skip all tests if Biopython not installed
10 | pytestmark = pytest.mark.skipif(
11 |     (importlib.util.find_spec("Bio") is None) & ~running_on_ci(),
12 |     reason="Biology tests relying on Biopython only required for CI",
13 | )
14 | 
15 | 
16 | @pytest.mark.biology
17 | def test_join_fasta(biodf):
18 |     """Test adding sequence from FASTA file in `sequence` column."""
19 |     df = biodf.join_fasta(
20 |         filename=os.path.join(pytest.TEST_DATA_DIR, "sequences.fasta"),
21 |         id_col="sequence_accession",
22 |         column_name="sequence",
23 |     )
24 | 
25 |     assert "sequence" in df.columns
26 | 


--------------------------------------------------------------------------------
/tests/chemistry/test_maccs_keys_fingerprint.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | from helpers import running_on_ci
 5 | 
 6 | import janitor.chemistry  # noqa: F401
 7 | 
 8 | # Skip all tests if rdkit not installed
 9 | pytestmark = pytest.mark.skipif(
10 |     (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(),
11 |     reason="rdkit tests only required for CI",
12 | )
13 | 
14 | 
15 | @pytest.mark.chemistry
16 | def test_maccs_keys_fingerprint(chemdf):
17 |     """Test conversion of SMILES strings to MACCS keys fingerprints."""
18 |     maccs_keys = chemdf.smiles2mol("smiles", "mol").maccs_keys_fingerprint(
19 |         "mol"
20 |     )
21 |     assert maccs_keys.shape == (10, 167)
22 |     assert set(maccs_keys.to_numpy().flatten().tolist()) == set([0, 1])
23 | 


--------------------------------------------------------------------------------
/tests/chemistry/test_molecular_descriptors.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | from helpers import running_on_ci
 5 | 
 6 | # Skip all tests if rdkit not installed
 7 | pytestmark = pytest.mark.skipif(
 8 |     (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(),
 9 |     reason="rdkit tests only required for CI",
10 | )
11 | 
12 | 
13 | @pytest.mark.chemistry
14 | def test_molecular_descriptors(chemdf):
15 |     """Test conversion of Mol objects to 39 column molecular descriptors."""
16 |     mol_desc = chemdf.smiles2mol("smiles", "mol").molecular_descriptors("mol")
17 |     assert mol_desc.shape == (10, 39)
18 | 


--------------------------------------------------------------------------------
/tests/chemistry/test_morgan_fingerprint.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | from helpers import running_on_ci
 5 | 
 6 | pytestmark = pytest.mark.skipif(
 7 |     (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(),
 8 |     reason="rdkit tests only required for CI",
 9 | )
10 | 
11 | 
12 | @pytest.mark.chemistry
13 | def test_morgan_fingerprint_counts(chemdf):
14 |     """Test counts of Morgan Fingerprints converted from Mol objects."""
15 |     morgans = chemdf.smiles2mol("smiles", "mol").morgan_fingerprint(
16 |         "mol", kind="counts"
17 |     )
18 |     assert morgans.shape == (10, 2048)
19 |     assert (morgans.to_numpy() >= 0).all()
20 | 
21 | 
22 | @pytest.mark.chemistry
23 | def test_morgan_fingerprint_bits(chemdf):
24 |     """Test bits of Morgan Fingerprints converted from Mol objects."""
25 |     morgans = chemdf.smiles2mol("smiles", "mol").morgan_fingerprint(
26 |         "mol", kind="bits"
27 |     )
28 |     assert morgans.shape == (10, 2048)
29 |     assert set(morgans.to_numpy().flatten().tolist()) == set([0, 1])
30 | 
31 | 
32 | @pytest.mark.chemistry
33 | def test_morgan_fingerprint_kind_error(chemdf):
34 |     """Test `morgan_fingerprint` raises exception for invalid `kind`."""
35 |     with pytest.raises(ValueError):
36 |         chemdf.smiles2mol("smiles", "mol").morgan_fingerprint(
37 |             "mol", kind="invalid-kind"
38 |         )
39 | 


--------------------------------------------------------------------------------
/tests/chemistry/test_smiles2mol.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | from helpers import running_on_ci
 5 | 
 6 | # Skip all tests if rdkit not installed
 7 | pytestmark = pytest.mark.skipif(
 8 |     (importlib.util.find_spec("rdkit") is None) & ~running_on_ci(),
 9 |     reason="rdkit tests only required for CI",
10 | )
11 | 
12 | 
13 | @pytest.mark.parametrize("progressbar", [None, "terminal", "notebook"])
14 | @pytest.mark.chemistry
15 | def test_smiles2mol(chemdf, progressbar):
16 |     """Test each SMILES properly converted to Mol object."""
17 |     from rdkit import Chem
18 | 
19 |     chemdf = chemdf.smiles2mol("smiles", "mol", progressbar)
20 |     assert "mol" in chemdf.columns
21 |     for elem in chemdf["mol"]:
22 |         assert isinstance(elem, Chem.rdchem.Mol)
23 | 
24 | 
25 | def test_smiles2mol_bad_progressbar(chemdf):
26 |     """Test that bad progressbar value raises error."""
27 |     with pytest.raises(ValueError):
28 |         chemdf = chemdf.smiles2mol("smiles", "mol", progressbar="blah")
29 | 


--------------------------------------------------------------------------------
/tests/finance/test_convert_currency.py:
--------------------------------------------------------------------------------
 1 | """Tests for convert_currency() in finance module."""
 2 | 
 3 | from datetime import date, datetime
 4 | 
 5 | import pytest
 6 | import requests
 7 | 
 8 | from janitor.finance import convert_currency  # noqa: F401
 9 | 
10 | 
11 | @pytest.mark.finance
12 | @pytest.mark.xfail(reason="changes made to web API prevent this from running")
13 | def test_make_currency_api_request():
14 |     """
15 |     Test for currency API request.
16 | 
17 |     This test exists because we rely solely on the service by
18 |     exchangeratesapi. That said, we also mark it as expected to fail because
19 |     it sometimes pings the exchange rates API a too frequently and causes
20 |     tests to fail.
21 | 
22 |     For an example of how this test fails, see:
23 |     https://github.com/pyjanitor-devs/pyjanitor/issues/147
24 |     """
25 |     r = requests.get("https://api.exchangeratesapi.io")
26 |     assert r.status_code == 200
27 | 
28 | 
29 | @pytest.mark.xfail(reason="changes made to web API prevent this from running")
30 | @pytest.mark.finance
31 | def test_make_new_currency_col(dataframe):
32 |     """Test converting to same currency equals original currency column."""
33 |     df = dataframe.convert_currency("a", "USD", "USD", make_new_column=True)
34 |     assert all(df["a"] == df["a_USD"])
35 | 
36 | 
37 | @pytest.mark.finance
38 | @pytest.mark.xfail(reason="changes made to web API prevent this from running")
39 | def test_historical_datetime(dataframe):
40 |     """Test conversion raises exception for datetime outside API range."""
41 |     with pytest.raises(ValueError):
42 |         assert dataframe.convert_currency(
43 |             "a",
44 |             "USD",
45 |             "AUD",
46 |             make_new_column=True,
47 |             historical_date=datetime(1982, 10, 27),
48 |         )
49 | 
50 | 
51 | @pytest.mark.finance
52 | @pytest.mark.xfail(reason="changes made to web API prevent this from running")
53 | def test_historical_date(dataframe):
54 |     """Test conversion raises exception for date outside API range."""
55 |     with pytest.raises(ValueError):
56 |         assert dataframe.convert_currency(
57 |             "a",
58 |             "USD",
59 |             "AUD",
60 |             make_new_column=True,
61 |             historical_date=date(1982, 10, 27),
62 |         )
63 | 
64 | 
65 | @pytest.mark.finance
66 | @pytest.mark.xfail(reason="changes made to web API prevent this from running")
67 | def test_currency_check(dataframe):
68 |     """Test conversion raises exception for invalid currency."""
69 |     with pytest.raises(ValueError):
70 |         assert dataframe.convert_currency("a", "USD", "INVALID-CURRENCY")
71 | 


--------------------------------------------------------------------------------
/tests/finance/test_convert_stock.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from janitor.finance import get_symbol
 4 | 
 5 | 
 6 | @pytest.mark.xfail(reason="Flaky because it depends on internet connectivity.")
 7 | def test_convert_stock():
 8 |     """
 9 |     Tests get_symbol function,
10 |     get_symbol should return appropriate string
11 |     corresponding to abbreviation.
12 |     This string will be a company's full name,
13 |     and the abbreviation will be the NSYE
14 |     symbol for the company.
15 | 
16 |     Example:
17 |         print(get_symbol("aapl"))
18 |         console >> Apple Inc.
19 | 
20 |     If the symbol does not have a corresponding
21 |     company, Nonetype should be returned.
22 |     """
23 |     assert get_symbol("GME") == "GameStop Corp."
24 |     assert get_symbol("AAPL") != "Aramark"
25 |     assert get_symbol("ASNF") is None
26 | 


--------------------------------------------------------------------------------
/tests/finance/test_get_symbol.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from janitor.finance import get_symbol
 4 | 
 5 | """
 6 | tests the convert_symbol helper function.
 7 | 
 8 | Test 1: GME is Gamestop Corp. Test should run fine.
 9 | Test 2: GME is not Globius Medical Inc.
10 | Test 3: A little redundant, but it's another
11 |     'happy path' to show get_symbol works for more
12 |     abbreviations than just the one tested so far.
13 | Test 4: ZZZZ does not belong to any company,
14 |     it should therefore it should be None
15 | """
16 | 
17 | 
18 | @pytest.mark.xfail(
19 |     reason="Flaky, because it depends on internet connectivity."
20 | )
21 | def test_get_symbol():
22 |     assert get_symbol("GME") == "GameStop Corp."
23 |     assert get_symbol("GME") != "Globus Medical Inc."
24 |     assert get_symbol("F") == "Ford Motor Company"
25 |     assert get_symbol("ZZZZ") is None
26 | 


--------------------------------------------------------------------------------
/tests/functions/test_add_columns.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from hypothesis import given, settings
 5 | from hypothesis import strategies as st
 6 | from pandas.testing import assert_series_equal
 7 | 
 8 | from janitor.testing_utils.strategies import df_strategy
 9 | 
10 | 
11 | @pytest.mark.functions
12 | @given(
13 |     df=df_strategy(),
14 |     x_vals=st.floats(),
15 |     n_yvals=st.integers(min_value=0, max_value=100),
16 | )
17 | @settings(deadline=None, max_examples=10)
18 | def test_add_columns(df, x_vals, n_yvals):
19 |     """
20 |     Test for adding multiple columns at the same time.
21 |     """
22 |     y_vals = np.linspace(0, 42, n_yvals)
23 | 
24 |     if n_yvals != len(df) or n_yvals == 0:
25 |         with pytest.raises(ValueError):
26 |             df = df.add_columns(x=x_vals, y=y_vals)
27 | 
28 |     else:
29 |         df = df.add_columns(x=x_vals, y=y_vals)
30 |         series = pd.Series([x_vals] * len(df))
31 |         series.name = "x"
32 |         assert_series_equal(df["x"], series)
33 | 
34 |         series = pd.Series(y_vals)
35 |         series.name = "y"
36 |         assert_series_equal(df["y"], series)
37 | 


--------------------------------------------------------------------------------
/tests/functions/test_alias.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas.testing import assert_series_equal
 3 | 
 4 | 
 5 | def test_alias_no_name():
 6 |     """Test output if Series does not have a name"""
 7 |     series = pd.Series([1, 2, 3])
 8 |     assert_series_equal(series, series.alias())
 9 | 
10 | 
11 | def test_alias_callable():
12 |     """Test output if alias is a callable"""
13 |     series = pd.Series([1, 2, 3], name="UPPER")
14 |     assert_series_equal(series.rename("upper"), series.alias(str.lower))
15 | 
16 | 
17 | def test_alias_scalar():
18 |     """Test output if alias is a scalar"""
19 |     series = pd.Series([1, 2, 3], name="UPPER")
20 |     assert_series_equal(series.rename("upper"), series.alias("upper"))
21 | 


--------------------------------------------------------------------------------
/tests/functions/test_also.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for `.also()`."""
 2 | 
 3 | from unittest.mock import Mock
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | def remove_first_two_letters_from_col_names(df):
 9 |     """Helper function to mutate dataframe by changing column names."""
10 |     col_names = df.columns
11 |     col_names = [name[2:] for name in col_names]
12 |     df.columns = col_names
13 |     return df
14 | 
15 | 
16 | def remove_rows_3_and_4(df):
17 |     """Helper function to mutate dataframe by removing rows."""
18 |     df = df.drop(3, axis=0)
19 |     df = df.drop(4, axis=0)
20 |     return df
21 | 
22 | 
23 | def drop_inplace(df):
24 |     """
25 |     Helper function to mutate dataframe by dropping a column.
26 | 
27 |     We usually would not use `inplace=True` in a block,
28 |     but the intent here is to test that
29 |     the in-place modification of a dataframe
30 |     doesn't get passed through in the `.also()` function.
31 |     Hence, we tell Flake8 to skip checking `PD002` on that line.
32 | 
33 |     .. # noqa: DAR101
34 |     """
35 |     df.drop(columns=[df.columns[0]], inplace=True)  # noqa: PD002
36 | 
37 | 
38 | @pytest.mark.functions
39 | def test_also_column_manipulation_no_change(dataframe):
40 |     """Test that changed dataframe inside `.also()` doesn't get returned."""
41 |     cols = tuple(dataframe.columns)
42 |     df = dataframe.also(remove_first_two_letters_from_col_names)
43 |     assert dataframe is df
44 |     assert cols == tuple(df.columns)
45 | 
46 | 
47 | @pytest.mark.functions
48 | def test_also_remove_rows_no_change(dataframe):
49 |     """Test that changed dataframe inside `.also()` doesn't get returned."""
50 |     df = dataframe.also(remove_rows_3_and_4)
51 |     rows = tuple(df.index)
52 |     assert rows == (0, 1, 2, 3, 4, 5, 6, 7, 8)
53 | 
54 | 
55 | @pytest.mark.functions
56 | def test_also_runs_function(dataframe):
57 |     """Test that `.also()` executes the function."""
58 |     method = Mock(return_value=None)
59 |     df = dataframe.also(method)
60 |     assert id(df) == id(dataframe)
61 |     assert method.call_count == 1
62 | 
63 | 
64 | @pytest.mark.functions
65 | def test_also_args(dataframe):
66 |     """Test that the args are passed through to the function."""
67 |     method = Mock(return_value=None)
68 |     _ = dataframe.also(method, 5)
69 | 
70 |     assert method.call_args[0][1] == 5
71 | 
72 | 
73 | @pytest.mark.functions
74 | def test_also_kwargs(dataframe):
75 |     """Test that the kwargs are passed through to the function."""
76 |     method = Mock(return_value=None)
77 |     _ = dataframe.also(method, n=5)
78 | 
79 |     assert method.call_args[1] == {"n": 5}
80 | 
81 | 
82 | @pytest.mark.functions
83 | def test_also_drop_inplace(dataframe):
84 |     """Test that in-place modification of dataframe does not pass through."""
85 |     cols = tuple(dataframe.columns)
86 |     df = dataframe.also(drop_inplace)
87 |     assert tuple(df.columns) == cols
88 | 


--------------------------------------------------------------------------------
/tests/functions/test_bin_numeric.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given, settings
 3 | 
 4 | from janitor.testing_utils.strategies import df_strategy
 5 | 
 6 | 
 7 | @pytest.mark.functions
 8 | @given(df=df_strategy())
 9 | @settings(deadline=None, max_examples=10)
10 | def test_bin_numeric_expected_columns(df):
11 |     df = df.bin_numeric(from_column_name="a", to_column_name="a_bin")
12 |     expected_columns = [
13 |         "a",
14 |         "Bell__Chart",
15 |         "decorated-elephant",
16 |         "animals@#$%^",
17 |         "cities",
18 |         "a_bin",
19 |     ]
20 | 
21 |     assert set(df.columns) == set(expected_columns)
22 | 
23 | 
24 | @pytest.mark.functions
25 | @given(df=df_strategy())
26 | @settings(deadline=None, max_examples=10)
27 | def test_bin_numeric_kwargs_has_no_retbins(df):
28 |     with pytest.raises(ValueError):
29 |         labels = ["a", "b", "c", "d", "e"]
30 |         df.bin_numeric(
31 |             from_column_name="a",
32 |             to_column_name="a_bin",
33 |             bins=5,
34 |             labels=labels,
35 |             retbins=True,
36 |         )
37 | 


--------------------------------------------------------------------------------
/tests/functions/test_coalesce.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from pandas.testing import assert_frame_equal
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def df():
 9 |     "Base DataFrame fixture"
10 |     return pd.DataFrame(
11 |         {"a": [1, np.nan, 3], "b": [2, 3, 1], "c": [2, np.nan, 9]}
12 |     )
13 | 
14 | 
15 | @pytest.mark.xfail(reason="column_names is a variable args")
16 | def test_wrong_type_column_names(df):
17 |     """Raise Error if wrong type is provided for `column_names`."""
18 |     with pytest.raises(TypeError):
19 |         df.coalesce("a", "b")
20 | 
21 | 
22 | def test_wrong_type_target_column_name(df):
23 |     """Raise TypeError if wrong type is provided for `target_column_name`."""
24 |     with pytest.raises(TypeError):
25 |         df.coalesce("a", "b", target_column_name=["new_name"])
26 | 
27 | 
28 | def test_wrong_type_default_value(df):
29 |     """Raise TypeError if wrong type is provided for `default_value`."""
30 |     with pytest.raises(TypeError):
31 |         df.coalesce(
32 |             "a", "b", target_column_name="new_name", default_value=[1, 2, 3]
33 |         )
34 | 
35 | 
36 | def test_len_column_names_less_than_2(df):
37 |     """Raise Error if column_names length is less than 2."""
38 |     with pytest.raises(ValueError):
39 |         df.coalesce("a")
40 | 
41 | 
42 | def test_empty_column_names(df):
43 |     """Return dataframe if `column_names` is empty."""
44 |     assert_frame_equal(df.coalesce(), df)
45 | 
46 | 
47 | @pytest.mark.functions
48 | def test_coalesce_without_target(df):
49 |     """Test output if `target_column_name` is not provided."""
50 |     result = df.coalesce("a", "b", "c")
51 |     expected_output = df.assign(
52 |         a=df["a"].combine_first(df["b"].combine_first(df["c"]))
53 |     )
54 |     assert_frame_equal(result, expected_output)
55 | 
56 | 
57 | @pytest.mark.functions
58 | def test_coalesce_without_delete():
59 |     """Test output if nulls remain and `default_value` is provided."""
60 |     df = pd.DataFrame(
61 |         {"s1": [np.nan, np.nan, 6, 9, 9], "s2": [np.nan, 8, 7, 9, 9]}
62 |     )
63 |     expected = df.assign(s3=df.s1.combine_first(df.s2).fillna(0))
64 |     result = df.coalesce("s1", "s2", target_column_name="s3", default_value=0)
65 |     assert_frame_equal(result, expected)
66 | 
67 | 
68 | def test_coalesce_duplicate_columns():
69 |     """
70 |     Test output on duplicate columns.
71 |     """
72 |     df = pd.DataFrame(
73 |         np.array([[1.0, 2.0, 2.0], [np.nan, 3.0, np.nan], [3.0, 1.0, 9.0]]),
74 |         columns=["a", "a", "c"],
75 |     )
76 | 
77 |     expected = pd.DataFrame(
78 |         np.array([[1.0, 2.0, 2.0], [3, 3.0, np.nan], [3.0, 1.0, 9.0]]),
79 |         columns=["a", "a", "c"],
80 |     )
81 | 
82 |     actual = df.coalesce("a")
83 | 
84 |     assert_frame_equal(expected, actual)
85 | 


--------------------------------------------------------------------------------
/tests/functions/test_concatenate_columns.py:
--------------------------------------------------------------------------------
 1 | """Tests for concatenate_columns."""
 2 | 
 3 | import pytest
 4 | 
 5 | from janitor.errors import JanitorError
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_concatenate_columns(dataframe):
10 |     """Basic test for concatenate_columns."""
11 |     df = dataframe.concatenate_columns(
12 |         column_names=["a", "decorated-elephant"],
13 |         sep="-",
14 |         new_column_name="index",
15 |     )
16 |     assert "index" in df.columns
17 | 
18 | 
19 | @pytest.mark.functions
20 | def test_concatenate_columns_null_values(missingdata_df):
21 |     """Test for concatenating columns with null values.
22 | 
23 |     Null values ought to show up as "nan" in strings
24 |     in the concatenated column.
25 |     """
26 |     df = missingdata_df.concatenate_columns(
27 |         column_names=["a", "decorated-elephant"],
28 |         sep="-",
29 |         new_column_name="index",
30 |         ignore_empty=True,
31 |     )
32 |     expected_values = ["1.0-1", "2.0-2", "nan-3"] * 3
33 |     assert expected_values == df["index"].tolist()
34 | 
35 | 
36 | @pytest.mark.functions
37 | @pytest.mark.parametrize("column_names", [["a"], []])
38 | def test_concatenate_columns_errors(dataframe, column_names):
39 |     """
40 |     Test that an error is raised when less than two columns are specified.
41 |     """
42 |     with pytest.raises(
43 |         JanitorError, match="At least two columns must be specified"
44 |     ):
45 |         dataframe.concatenate_columns(
46 |             column_names=column_names, new_column_name="index"
47 |         )
48 | 


--------------------------------------------------------------------------------
/tests/functions/test_convert_excel_date.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.functions
 8 | def test_convert_excel_date():
 9 |     # using openpyxl as the engine staves off an error that crops up
10 |     # during the CI build up with xlrd
11 |     df = (
12 |         pd.read_excel(
13 |             Path(pytest.EXAMPLES_DIR) / "notebooks" / "dirty_data.xlsx",
14 |             engine="openpyxl",
15 |         )
16 |         .clean_names()
17 |         .convert_excel_date("hire_date")
18 |     )
19 | 
20 |     assert df["hire_date"].dtype == "M8[ns]"
21 | 


--------------------------------------------------------------------------------
/tests/functions/test_convert_matlab_date.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_convert_matlab_date():
 7 |     mlab = [
 8 |         733_301.0,
 9 |         729_159.0,
10 |         734_471.0,
11 |         737_299.563_296_356_5,
12 |         737_300.000_000_000_0,
13 |     ]
14 |     df = pd.DataFrame(mlab, columns=["dates"]).convert_matlab_date("dates")
15 | 
16 |     assert df["dates"].dtype == "M8[ns]"
17 | 


--------------------------------------------------------------------------------
/tests/functions/test_convert_unix_date.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.skipif(
 8 |     os.name == "nt", reason="Skip *nix-specific tests on Windows"
 9 | )
10 | def test_convert_unix_date():
11 |     unix = [
12 |         "1284101485",
13 |         1_284_101_486,
14 |         "1284101487000",
15 |         1_284_101_488_000,
16 |         "1284101489",
17 |         "1284101490",
18 |         -2_147_483_648,
19 |         2_147_483_648,
20 |     ]
21 |     df = pd.DataFrame(unix, columns=["dates"]).convert_unix_date("dates")
22 | 
23 |     assert df["dates"].dtype == "M8[ns]"
24 | 


--------------------------------------------------------------------------------
/tests/functions/test_drop_constant_columns.py:
--------------------------------------------------------------------------------
 1 | """Tests for drop_constant_columns."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from pandas.testing import assert_frame_equal
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_drop_constant_columns(df_constant_columns):
10 |     """Test that executes drop_constant_columns function."""
11 |     processed_df = df_constant_columns.drop_constant_columns()
12 |     expected_col_list = ["Bell__Chart", "decorated-elephant", "cities"]
13 |     assert processed_df.columns.to_list() == expected_col_list
14 |     data = {
15 |         "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3,
16 |         "decorated-elephant": [1, 2, 3] * 3,
17 |         "cities": ["Cambridge", "Shanghai", "Basel"] * 3,
18 |     }
19 |     expected_df = pd.DataFrame(data)
20 |     assert_frame_equal(processed_df, expected_df)
21 | 


--------------------------------------------------------------------------------
/tests/functions/test_drop_duplicate_columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | 
 5 | import janitor  # noqa: F401
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_drop_duplicate_columns(df_duplicated_columns):
10 |     # df_duplicated_columns contains columns 'a', duplicated three times
11 |     clean_df = df_duplicated_columns.drop_duplicate_columns(column_name="a")
12 |     assert clean_df.columns.to_list() == ["b", "a", "a"]
13 |     expected_df = pd.DataFrame(
14 |         {"b": range(10), "a": range(10, 20), "a*": range(20, 30)}
15 |     ).clean_names(remove_special=True)
16 |     assert_frame_equal(clean_df, expected_df)
17 | 
18 | 
19 | @pytest.mark.functions
20 | def test_drop_duplicate_columns_for_second_duplicated_column(
21 |     df_duplicated_columns,
22 | ):
23 |     clean_df = df_duplicated_columns.drop_duplicate_columns(
24 |         column_name="a", nth_index=1
25 |     )
26 |     expected_df = pd.DataFrame(
27 |         {"a": range(10), "b": range(10), "a*": range(20, 30)}
28 |     ).clean_names(remove_special=True)
29 |     assert clean_df.columns.to_list() == ["a", "b", "a"]
30 |     assert_frame_equal(clean_df, expected_df)
31 | 
32 | 
33 | @pytest.mark.functions
34 | def test_drop_duplicate_columns_for_third_duplicated_column(
35 |     df_duplicated_columns,
36 | ):
37 |     clean_df = df_duplicated_columns.drop_duplicate_columns(
38 |         column_name="a", nth_index=2
39 |     )
40 |     expected_df = pd.DataFrame(
41 |         {"a": range(10), "b": range(10), "A": range(10, 20)}
42 |     ).clean_names(remove_special=True)
43 |     assert clean_df.columns.to_list() == ["a", "b", "a"]
44 |     assert_frame_equal(clean_df, expected_df)
45 | 
46 | 
47 | @pytest.mark.functions
48 | def test_drop_duplicate_columns_with_error(df_duplicated_columns):
49 |     with pytest.raises(IndexError):
50 |         df_duplicated_columns.drop_duplicate_columns(
51 |             column_name="a", nth_index=3
52 |         )
53 | 


--------------------------------------------------------------------------------
/tests/functions/test_dropnotnull.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_dropnotnull(missingdata_df):
 8 |     df = missingdata_df.clean_names()
 9 |     df_drop = df.dropnotnull("bell_chart")
10 | 
11 |     assert pd.isna(df_drop["bell_chart"]).all()
12 | 
13 |     assert_frame_equal(df.loc[df_drop.index], df_drop)
14 | 


--------------------------------------------------------------------------------
/tests/functions/test_expand_column.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_expand_column():
 7 |     data = {
 8 |         "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
 9 |         "col2": [1, 2, 3, 4],
10 |     }
11 | 
12 |     df = pd.DataFrame(data)
13 |     expanded_df = df.expand_column(column_name="col1", sep=", ", concat=False)
14 |     assert expanded_df.shape[1] == 6
15 | 
16 | 
17 | @pytest.mark.functions
18 | def test_expand_and_concat():
19 |     data = {
20 |         "col1": ["A, B", "B, C, D", "E, F", "A, E, F"],
21 |         "col2": [1, 2, 3, 4],
22 |     }
23 | 
24 |     df = pd.DataFrame(data).expand_column(
25 |         column_name="col1", sep=", ", concat=True
26 |     )
27 |     assert df.shape[1] == 8
28 | 
29 | 
30 | @pytest.mark.functions
31 | def test_sep_default_parameter():
32 |     """Test that the default parameter is a pipe character `|`."""
33 |     df = pd.DataFrame(
34 |         {
35 |             "col1": ["A|B", "B|C|D", "E|F", "A|E|F"],
36 |             "col2": [1, 2, 3, 4],
37 |         }
38 |     )
39 |     result = df.expand_column("col1")
40 | 
41 |     assert result.shape[1] == 8
42 | 


--------------------------------------------------------------------------------
/tests/functions/test_fill_empty.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_fill_empty(null_df):
 7 |     df = null_df.fill_empty(column_names=["2"], value=3)
 8 |     assert set(df.loc[:, "2"]) == set([3])
 9 | 
10 | 
11 | @pytest.mark.functions
12 | def test_fill_empty_column_string(null_df):
13 |     df = null_df.fill_empty(column_names="2", value=3)
14 |     assert set(df.loc[:, "2"]) == set([3])
15 | 
16 | 
17 | @pytest.mark.functions
18 | @pytest.mark.parametrize(
19 |     "column_names",
20 |     [
21 |         (0, 1, "2", "3"),  # tuple
22 |         [0, 1, "2", "3"],  # list
23 |         {0, 1, "2", "3"},  # set
24 |         ({0: 0, 1: 1, "2": "2", "3": "3"}).keys(),  # dict key
25 |         ({0: 0, 1: 1, "2": "2", "3": "3"}).values(),  # dict value
26 |         pd.Index([0, 1, "2", "3"]),  # Index
27 |     ],
28 | )
29 | def test_column_names_iterable_type(null_df, column_names):
30 |     result = null_df.fill_empty(column_names=column_names, value=3)
31 |     excepted = null_df.fillna(3)
32 | 
33 |     assert result.equals(excepted)
34 | 


--------------------------------------------------------------------------------
/tests/functions/test_filter_column_isin.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import assume, given, settings
 3 | 
 4 | from janitor.testing_utils.strategies import (
 5 |     categoricaldf_strategy,
 6 |     names_strategy,
 7 | )
 8 | 
 9 | 
10 | @pytest.mark.functions
11 | @given(df=categoricaldf_strategy(), iterable=names_strategy())
12 | @settings(deadline=None, max_examples=10)
13 | def test_filter_column_isin(df, iterable):
14 |     """
15 |     `filter_column_isin` should return the property that the column of
16 |     interest's set of values should be a subset of the iterable provided.
17 |     This encompasses a few scenarios:
18 | 
19 |     - Each element in iterable is present in the column.
20 |     - No elements of iterable are present in the column.
21 |     - A subset of elements in iterable are present in the column.
22 | 
23 |     All 3 cases can be caught by using subsets.
24 |     """
25 |     assume(len(iterable) >= 1)
26 |     df = df.filter_column_isin("names", iterable)
27 |     assert set(df["names"]).issubset(iterable)
28 | 


--------------------------------------------------------------------------------
/tests/functions/test_filter_on.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | @pytest.mark.parametrize("complement,expected", [(True, 6), (False, 3)])
 6 | def test_filter_on(dataframe, complement, expected):
 7 |     df = dataframe.filter_on("a == 3", complement=complement)
 8 |     assert len(df) == expected
 9 | 
10 | 
11 | @pytest.mark.functions
12 | @pytest.mark.parametrize("complement,expected", [(True, 3), (False, 6)])
13 | def test_filter_on_with_multiple_criteria(dataframe, complement, expected):
14 |     df = dataframe.filter_on("(a == 3) | (a == 1)", complement=complement)
15 |     assert len(df) == expected
16 | 


--------------------------------------------------------------------------------
/tests/functions/test_filter_string.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | def test_filter_string(dataframe):
 6 |     df = dataframe.filter_string(
 7 |         column_name="animals@#$%^",
 8 |         search_string="bbit",
 9 |     )
10 | 
11 |     assert len(df) == 3
12 | 
13 | 
14 | def test_filter_string_complement(dataframe):
15 |     df = dataframe.filter_string(
16 |         column_name="cities",
17 |         search_string="hang",
18 |         complement=True,
19 |     )
20 | 
21 |     assert len(df) == 6
22 | 
23 | 
24 | def test_filter_string_case(dataframe):
25 |     df = dataframe.filter_string(
26 |         column_name="cities",
27 |         search_string="B",
28 |         case=False,
29 |     )
30 | 
31 |     assert len(df) == 6
32 | 
33 | 
34 | def test_filter_string_regex(dataframe):
35 |     df = dataframe.change_type("Bell__Chart", str).filter_string(
36 |         column_name="Bell__Chart",
37 |         search_string="1.",
38 |         regex=False,
39 |     )
40 | 
41 |     assert len(df) == 3
42 | 


--------------------------------------------------------------------------------
/tests/functions/test_find_replace.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def df():
 8 |     return pd.DataFrame(
 9 |         {"a": [1, np.nan, 3], "b": [2, 3, 1], "c": [2, np.nan, 2]}
10 |     )
11 | 
12 | 
13 | @pytest.mark.functions
14 | def test_find_replace_single(df):
15 |     assert df["a"].iloc[2] == 3
16 |     df.find_replace(a={3: 5})
17 |     assert df["a"].iloc[2] == 5
18 | 
19 |     assert sum(df["c"] == 2) == 2
20 |     assert sum(df["c"] == 5) == 0
21 |     df.find_replace(c={2: 5})
22 |     assert sum(df["c"] == 2) == 0
23 |     assert sum(df["c"] == 5) == 2
24 | 
25 | 
26 | @pytest.mark.functions
27 | def test_find_replace_null_raises_error(df):
28 |     with pytest.raises(ValueError):
29 |         df.find_replace(a={np.nan: 5})
30 | 
31 | 
32 | @pytest.fixture
33 | def df_orders():
34 |     return pd.DataFrame(
35 |         {
36 |             "customer": ["Mary", "Tom", "Lila"],
37 |             "order": ["ice coffee", "lemonade", "regular coffee"],
38 |         }
39 |     )
40 | 
41 | 
42 | @pytest.mark.functions
43 | def test_find_replace_regex(df_orders):
44 |     df_orders.find_replace(order={"coffee$": "latte"}, match="regex")
45 |     assert df_orders["order"].iloc[0] == "latte"
46 |     assert df_orders["order"].iloc[1] == "lemonade"
47 |     assert df_orders["order"].iloc[-1] == "latte"
48 | 
49 | 
50 | @pytest.mark.functions
51 | def test_find_replace_regex_match_raises_error(df_orders):
52 |     with pytest.raises(ValueError):
53 |         df_orders.find_replace(order={"lemonade": "orange juice"}, match="bla")
54 | 


--------------------------------------------------------------------------------
/tests/functions/test_get_dupes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_get_dupes():
 7 |     df = pd.DataFrame()
 8 |     df["a"] = [1, 2, 1]
 9 |     df["b"] = [1, 2, 1]
10 |     df_dupes = df.get_dupes()
11 |     assert df_dupes.shape == (2, 2)
12 | 
13 |     df2 = pd.DataFrame()
14 |     df2["a"] = [1, 2, 3]
15 |     df2["b"] = [1, 2, 3]
16 |     df2_dupes = df2.get_dupes()
17 |     assert df2_dupes.shape == (0, 2)
18 | 


--------------------------------------------------------------------------------
/tests/functions/test_impute.py:
--------------------------------------------------------------------------------
 1 | """Tests for the `impute` functions"""
 2 | 
 3 | import pytest
 4 | from pandas.testing import assert_frame_equal
 5 | 
 6 | 
 7 | @pytest.mark.functions
 8 | def test_impute_single_value(missingdata_df):
 9 |     """Check if constant value is imputed correctly."""
10 |     df = missingdata_df.impute("a", 5)
11 |     assert set(df["a"]) == set([1, 2, 5])
12 | 
13 | 
14 | @pytest.mark.functions
15 | def test_impute_single_value_multiple_columns(missingdata_df):
16 |     """Check if constant value is imputed correctly."""
17 |     df = missingdata_df.impute(["a", "Bell__Chart"], 5)
18 |     assert_frame_equal(
19 |         missingdata_df.assign(**df.loc[:, ["a", "Bell__Chart"]].fillna(5)), df
20 |     )
21 | 
22 | 
23 | @pytest.mark.functions
24 | @pytest.mark.parametrize(
25 |     "statistic,expected",
26 |     [
27 |         ("mean", set([1, 2, 1.5])),
28 |         ("average", set([1, 2, 1.5])),
29 |         ("median", set([1, 2, 1.5])),
30 |         ("mode", set([1, 2])),
31 |         ("min", set([1, 2])),
32 |         ("minimum", set([1, 2])),
33 |         ("max", set([1, 2])),
34 |         ("maximum", set([1, 2])),
35 |     ],
36 | )
37 | def test_impute_statistical(missingdata_df, statistic, expected):
38 |     """Check if imputing via statistic_column_name works correctly."""
39 |     df = missingdata_df.impute("a", statistic_column_name=statistic)
40 |     assert set(df["a"]) == expected
41 | 
42 | 
43 | @pytest.mark.functions
44 | def test_impute_error_with_invalid_inputs(missingdata_df):
45 |     """Check errors are properly raised with invalid inputs."""
46 |     with pytest.raises(
47 |         ValueError,
48 |         match="Only one of `value` or "
49 |         "`statistic_column_name` "
50 |         "should be provided.",
51 |     ):
52 |         missingdata_df.impute(
53 |             "a",
54 |             value=0,
55 |             statistic_column_name="mean",
56 |         )
57 | 
58 |     with pytest.raises(
59 |         KeyError, match="`statistic_column_name` must be one of.+"
60 |     ):
61 |         missingdata_df.impute("a", statistic_column_name="foobar")
62 | 
63 |     with pytest.raises(
64 |         ValueError, match="Kindly specify a value or a statistic_column_name"
65 |     ):
66 |         missingdata_df.impute("a")
67 | 


--------------------------------------------------------------------------------
/tests/functions/test_join_apply.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | 
 5 | import janitor  # noqa: F401
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_join_apply():
10 |     df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}).join_apply(
11 |         lambda x: 2 * x["a"] + x["b"], new_column_name="2a+b"
12 |     )
13 | 
14 |     expected = df.copy()
15 |     expected["2a+b"] = [4, 7, 10]
16 | 
17 |     assert_frame_equal(df, expected)
18 | 


--------------------------------------------------------------------------------
/tests/functions/test_label_encode.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_single_column_label_encode():
 7 |     df = pd.DataFrame(
 8 |         {"a": ["hello", "hello", "sup"], "b": [1, 2, 3]}
 9 |     ).label_encode(column_names="a")
10 |     assert "a_enc" in df.columns
11 | 
12 | 
13 | @pytest.mark.functions
14 | def test_single_column_fail_label_encode():
15 |     with pytest.raises(ValueError):
16 |         pd.DataFrame(
17 |             {"a": ["hello", "hello", "sup"], "b": [1, 2, 3]}
18 |         ).label_encode(
19 |             column_names="c"
20 |         )  # noqa: 841
21 | 
22 | 
23 | @pytest.mark.functions
24 | def test_multicolumn_label_encode():
25 |     df = pd.DataFrame(
26 |         {
27 |             "a": ["hello", "hello", "sup"],
28 |             "b": [1, 2, 3],
29 |             "c": ["aloha", "nihao", "nihao"],
30 |         }
31 |     ).label_encode(column_names=["a", "c"])
32 |     assert "a_enc" in df.columns
33 |     assert "c_enc" in df.columns
34 | 
35 | 
36 | @pytest.mark.functions
37 | def test_label_encode_invalid_input(dataframe):
38 |     with pytest.raises(NotImplementedError):
39 |         dataframe.label_encode(1)
40 | 


--------------------------------------------------------------------------------
/tests/functions/test_limit_column_characters.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | def test_limit_column_characters(dataframe):
 6 |     df = dataframe.limit_column_characters(1)
 7 |     assert df.columns[0] == "a"
 8 |     assert df.columns[1] == "B"
 9 |     assert df.columns[2] == "d"
10 |     assert df.columns[3] == "a_1"
11 |     assert df.columns[4] == "c"
12 | 
13 | 
14 | @pytest.mark.functions
15 | def test_limit_column_characters_different_positions(dataframe):
16 |     df = dataframe
17 |     df.columns = ["first", "first", "second", "second", "first"]
18 |     df.limit_column_characters(3)
19 | 
20 |     assert df.columns[0] == "fir"
21 |     assert df.columns[1] == "fir_1"
22 |     assert df.columns[2] == "sec"
23 |     assert df.columns[3] == "sec_1"
24 |     assert df.columns[4] == "fir_2"
25 | 
26 | 
27 | @pytest.mark.functions
28 | def test_limit_column_characters_different_positions_different_separator(
29 |     dataframe,
30 | ):
31 |     df = dataframe
32 |     df.columns = ["first", "first", "second", "second", "first"]
33 |     df.limit_column_characters(3, ".")
34 | 
35 |     assert df.columns[0] == "fir"
36 |     assert df.columns[1] == "fir.1"
37 |     assert df.columns[2] == "sec"
38 |     assert df.columns[3] == "sec.1"
39 |     assert df.columns[4] == "fir.2"
40 | 
41 | 
42 | @pytest.mark.functions
43 | def test_limit_column_characters_all_unique(dataframe):
44 |     df = dataframe.limit_column_characters(2)
45 |     assert df.columns[0] == "a"
46 |     assert df.columns[1] == "Be"
47 |     assert df.columns[2] == "de"
48 |     assert df.columns[3] == "an"
49 |     assert df.columns[4] == "ci"
50 | 


--------------------------------------------------------------------------------
/tests/functions/test_remove_columns.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | def test_remove_columns_one_col(dataframe):
 6 |     df = dataframe.remove_columns(column_names=["a"])
 7 |     assert len(df.columns) == 4
 8 | 
 9 | 
10 | @pytest.mark.functions
11 | def test_remove_columns_mult_cols(dataframe):
12 |     df = dataframe.remove_columns(column_names=["a", "Bell__Chart"])
13 |     assert len(df.columns) == 3
14 | 
15 | 
16 | @pytest.mark.functions
17 | def test_remove_columns_no_cols(dataframe):
18 |     df = dataframe.remove_columns(column_names=[])
19 |     assert len(df.columns) == 5
20 | 
21 | 
22 | @pytest.mark.functions
23 | def test_remove_columns_all_cols(dataframe):
24 |     df = dataframe.remove_columns(
25 |         column_names=[
26 |             "a",
27 |             "Bell__Chart",
28 |             "decorated-elephant",
29 |             "animals@#$%^",
30 |             "cities",
31 |         ]
32 |     )
33 |     assert len(df.columns) == 0
34 | 
35 | 
36 | @pytest.mark.skip(reason="Not sure why this is failing")
37 | def test_remove_columns_strange_cols(dataframe):
38 |     df = dataframe.remove_columns(
39 |         column_names=[
40 |             "a",
41 |             ["Bell__Chart", "decorated-elephant", "animals@#$%^", "cities"],
42 |         ]
43 |     )
44 |     assert len(df.columns) == 0
45 | 
46 | 
47 | @pytest.mark.functions
48 | def test_remove_columns_strange_cols_multilevel(multilevel_dataframe):
49 |     # When creating a multi level dataframe with 4 columns * 2 columns
50 |     # (16 columns in total)
51 |     # From input
52 | 
53 |     # If 2 columns (2 tuples = 4 codes) are removed
54 |     df = multilevel_dataframe.remove_columns(
55 |         column_names=[("bar", "one"), ("baz", "two")]
56 |     )
57 | 
58 |     # Then the total number of codes must be 12 (16-4)
59 |     assert (
60 |         len([item for sublist in df.columns.codes for item in sublist]) == 12
61 |     )
62 | 


--------------------------------------------------------------------------------
/tests/functions/test_remove_empty.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from hypothesis import given, settings
 5 | 
 6 | from janitor.testing_utils.strategies import df_strategy
 7 | 
 8 | 
 9 | @pytest.mark.functions
10 | @given(df=df_strategy())
11 | @settings(deadline=None, max_examples=10)
12 | def test_remove_empty(df):
13 |     """This test ensures that there are no columns that are completely null"""
14 |     df = df.remove_empty()
15 |     for col in df.columns:
16 |         assert not pd.isna(df[col]).all()
17 |     for r, d in df.iterrows():
18 |         assert not pd.isna(d).all()
19 | 
20 | 
21 | @pytest.mark.functions
22 | def test_index_after_remove_empty():
23 |     """This test ensures that the index is reset correctly"""
24 |     df = pd.DataFrame()
25 |     df["a"] = [1, np.nan, np.nan, 3, np.nan, 6]
26 |     df["b"] = [1, np.nan, 1, 3, np.nan, 6]
27 |     df_nonempty = df.remove_empty()
28 |     assert np.array_equal(
29 |         np.asarray(df_nonempty.index), np.asarray(range(0, len(df_nonempty)))
30 |     )
31 | 
32 | 
33 | @pytest.mark.functions
34 | def test_reset_index_false():
35 |     """Test output when reset_index is False"""
36 |     df = pd.DataFrame()
37 |     df["a"] = [1, np.nan, np.nan, 3, np.nan, 6]
38 |     df["b"] = [1, np.nan, 1, 3, np.nan, 6]
39 |     df_nonempty = df.remove_empty(reset_index=False)
40 |     assert np.array_equal(
41 |         df.notna().any(axis=1).to_numpy().nonzero()[0],
42 |         df_nonempty.index.to_numpy(),
43 |     )
44 | 


--------------------------------------------------------------------------------
/tests/functions/test_rename_column.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given  # noqa: F401
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_rename_column(dataframe):
 7 |     df = dataframe.clean_names().rename_column("a", "index")
 8 |     assert set(df.columns) == set(
 9 |         ["index", "bell_chart", "decorated_elephant", "animals@#$%^", "cities"]
10 |     )
11 |     assert "a" not in set(df.columns)
12 | 
13 | 
14 | @pytest.mark.functions
15 | def test_rename_column_absent_column(dataframe):
16 |     """
17 |     rename_column should raise an error if the column is absent.
18 |     """
19 |     with pytest.raises(ValueError):
20 |         dataframe.clean_names().rename_column("bb", "index")
21 | 


--------------------------------------------------------------------------------
/tests/functions/test_rename_columns.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given  # noqa: F401
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_rename_columns(dataframe):
 7 |     """
 8 |     Tests If rename_columns renames multiple columns based on the
 9 |     dictionary mappings.
10 |     """
11 |     df = dataframe.clean_names().rename_columns(
12 |         {"a": "index", "bell_chart": "chart"}
13 |     )
14 |     assert set(df.columns) == set(
15 |         ["index", "chart", "decorated_elephant", "animals@#$%^", "cities"]
16 |     )
17 |     assert "a" not in set(df.columns)
18 | 
19 | 
20 | @pytest.mark.functions
21 | def test_rename_columns_absent_column(dataframe):
22 |     """
23 |     rename_column should raise an error if the column is absent.
24 |     """
25 |     df = dataframe.copy()
26 |     with pytest.raises(ValueError):
27 |         df.clean_names().rename_columns({"a": "index", "bb": "chart"})
28 | 
29 |     assert set(df.columns) == set(dataframe.columns)
30 | 
31 | 
32 | @pytest.mark.functions
33 | def test_rename_columns_function(dataframe):
34 |     """
35 |     rename_columns should apply the given function for each column name
36 |     """
37 |     df = dataframe.clean_names().rename_columns(function=str.upper)
38 |     assert set(df.columns) == set(
39 |         ["A", "BELL_CHART", "DECORATED_ELEPHANT", "ANIMALS@#$%^", "CITIES"]
40 |     )
41 | 
42 |     assert "a" not in set(df.columns)
43 | 
44 | 
45 | @pytest.mark.functions
46 | def test_rename_columns_no_args(dataframe):
47 |     """
48 |     rename_columns should throw error when both column_name and function are
49 |     not provided.
50 |     """
51 |     df = dataframe.copy()
52 |     with pytest.raises(ValueError):
53 |         df.rename_columns()
54 | 
55 |     assert set(df.columns) == set(dataframe.columns)
56 | 


--------------------------------------------------------------------------------
/tests/functions/test_reorder_columns.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given, settings
 3 | 
 4 | from janitor.testing_utils.strategies import df_strategy
 5 | 
 6 | 
 7 | @pytest.mark.functions
 8 | @given(df=df_strategy())
 9 | @settings(deadline=None, max_examples=10)
10 | def test_reorder_columns(df):
11 |     # NOTE: This test essentially has four different tests underneath it.
12 |     # WE should be able to refactor this using pytest.mark.parametrize.
13 | 
14 |     # sanity checking of inputs
15 | 
16 |     # input is not a list or pd.Index
17 |     with pytest.raises(TypeError):
18 |         df.reorder_columns("a")
19 | 
20 |     # one of the columns is not present in the DataFrame
21 |     with pytest.raises(IndexError):
22 |         df.reorder_columns(["notpresent"])
23 | 
24 |     # reordering functionality
25 | 
26 |     # sanity check when desired order matches current order
27 |     # this also tests whether the function can take Pandas Index objects
28 |     assert all(df.reorder_columns(df.columns).columns == df.columns)
29 | 
30 |     # when columns are list & not all columns of DataFrame are included
31 |     assert all(
32 |         df.reorder_columns(["animals@#$%^", "Bell__Chart"]).columns
33 |         == ["animals@#$%^", "Bell__Chart", "a", "decorated-elephant", "cities"]
34 |     )
35 | 


--------------------------------------------------------------------------------
/tests/functions/test_round_to_fraction.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | def test_round_to_nearest_half(dataframe):
 6 |     """Checks output for rounding to the nearest 1/2."""
 7 |     df = dataframe.round_to_fraction("Bell__Chart", 2)
 8 |     assert df.iloc[0, 1] == 1.0
 9 |     assert df.iloc[1, 1] == 2.5
10 |     assert df.iloc[2, 1] == 3.0
11 |     assert df.iloc[3, 1] == 1.0
12 |     assert df.iloc[4, 1] == 2.5
13 |     assert df.iloc[5, 1] == 3.0
14 |     assert df.iloc[6, 1] == 1.0
15 |     assert df.iloc[7, 1] == 2.5
16 |     assert df.iloc[8, 1] == 3.0
17 | 
18 | 
19 | @pytest.mark.functions
20 | def test_round_digits(dataframe):
21 |     """Checks rounding to the specified number of digits."""
22 |     df = dataframe.round_to_fraction("Bell__Chart", 7, digits=3)
23 |     assert df.iloc[0, 1] == 1.286
24 |     assert df.iloc[1, 1] == 2.429
25 |     assert df.iloc[2, 1] == 3.286
26 | 
27 | 
28 | @pytest.mark.functions
29 | @pytest.mark.parametrize(
30 |     "denominator",
31 |     [0, -5, -0.25],
32 | )
33 | def test_invalid_denominator_args(dataframe, denominator):
34 |     """Ensure ValueError's are raised if denominator value passed in
35 |     is invalid.
36 |     """
37 |     with pytest.raises(ValueError):
38 |         dataframe.round_to_fraction("Bell__Chart", denominator)
39 | 


--------------------------------------------------------------------------------
/tests/functions/test_shuffle.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.functions
 5 | def test_shuffle_without_index_reset(dataframe):
 6 |     """
 7 |     Test the shuffle function.
 8 | 
 9 |     This test checks that the set of indices in the shuffled dataframe are
10 |     identical to the set of indices in the original.
11 |     """
12 |     df = dataframe.shuffle(reset_index=False)
13 |     assert set(df.index) == set(dataframe.index)
14 | 
15 | 
16 | @pytest.mark.functions
17 | def test_shuffle(dataframe):
18 |     """
19 |     Test the shuffle function.
20 | 
21 |     This test checks that the set of dataframes has identical columns and
22 |     number of rows.
23 |     """
24 |     df = dataframe.shuffle()
25 |     assert len(df) == len(dataframe)
26 |     assert set(df.columns) == set(dataframe.columns)
27 | 


--------------------------------------------------------------------------------
/tests/functions/test_sort_naturally.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for sort_naturally.
 3 | 
 4 | Some places where this test suite could be improved:
 5 | 
 6 | - Replace example-based test
 7 | with Hypothesis-generated property-based test. [intermediate]
 8 | - Provide another example-based test of something
 9 | that needs to be naturally rather than lexiographically sorted.
10 | """
11 | 
12 | import pandas as pd
13 | import pytest
14 | from natsort import natsorted
15 | from pandas.testing import assert_frame_equal
16 | 
17 | import janitor  # noqa: F401
18 | 
19 | 
20 | @pytest.fixture
21 | def well_dataframe():
22 |     data = {
23 |         "Well": ["A21", "A3", "A21", "B2", "B51", "B12"],
24 |         "Value": [1, 2, 13, 3, 4, 7],
25 |     }
26 |     df = pd.DataFrame(data)
27 |     return df
28 | 
29 | 
30 | def test_sort_naturally(well_dataframe):
31 |     """Example-based test for sort_naturally.
32 | 
33 |     We check that:
34 | 
35 |     - the resultant dataframe is sorted identically
36 |     to what natsorted would provide,
37 |     - the data in the dataframe are not corrupted.
38 |     """
39 |     sorted_df = well_dataframe.sort_naturally("Well")
40 |     assert sorted_df["Well"].tolist() == natsorted(well_dataframe["Well"])
41 |     assert_frame_equal(sorted_df.sort_index(), well_dataframe)
42 | 


--------------------------------------------------------------------------------
/tests/functions/test_take_first.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_take_first():
 8 |     df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [0, 1, 2, 3]})
 9 | 
10 |     res = df.take_first(subset="a", by="b")
11 |     exp = df.iloc[[0, 2], :]
12 | 
13 |     assert_frame_equal(res, exp)
14 | 


--------------------------------------------------------------------------------
/tests/functions/test_then.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def remove_first_two_letters_from_col_names(df):
 5 |     col_names = df.columns
 6 |     col_names = [name[2:] for name in col_names]
 7 |     df.columns = col_names
 8 |     return df
 9 | 
10 | 
11 | def remove_rows_3_and_4(df):
12 |     df = df.drop(3, axis=0)
13 |     df = df.drop(4, axis=0)
14 |     return df
15 | 
16 | 
17 | @pytest.mark.functions
18 | def test_then_column_names(dataframe):
19 |     df = dataframe.then(remove_first_two_letters_from_col_names)
20 |     cols = tuple(df.columns)
21 |     assert cols == ("", "ll__Chart", "corated-elephant", "imals@#$%^", "ties")
22 | 
23 | 
24 | @pytest.mark.functions
25 | def test_then_remove_rows(dataframe):
26 |     df = dataframe.then(remove_rows_3_and_4)
27 |     rows = tuple(df.index)
28 |     assert rows == (0, 1, 2, 5, 6, 7, 8)
29 | 


--------------------------------------------------------------------------------
/tests/functions/test_to_datetime.py:
--------------------------------------------------------------------------------
 1 | """Tests for `to_datetime` function."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_to_datetime():
10 |     """Checks to_datetime functionality is as expected."""
11 | 
12 |     df = pd.DataFrame(
13 |         {"date1": ["20190101", "20190102", "20190304", np.nan]}
14 |     ).to_datetime("date1", format="%Y%m%d")
15 |     assert df["date1"].dtype == np.dtype("datetime64[ns]")
16 |     assert df["date1"].iloc[0].isoformat() == "2019-01-01T00:00:00"
17 | 


--------------------------------------------------------------------------------
/tests/functions/test_toset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_coalesce_with_title():
 7 |     s = pd.Series([1, 2, 3, 5, 5], index=["a", "b", "c", "d", "e"]).toset()
 8 | 
 9 |     assert isinstance(s, set)
10 |     assert len(s) == 4
11 |     assert s == set([1, 2, 3, 5])
12 | 


--------------------------------------------------------------------------------
/tests/functions/test_truncate_datetime.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from pandas.testing import assert_frame_equal
 7 | 
 8 | 
 9 | @pytest.mark.functions
10 | def test_truncate_datetime_dataframe_invalid_datepart():
11 |     """Checks if a ValueError is appropriately raised when datepart is
12 |     not a valid enumeration.
13 |     """
14 |     with pytest.raises(ValueError, match=r"invalid `datepart`"):
15 |         pd.DataFrame().truncate_datetime_dataframe("INVALID")
16 | 
17 | 
18 | @pytest.mark.functions
19 | def test_truncate_datetime_dataframe_all_parts():
20 |     """Test for truncate_datetime_dataframe, for all valid dateparts.
21 |     Also only passes if `truncate_datetime_dataframe` method is idempotent.
22 |     """
23 |     x = datetime(2022, 3, 21, 9, 1, 15, 666)
24 |     df = pd.DataFrame({"dt": [x], "foo": [np.nan]}, copy=False)
25 | 
26 |     result = df.truncate_datetime_dataframe("second")
27 |     assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9, 1, 15, 0)
28 |     result = df.truncate_datetime_dataframe("minute")
29 |     assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9, 1)
30 |     result = df.truncate_datetime_dataframe("HOUR")
31 |     assert result.loc[0, "dt"] == datetime(2022, 3, 21, 9)
32 |     result = df.truncate_datetime_dataframe("Day")
33 |     assert result.loc[0, "dt"] == datetime(2022, 3, 21)
34 |     result = df.truncate_datetime_dataframe("month")
35 |     assert result.loc[0, "dt"] == datetime(2022, 3, 1)
36 |     result = df.truncate_datetime_dataframe("yeaR")
37 |     assert result.loc[0, "dt"] == datetime(2022, 1, 1)
38 | 
39 | 
40 | # bad data
41 | @pytest.mark.functions
42 | def test_truncate_datetime_dataframe_do_nothing():
43 |     """Ensure nothing changes (and no errors raised) if there are no datetime-
44 |     compatible columns.
45 |     """
46 |     in_data = {
47 |         "a": [1, 0],
48 |         "b": ["foo", ""],
49 |         "c": [np.nan, 3.0],
50 |         "d": [True, False],
51 |     }
52 | 
53 |     result = pd.DataFrame(in_data).truncate_datetime_dataframe("year")
54 |     expected = pd.DataFrame(in_data)
55 | 
56 |     assert_frame_equal(result, expected)
57 | 
58 | 
59 | @pytest.mark.functions
60 | def test_truncate_datetime_containing_NaT():
61 |     """Ensure NaT is ignored safely (no-op) and no TypeError is thrown."""
62 |     x = datetime(2022, 3, 21, 9, 1, 15, 666)
63 |     df = pd.DataFrame({"dt": [x, pd.NaT], "foo": [np.nan, 3]})
64 |     expected = pd.DataFrame(
65 |         {"dt": [x.replace(microsecond=0), pd.NaT], "foo": [np.nan, 3]}
66 |     )
67 | 
68 |     result = df.truncate_datetime_dataframe("second").assign(
69 |         dt=lambda df: df["dt"].dt.as_unit("ns")
70 |     )
71 |     assert_frame_equal(result, expected)
72 | 


--------------------------------------------------------------------------------
/tests/functions/test_update_where.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | 
 5 | from janitor.functions import update_where
 6 | 
 7 | 
 8 | @pytest.mark.functions
 9 | def test_update_where(dataframe):
10 |     """
11 |     Test that it accepts conditional parameters
12 |     """
13 |     assert_frame_equal(
14 |         dataframe.update_where(
15 |             (dataframe["decorated-elephant"] == 1)
16 |             & (dataframe["animals@#$%^"] == "rabbit"),
17 |             "cities",
18 |             "Durham",
19 |         ),
20 |         dataframe.replace("Cambridge", "Durham"),
21 |     )
22 | 
23 | 
24 | @pytest.fixture
25 | def df():
26 |     return pd.DataFrame(
27 |         {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [0, 0, 0, 0]}
28 |     )
29 | 
30 | 
31 | def test_update_where_query(df):
32 |     """Test that function works with pandas query-style string expression."""
33 | 
34 |     expected = pd.DataFrame(
35 |         {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [0, 0, 10, 0]}
36 |     )
37 |     result = update_where(
38 |         df, conditions="a > 2 and b < 8", target_column_name="c", target_val=10
39 |     )
40 | 
41 |     assert_frame_equal(result, expected)
42 | 
43 | 
44 | def test_not_boolean_conditions(df):
45 |     """Raise Error if `conditions` is not a boolean type."""
46 |     with pytest.raises(ValueError):
47 |         df.update_where(
48 |             conditions=(df.a + 5),
49 |             target_column_name="c",
50 |             target_val=10,
51 |         )
52 | 


--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
1 | """Helper functions for running tests."""
2 | 
3 | import os
4 | 
5 | 
6 | def running_on_ci() -> bool:
7 |     """Return True if running on CI machine."""
8 |     return os.environ.get("JANITOR_CI_MACHINE") is not None
9 | 


--------------------------------------------------------------------------------
/tests/io/test_read_commandline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import tempfile
 4 | from subprocess import CalledProcessError
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | import janitor.io
10 | 
11 | 
12 | def test_read_commandline(dataframe):
13 |     """
14 |     Test asserts that the dataframe made
15 |         from the read_commandline function is
16 |         identical to the test dataframe from
17 |         which the .csv file was created.
18 | 
19 |     """
20 |     # create a temporary .csv file from test data
21 |     temp_dir = tempfile.gettempdir()
22 | 
23 |     dataframe.to_csv(f"{temp_dir}/dataframe.csv", index=0)
24 | 
25 |     # create a new dataframe from the temporary .csv using
26 |     #   the cat command from the bash commandline
27 | 
28 |     if sys.platform in ["win32"]:
29 |         # cat is not an operable command for Windows command line
30 |         # "type" is a similar call
31 |         df = janitor.io.read_commandline(f"type {temp_dir}\\dataframe.csv")
32 |     else:
33 |         df = janitor.io.read_commandline(f"cat {temp_dir}/dataframe.csv")
34 | 
35 |     # Make assertion that new dataframe created with read_commandline
36 |     #   is equal to the test dataframe
37 |     assert df.equals(dataframe)
38 | 
39 |     # clean up after the test
40 |     os.unlink(f"{temp_dir}/dataframe.csv")
41 | 
42 | 
43 | def test_read_commandline_bad_cmd(dataframe):
44 |     """
45 |     Test 1 raises a TypeError if read_commandline
46 |         is given an input that is not a string.
47 | 
48 |     Test 2 raises a CalledProcessError if
49 |         read_commandline is given a string
50 |         which is not a valid bash command.
51 | 
52 |     Test 3 raises an EmptyDataError if
53 |         read_commandline is given a string which
54 |         is a valid bash command, however results
55 |         in the shell not creating a dataframe.
56 |     """
57 |     temp_dir = tempfile.gettempdir()
58 | 
59 |     # create a temporary .csv file
60 |     dataframe.to_csv(f"{temp_dir}/dataframe.csv")
61 | 
62 |     # Test 1
63 |     with pytest.raises(TypeError):
64 |         janitor.io.read_commandline(6)
65 | 
66 |     # Test 2
67 |     with pytest.raises(CalledProcessError):
68 |         janitor.io.read_commandline("bad command")
69 | 
70 |     # Test 3
71 |     # windows does not support "cat" in commandline
72 |     # "type" command must be used and it returns a different error
73 |     cmd = "cat"
74 | 
75 |     ExpectedError = pd.errors.EmptyDataError
76 |     if sys.platform in ["win32"]:
77 |         cmd = "type"
78 |         ExpectedError = CalledProcessError
79 | 
80 |     with pytest.raises(ExpectedError):
81 |         janitor.io.read_commandline(cmd)
82 | 
83 |     # clean up after the tests
84 |     os.unlink(f"{temp_dir}/dataframe.csv")
85 | 


--------------------------------------------------------------------------------
/tests/math/test_ecdf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from hypothesis import given, settings
 4 | from hypothesis.extra.pandas import series
 5 | 
 6 | 
 7 | @given(s=series(dtype=np.number))
 8 | @settings(deadline=None)
 9 | def test_ecdf(s):
10 |     """A simple execution test."""
11 |     if s.isna().sum() > 0:
12 |         with pytest.raises(ValueError):
13 |             x, y = s.ecdf()
14 |     else:
15 |         x, y = s.ecdf()
16 |         assert len(x) == len(y)
17 | 
18 | 
19 | @given(s=series(dtype=str))
20 | def test_ecdf_string(s):
21 |     """Test that type enforcement is in place."""
22 |     with pytest.raises(TypeError):
23 |         x, y = s.ecdf()
24 | 


--------------------------------------------------------------------------------
/tests/math/test_exp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_exp():
 8 |     s = pd.Series([0, 1, 2, 3, -1])
 9 |     out = s.exp()
10 |     assert (out == np.exp(s)).all()
11 |     assert (s.index == out.index).all()
12 | 


--------------------------------------------------------------------------------
/tests/math/test_log.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_log():
 8 |     s = pd.Series([0, 1, 2, 3, -1])
 9 | 
10 |     with pytest.raises(RuntimeError):
11 |         s.log(error="raise")
12 | 
13 |     with pytest.warns(RuntimeWarning):
14 |         out = s.log(error="warn")
15 | 
16 |     assert out[s <= 0].isna().all()
17 |     assert (out.index == s.index).all()
18 |     assert (out[s > 0] == np.log(np.array([1, 2, 3]))).all()
19 | 
20 |     out = s.log(error="ignore")
21 | 
22 |     assert out[s <= 0].isna().all()
23 |     assert (out.index == s.index).all()
24 |     assert (out[s > 0] == np.log(np.array([1, 2, 3]))).all()
25 | 


--------------------------------------------------------------------------------
/tests/math/test_logit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_logit():
 8 |     s = pd.Series([0, 0.1, 0.2, 0.3, 0.5, 0.9, 1, 2])
 9 |     inside = (0 < s) & (s < 1)
10 |     valid = np.array([0.1, 0.2, 0.3, 0.5, 0.9])
11 |     ans = np.log(valid / (1 - valid))
12 | 
13 |     with pytest.raises(RuntimeError):
14 |         s.logit(error="raise")
15 | 
16 |     with pytest.warns(RuntimeWarning):
17 |         out = s.logit(error="warn")
18 | 
19 |     assert out[inside].notna().all()
20 |     assert out[inside].to_numpy() == pytest.approx(ans)
21 |     assert (out.index == s.index).all()
22 |     assert out[~inside].isna().all()
23 | 
24 |     out = s.logit(error="ignore")
25 | 
26 |     assert out[inside].notna().all()
27 |     assert out[inside].to_numpy() == pytest.approx(ans)
28 |     assert (out.index == s.index).all()
29 |     assert out[~inside].isna().all()
30 | 


--------------------------------------------------------------------------------
/tests/math/test_normal_cdf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from scipy.stats import norm
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_normal_cdf():
 8 |     s = pd.Series([0, 1, 2, 3, -1])
 9 |     out = s.normal_cdf()
10 |     assert (out == norm.cdf(s)).all()
11 |     assert (s.index == out.index).all()
12 | 


--------------------------------------------------------------------------------
/tests/math/test_probit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from scipy.stats import norm
 5 | 
 6 | 
 7 | @pytest.mark.functions
 8 | def test_probit():
 9 |     s = pd.Series([-1, 0, 0.1, 0.2, 0.3, 1, 2])
10 |     inside = (0 < s) & (s < 1)
11 |     valid = np.array([0.1, 0.2, 0.3])
12 |     ans = norm.ppf(valid)
13 | 
14 |     with pytest.raises(RuntimeError):
15 |         s.probit(error="raise")
16 | 
17 |     with pytest.warns(RuntimeWarning):
18 |         out = s.probit(error="warn")
19 | 
20 |     assert out[inside].notna().all()
21 |     assert (out[inside] == ans).all()
22 |     assert (out.index == s.index).all()
23 |     assert out[~inside].isna().all()
24 | 
25 |     out = s.probit(error="ignore")
26 | 
27 |     assert out[inside].notna().all()
28 |     assert (out[inside] == ans).all()
29 |     assert (out.index == s.index).all()
30 |     assert out[~inside].isna().all()
31 | 


--------------------------------------------------------------------------------
/tests/math/test_sigmoid.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from scipy.special import expit
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_sigmoid():
 8 |     s = pd.Series([0, 1, 2, 3, -1])
 9 |     out = s.sigmoid()
10 |     assert (out == expit(s)).all()
11 |     assert (s.index == out.index).all()
12 | 


--------------------------------------------------------------------------------
/tests/math/test_softmax.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from scipy.special import softmax as scipy_softmax
 4 | 
 5 | 
 6 | @pytest.mark.functions
 7 | def test_softmax():
 8 |     s = pd.Series([0, 1, 2, 3, -1])
 9 |     out = s.softmax()
10 |     assert (out == scipy_softmax(s)).all()
11 |     assert (s.index == out.index).all()
12 |     assert out.sum() == 1.0
13 | 


--------------------------------------------------------------------------------
/tests/math/test_z_score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.functions
 6 | def test_z_score():
 7 |     s = pd.Series([0, 1, 2, 3, -1])
 8 | 
 9 |     m = s.mean()
10 |     st = s.std()
11 | 
12 |     ans = (s - m) / st
13 | 
14 |     d = {}
15 | 
16 |     assert (s.z_score(moments_dict=d) == ans).all()
17 |     assert (s.z_score().index == s.index).all()
18 | 
19 |     assert d["mean"] == m
20 |     assert d["std"] == st
21 | 


--------------------------------------------------------------------------------
/tests/ml/test_get_features_targets.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given, settings
 3 | 
 4 | import janitor.ml  # noqa: F401
 5 | from janitor.testing_utils.strategies import df_strategy
 6 | 
 7 | 
 8 | @pytest.mark.ml
 9 | @given(df=df_strategy())
10 | @settings(deadline=None, max_examples=10)
11 | def test_get_features_targets(df):
12 |     """Test one column returned as target and rest as features."""
13 |     X, y = df.clean_names().get_features_targets(
14 |         target_column_names="bell_chart"
15 |     )
16 |     assert X.shape[1] == 4
17 |     assert len(y.shape) == 1
18 | 
19 | 
20 | @pytest.mark.ml
21 | @given(df=df_strategy())
22 | @settings(deadline=None, max_examples=10)
23 | def test_get_features_targets_multi_features(df):
24 |     """Test one column returned as target and two as features."""
25 |     X, y = df.clean_names().get_features_targets(
26 |         feature_column_names=["animals@#$%^", "cities"],
27 |         target_column_names="bell_chart",
28 |     )
29 |     assert X.shape[1] == 2
30 |     assert len(y.shape) == 1
31 | 
32 | 
33 | @pytest.mark.ml
34 | @given(df=df_strategy())
35 | @settings(deadline=None, max_examples=10)
36 | def test_get_features_target_multi_columns(df):
37 |     """Test two columns returned as target and rest as features."""
38 |     X, y = df.clean_names().get_features_targets(
39 |         target_column_names=["a", "bell_chart"]
40 |     )
41 |     assert X.shape[1] == 3
42 |     assert y.shape[1] == 2
43 | 


--------------------------------------------------------------------------------
/tests/polars/functions/test_convert_excel_date_polars.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | import janitor.polars  # noqa: F401
 4 | 
 5 | 
 6 | def test_convert_excel_date():
 7 |     df = pl.DataFrame({"dates": [42580.3333333333]})
 8 | 
 9 |     expression = pl.col("dates").convert_excel_date().alias("dd")
10 |     expression = df.with_columns(expression).get_column("dd")
11 |     assert expression.dtype.is_temporal() is True
12 | 


--------------------------------------------------------------------------------
/tests/polars/functions/test_convert_matlab_date_polars.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | import janitor.polars  # noqa: F401
 4 | 
 5 | 
 6 | def test_convert_matlab_date():
 7 |     df = pl.DataFrame(
 8 |         {
 9 |             "dates": [
10 |                 733_301.0,
11 |                 729_159.0,
12 |                 734_471.0,
13 |                 737_299.563_296_356_5,
14 |                 737_300.000_000_000_0,
15 |             ]
16 |         }
17 |     )
18 |     expression = pl.col("dates").convert_matlab_date().alias("dd")
19 |     expression = df.with_columns(expression).get_column("dd")
20 |     assert expression.dtype.is_temporal() is True
21 | 


--------------------------------------------------------------------------------
/tests/polars/functions/test_expand_polars.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | import pytest
 3 | from polars.testing import assert_frame_equal
 4 | 
 5 | import janitor.polars  # noqa: F401
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def df():
10 |     """pytest fixture"""
11 |     return pl.DataFrame(
12 |         dict(
13 |             group=(1, 2, 1, 2),
14 |             item_id=(1, 2, 2, 3),
15 |             item_name=("a", "a", "b", "b"),
16 |             value1=(1, None, 3, 4),
17 |             value2=range(4, 8),
18 |         )
19 |     )
20 | 
21 | 
22 | def test_column_None(df):
23 |     """Test output if *columns is empty."""
24 |     assert_frame_equal(df.expand(), df)
25 | 
26 | 
27 | def test_empty_groups(df):
28 |     """Raise TypeError if wrong column type is passed."""
29 |     msg = "The argument passed to the columns parameter "
30 |     msg += "should either be a string, a column selector, "
31 |     msg += "a polars expression, or a polars Series; instead got.+"
32 |     with pytest.raises(TypeError, match=msg):
33 |         df.complete("group", {})
34 | 
35 | 
36 | def test_type_sort(df):
37 |     """Raise TypeError if `sort` is not boolean."""
38 |     with pytest.raises(TypeError):
39 |         df.complete("group", "item_id", sort=11)
40 | 
41 | 
42 | def test_expand_1(df):
43 |     """
44 |     Test output for janitor.expand.
45 |     """
46 |     expected = df.expand("group", "item_id", "item_name", sort=True)
47 |     actual = (
48 |         df.select(pl.col("group").unique())
49 |         .join(df.select(pl.col("item_id").unique()), how="cross")
50 |         .join(df.select(pl.col("item_name").unique()), how="cross")
51 |         .sort(by=pl.all())
52 |     )
53 |     assert_frame_equal(actual, expected)
54 | 
55 | 
56 | def test_expand_2(df):
57 |     """
58 |     Test output for janitor.expand.
59 |     """
60 |     expected = df.expand(
61 |         "group", df.get_column("item_id"), "item_name", sort=True
62 |     )
63 |     actual = (
64 |         df.select(pl.col("group").unique())
65 |         .join(df.select(pl.col("item_id").unique()), how="cross")
66 |         .join(df.select(pl.col("item_name").unique()), how="cross")
67 |         .sort(by=pl.all())
68 |     )
69 |     assert_frame_equal(actual, expected)
70 | 


--------------------------------------------------------------------------------
/tests/spark/conftest.py:
--------------------------------------------------------------------------------
 1 | """Spark fixtures."""
 2 | 
 3 | import pytest
 4 | 
 5 | try:
 6 |     from pyspark.sql import SparkSession
 7 |     from pyspark.sql.types import (
 8 |         FloatType,
 9 |         IntegerType,
10 |         StringType,
11 |         StructField,
12 |         StructType,
13 |     )
14 | except ImportError:
15 |     pass
16 | 
17 | 
18 | @pytest.fixture  # (scope="session")
19 | def spark():
20 |     """Create spark session."""
21 |     spark = SparkSession.builder.getOrCreate()
22 |     yield spark
23 |     spark.stop()
24 | 
25 | 
26 | @pytest.fixture
27 | def spark_df(spark):
28 |     """Create spark dataframe."""
29 |     schema = StructType(
30 |         [
31 |             StructField("a", IntegerType(), True),
32 |             StructField("Bell__Chart", FloatType(), True),
33 |             StructField("decorated-elephant", IntegerType(), True),
34 |             StructField("animals@#$%^", StringType(), True),
35 |             StructField("cities", StringType(), True),
36 |         ]
37 |     )
38 |     return spark.createDataFrame([], schema)
39 | 
40 | 
41 | @pytest.fixture
42 | def spark_dataframe(spark, dataframe):
43 |     """Another function to create spark dataframe."""
44 |     return spark.createDataFrame(dataframe)
45 | 


--------------------------------------------------------------------------------
/tests/test_data/016-MSPTDA-Excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/016-MSPTDA-Excel.xlsx


--------------------------------------------------------------------------------
/tests/test_data/excel_without_headers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/excel_without_headers.xlsx


--------------------------------------------------------------------------------
/tests/test_data/file_example_XLSX_10.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/file_example_XLSX_10.xlsx


--------------------------------------------------------------------------------
/tests/test_data/worked-examples.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyjanitor-devs/pyjanitor/7081f0de547bcc9fcb3209f7c29169f76a6977c1/tests/test_data/worked-examples.xlsx


--------------------------------------------------------------------------------
/tests/test_documentation_build.py:
--------------------------------------------------------------------------------
 1 | """Tests for documentation build."""
 2 | 
 3 | import os
 4 | 
 5 | import pytest
 6 | 
 7 | # If `mkdocs` wasn't installed in environment, just skip.
 8 | # Can't use `pytest.importorskip("mkdocs")`, 'mkdocs' is also
 9 | # a folder name to pyjanitor project.
10 | pytest.importorskip("mkdocstrings")
11 | 
12 | 
13 | @pytest.mark.documentation
14 | def test_docs_general_functions_present():
15 |     """Test that all docs pages build correctly.
16 | 
17 |     TODO: There has to be a better way to automatically check that
18 |     all of the functions are present in the docs.
19 |     This is an awesome thing that we could use help with in the future.
20 |     """
21 |     # Build docs using mkdocs
22 |     os.system("mkdocs build --clean")
23 | 
24 |     # We want to check that the following keywords are all present.
25 |     # I put in a subsample of general functions.
26 |     # This can be made much more robust.
27 |     rendered_correctly = False
28 |     with open("./site/api/functions/index.html", "r+") as f:
29 |         for line in f.readlines():
30 |             if "add_columns" in line or "update_where" in line:
31 |                 rendered_correctly = True
32 |     assert rendered_correctly
33 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | """Tests for test helper functions."""
 2 | 
 3 | from helpers import running_on_ci
 4 | 
 5 | 
 6 | def test_running_on_ci_local(monkeypatch):
 7 |     """Test running_on_ci run on local machine returns False."""
 8 |     monkeypatch.delenv("JANITOR_CI_MACHINE", raising=False)
 9 |     assert running_on_ci() is False
10 | 
11 | 
12 | def test_running_on_ci_ci(monkeypatch):
13 |     """Test running_on_ci run on CI machine returns True."""
14 |     monkeypatch.setenv("JANITOR_CI_MACHINE", "1")
15 |     assert running_on_ci() is True
16 | 


--------------------------------------------------------------------------------
/tests/timeseries/test_fill_missing_timestamps.py:
--------------------------------------------------------------------------------
 1 | from random import randint
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from janitor.timeseries import _get_missing_timestamps, fill_missing_timestamps
 7 | 
 8 | 
 9 | # Random data for testing
10 | @pytest.fixture
11 | def timeseries_dataframe() -> pd.DataFrame:
12 |     """
13 |     Returns a time series dataframe
14 |     """
15 |     ts_index = pd.date_range("1/1/2019", periods=1000, freq="1h")
16 |     v1 = [randint(1, 2000) for i in range(1000)]
17 |     test_df = pd.DataFrame({"v1": v1}, index=ts_index)
18 |     return test_df
19 | 
20 | 
21 | @pytest.mark.timeseries
22 | def test_fill_missing_timestamps(timeseries_dataframe):
23 |     """Test that filling missing timestamps works as expected."""
24 |     # Remove random row from the data frame
25 |     random_number = randint(1, len(timeseries_dataframe))
26 |     df1 = timeseries_dataframe.drop(timeseries_dataframe.index[random_number])
27 | 
28 |     # Fill missing timestamps
29 |     # fix for GH#1184 is to use the start and end from
30 |     # timeseries_dataframe
31 |     # imagine that the last row of df1 is removed, or the first entry
32 |     # the length check in the assert line will fail
33 |     result = fill_missing_timestamps(
34 |         df1,
35 |         frequency="1h",
36 |         first_time_stamp=timeseries_dataframe.index.min(),
37 |         last_time_stamp=timeseries_dataframe.index.max(),
38 |     )
39 | 
40 |     # Testing if the missing timestamp has been filled
41 |     assert len(result) == len(timeseries_dataframe)
42 | 
43 |     # Testing if indices are exactly the same after filling
44 |     original_index = timeseries_dataframe.index
45 |     new_index = result.index
46 |     delta = original_index.difference(new_index)
47 | 
48 |     assert delta.empty is True
49 | 
50 | 
51 | @pytest.mark.timeseries
52 | def test__get_missing_timestamps(timeseries_dataframe):
53 |     """Test utility function for identifying the missing timestamps."""
54 |     from random import sample
55 | 
56 |     timeseries_dataframe.index.freq = None
57 |     timestamps_to_drop = sample(timeseries_dataframe.index.tolist(), 3)
58 |     df = timeseries_dataframe.drop(index=timestamps_to_drop)
59 |     missing_timestamps = _get_missing_timestamps(df, "1h")
60 |     assert set(missing_timestamps.index) == set(timestamps_to_drop)
61 | 


--------------------------------------------------------------------------------
/tests/timeseries/test_sort_timestamps_monotonically.py:
--------------------------------------------------------------------------------
 1 | from random import randint
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import janitor  # noqa: F401
 7 | import janitor.timeseries  # noqa: F401
 8 | 
 9 | 
10 | @pytest.fixture
11 | def timeseries_dataframe() -> pd.DataFrame:
12 |     """
13 |     Returns a time series dataframe
14 |     """
15 |     ts_index = pd.date_range("1/1/2019", periods=1000, freq="1h")
16 |     v1 = [randint(1, 2000) for i in range(1000)]
17 |     test_df = pd.DataFrame({"v1": v1}, index=ts_index)
18 |     return test_df
19 | 
20 | 
21 | # NOTE: The tests possibly can be merged back together later
22 | # if they are parametrized properly.
23 | # NOTE: These tests use `df.equals(other_df)`,
24 | # because the desired `pd.assert_frame_equal(df, other_df)`
25 | # constantly failed on the CI systems.
26 | # It's a task for later to fix.
27 | 
28 | 
29 | @pytest.mark.timeseries
30 | def test_sort_timestamps_monotonically(timeseries_dataframe):
31 |     """Test sort_timestamps_monotonically for ascending order"""
32 |     df = timeseries_dataframe.shuffle(
33 |         reset_index=False
34 |     ).sort_timestamps_monotonically()
35 |     assert df.equals(timeseries_dataframe)
36 | 
37 | 
38 | @pytest.mark.timeseries
39 | def test_sort_timestamps_monotonically_decreasing(timeseries_dataframe):
40 |     """Test sort_timestamps_monotonically for descending order"""
41 |     df2 = timeseries_dataframe.sort_index(ascending=False)
42 |     df3 = df2.sort_timestamps_monotonically("decreasing")
43 |     assert df3.equals(df2)
44 | 
45 | 
46 | @pytest.mark.timeseries
47 | def test_sort_timestamps_monotonically_strict(timeseries_dataframe):
48 |     """Test sort_timestamps_monotonically for index duplication handling"""
49 |     df = timeseries_dataframe.shuffle(reset_index=False)
50 |     random_number = df.index[randint(1, len(timeseries_dataframe))]
51 |     df = pd.concat(
52 |         [df, df.loc[[random_number], :]]
53 |     ).sort_timestamps_monotonically(direction="increasing", strict=True)
54 |     assert df.equals(timeseries_dataframe)
55 | 


--------------------------------------------------------------------------------
/tests/utils/test_check_column.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from janitor.utils import check_column
 4 | 
 5 | 
 6 | @pytest.mark.utils
 7 | def test_check_column(dataframe):
 8 |     """
 9 |     check_column should return if column exist
10 |     """
11 |     assert check_column(dataframe, ["a"]) is None
12 | 
13 | 
14 | @pytest.mark.utils
15 | def test_check_column_single(dataframe):
16 |     """
17 |     Check works with a single input
18 |     """
19 | 
20 |     assert check_column(dataframe, "a") is None
21 | 
22 |     with pytest.raises(ValueError):
23 |         check_column(dataframe, "b")
24 | 
25 |     # should also work with non-string inputs
26 | 
27 |     with pytest.raises(ValueError):
28 |         check_column(dataframe, 2)
29 | 
30 |     dataframe[2] = "asdf"
31 | 
32 |     assert check_column(dataframe, 2) is None
33 | 
34 | 
35 | @pytest.mark.utils
36 | def test_check_column_absent_column(dataframe):
37 |     """
38 |     check_column should raise an error if the column is absent.
39 |     """
40 |     with pytest.raises(ValueError):
41 |         check_column(dataframe, ["b"])
42 | 
43 | 
44 | @pytest.mark.utils
45 | def test_check_column_excludes(dataframe):
46 |     """
47 |     check_column should return if column is absent and present is False
48 |     """
49 |     assert check_column(dataframe, ["b"], present=False) is None
50 | 
51 | 
52 | @pytest.mark.utils
53 | def test_check_column_absent_column_excludes(dataframe):
54 |     """
55 |     check_column should raise an error if the column is absent and present is
56 |     False
57 |     """
58 |     with pytest.raises(ValueError):
59 |         check_column(dataframe, ["a"], present=False)
60 | 


--------------------------------------------------------------------------------
/tests/utils/test_deprecated_alias.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import pytest
 4 | 
 5 | from janitor.utils import deprecated_alias
 6 | 
 7 | 
 8 | @deprecated_alias(a="alpha", b="beta")
 9 | def simple_sum(alpha, beta):
10 |     gamma = alpha + beta
11 |     return gamma
12 | 
13 | 
14 | @pytest.mark.utils
15 | def test_old_aliases():
16 |     """
17 |     Using old aliases should  result in `DeprecationWarning`
18 |     """
19 |     with pytest.warns(DeprecationWarning):
20 |         simple_sum(a=2, b=6)
21 | 
22 | 
23 | @pytest.mark.utils
24 | def test_new_aliases():
25 |     """
26 |     Using new aliases should not result in errors or warnings
27 |     """
28 |     # https://github.com/scikit-learn/scikit-learn/issues/22572#issuecomment-1047316960
29 |     with warnings.catch_warnings(record=True) as record:
30 |         simple_sum(alpha=2, beta=6)
31 |     assert not record
32 | 
33 |     assert simple_sum(alpha=2, beta=6)
34 | 
35 | 
36 | @pytest.mark.utils
37 | def test_mixed_aliases():
38 |     """
39 |     Using mixed aliases should result in errors
40 |     """
41 |     with pytest.raises(TypeError):
42 |         assert simple_sum(alpha=2, beta=6, a=5)
43 | 


--------------------------------------------------------------------------------
/tests/utils/test_idempotent.py:
--------------------------------------------------------------------------------
 1 | from math import fabs, floor
 2 | 
 3 | import pytest
 4 | 
 5 | from janitor.utils import idempotent
 6 | 
 7 | 
 8 | @pytest.mark.utils
 9 | @pytest.mark.parametrize("func,data", [(fabs, -5), (floor, 10.45)])
10 | def test__idempotence(func, data):
11 |     idempotent(func, data)
12 | 


--------------------------------------------------------------------------------
/tests/utils/test_import_message.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pytest
 5 | 
 6 | from janitor.utils import import_message
 7 | 
 8 | 
 9 | @pytest.mark.utils
10 | def test_import_message(capsys):
11 |     is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
12 |     if is_conda:
13 |         message = (
14 |             "To use the janitor submodule biology, you need to install "
15 |             "biopython.\n\n"
16 |             "To do so, use the following command:\n\n"
17 |             "    conda install -c conda-forge biopython\n"
18 |         )
19 |     else:
20 |         message = (
21 |             "To use the janitor submodule biology, you need to install "
22 |             "biopython.\n\n"
23 |             "To do so, use the following command:\n\n"
24 |             "    pip install biopython\n"
25 |         )
26 |     import_message(
27 |         submodule="biology",
28 |         package="biopython",
29 |         conda_channel="conda-forge",
30 |         pip_install=True,
31 |     )
32 |     captured = capsys.readouterr()
33 |     assert captured.out == message
34 | 


--------------------------------------------------------------------------------
/tests/utils/test_is_connected.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | import pytest
 4 | 
 5 | from janitor.utils import is_connected
 6 | 
 7 | """
 8 | Tests the is_connected helper function,
 9 |     which is a function to check if the client
10 |     is connected to the internet.
11 | 
12 | Example:
13 |     print(is_connected("www.google.com"))
14 |     console >> True
15 | 
16 | Test 1: happy path, ensures function work
17 | 
18 | Test 2: web addresses that are not recognized
19 |     will return false (comzzz is not a tld).
20 | 
21 | Test 3: web addresses that are not recognized
22 |     will return false (aadsfff.com does not exist
23 |     at time of testing).
24 | 
25 | If test 3 fails, perhaps this is because
26 |     the website now exists. If that is the case,
27 |     alter or delete the test.
28 | """
29 | 
30 | 
31 | def test_is_connected():
32 |     assert is_connected("www.google.com")
33 |     with pytest.raises(socket.gaierror):
34 |         assert is_connected("www.google.comzzz") is False
35 |     with pytest.raises(socket.gaierror):
36 |         assert is_connected("aadsfff.com") is False
37 | 


--------------------------------------------------------------------------------
/tests/utils/test_replace_empty_string_with_none.py:
--------------------------------------------------------------------------------
 1 | """Tests for _replace_empty_string_with_none helper function."""
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from pandas.testing import assert_series_equal
 6 | 
 7 | from janitor.functions.currency_column_to_numeric import (
 8 |     _replace_empty_string_with_none,
 9 |     _replace_original_empty_string_with_none,
10 | )
11 | 
12 | 
13 | @pytest.mark.utils
14 | def test_replace_empty_string_with_none():
15 |     """Example-based test for _replace_empty_string_with_none."""
16 |     df = pd.DataFrame({"a": ["", 1, 0.34, "6.5", ""]})
17 |     df_expected = pd.DataFrame({"a": [None, 1, 0.34, "6.5", None]})
18 | 
19 |     df["a"] = _replace_empty_string_with_none(df["a"])
20 |     assert_series_equal(df["a"], df_expected["a"])
21 | 
22 | 
23 | @pytest.mark.utils
24 | def test_replace_original_empty_string_with_none():
25 |     """
26 |     Example test for the "original" _replace_empty_string_with_none.
27 | 
28 |     NOTE: This should be deprecated, I think?
29 |     TODO: Investigate whether this should be deprecated.
30 |     """
31 |     df = pd.DataFrame({"a": [1, 0.34, "6.5", None, "ORIGINAL_NA", "foo"]})
32 |     df_expected = pd.DataFrame({"a": [1, 0.34, "6.5", None, None, "foo"]})
33 | 
34 |     df["a"] = _replace_original_empty_string_with_none(df["a"])
35 |     assert_series_equal(df["a"], df_expected["a"])
36 | 


--------------------------------------------------------------------------------
/tests/utils/test_skiperror.py:
--------------------------------------------------------------------------------
 1 | """Tests for skiperror."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from janitor.utils import skiperror
 8 | 
 9 | 
10 | @pytest.mark.functions
11 | def test_skiperror():
12 |     """
13 |     Overall test for skiperror.
14 | 
15 |     TODO: I believe this test should be refactored into smaller "unit" tests.
16 |     """
17 |     df = pd.DataFrame({"x": [1, 2, 3, "a"], "y": [1, 2, 3, "b"]})
18 | 
19 |     def func(s):
20 |         """Dummy helper function."""
21 |         return s + 1
22 | 
23 |     # Verify that applying function causes error
24 |     with pytest.raises(Exception):
25 |         df["x"].apply(func)
26 | 
27 |     result = df["x"].apply(skiperror(func))
28 |     assert (result.to_numpy()[:-1] == np.array([2, 3, 4])).all() and np.isnan(
29 |         result.to_numpy()[-1]
30 |     )
31 | 
32 |     result = df["x"].apply(skiperror(func, return_x=True))
33 |     assert (result.to_numpy() == np.array([2, 3, 4, "a"], dtype=object)).all()
34 | 
35 |     result = df["x"].apply(skiperror(func, return_x=False, return_val=5))
36 |     assert (result.to_numpy() == np.array([2, 3, 4, 5])).all()
37 | 


--------------------------------------------------------------------------------
/tests/utils/test_skipna.py:
--------------------------------------------------------------------------------
 1 | """Tests for skipna."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from janitor.utils import skipna
 8 | 
 9 | 
10 | @pytest.mark.functions
11 | def test_skipna():
12 |     """
13 |     Overall test for skipna.
14 | 
15 |     TODO: Should be refactored into separate tests.
16 |     """
17 |     df = pd.DataFrame({"x": ["a", "b", "c", np.nan], "y": [1, 2, 3, np.nan]})
18 | 
19 |     def func(s):
20 |         """Dummy helper func."""
21 |         return s + "1"
22 | 
23 |     # Verify that applying function causes error
24 |     with pytest.raises(Exception):
25 |         df["x"].apply(func)
26 | 
27 |     result = df["x"].apply(skipna(func))
28 |     assert (
29 |         result.to_numpy()[:-1] == np.array(["a1", "b1", "c1"])
30 |     ).all() and np.isnan(result.to_numpy()[-1])
31 | 


--------------------------------------------------------------------------------
/tests/xarray/conftest.py:
--------------------------------------------------------------------------------
 1 | """Fixtures for xarray tests."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import xarray as xr
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def da():
10 |     """
11 |     Input testing DataArray for clone_using and convert_datetime_to_number.
12 | 
13 |     It creates a two-dimensional array of random integers adds axis coordinates
14 |     that are either linearly or log-spaced increments.
15 | 
16 |     Included is a simple metadata dictionary passed as `attrs`.
17 | 
18 |     .. # noqa: DAR201
19 |     """
20 |     da = xr.DataArray(
21 |         np.random.randint(0, 100, size=(512, 1024)),
22 |         dims=["random_ax_1", "random_ax_2"],
23 |         coords=dict(
24 |             random_ax_1=np.linspace(0, 1, 512),
25 |             random_ax_2=np.logspace(-2, 2, 1024),
26 |         ),
27 |         name="blarg",
28 |         attrs=dict(a=3, b=["asdf", "fdsa"]),
29 |     )
30 |     return da
31 | 


--------------------------------------------------------------------------------
/tests/xarray/test_clone_using.py:
--------------------------------------------------------------------------------
 1 | """Tests for clone_using."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import xarray as xr
 6 | 
 7 | import janitor  # noqa: F401
 8 | 
 9 | 
10 | @pytest.mark.xarray
11 | def test_successful_cloning_coords(da):
12 |     """Test that clone_using coordinates works correctly."""
13 | 
14 |     # with copying coords
15 |     new_da: xr.DataArray = da.clone_using(np.random.randn(*da.data.shape))
16 | 
17 |     with pytest.raises(AssertionError):
18 |         np.testing.assert_equal(new_da.data, da.data)
19 | 
20 |     assert all(
21 |         (
22 |             new_coord == old_coord
23 |             for new_coord, old_coord in zip(new_da.coords, da.coords)
24 |         )
25 |     )
26 |     assert new_da.dims == da.dims
27 | 
28 | 
29 | @pytest.mark.xarray
30 | def test_successful_cloning_no_coords(da):
31 |     """Test that cloning works without coordinates."""
32 | 
33 |     new_da: xr.DataArray = da.clone_using(
34 |         np.random.randn(*da.data.shape), use_coords=False
35 |     )
36 | 
37 |     with pytest.raises(AssertionError):
38 |         np.testing.assert_equal(new_da.data, da.data)
39 | 
40 |     assert new_da.dims == da.dims
41 | 
42 | 
43 | @pytest.mark.xarray
44 | def test_metadata_cloning(da):
45 |     """Test that metadata gets cloned over."""
46 |     new_da: xr.DataArray = da.clone_using(
47 |         np.random.randn(*da.data.shape), use_attrs=True, new_name="new_name"
48 |     )
49 | 
50 |     assert new_da.name != da.name
51 |     assert new_da.attrs == da.attrs
52 | 
53 | 
54 | @pytest.mark.xarray
55 | def test_no_coords_errors(da: xr.DataArray):
56 |     """Test that errors are raised when dims do not match."""
57 |     # number of dims should match
58 |     with pytest.raises(ValueError):
59 |         da.clone_using(np.random.randn(10, 10, 10), use_coords=False)
60 | 
61 |     # shape of each axis does not need to match
62 |     da.clone_using(np.random.randn(10, 10), use_coords=False)
63 | 
64 | 
65 | @pytest.mark.xarray
66 | def test_coords_errors(da: xr.DataArray):
67 |     # number of dims should match
68 |     with pytest.raises(ValueError):
69 |         da.clone_using(np.random.randn(10, 10, 10), use_coords=False)
70 | 
71 |     # shape of each axis must match when using coords
72 |     with pytest.raises(ValueError):
73 |         da.clone_using(np.random.randn(10, 10), use_coords=True)
74 | 


--------------------------------------------------------------------------------
/tests/xarray/test_convert_datetime_to_number.py:
--------------------------------------------------------------------------------
 1 | """Tests for datetime_conversion."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import xarray as xr
 6 | 
 7 | 
 8 | @pytest.mark.xarray
 9 | def test_datetime_conversion(da):
10 |     """Test that datetime conversion works on DataArrays."""
11 |     seconds_arr = np.arange(512)
12 | 
13 |     # dataarrays
14 |     new_da = da.assign_coords(
15 |         random_ax_1=1e9 * seconds_arr * np.timedelta64(1, "ns")
16 |     ).convert_datetime_to_number("m", dim="random_ax_1")
17 | 
18 |     # account for rounding errors
19 |     np.testing.assert_array_almost_equal(
20 |         new_da.coords["random_ax_1"].data, 1 / 60 * seconds_arr
21 |     )
22 | 
23 |     # datasets
24 |     new_ds = xr.Dataset(
25 |         dict(
26 |             array=da.assign_coords(
27 |                 random_ax_1=1e9 * seconds_arr * np.timedelta64(1, "ns")
28 |             )
29 |         )
30 |     ).convert_datetime_to_number("m", dim="random_ax_1")
31 | 
32 |     np.testing.assert_array_almost_equal(
33 |         new_ds.coords["random_ax_1"].data, 1 / 60 * seconds_arr
34 |     )
35 | 


--------------------------------------------------------------------------------