├── .copier-answers.yml ├── .git_archival.txt ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── 0-general_issue.md │ ├── 1-bug_report.md │ ├── 2-feature_request.md │ └── README.md ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── README.md │ ├── asv-main.yml │ ├── asv-nightly.yml │ ├── asv-pr.yml │ ├── build-documentation.yml │ ├── pre-commit-ci.yml │ ├── publish-benchmarks-pr.yml │ ├── publish-to-pypi.yml │ ├── smoke-test.yml │ └── testing-and-coverage.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── .setup_dev.sh ├── LICENSE ├── README.md ├── benchmarks ├── README.md ├── __init__.py ├── asv.conf.json └── benchmarks.py ├── docs ├── Makefile ├── _static │ └── custom.css ├── _templates │ └── autosummary │ │ ├── base.rst │ │ ├── class.rst │ │ └── module.rst ├── about.rst ├── about │ ├── internals.rst │ └── npd_internals.png ├── conf.py ├── gettingstarted.rst ├── gettingstarted │ ├── contributing.rst │ ├── installation.rst │ └── quickstart.ipynb ├── index.rst ├── intro_images │ ├── loc_into_nested.png │ ├── nestedframe_example.png │ ├── pandas_dfs.png │ └── reduce.png ├── pre_executed │ └── performance.ipynb ├── reference.rst ├── reference │ ├── accessor.rst │ ├── ext_array.rst │ ├── nesteddtype.rst │ ├── nestedframe.rst │ ├── packer.rst │ └── utils.rst ├── requirements.txt ├── tutorials.rst └── tutorials │ ├── README.md │ ├── data_loading_notebook.ipynb │ ├── data_manipulation.ipynb │ ├── low_level.ipynb │ └── nested_spectra.ipynb ├── pyproject.toml ├── src └── nested_pandas │ ├── __init__.py │ ├── datasets │ ├── __init__.py │ └── generation.py │ ├── nestedframe │ ├── __init__.py │ ├── core.py │ ├── expr.py │ └── io.py │ ├── py.typed │ ├── series │ ├── __init__.py │ ├── _storage │ │ ├── __init__.py │ │ ├── list_struct_storage.py │ │ ├── struct_list_storage.py │ │ └── table_storage.py │ ├── accessor.py │ ├── dtype.py │ ├── ext_array.py │ ├── packer.py │ └── utils.py │ └── utils │ ├── __init__.py │ └── utils.py └── tests ├── nested_pandas ├── conftest.py ├── datasets │ └── test_generation.py ├── e2e_tests │ └── test_issue89.py ├── nestedframe │ ├── test_io.py │ └── test_nestedframe.py ├── series │ ├── test_accessor.py │ ├── test_dtype.py │ ├── test_ext_array.py │ ├── test_packer.py │ └── test_series_utils.py ├── test_packaging.py └── utils │ └── test_utils.py └── test_data ├── nested.parquet ├── not_nestable.parquet └── vsx-x-ztfdr22_lc-m31.parquet /.copier-answers.yml: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier 2 | _commit: v2.0.7 3 | _src_path: gh:lincc-frameworks/python-project-template 4 | author_email: brantd@uw.edu 5 | author_name: LINCC Frameworks 6 | create_example_module: false 7 | custom_install: true 8 | enforce_style: 9 | - ruff_lint 10 | - ruff_format 11 | failure_notification: [] 12 | include_benchmarks: true 13 | include_docs: true 14 | include_notebooks: true 15 | mypy_type_checking: basic 16 | package_name: nested_pandas 17 | project_license: MIT 18 | project_name: nested-pandas 19 | project_organization: lincc-frameworks 20 | python_versions: 21 | - '3.10' 22 | - '3.11' 23 | - '3.12' 24 | - '3.13' 25 | test_lowest_version: all 26 | -------------------------------------------------------------------------------- /.git_archival.txt: -------------------------------------------------------------------------------- 1 | node: 78deda7f896727baa7be7990ab159d9236d9f68c 2 | node-date: 2025-06-05T15:18:35-04:00 3 | describe-name: v0.4.4 4 | ref-names: HEAD -> main, tag: v0.4.4 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # For explanation of this file and uses see 2 | # https://git-scm.com/docs/gitattributes 3 | # https://developer.lsst.io/git/git-lfs.html#using-git-lfs-enabled-repositories 4 | # https://lincc-ppt.readthedocs.io/en/latest/practices/git-lfs.html 5 | # 6 | # Used by https://github.com/lsst/afwdata.git 7 | # *.boost filter=lfs diff=lfs merge=lfs -text 8 | # *.dat filter=lfs diff=lfs merge=lfs -text 9 | # *.fits filter=lfs diff=lfs merge=lfs -text 10 | # *.gz filter=lfs diff=lfs merge=lfs -text 11 | # 12 | # apache parquet files 13 | # *.parq filter=lfs diff=lfs merge=lfs -text 14 | # 15 | # sqlite files 16 | # *.sqlite3 filter=lfs diff=lfs merge=lfs -text 17 | # 18 | # gzip files 19 | # *.gz filter=lfs diff=lfs merge=lfs -text 20 | # 21 | # png image files 22 | # *.png filter=lfs diff=lfs merge=lfs -text 23 | 24 | .git_archival.txt export-subst -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/0-general_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General issue 3 | about: Quickly create a general issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Tell us about a problem to fix 4 | title: 'Short description' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | **Bug report** 10 | 11 | 12 | **Before submitting** 13 | Please check the following: 14 | 15 | - [ ] I have described the situation in which the bug arose, including what code was executed, information about my environment, and any applicable data others will need to reproduce the problem. 16 | - [ ] I have included available evidence of the unexpected behavior (including error messages, screenshots, and/or plots) as well as a description of what I expected instead. 17 | - [ ] If I have a solution in mind, I have provided an explanation and/or pseudocode and/or task list. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: 'Short description' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Feature request** 11 | 12 | 13 | **Before submitting** 14 | Please check the following: 15 | 16 | - [ ] I have described the purpose of the suggested change, specifying what I need the enhancement to accomplish, i.e. what problem it solves. 17 | - [ ] I have included any relevant links, screenshots, environment information, and data relevant to implementing the requested feature, as well as pseudocode for how I want to access the new functionality. 18 | - [ ] If I have ideas for how the new feature could be implemented, I have provided explanations and/or pseudocode and/or task lists for the steps. 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/README.md: -------------------------------------------------------------------------------- 1 | # Configurations 2 | 3 | Templates for various different issue types are defined in this directory 4 | and a pull request template is defined as ``../pull_request_template.md``. Adding, 5 | removing, and modifying these templates to suit the needs of your project is encouraged. 6 | 7 | For more information about these templates, look here: https://lincc-ppt.readthedocs.io/en/latest/practices/issue_pr_templating.html 8 | 9 | Or if you still have questions contact us: https://lincc-ppt.readthedocs.io/en/latest/source/contact.html -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | - package-ecosystem: "pip" 8 | directory: "/" 9 | schedule: 10 | interval: "monthly" 11 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 12 | 13 | ## Change Description 14 | 19 | - [ ] My PR includes a link to the issue that I am addressing 20 | 21 | 22 | 23 | ## Solution Description 24 | 25 | 26 | 27 | 28 | ## Code Quality 29 | - [ ] I have read the Contribution Guide 30 | - [ ] My code follows the code style of this project 31 | - [ ] My code builds (or compiles) cleanly without any errors or warnings 32 | - [ ] My code contains relevant comments and necessary documentation 33 | 34 | ## Project-Specific Pull Request Checklists 35 | 36 | 37 | ### Bug Fix Checklist 38 | - [ ] My fix includes a new test that breaks as a result of the bug (if possible) 39 | - [ ] My change includes a breaking change 40 | - [ ] My change includes backwards compatibility and deprecation warnings (if possible) 41 | 42 | ### New Feature Checklist 43 | - [ ] I have added or updated the docstrings associated with my feature using the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html) 44 | - [ ] I have updated the tutorial to highlight my new feature (if appropriate) 45 | - [ ] I have added unit/End-to-End (E2E) test cases to cover my new feature 46 | - [ ] My change includes a breaking change 47 | - [ ] My change includes backwards compatibility and deprecation warnings (if possible) 48 | 49 | ### Documentation Change Checklist 50 | - [ ] Any updated docstrings use the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html) 51 | 52 | ### Build/CI Change Checklist 53 | - [ ] If required or optional dependencies have changed (including version numbers), I have updated the README to reflect this 54 | - [ ] If this is a new CI setup, I have added the associated badge to the README 55 | 56 | 57 | 58 | ### Other Change Checklist 59 | - [ ] Any new or updated docstrings use the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html). 60 | - [ ] I have updated the tutorial to highlight my new feature (if appropriate) 61 | - [ ] I have added unit/End-to-End (E2E) test cases to cover any changes 62 | - [ ] My change includes a breaking change 63 | - [ ] My change includes backwards compatibility and deprecation warnings (if possible) 64 | -------------------------------------------------------------------------------- /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | # Workflows 2 | 3 | The .yml files in this directory are used to define the various continuous 4 | integration scripts that will be run on your behalf e.g. nightly as a smoke check, 5 | or when you create a new PR. 6 | 7 | For more information about CI and workflows, look here: https://lincc-ppt.readthedocs.io/en/latest/practices/ci.html 8 | 9 | Or if you still have questions contact us: https://lincc-ppt.readthedocs.io/en/latest/source/contact.html -------------------------------------------------------------------------------- /.github/workflows/asv-main.yml: -------------------------------------------------------------------------------- 1 | # This workflow will run benchmarks with airspeed velocity (asv), 2 | # store the new results in the "benchmarks" branch and publish them 3 | # to a dashboard on GH Pages. 4 | name: Run ASV benchmarks for main 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | 10 | env: 11 | PYTHON_VERSION: "3.11" 12 | ASV_VERSION: "0.6.4" 13 | WORKING_DIR: ${{github.workspace}}/benchmarks 14 | 15 | concurrency: 16 | group: ${{github.workflow}}-${{github.ref}} 17 | cancel-in-progress: true 18 | 19 | jobs: 20 | asv-main: 21 | runs-on: ubuntu-latest 22 | permissions: 23 | contents: write 24 | defaults: 25 | run: 26 | working-directory: ${{env.WORKING_DIR}} 27 | steps: 28 | - name: Set up Python ${{env.PYTHON_VERSION}} 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: ${{env.PYTHON_VERSION}} 32 | - name: Checkout main branch of the repository 33 | uses: actions/checkout@v4 34 | with: 35 | fetch-depth: 0 36 | - name: Install dependencies 37 | run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}" 38 | - name: Configure git 39 | run: | 40 | git config user.name "github-actions[bot]" 41 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 42 | - name: Create ASV machine config file 43 | run: asv machine --machine gh-runner --yes 44 | - name: Fetch previous results from the "benchmarks" branch 45 | run: | 46 | if git ls-remote --exit-code origin benchmarks > /dev/null 2>&1; then 47 | git merge origin/benchmarks \ 48 | --allow-unrelated-histories \ 49 | --no-commit 50 | mv ../_results . 51 | fi 52 | - name: Run ASV for the main branch 53 | run: asv run ALL --skip-existing --verbose || true 54 | - name: Submit new results to the "benchmarks" branch 55 | uses: JamesIves/github-pages-deploy-action@v4 56 | with: 57 | branch: benchmarks 58 | folder: ${{env.WORKING_DIR}}/_results 59 | target-folder: _results 60 | - name: Generate dashboard HTML 61 | run: | 62 | asv show 63 | asv publish 64 | - name: Deploy to Github pages 65 | uses: JamesIves/github-pages-deploy-action@v4 66 | with: 67 | branch: gh-pages 68 | folder: ${{env.WORKING_DIR}}/_html -------------------------------------------------------------------------------- /.github/workflows/asv-nightly.yml: -------------------------------------------------------------------------------- 1 | # This workflow will run daily at 06:45. 2 | # It will run benchmarks with airspeed velocity (asv) 3 | # and compare performance with the previous nightly build. 4 | name: Run benchmarks nightly job 5 | 6 | on: 7 | schedule: 8 | - cron: 45 6 * * * 9 | workflow_dispatch: 10 | 11 | env: 12 | PYTHON_VERSION: "3.11" 13 | ASV_VERSION: "0.6.4" 14 | WORKING_DIR: ${{github.workspace}}/benchmarks 15 | NIGHTLY_HASH_FILE: nightly-hash 16 | 17 | jobs: 18 | asv-nightly: 19 | runs-on: ubuntu-latest 20 | defaults: 21 | run: 22 | working-directory: ${{env.WORKING_DIR}} 23 | steps: 24 | - name: Set up Python ${{env.PYTHON_VERSION}} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{env.PYTHON_VERSION}} 28 | - name: Checkout main branch of the repository 29 | uses: actions/checkout@v4 30 | with: 31 | fetch-depth: 0 32 | - name: Install dependencies 33 | run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}" 34 | - name: Configure git 35 | run: | 36 | git config user.name "github-actions[bot]" 37 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 38 | - name: Create ASV machine config file 39 | run: asv machine --machine gh-runner --yes 40 | - name: Fetch previous results from the "benchmarks" branch 41 | run: | 42 | if git ls-remote --exit-code origin benchmarks > /dev/null 2>&1; then 43 | git merge origin/benchmarks \ 44 | --allow-unrelated-histories \ 45 | --no-commit 46 | mv ../_results . 47 | fi 48 | - name: Get nightly dates under comparison 49 | id: nightly-dates 50 | run: | 51 | echo "yesterday=$(date -d yesterday +'%Y-%m-%d')" >> $GITHUB_OUTPUT 52 | echo "today=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT 53 | - name: Use last nightly commit hash from cache 54 | uses: actions/cache@v4 55 | with: 56 | path: ${{env.WORKING_DIR}} 57 | key: nightly-results-${{steps.nightly-dates.outputs.yesterday}} 58 | - name: Run comparison of main against last nightly build 59 | run: | 60 | HASH_FILE=${{env.NIGHTLY_HASH_FILE}} 61 | CURRENT_HASH=${{github.sha}} 62 | if [ -f $HASH_FILE ]; then 63 | PREV_HASH=$(cat $HASH_FILE) 64 | asv continuous $PREV_HASH $CURRENT_HASH --verbose || true 65 | asv compare $PREV_HASH $CURRENT_HASH --sort ratio --verbose 66 | fi 67 | echo $CURRENT_HASH > $HASH_FILE 68 | - name: Update last nightly hash in cache 69 | uses: actions/cache@v4 70 | with: 71 | path: ${{env.WORKING_DIR}} 72 | key: nightly-results-${{steps.nightly-dates.outputs.today}} -------------------------------------------------------------------------------- /.github/workflows/asv-pr.yml: -------------------------------------------------------------------------------- 1 | # This workflow will run benchmarks with airspeed velocity (asv) for pull requests. 2 | # It will compare the performance of the main branch with the performance of the merge 3 | # with the new changes. It then publishes a comment with this assessment by triggering 4 | # the publish-benchmarks-pr workflow. 5 | # Based on https://securitylab.github.com/research/github-actions-preventing-pwn-requests/. 6 | name: Run benchmarks for PR 7 | 8 | on: 9 | pull_request: 10 | branches: [ main ] 11 | workflow_dispatch: 12 | 13 | concurrency: 14 | group: ${{github.workflow}}-${{github.ref}} 15 | cancel-in-progress: true 16 | 17 | env: 18 | PYTHON_VERSION: "3.11" 19 | ASV_VERSION: "0.6.4" 20 | WORKING_DIR: ${{github.workspace}}/benchmarks 21 | ARTIFACTS_DIR: ${{github.workspace}}/artifacts 22 | 23 | jobs: 24 | asv-pr: 25 | runs-on: ubuntu-latest 26 | defaults: 27 | run: 28 | working-directory: ${{env.WORKING_DIR}} 29 | steps: 30 | - name: Set up Python ${{env.PYTHON_VERSION}} 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: ${{env.PYTHON_VERSION}} 34 | - name: Checkout PR branch of the repository 35 | uses: actions/checkout@v4 36 | with: 37 | fetch-depth: 0 38 | - name: Display Workflow Run Information 39 | run: | 40 | echo "Workflow Run ID: ${{github.run_id}}" 41 | - name: Install dependencies 42 | run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}" lf-asv-formatter 43 | - name: Make artifacts directory 44 | run: mkdir -p ${{env.ARTIFACTS_DIR}} 45 | - name: Save pull request number 46 | run: echo ${{github.event.pull_request.number}} > ${{env.ARTIFACTS_DIR}}/pr 47 | - name: Get current job logs URL 48 | uses: Tiryoh/gha-jobid-action@v1 49 | id: jobs 50 | with: 51 | github_token: ${{secrets.GITHUB_TOKEN}} 52 | job_name: ${{github.job}} 53 | - name: Create ASV machine config file 54 | run: asv machine --machine gh-runner --yes 55 | - name: Save comparison of PR against main branch 56 | run: | 57 | git remote add upstream https://github.com/${{github.repository}}.git 58 | git fetch upstream 59 | asv continuous upstream/main HEAD --verbose || true 60 | asv compare upstream/main HEAD --sort ratio --verbose | tee output 61 | python -m lf_asv_formatter --asv_version "$(asv --version | awk '{print $2}')" 62 | printf "\n\nClick [here]($STEP_URL) to view all benchmarks." >> output 63 | mv output ${{env.ARTIFACTS_DIR}} 64 | env: 65 | STEP_URL: ${{steps.jobs.outputs.html_url}}#step:10:1 66 | - name: Upload artifacts (PR number and benchmarks output) 67 | uses: actions/upload-artifact@v4 68 | with: 69 | name: benchmark-artifacts 70 | path: ${{env.ARTIFACTS_DIR}} -------------------------------------------------------------------------------- /.github/workflows/build-documentation.yml: -------------------------------------------------------------------------------- 1 | 2 | # This workflow will install Python dependencies, build the package and then build the documentation. 3 | 4 | name: Build documentation 5 | 6 | 7 | on: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | build: 19 | 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Set up Python 3.11 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: '3.11' 28 | - name: Install dependencies 29 | run: | 30 | sudo apt-get update 31 | python -m pip install --upgrade pip 32 | if [ -f docs/requirements.txt ]; then pip install -r docs/requirements.txt; fi 33 | pip install . 34 | - name: Install notebook requirements 35 | run: | 36 | sudo apt-get install pandoc 37 | - name: Build docs 38 | run: | 39 | sphinx-build -T -E -b html -d docs/build/doctrees ./docs docs/build/html 40 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit-ci.yml: -------------------------------------------------------------------------------- 1 | 2 | # This workflow runs pre-commit hooks on pushes and pull requests to main 3 | # to enforce coding style. To ensure correct configuration, please refer to: 4 | # https://lincc-ppt.readthedocs.io/en/latest/practices/ci_precommit.html 5 | name: Run pre-commit hooks 6 | 7 | on: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | jobs: 14 | pre-commit-ci: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: '3.11' 24 | - name: Install dependencies 25 | run: | 26 | sudo apt-get update 27 | python -m pip install --upgrade pip 28 | pip install .[dev] 29 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 30 | - uses: pre-commit/action@v3.0.1 31 | with: 32 | extra_args: --all-files --verbose 33 | env: 34 | SKIP: "check-lincc-frameworks-template-version,no-commit-to-branch,check-added-large-files,validate-pyproject,sphinx-build,pytest-check" 35 | - uses: pre-commit-ci/lite-action@v1.1.0 36 | if: failure() && github.event_name == 'pull_request' && github.event.pull_request.draft == false -------------------------------------------------------------------------------- /.github/workflows/publish-benchmarks-pr.yml: -------------------------------------------------------------------------------- 1 | # This workflow publishes a benchmarks comment on a pull request. It is triggered after the 2 | # benchmarks are computed in the asv-pr workflow. This separation of concerns allows us limit 3 | # access to the target repository private tokens and secrets, increasing the level of security. 4 | # Based on https://securitylab.github.com/research/github-actions-preventing-pwn-requests/. 5 | name: Publish benchmarks comment to PR 6 | 7 | on: 8 | workflow_run: 9 | workflows: ["Run benchmarks for PR"] 10 | types: [completed] 11 | 12 | jobs: 13 | upload-pr-comment: 14 | runs-on: ubuntu-latest 15 | if: > 16 | github.event.workflow_run.event == 'pull_request' && 17 | github.event.workflow_run.conclusion == 'success' 18 | permissions: 19 | issues: write 20 | pull-requests: write 21 | steps: 22 | - name: Display Workflow Run Information 23 | run: | 24 | echo "Workflow Run ID: ${{ github.event.workflow_run.id }}" 25 | echo "Head SHA: ${{ github.event.workflow_run.head_sha }}" 26 | echo "Head Branch: ${{ github.event.workflow_run.head_branch }}" 27 | echo "Conclusion: ${{ github.event.workflow_run.conclusion }}" 28 | echo "Event: ${{ github.event.workflow_run.event }}" 29 | - name: Download artifact 30 | uses: dawidd6/action-download-artifact@v7 31 | with: 32 | name: benchmark-artifacts 33 | run_id: ${{ github.event.workflow_run.id }} 34 | - name: Extract artifacts information 35 | id: pr-info 36 | run: | 37 | printf "PR number: $(cat pr)\n" 38 | printf "Output:\n$(cat output)" 39 | printf "pr=$(cat pr)" >> $GITHUB_OUTPUT 40 | - name: Find benchmarks comment 41 | uses: peter-evans/find-comment@v3 42 | id: find-comment 43 | with: 44 | issue-number: ${{ steps.pr-info.outputs.pr }} 45 | comment-author: 'github-actions[bot]' 46 | body-includes: view all benchmarks 47 | - name: Create or update benchmarks comment 48 | uses: peter-evans/create-or-update-comment@v4 49 | with: 50 | comment-id: ${{ steps.find-comment.outputs.comment-id }} 51 | issue-number: ${{ steps.pr-info.outputs.pr }} 52 | body-path: output 53 | edit-mode: replace -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | 2 | # This workflow will upload a Python Package using Twine when a release is created 3 | # For more information see: https://github.com/pypa/gh-action-pypi-publish#trusted-publishing 4 | 5 | # This workflow uses actions that are not certified by GitHub. 6 | # They are provided by a third-party and are governed by 7 | # separate terms of service, privacy policy, and support 8 | # documentation. 9 | 10 | name: Upload Python Package 11 | 12 | on: 13 | release: 14 | types: [published] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | deploy: 21 | 22 | runs-on: ubuntu-latest 23 | permissions: 24 | id-token: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: '3.11' 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install build 35 | - name: Build package 36 | run: python -m build 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | -------------------------------------------------------------------------------- /.github/workflows/smoke-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will run daily at 06:45. 2 | # It will install Python dependencies and run tests with a variety of Python versions. 3 | # See documentation for help debugging smoke test issues: 4 | # https://lincc-ppt.readthedocs.io/en/latest/practices/ci_testing.html#version-culprit 5 | 6 | name: Unit test smoke test 7 | 8 | on: 9 | 10 | # Runs this workflow automatically 11 | schedule: 12 | - cron: 45 6 * * * 13 | 14 | # Allows you to run this workflow manually from the Actions tab 15 | workflow_dispatch: 16 | 17 | jobs: 18 | build: 19 | 20 | runs-on: ubuntu-latest 21 | strategy: 22 | matrix: 23 | python-version: ['3.10', '3.11', '3.12', '3.13'] 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install dependencies 32 | run: | 33 | sudo apt-get update 34 | python -m pip install --upgrade pip 35 | pip install -e .[dev] 36 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 37 | - name: List dependencies 38 | run: | 39 | pip list 40 | - name: Run unit tests with pytest 41 | run: | 42 | python -m pytest -------------------------------------------------------------------------------- /.github/workflows/testing-and-coverage.yml: -------------------------------------------------------------------------------- 1 | 2 | # This workflow will install Python dependencies, run tests and report code coverage with a variety of Python versions 3 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 4 | 5 | name: Unit test and code coverage 6 | 7 | on: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ['3.10', '3.11', '3.12', '3.13'] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | sudo apt-get update 30 | python -m pip install --upgrade pip 31 | pip install -e .[dev] 32 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 33 | - name: Run unit tests with pytest 34 | run: | 35 | python -m pytest --cov=nested_pandas --cov-report=xml 36 | - name: Upload coverage report to codecov 37 | uses: codecov/codecov-action@v5 38 | with: 39 | token: ${{ secrets.CODECOV_TOKEN }} 40 | test-lowest-versions: 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: actions/checkout@v4 44 | - name: Set up Python 3.10 45 | uses: actions/setup-python@v5 46 | with: 47 | python-version: '3.10' 48 | - name: Install dependencies 49 | run: | 50 | sudo apt-get update 51 | python -m pip install --upgrade uv 52 | uv venv venv 53 | source venv/bin/activate 54 | uv pip compile --resolution=lowest -o requirements_lowest.txt pyproject.toml 55 | uv pip install --constraint=requirements_lowest.txt -e .[dev] 56 | - name: Run unit tests with pytest 57 | run: | 58 | source venv/bin/activate 59 | python -m pytest 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | _version.py 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | _readthedocs/ 75 | docs/reference/api/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # vscode 135 | .vscode/ 136 | 137 | # dask 138 | dask-worker-space/ 139 | 140 | # tmp directory 141 | tmp/ 142 | 143 | # Mac OS 144 | .DS_Store 145 | 146 | # Airspeed Velocity performance results 147 | _results/ 148 | _html/ 149 | 150 | # Project initialization script 151 | .initialize_new_project.sh 152 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | 2 | repos: 3 | # Compare the local template version to the latest remote template version 4 | # This hook should always pass. It will print a message if the local version 5 | # is out of date. 6 | - repo: https://github.com/lincc-frameworks/pre-commit-hooks 7 | rev: v0.1.2 8 | hooks: 9 | - id: check-lincc-frameworks-template-version 10 | name: Check template version 11 | description: Compare current template version against latest 12 | verbose: true 13 | # Clear output from jupyter notebooks so that only the input cells are committed. 14 | - repo: local 15 | hooks: 16 | - id: jupyter-nb-clear-output 17 | name: Clear output from Jupyter notebooks 18 | description: Clear output from Jupyter notebooks. 19 | files: \.ipynb$ 20 | stages: [pre-commit] 21 | language: system 22 | entry: jupyter nbconvert --clear-output 23 | exclude: docs/pre_executed 24 | # Prevents committing directly branches named 'main' and 'master'. 25 | - repo: https://github.com/pre-commit/pre-commit-hooks 26 | rev: v4.4.0 27 | hooks: 28 | - id: no-commit-to-branch 29 | name: Prevent main branch commits 30 | description: Prevent the user from committing directly to the primary branch. 31 | - id: check-added-large-files 32 | name: Check for large files 33 | description: Prevent the user from committing very large files. 34 | args: ['--maxkb=500'] 35 | # Verify that pyproject.toml is well formed 36 | - repo: https://github.com/abravalheri/validate-pyproject 37 | rev: v0.12.1 38 | hooks: 39 | - id: validate-pyproject 40 | name: Validate pyproject.toml 41 | description: Verify that pyproject.toml adheres to the established schema. 42 | # Verify that GitHub workflows are well formed 43 | - repo: https://github.com/python-jsonschema/check-jsonschema 44 | rev: 0.28.0 45 | hooks: 46 | - id: check-github-workflows 47 | args: ["--verbose"] 48 | - repo: https://github.com/astral-sh/ruff-pre-commit 49 | # Ruff version. 50 | rev: v0.2.1 51 | hooks: 52 | - id: ruff 53 | name: Lint code using ruff; sort and organize imports 54 | types_or: [ python, pyi ] 55 | args: ["--fix"] 56 | - repo: https://github.com/astral-sh/ruff-pre-commit 57 | # Ruff version. 58 | rev: v0.2.1 59 | hooks: 60 | - id: ruff-format 61 | name: Format code using ruff 62 | types_or: [ python, pyi, jupyter ] 63 | # Analyze type hints and report errors. 64 | - repo: local 65 | hooks: 66 | - id: mypy 67 | name: mypy (python files in src/ and tests/) 68 | entry: mypy 69 | language: system 70 | types: [python] 71 | files: ^(src|tests)/ 72 | args: 73 | [ 74 | "--ignore-missing-imports", # Ignore imports without type hints 75 | ] 76 | # Make sure Sphinx can build the documentation while explicitly omitting 77 | # notebooks from the docs, so users don't have to wait through the execution 78 | # of each notebook or each commit. By default, these will be checked in the 79 | # GitHub workflows. 80 | - repo: local 81 | hooks: 82 | - id: sphinx-build 83 | name: Build documentation with Sphinx 84 | entry: sphinx-build 85 | language: system 86 | always_run: true 87 | exclude_types: [file, symlink] 88 | args: 89 | [ 90 | "-M", # Run sphinx in make mode, so we can use -D flag later 91 | # Note: -M requires next 3 args to be builder, source, output 92 | "html", # Specify builder 93 | "./docs", # Source directory of documents 94 | "./_readthedocs", # Output directory for rendered documents 95 | "-T", # Show full trace back on exception 96 | "-E", # Don't use saved env; always read all files 97 | "-d", # Flag for cached environment and doctrees 98 | "./docs/_build/doctrees", # Directory 99 | "-D", # Flag to override settings in conf.py 100 | "exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit 101 | ] 102 | # Run unit tests, verify that they pass. Note that coverage is run against 103 | # the ./src directory here because that is what will be committed. In the 104 | # github workflow script, the coverage is run against the installed package 105 | # and uploaded to Codecov by calling pytest like so: 106 | # `python -m pytest --cov= --cov-report=xml` 107 | - repo: local 108 | hooks: 109 | - id: pytest-check 110 | name: Run unit tests 111 | description: Run unit tests with pytest. 112 | entry: bash -c "if python -m pytest --co -qq; then python -m pytest --cov=./src --cov-report=html; fi" 113 | language: system 114 | pass_filenames: false 115 | always_run: true 116 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | 2 | # .readthedocs.yml 3 | # Read the Docs configuration file 4 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 5 | 6 | # Required 7 | version: 2 8 | 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | - method: pip 23 | path: . 24 | -------------------------------------------------------------------------------- /.setup_dev.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Bash Unofficial strict mode (http://redsymbol.net/articles/unofficial-bash-strict-mode/) 4 | # and (https://disconnected.systems/blog/another-bash-strict-mode/) 5 | set -o nounset # Any uninitialized variable is an error 6 | set -o errexit # Exit the script on the failure of any command to execute without error 7 | set -o pipefail # Fail command pipelines on the failure of any individual step 8 | IFS=$'\n\t' #set internal field separator to avoid iteration errors 9 | # Trap all exits and output something helpful 10 | trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR 11 | 12 | # This script should be run by new developers to install this package in 13 | # editable mode and configure their local environment 14 | 15 | echo "Checking virtual environment" 16 | if [ "${VIRTUAL_ENV:-missing}" = "missing" ] && [ "${CONDA_PREFIX:-missing}" = "missing" ]; then 17 | echo 'No virtual environment detected: none of $VIRTUAL_ENV or $CONDA_PREFIX is set.' 18 | echo 19 | echo "=== This script is going to install the project in the system python environment ===" 20 | echo "Proceed? [y/N]" 21 | read -r RESPONCE 22 | if [ "${RESPONCE}" != "y" ]; then 23 | echo "See https://lincc-ppt.readthedocs.io/ for details." 24 | echo "Exiting." 25 | exit 1 26 | fi 27 | 28 | fi 29 | 30 | echo "Checking pip version" 31 | MINIMUM_PIP_VERSION=22 32 | pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./\n\t/g') ) 33 | if let "${pipversion[0]}<${MINIMUM_PIP_VERSION}"; then 34 | echo "Insufficient version of pip found. Requires at least version ${MINIMUM_PIP_VERSION}." 35 | echo "See https://lincc-ppt.readthedocs.io/ for details." 36 | exit 1 37 | fi 38 | 39 | echo "Installing package and runtime dependencies in local environment" 40 | python -m pip install -e . > /dev/null 41 | 42 | echo "Installing developer dependencies in local environment" 43 | python -m pip install -e .'[dev]' > /dev/null 44 | if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi 45 | 46 | echo "Installing pre-commit" 47 | pre-commit install > /dev/null 48 | 49 | ####################################################### 50 | # Include any additional configurations below this line 51 | ####################################################### 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 LINCC Frameworks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nested-pandas 2 | 3 | [![Template](https://img.shields.io/badge/Template-LINCC%20Frameworks%20Python%20Project%20Template-brightgreen)](https://lincc-ppt.readthedocs.io/en/latest/) 4 | 5 | [![PyPI](https://img.shields.io/pypi/v/nested-pandas?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/nested-pandas/) 6 | [![Conda](https://img.shields.io/conda/vn/conda-forge/nested-pandas.svg?color=blue&logo=condaforge&logoColor=white)](https://anaconda.org/conda-forge/nested-pandas) 7 | 8 | [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/lincc-frameworks/nested-pandas/smoke-test.yml)](https://github.com/lincc-frameworks/nested-pandas/actions/workflows/smoke-test.yml) 9 | [![codecov](https://codecov.io/gh/lincc-frameworks/nested-pandas/branch/main/graph/badge.svg)](https://codecov.io/gh/lincc-frameworks/nested-pandas) 10 | [![Read the Docs](https://img.shields.io/readthedocs/nested-pandas)](https://nested-pandas.readthedocs.io/) 11 | [![benchmarks](https://img.shields.io/github/actions/workflow/status/lincc-frameworks/nested-pandas/asv-main.yml?label=benchmarks)](https://lincc-frameworks.github.io/nested-pandas/) 12 | 13 | An extension of pandas for efficient representation of nested 14 | associated datasets. 15 | 16 | Nested-Pandas extends the [pandas](https://pandas.pydata.org/) package with 17 | tooling and support for nested dataframes packed into values of top-level 18 | dataframe columns. [Pyarrow](https://arrow.apache.org/docs/python/index.html) 19 | is used internally to aid in scalability and performance. 20 | 21 | Nested-Pandas allows data like this: 22 | 23 |

24 | pandas dataframes 25 |

26 | 27 | To instead be represented like this: 28 | 29 |

30 | nestedframe 31 |

32 | 33 | Where the nested data is represented as nested dataframes: 34 | 35 | ```python 36 | # Each row of "object_nf" now has it's own sub-dataframe of matched rows from "source_df" 37 | object_nf.loc[0]["nested_sources"] 38 | ``` 39 | 40 |

41 | sub-dataframe 42 |

43 | 44 | Allowing powerful and straightforward operations, like: 45 | 46 | ```python 47 | # Compute the mean flux for each row of "object_nf" 48 | import numpy as np 49 | object_nf.reduce(np.mean, "nested_sources.flux") 50 | ``` 51 | 52 |

53 | using reduce 54 |

55 | 56 | Nested-Pandas is motivated by time-domain astronomy use cases, where we see 57 | typically two levels of information, information about astronomical objects and 58 | then an associated set of `N` measurements of those objects. Nested-Pandas offers 59 | a performant and memory-efficient package for working with these types of datasets. 60 | 61 | Core advantages being: 62 | * hierarchical column access 63 | * efficient packing of nested information into inputs to custom user functions 64 | * avoiding costly groupby operations 65 | 66 | 67 | 68 | This is a LINCC Frameworks project - find more information about LINCC Frameworks [here](https://lsstdiscoveryalliance.org/programs/lincc-frameworks/). 69 | 70 | 71 | 72 | ## Acknowledgements 73 | 74 | This project is supported by Schmidt Sciences. 75 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This directory contains files that will be run via continuous testing either 4 | nightly or after committing code to a pull request. 5 | 6 | The runtime and/or memory usage of the functions defined in these files will be 7 | tracked and reported to give you a sense of the overall performance of your code. 8 | 9 | You are encouraged to add, update, or remove benchmark functions to suit the needs 10 | of your project. 11 | 12 | For more information, see the documentation here: https://lincc-ppt.readthedocs.io/en/latest/practices/ci_benchmarking.html -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/asv.conf.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | // The version of the config file format. Do not change, unless 4 | // you know what you are doing. 5 | "version": 1, 6 | // The name of the project being benchmarked. 7 | "project": "nested-pandas", 8 | // The project's homepage. 9 | "project_url": "https://github.com/lincc-frameworks/nested-pandas", 10 | // The URL or local path of the source code repository for the 11 | // project being benchmarked. 12 | "repo": "..", 13 | // List of branches to benchmark. If not provided, defaults to "master" 14 | // (for git) or "tip" (for mercurial). 15 | "branches": [ 16 | "HEAD" 17 | ], 18 | "install_command": [ 19 | "python -m pip install {wheel_file}" 20 | ], 21 | "build_command": [ 22 | "python -m build --wheel -o {build_cache_dir} {build_dir}" 23 | ], 24 | // The DVCS being used. If not set, it will be automatically 25 | // determined from "repo" by looking at the protocol in the URL 26 | // (if remote), or by looking for special directories, such as 27 | // ".git" (if local). 28 | "dvcs": "git", 29 | // The tool to use to create environments. May be "conda", 30 | // "virtualenv" or other value depending on the plugins in use. 31 | // If missing or the empty string, the tool will be automatically 32 | // determined by looking for tools on the PATH environment 33 | // variable. 34 | "environment_type": "virtualenv", 35 | // the base URL to show a commit for the project. 36 | "show_commit_url": "https://github.com/lincc-frameworks/nested-pandas/commit/", 37 | // The Pythons you'd like to test against. If not provided, defaults 38 | // to the current version of Python used to run `asv`. 39 | "pythons": [ 40 | "3.11" 41 | ], 42 | // The matrix of dependencies to test. Each key is the name of a 43 | // package (in PyPI) and the values are version numbers. An empty 44 | // list indicates to just test against the default (latest) 45 | // version. 46 | "matrix": { 47 | "Cython": [], 48 | "build": [], 49 | "packaging": [] 50 | }, 51 | // The directory (relative to the current directory) that benchmarks are 52 | // stored in. If not provided, defaults to "benchmarks". 53 | "benchmark_dir": ".", 54 | // The directory (relative to the current directory) to cache the Python 55 | // environments in. If not provided, defaults to "env". 56 | "env_dir": "env", 57 | // The directory (relative to the current directory) that raw benchmark 58 | // results are stored in. If not provided, defaults to "results". 59 | "results_dir": "_results", 60 | // The directory (relative to the current directory) that the html tree 61 | // should be written to. If not provided, defaults to "html". 62 | "html_dir": "_html", 63 | // The number of characters to retain in the commit hashes. 64 | // "hash_length": 8, 65 | // `asv` will cache wheels of the recent builds in each 66 | // environment, making them faster to install next time. This is 67 | // number of builds to keep, per environment. 68 | "build_cache_size": 8 69 | // The commits after which the regression search in `asv publish` 70 | // should start looking for regressions. Dictionary whose keys are 71 | // regexps matching to benchmark names, and values corresponding to 72 | // the commit (exclusive) after which to start looking for 73 | // regressions. The default is to start from the first commit 74 | // with results. If the commit is `null`, regression detection is 75 | // skipped for the matching benchmark. 76 | // 77 | // "regressions_first_commits": { 78 | // "some_benchmark": "352cdf", // Consider regressions only after this commit 79 | // "another_benchmark": null, // Skip regression detection altogether 80 | // } 81 | } -------------------------------------------------------------------------------- /benchmarks/benchmarks.py: -------------------------------------------------------------------------------- 1 | """Two sample benchmarks to compute runtime and memory usage. 2 | 3 | For more information on writing benchmarks: 4 | https://asv.readthedocs.io/en/stable/writing_benchmarks.html.""" 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pyarrow as pa 9 | from nested_pandas import NestedDtype, NestedFrame, datasets 10 | 11 | 12 | class AssignSingleDfToNestedSeries: 13 | """Benchmark the performance of changing a single nested series element""" 14 | 15 | n_objects = 10_000 16 | n_sources = 100 17 | new_df: pd.DataFrame 18 | series: pd.Series 19 | 20 | def setup(self): 21 | """Set up the benchmark environment.""" 22 | self.new_df = pd.DataFrame( 23 | { 24 | "time": np.arange(self.n_sources, dtype=np.float64), 25 | "flux": np.linspace(0, 1, self.n_sources), 26 | "band": np.full_like("lsstg", self.n_sources), 27 | } 28 | ) 29 | original_df = pd.DataFrame( 30 | { 31 | "time": np.linspace(0, 1, self.n_sources), 32 | "flux": np.arange(self.n_sources, dtype=np.float64), 33 | "band": np.full_like("sdssu", self.n_sources), 34 | } 35 | ) 36 | self.series = pd.Series( 37 | [original_df] * self.n_objects, 38 | # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we 39 | # need to order by field name here for backwards compatibility. 40 | dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}), 41 | ) 42 | 43 | def run(self): 44 | """Run the benchmark.""" 45 | self.series[self.n_objects // 2] = self.new_df 46 | 47 | def time_run(self): 48 | """Benchmark the runtime of changing a single nested series element.""" 49 | self.run() 50 | 51 | def peakmem_run(self): 52 | """Benchmark the memory usage of changing a single nested series element.""" 53 | self.run() 54 | 55 | 56 | class ReassignHalfOfNestedSeries: 57 | """Benchmark the performance of changing a lot of nested series elements""" 58 | 59 | n_objects = 10_000 60 | n_sources = 100 61 | series: pd.Series 62 | new_series: pd.Series 63 | 64 | def setup(self): 65 | """Set up the benchmark environment.""" 66 | # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to 67 | # order by field name here for backwards compatibility. 68 | dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}) 69 | original_df = pd.DataFrame( 70 | { 71 | "time": np.linspace(0, 1, self.n_sources), 72 | "flux": np.arange(self.n_sources, dtype=np.float64), 73 | "band": np.full_like("sdssu", self.n_sources), 74 | } 75 | ) 76 | self.series = pd.Series( 77 | [original_df] * self.n_objects, 78 | dtype=dtype, 79 | ) 80 | 81 | new_df = pd.DataFrame( 82 | { 83 | "time": np.arange(self.n_sources, dtype=np.float64), 84 | "flux": np.linspace(0, 1, self.n_sources), 85 | "band": np.full_like("lsstg", self.n_sources), 86 | } 87 | ) 88 | self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype) 89 | 90 | def run(self): 91 | """Run the benchmark.""" 92 | self.series[::2] = self.new_series 93 | 94 | def time_run(self): 95 | """Benchmark the runtime of changing a single nested series element.""" 96 | self.run() 97 | 98 | def peakmem_run(self): 99 | """Benchmark the memory usage of changing a single nested series element.""" 100 | self.run() 101 | 102 | 103 | class NestedFrameAddNested: 104 | """Benchmark the NestedFrame.add_nested function""" 105 | 106 | n_base = 100 107 | layer_size = 1000 108 | base_nf = NestedFrame 109 | layer_nf = NestedFrame 110 | 111 | def setup(self): 112 | """Set up the benchmark environment""" 113 | # use provided seed, "None" acts as if no seed is provided 114 | randomstate = np.random.RandomState(seed=1) 115 | 116 | # Generate base data 117 | base_data = {"a": randomstate.random(self.n_base), "b": randomstate.random(self.n_base) * 2} 118 | self.base_nf = NestedFrame(data=base_data) 119 | 120 | layer_data = { 121 | "t": randomstate.random(self.layer_size * self.n_base) * 20, 122 | "flux": randomstate.random(self.layer_size * self.n_base) * 100, 123 | "band": randomstate.choice(["r", "g"], size=self.layer_size * self.n_base), 124 | "index": np.arange(self.layer_size * self.n_base) % self.n_base, 125 | } 126 | self.layer_nf = NestedFrame(data=layer_data).set_index("index") 127 | 128 | def run(self): 129 | """Run the benchmark.""" 130 | self.base_nf.add_nested(self.layer_nf, "nested") 131 | 132 | def time_run(self): 133 | """Benchmark the runtime of adding a nested layer""" 134 | self.run() 135 | 136 | def peakmem_run(self): 137 | """Benchmark the memory usage of adding a nested layer""" 138 | self.run() 139 | 140 | 141 | class NestedFrameReduce: 142 | """Benchmark the NestedFrame.reduce function""" 143 | 144 | n_base = 100 145 | n_nested = 1000 146 | nf = NestedFrame 147 | 148 | def setup(self): 149 | """Set up the benchmark environment""" 150 | self.nf = datasets.generate_data(self.n_base, self.n_nested) 151 | 152 | def run(self): 153 | """Run the benchmark.""" 154 | self.nf.reduce(np.mean, "nested.flux") 155 | 156 | def time_run(self): 157 | """Benchmark the runtime of applying the reduce function""" 158 | self.run() 159 | 160 | def peakmem_run(self): 161 | """Benchmark the memory usage of applying the reduce function""" 162 | self.run() 163 | 164 | 165 | class NestedFrameQuery: 166 | """Benchmark the NestedFrame.query function""" 167 | 168 | n_base = 100 169 | n_nested = 1000 170 | nf = NestedFrame 171 | 172 | def setup(self): 173 | """Set up the benchmark environment""" 174 | self.nf = datasets.generate_data(self.n_base, self.n_nested) 175 | 176 | def run(self): 177 | """Run the benchmark.""" 178 | 179 | # Apply nested layer query 180 | self.nf = self.nf.query("nested.band == 'g'") 181 | 182 | def time_run(self): 183 | """Benchmark the runtime of applying the two queries""" 184 | self.run() 185 | 186 | def peakmem_run(self): 187 | """Benchmark the memory usage of applying the two queries""" 188 | self.run() 189 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -T -E -d _build/doctrees -D language=en 7 | EXCLUDENB ?= -D exclude_patterns="notebooks/*","_build","**.ipynb_checkpoints" 8 | SPHINXBUILD ?= sphinx-build 9 | SOURCEDIR = . 10 | BUILDDIR = ../_readthedocs/ 11 | 12 | .PHONY: help clean Makefile no-nb no-notebooks 13 | 14 | # Put it first so that "make" without argument is like "make help". 15 | help: 16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 17 | 18 | # Build all Sphinx docs locally, except the notebooks 19 | no-nb no-notebooks: 20 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(EXCLUDENB) $(O) 21 | 22 | # Cleans up files generated by the build process 23 | clean: 24 | rm -r "_build/doctrees" 25 | rm -r "$(BUILDDIR)" 26 | 27 | # Catch-all target: route all unknown targets to Sphinx using the new 28 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 29 | %: Makefile 30 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 31 | 32 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .jupyter-widgets { 2 | color: var(--pst-color-text-base) !important; 3 | } -------------------------------------------------------------------------------- /docs/_templates/autosummary/base.rst: -------------------------------------------------------------------------------- 1 | {%- if objname.split('.')[-1] == objname %} 2 | {{ objname | escape | underline }} 3 | {%- else %} 4 | {{ objname.split('.')[-1] | escape | underline }} 5 | {%- endif %} 6 | 7 | .. currentmodule:: {{ module }} 8 | 9 | .. auto{{ objtype }}:: {{ objname }} 10 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | {%- if objname.split('.')[-1] == objname %} 2 | {{ objname | escape | underline }} 3 | {%- else %} 4 | {{ objname.split('.')[-1] | escape | underline }} 5 | {%- endif %} 6 | 7 | .. currentmodule:: {{ module }} 8 | 9 | .. autoclass:: {{ objname }} 10 | 11 | {% block methods %} 12 | .. automethod:: __init__ 13 | 14 | {% if methods %} 15 | .. rubric:: {{ _('Methods') }} 16 | 17 | .. autosummary:: 18 | {% for item in methods %} 19 | ~{{ name }}.{{ item }} 20 | {%- endfor %} 21 | {% endif %} 22 | {% endblock %} 23 | 24 | {% block attributes %} 25 | {% if attributes %} 26 | .. rubric:: {{ _('Attributes') }} 27 | 28 | .. autosummary:: 29 | {% for item in attributes %} 30 | ~{{ name }}.{{ item }} 31 | {%- endfor %} 32 | {% endif %} 33 | {% endblock %} 34 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | 5 | {% block attributes %} 6 | {%- if attributes %} 7 | .. rubric:: {{ _('Module Attributes') }} 8 | 9 | .. autosummary:: 10 | {% for item in attributes %} 11 | {{ item }} 12 | {%- endfor %} 13 | {% endif %} 14 | {%- endblock %} 15 | 16 | {%- block functions %} 17 | {%- if functions %} 18 | .. rubric:: {{ _('Functions') }} 19 | 20 | .. autosummary:: 21 | {% for item in functions %} 22 | {{ item }} 23 | {%- endfor %} 24 | {% endif %} 25 | {%- endblock %} 26 | 27 | {%- block classes %} 28 | {%- if classes %} 29 | .. rubric:: {{ _('Classes') }} 30 | 31 | .. autosummary:: 32 | {% for item in classes %} 33 | {{ item }} 34 | {%- endfor %} 35 | {% endif %} 36 | {%- endblock %} 37 | 38 | {%- block exceptions %} 39 | {%- if exceptions %} 40 | .. rubric:: {{ _('Exceptions') }} 41 | 42 | .. autosummary:: 43 | {% for item in exceptions %} 44 | {{ item }} 45 | {%- endfor %} 46 | {% endif %} 47 | {%- endblock %} 48 | 49 | {%- block modules %} 50 | {%- if modules %} 51 | .. rubric:: Modules 52 | 53 | .. autosummary:: 54 | :toctree: 55 | :recursive: 56 | {% for item in modules %} 57 | {{ item }} 58 | {%- endfor %} 59 | {% endif %} 60 | {%- endblock %} 61 | -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | About Nested-Pandas 2 | =================== 3 | 4 | 5 | .. toctree:: 6 | 7 | Internal Representation of Nested Data 8 | Performance Impact of Nested-Pandas -------------------------------------------------------------------------------- /docs/about/internals.rst: -------------------------------------------------------------------------------- 1 | Internal Representation of Nested Data 2 | ====================================== 3 | "Dataframes within Dataframes" is a useful hueristic for understanding the 4 | API/workings of a NestedFrame. However, the actual storage representation 5 | leverages pyarrow and materializes the nested dataframes as a view of the 6 | data. The following diagram details the actual storage representation of 7 | nested-pandas: 8 | 9 | .. image:: ./npd_internals.png 10 | :width: 400 11 | :align: center 12 | :alt: Internal representation of nested-pandas 13 | 14 | 15 | The advantage of this approach is that each sub-column ("field" in pyarrow) is 16 | stored in a flat array, with an offset array used to slice the data into the 17 | respective sub-dataframes. This allows for efficient transformations to other 18 | data representations (dataframes, list-arrays, flat arrays, etc.) which are 19 | used internally to minimize overhead of operations involving nested data. 20 | 21 | Nested Serialization to Parquet 22 | ------------------------------- 23 | The internal design of nested columns has valid pyarrow struct-list objects 24 | underneath. This allows for direct serialization of nested columns to the 25 | parquet format. nested-pandas will automatically write nested columns to 26 | parquet format as valid pyarrow dtypes, which allows for them to be read 27 | by other parquet readers that support complex types. Additionally, nested-pandas 28 | will attempt to cast pyarrow struct-list columns to nested columns directly 29 | when reading from parquet. 30 | 31 | 32 | Multi-level Nesting Support 33 | --------------------------- 34 | At this time, nested-pandas only supports a single level of nesting. Though we 35 | intend to support multiple levels of nesting in the future, and would be 36 | additionally motivated by community use cases that would benefit from this. -------------------------------------------------------------------------------- /docs/about/npd_internals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/docs/about/npd_internals.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | 7 | import os 8 | import sys 9 | from importlib.metadata import version 10 | 11 | # Define path to the code to be documented **relative to where conf.py (this file) is kept** 12 | sys.path.insert(0, os.path.abspath("../src/")) 13 | 14 | # -- Project information ----------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 16 | 17 | project = "nested-pandas" 18 | copyright = "2024, LINCC Frameworks" 19 | author = "LINCC Frameworks" 20 | release = version("nested-pandas") 21 | # for example take major/minor 22 | version = ".".join(release.split(".")[:2]) 23 | 24 | # -- General configuration --------------------------------------------------- 25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 26 | 27 | extensions = ["sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.viewcode", "sphinx.ext.autosummary"] 28 | 29 | extensions.append("nbsphinx") 30 | 31 | # -- sphinx-copybutton configuration ---------------------------------------- 32 | extensions.append("sphinx_copybutton") 33 | ## sets up the expected prompt text from console blocks, and excludes it from 34 | ## the text that goes into the clipboard. 35 | copybutton_exclude = ".linenos, .gp" 36 | copybutton_prompt_text = ">> " 37 | 38 | ## lets us suppress the copy button on select code blocks. 39 | copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre" 40 | 41 | templates_path = ["_templates"] 42 | exclude_patterns = ["_build", "**.ipynb_checkpoints"] 43 | 44 | # This assumes that sphinx-build is called from the root directory 45 | master_doc = "index" 46 | # Remove 'view source code' from top of page (for html, not python) 47 | html_show_sourcelink = False 48 | # Remove namespaces from class/method signatures 49 | add_module_names = False 50 | # Hide full module path in navigation 51 | modindex_common_prefix = ["nested_pandas."] 52 | # Customize display of autosummary entries 53 | autosummary_imported_members = True 54 | 55 | html_theme = "sphinx_book_theme" 56 | 57 | html_static_path = ["_static"] 58 | html_css_files = ["custom.css"] 59 | -------------------------------------------------------------------------------- /docs/gettingstarted.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | ======================================================================================== 3 | 4 | These pages will help you install and learn the basics of using nested-pandas. If you encounter any issues 5 | we encourage you to open an issue on the 6 | `nested-pandas github repository `_. 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | 11 | Installing nested-pandas 12 | Contribution Guide 13 | Quickstart Guide -------------------------------------------------------------------------------- /docs/gettingstarted/contributing.rst: -------------------------------------------------------------------------------- 1 | Contribution Guide 2 | ================== 3 | 4 | Dev Guide - Getting Started 5 | --------------------------- 6 | 7 | Download code and install dependencies in a conda environment. Run unit tests at the end as a verification that the packages are properly installed. 8 | 9 | .. code-block:: bash 10 | 11 | conda create -n nested_pandas_env python=3.11 12 | conda activate nested_pandas_env 13 | 14 | git clone https://github.com/lincc-frameworks/nested-pandas.git 15 | cd nested-pandas/ 16 | bash ./.setup_dev.sh 17 | 18 | pip install pytest 19 | pytest 20 | -------------------------------------------------------------------------------- /docs/gettingstarted/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | nested-pandas is available to install with pip, using the "nested-pandas" package name: 5 | 6 | .. code-block:: bash 7 | 8 | % pip install nested-pandas 9 | 10 | 11 | This will grab the latest release version of nested-pandas from pip. 12 | 13 | Installation from Source 14 | --------------------- 15 | 16 | In some cases, installation via pip may not be sufficient. In particular, if you're looking to grab the latest 17 | development version of nested-pandas, you should instead build 'nested-pandas' from source. The following process downloads the 18 | 'nested-pandas' source code and installs it and any needed dependencies in a fresh conda environment. 19 | 20 | .. code-block:: bash 21 | 22 | conda create -n nested_pandas_env python=3.11 23 | conda activate nested_pandas_env 24 | 25 | git clone https://github.com/lincc-frameworks/nested-pandas.git 26 | cd nested-pandas 27 | pip install . 28 | pip install .[dev] # it may be necessary to use `pip install .'[dev]'` (with single quotes) depending on your machine. 29 | 30 | The ``pip install .[dev]`` command is optional, and installs dependencies needed to run the unit tests and build 31 | the documentation. The latest source version of nested-pandas may be less stable than a release, and so we recommend 32 | running the unit test suite to verify that your local install is performing as expected. 33 | 34 | .. code-block:: bash 35 | 36 | pip install pytest 37 | pytest -------------------------------------------------------------------------------- /docs/gettingstarted/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Quickstart\n", 8 | "\n", 9 | "This notebook provides a brief introduction to nested-pandas, including the motivation and basics for working with the data structure. For more in-depth descriptions, see the other tutorial notebooks." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Installation\n", 17 | "\n", 18 | "With a valid Python environment, nested-pandas and it's dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# % pip install nested-pandas" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Overview\n", 35 | "\n", 36 | "Nested-Pandas is tailored towards efficient analysis of nested data sets. This includes data that would normally be represented in a Pandas DataFrames with multiple rows needed to represent a single \"thing\" and therefor columns whose values will be identical for that item.\n", 37 | "\n", 38 | "As a concrete example, consider an astronomical data set storing information about observations of physical objects, such as stars and galaxies. One way to represent this in Pandas is to create one row per observation with an ID column indicating to which physical object the observation corresponds. However this approach ends up repeating a lot of data over each observation of the same object such as its location on the sky (RA, dec), its classification, etc. Further, any operations processing the data as time series requires the user to first perform a (potentially expensive) group-by operation to aggregate all of the data for each object.\n", 39 | "\n", 40 | "Let's create a flat pandas dataframe with three objects: object 0 has three observations, object 1 has three observations, and object 2 has 4 observations." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "\n", 51 | "# Represent nested time series information as a classic pandas dataframe.\n", 52 | "my_data_frame = pd.DataFrame(\n", 53 | " {\n", 54 | " \"id\": [0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n", 55 | " \"ra\": [10.0, 10.0, 10.0, 15.0, 15.0, 15.0, 12.1, 12.1, 12.1, 12.1],\n", 56 | " \"dec\": [0.0, 0.0, 0.0, -1.0, -1.0, -1.0, 0.5, 0.5, 0.5, 0.5],\n", 57 | " \"time\": [60676.0, 60677.0, 60678.0, 60675.0, 60676.5, 60677.0, 60676.6, 60676.7, 60676.8, 60676.9],\n", 58 | " \"brightness\": [100.0, 101.0, 99.8, 5.0, 5.01, 4.98, 20.1, 20.5, 20.3, 20.2],\n", 59 | " }\n", 60 | ")\n", 61 | "my_data_frame" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Note that we cannot cleanly compress this by adding more columns (such as such as t0, t1, and so forth), because the number of observations is not bounded and may vary from object to object.\n", 69 | "\n", 70 | "Beyond astronomical data we might be interested in tracking patients blood pressure over time, the measure of intensities of emitted light at different wavelengths, or storing a list of the type of rock found at different depths of core samples. In each case it is possible to represent this data with multiple rows (such as one row for each patient + measurement pair) and associate them together by ids.\n", 71 | "\n", 72 | "Nested-pandas is designed for exactly this type of data by allowing columns to contain nested data. We can have regular columns with the (single) value for the objects’ unvarying characteristics (location on the sky, patentient birth date, location of the core sample) and nested columns for the values of each observation.\n", 73 | "\n", 74 | "Let's see an example:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "from nested_pandas.nestedframe import NestedFrame\n", 84 | "\n", 85 | "# Create a nested data set\n", 86 | "nf = NestedFrame.from_flat(\n", 87 | " my_data_frame,\n", 88 | " base_columns=[\"ra\", \"dec\"], # the columns not to nest\n", 89 | " nested_columns=[\"time\", \"brightness\"], # the columns to nest\n", 90 | " on=\"id\", # column used to associate rows\n", 91 | " name=\"lightcurve\", # name of the nested column\n", 92 | ")\n", 93 | "nf" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "The above dataframe is a `NestedFrame`, which extends the capabilities of the Pandas `DataFrame` to support columns with nested information. \n", 101 | "\n", 102 | "We now have the top level dataframe with 3 rows, each of which corresponds to a single object. The table has three columns beyond \"id\". Two columns, \"ra\" and \"dec\", have a single value for the object (in this case the position on the sky). The last column \"lightcurve\" contains a nested table with a series of observation times and observation brightnesses for the object. The first row of this nested table is provided along with dimensions to provide a sense for the contents of the nested data. As we will see below, this nested table allows the user to easily access to the all of the observations for a given object.\n", 103 | "\n", 104 | "## Accessing Nested Data\n", 105 | "\n", 106 | "We can inspect the contents of the \"lightcurve\" column using pandas API tooling like `loc`." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "nf.loc[0][\"lightcurve\"]" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Here we see that within the \"lightcurve\" column there are tables with their own data. In this case we have 2 columns (\"time\" and \"brightness\") that represent a time series of observations. \n", 123 | "\n", 124 | "Note that `loc` itself accesses the row, so the combination of `nf.loc[0][\"lightcurve\"]` means we are looking at value in the \"lightcurve\" column for a single row (row 0). If we just use `nf.loc[0]` we would retrieve the entire row, including the nested \"lightcurve\" column and all other columns. Similarly if we use `nf[\"lightcurve]` we retrieve the nested column for all rows. What makes the nesting useful is that once we access the nested entry for a specific row, we can treat the value as a table in its own right.\n", 125 | "\n", 126 | "As in Pandas, we can still access individual entries from a column based on the row index. Thus we can access the values (in a table) in row 0 of the nested column as `nf[\"lightcurve\"][0]` as well." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "nf[\"lightcurve\"][0]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "We can also use dot notation to access all the values in a nested sub column:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "nf[\"lightcurve.time\"]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Note that \"lightcurve.time\" contains the time values for all rows, but also preserves the nesting information. The id column of the returned data maps the top-level row (in `nf`) to where this value resides.\n", 159 | "\n", 160 | "Similarly, we can access the values for a given top-level row by index. To get all the `time` values for row 0 we could specify:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "nf[\"lightcurve.time\"][0]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "Here the `[0]` is telling our nested frame to access the values of the series `nf[\"lightcurve.time\"]` where the id = 0. If we try `nf[\"lightcurve.time\"][0][0]` we again match id = 0 and return the same frame. \n", 177 | "\n", 178 | "To access a single element within the series, we need to use its location:" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "nf[\"lightcurve.time\"][0].iloc[0]" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "## Inspecting Nested Frames\n", 195 | "\n", 196 | "We can inspect the available columns using some custom properties of the `NestedFrame`." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# Shows which columns have nested data\n", 206 | "nf.nested_columns" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# Provides a dictionary of \"base\" (top-level) and nested column labels\n", 216 | "nf.all_columns" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Pandas Operations\n", 224 | "\n", 225 | "Nested-pandas extends the Pandas API, meaning any operation you could do in Pandas is available within nested-pandas. However, nested-pandas has additional functionality and tooling to better support working with nested datasets. For example, let's look at `query`:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# Normal queries work as expected, rejecting rows from the dataframe that don't meet the criteria\n", 235 | "nf.query(\"ra > 11.2\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "The above query is native Pandas, however with nested-pandas we can use hierarchical column names to extend `query` to nested layers." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "# Applies the query to \"nested\", filtering based on \"time > 60676.0\"\n", 252 | "nf_g = nf.query(\"lightcurve.time > 60676.0\")\n", 253 | "nf_g" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "This query does not affect the rows of the top-level dataframe, but rather applies the query to the \"nested\" dataframes. If we look at one of them, we can see the effect of the query." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "# All t <= 60676.0 have been removed\n", 270 | "nf_g.loc[0][\"lightcurve\"]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "A limited set of functions have been extended in this way so far, with the aim being to fully support this hierarchical access where applicable in the Pandas API." 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "## Reduce Function\n", 285 | "\n", 286 | "Finally, we'll end with the flexible `reduce` function. `reduce` functions similarly to Pandas' `apply` but flattens (reduces) the inputs from nested layers into array inputs to the given apply function. For example, let's find the mean flux for each dataframe in \"nested\":" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "import numpy as np\n", 296 | "\n", 297 | "# use hierarchical column names to access the flux column\n", 298 | "# passed as an array to np.mean\n", 299 | "nf.reduce(np.mean, \"lightcurve.brightness\")" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "This can be used to apply any custom functions you need for your analysis, and just to illustrate that point further let's define a custom function that just returns it's inputs." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "def show_inputs(*args):\n", 316 | " return args" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "Applying some inputs via reduce, we see how it sends inputs to a given function. The output frame `nf_inputs` consists of two columns containing the output of the “ra” column and the “lightcurve.time” column." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "nf_inputs = nf.reduce(show_inputs, \"ra\", \"lightcurve.time\")\n", 333 | "nf_inputs" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "nf_inputs.loc[0]" 343 | ] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 3 (ipykernel)", 349 | "language": "python", 350 | "name": "python3" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 3 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython3", 362 | "version": "3.13.3" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 4 367 | } 368 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. nested_pandas documentation main file. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Nested-Pandas 7 | ============= 8 | 9 | An extension of pandas for efficient representation of nested 10 | associated datasets. 11 | 12 | Nested-Pandas extends the `pandas `_ package with 13 | tooling and support for nested dataframes packed into values of top-level 14 | dataframe columns. `Pyarrow `_ 15 | is used internally to aid in scalability and performance. 16 | 17 | Nested-Pandas allows data like this: 18 | 19 | .. image:: ./intro_images/pandas_dfs.png 20 | :width: 400 21 | :align: center 22 | :alt: pandas dataframes 23 | 24 | To instead be represented like this: 25 | 26 | .. image:: ./intro_images/nestedframe_example.png 27 | :width: 300 28 | :align: center 29 | :alt: pandas dataframes 30 | 31 | Where the nested data is represented as nested dataframes: 32 | 33 | .. code-block:: python 34 | 35 | # Each row of "object_nf" now has it's own sub-dataframe of matched rows from "source_df" 36 | object_nf.loc[0]["nested_sources"] 37 | 38 | .. image:: ./intro_images/loc_into_nested.png 39 | :width: 300 40 | :align: center 41 | :alt: pandas dataframes 42 | 43 | Allowing powerful and straightforward operations, like: 44 | 45 | .. code-block:: python 46 | 47 | # Compute the mean flux for each row of "object_nf" 48 | import numpy as np 49 | object_nf.reduce(np.mean, "nested_sources.flux") 50 | 51 | .. image:: ./intro_images/reduce.png 52 | :width: 150 53 | :align: center 54 | :alt: pandas dataframes 55 | 56 | Nested-Pandas is motivated by time-domain astronomy use cases, where we see 57 | typically two levels of information, information about astronomical objects and 58 | then an associated set of `N` measurements of those objects. Nested-Pandas offers 59 | a performant and memory-efficient package for working with these types of datasets. 60 | 61 | Core advantages being: 62 | 63 | * hierarchical column access 64 | * efficient packing of nested information into inputs to custom user functions 65 | * avoiding costly groupby operations 66 | 67 | 68 | How to Use This Guide 69 | ===================== 70 | 71 | Begin with the :doc:`Getting Started ` 72 | guide to learn the basics of installation and walkthrough a simple example of 73 | using nested-pandas. 74 | 75 | The :doc:`Tutorials ` 76 | section showcases the fundamental features of nested-pandas. 77 | 78 | API-level information about nested-pandas is viewable in the 79 | :doc:`API Reference ` 80 | section. 81 | 82 | The :doc:`About Nested-Pandas ` section provides information on the 83 | design and performance advantages of nested-pandas. 84 | 85 | Learn more about contributing to this repository in our :doc:`Contribution Guide `. 86 | 87 | .. toctree:: 88 | :hidden: 89 | 90 | Home page 91 | Getting Started 92 | Tutorials 93 | API Reference 94 | About Nested-Pandas 95 | -------------------------------------------------------------------------------- /docs/intro_images/loc_into_nested.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/docs/intro_images/loc_into_nested.png -------------------------------------------------------------------------------- /docs/intro_images/nestedframe_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/docs/intro_images/nestedframe_example.png -------------------------------------------------------------------------------- /docs/intro_images/pandas_dfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/docs/intro_images/pandas_dfs.png -------------------------------------------------------------------------------- /docs/intro_images/reduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/docs/intro_images/reduce.png -------------------------------------------------------------------------------- /docs/pre_executed/performance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Performance Impact of `nested-pandas`\n", 8 | "\n", 9 | "For use-cases involving nesting data, `nested-pandas` can offer significant speedups compared to using the native `pandas` API. Below is a brief example workflow comparison between `pandas` and `nested-pandas`, where this example workflow calculates the amplitude of photometric fluxes after a few filtering steps." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import nested_pandas as npd\n", 19 | "import pandas as pd\n", 20 | "import light_curve as licu\n", 21 | "import numpy as np\n", 22 | "\n", 23 | "from nested_pandas.utils import count_nested" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Pandas" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 5, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "498 ms ± 3.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "%%timeit\n", 48 | "\n", 49 | "# Read data\n", 50 | "object_df = pd.read_parquet(\"objects.parquet\")\n", 51 | "source_df = pd.read_parquet(\"ztf_sources.parquet\")\n", 52 | "\n", 53 | "# Filter on object\n", 54 | "filtered_object = object_df.query(\"ra > 10.0\")\n", 55 | "# sync object to source --removes any index values of source not found in object\n", 56 | "filtered_source = filtered_object[[]].join(source_df, how=\"left\")\n", 57 | "\n", 58 | "# Count number of observations per photometric band and add it to the object table\n", 59 | "band_counts = (\n", 60 | " source_df.groupby(level=0)\n", 61 | " .apply(lambda x: x[[\"band\"]].value_counts().reset_index())\n", 62 | " .pivot_table(values=\"count\", index=\"index\", columns=\"band\", aggfunc=\"sum\")\n", 63 | ")\n", 64 | "filtered_object = filtered_object.join(band_counts[[\"g\", \"r\"]])\n", 65 | "\n", 66 | "# Filter on our nobs\n", 67 | "filtered_object = filtered_object.query(\"g > 520\")\n", 68 | "filtered_source = filtered_object[[]].join(source_df, how=\"left\")\n", 69 | "\n", 70 | "# Calculate Amplitude\n", 71 | "amplitude = licu.Amplitude()\n", 72 | "filtered_source.groupby(level=0).apply(lambda x: amplitude(np.array(x.mjd), np.array(x.flux)))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Nested-Pandas" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "228 ms ± 2.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "%%timeit\n", 97 | "\n", 98 | "# Read in parquet data\n", 99 | "# nesting sources into objects\n", 100 | "nf = npd.read_parquet(\"objects.parquet\")\n", 101 | "nf = nf.add_nested(npd.read_parquet(\"ztf_sources.parquet\"), \"ztf_sources\")\n", 102 | "\n", 103 | "# Filter on object\n", 104 | "nf = nf.query(\"ra > 10.0\")\n", 105 | "\n", 106 | "# Count number of observations per photometric band and add it as a column\n", 107 | "nf = count_nested(nf, \"ztf_sources\", by=\"band\", join=True) # use an existing utility\n", 108 | "\n", 109 | "# Filter on our nobs\n", 110 | "nf = nf.query(\"n_ztf_sources_g > 520\")\n", 111 | "\n", 112 | "# Calculate Amplitude\n", 113 | "amplitude = licu.Amplitude()\n", 114 | "nf.reduce(amplitude, \"ztf_sources.mjd\", \"ztf_sources.flux\")" 115 | ] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "lsdb", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.11.11" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ======================================================================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | NestedFrame 8 | .nest Accessor 9 | Utility Functions 10 | NestedDtype 11 | Nested Extension Array 12 | Packer Functions 13 | -------------------------------------------------------------------------------- /docs/reference/accessor.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | .nest Series Accessor 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | Constructor 7 | ~~~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | NestSeriesAccessor 12 | 13 | Functions 14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .. autosummary:: 16 | :toctree: api/ 17 | 18 | NestSeriesAccessor.to_lists 19 | NestSeriesAccessor.to_flat 20 | NestSeriesAccessor.to_flatten_inner 21 | NestSeriesAccessor.with_field 22 | NestSeriesAccessor.with_flat_field 23 | NestSeriesAccessor.with_list_field 24 | NestSeriesAccessor.with_filled_field 25 | NestSeriesAccessor.without_field 26 | NestSeriesAccessor.query_flat 27 | NestSeriesAccessor.get_flat_index 28 | NestSeriesAccessor.get_flat_series 29 | NestSeriesAccessor.get_list_series 30 | -------------------------------------------------------------------------------- /docs/reference/ext_array.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | NestedDtype 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | Constructor 7 | ~~~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | series.ext_array.NestedExtensionArray 12 | 13 | Functions 14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .. autosummary:: 16 | :toctree: api/ 17 | 18 | series.ext_array.NestedExtensionArray.dtype 19 | series.ext_array.NestedExtensionArray.nbytes 20 | series.ext_array.NestedExtensionArray.list_array 21 | series.ext_array.NestedExtensionArray.struct_array 22 | series.ext_array.NestedExtensionArray.py_table 23 | series.ext_array.NestedExtensionArray.list_offsets 24 | series.ext_array.NestedExtensionArray.field_names 25 | series.ext_array.NestedExtensionArray.list_lengths 26 | series.ext_array.NestedExtensionArray.flat_length 27 | series.ext_array.NestedExtensionArray.num_chunks 28 | series.ext_array.NestedExtensionArray.to_numpy 29 | series.ext_array.NestedExtensionArray.isna 30 | series.ext_array.NestedExtensionArray.take 31 | series.ext_array.NestedExtensionArray.copy 32 | series.ext_array.NestedExtensionArray.equals 33 | series.ext_array.NestedExtensionArray.dropna 34 | series.ext_array.NestedExtensionArray.from_sequence 35 | series.ext_array.NestedExtensionArray.to_arrow_ext_array 36 | series.ext_array.NestedExtensionArray.to_pyarrow_scalar 37 | series.ext_array.NestedExtensionArray.get_list_index 38 | series.ext_array.NestedExtensionArray.iter_field_lists 39 | series.ext_array.NestedExtensionArray.view_fields 40 | series.ext_array.NestedExtensionArray.set_flat_field 41 | series.ext_array.NestedExtensionArray.set_list_field 42 | series.ext_array.NestedExtensionArray.fill_field_lists 43 | series.ext_array.NestedExtensionArray.pop_fields -------------------------------------------------------------------------------- /docs/reference/nesteddtype.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | NestedDtype 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | Constructor 7 | ~~~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | NestedDtype 12 | 13 | Functions 14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .. autosummary:: 16 | :toctree: api/ 17 | 18 | NestedDtype.construct_array_type 19 | NestedDtype.construct_from_string 20 | NestedDtype.from_fields 21 | NestedDtype.from_pandas_arrow_dtype 22 | NestedDtype.to_pandas_arrow_dtype -------------------------------------------------------------------------------- /docs/reference/nestedframe.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | NestedFrame 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | Constructor 7 | ~~~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | NestedFrame 12 | 13 | Nesting 14 | ~~~~~~~~~ 15 | .. autosummary:: 16 | :toctree: api/ 17 | 18 | NestedFrame.add_nested 19 | NestedFrame.nest_lists 20 | NestedFrame.from_flat 21 | NestedFrame.from_lists 22 | 23 | Extended Pandas.DataFrame Interface 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. note:: 27 | The NestedFrame extends the Pandas.DataFrame interface, so all methods 28 | of Pandas.DataFrame are available. The following methods are extended 29 | to support NestedFrame functionality. Please reference the Pandas 30 | documentation for more information. 31 | https://pandas.pydata.org/docs/reference/frame.html 32 | 33 | .. autosummary:: 34 | :toctree: api/ 35 | 36 | NestedFrame.eval 37 | NestedFrame.query 38 | NestedFrame.dropna 39 | NestedFrame.sort_values 40 | NestedFrame.reduce 41 | NestedFrame.drop 42 | 43 | I/O 44 | ~~~~~~~~~ 45 | .. autosummary:: 46 | :toctree: api/ 47 | 48 | NestedFrame.to_parquet 49 | read_parquet -------------------------------------------------------------------------------- /docs/reference/packer.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Packer 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | Functions 7 | ~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | series.packer.pack 12 | series.packer.pack_flat 13 | series.packer.pack_seq 14 | series.packer.pack_lists -------------------------------------------------------------------------------- /docs/reference/utils.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Utility Functions 3 | ========= 4 | .. currentmodule:: nested_pandas 5 | 6 | NestedFrame Utilities 7 | ~~~~~~~~~ 8 | .. autosummary:: 9 | :toctree: api/ 10 | 11 | utils.count_nested 12 | 13 | Generating Toy Datasets 14 | ~~~~~~~~~ 15 | .. autosummary:: 16 | :toctree: api/ 17 | 18 | datasets.generation.generate_data 19 | datasets.generation.generate_parquet_file -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | ipykernel 3 | ipython 4 | jupytext 5 | nbconvert 6 | nbsphinx 7 | sphinx 8 | sphinx-autoapi 9 | sphinx-copybutton 10 | sphinx-book-theme 11 | astroquery 12 | astropy 13 | matplotlib 14 | light-curve -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ======================================================================================== 3 | 4 | .. toctree:: 5 | 6 | Loading Data into Nested-Pandas 7 | Fine Data Manipulation with Nested-Pandas 8 | Lower-level interfaces 9 | Using Nested-Pandas with Astronomical Spectra 10 | -------------------------------------------------------------------------------- /docs/tutorials/README.md: -------------------------------------------------------------------------------- 1 | Put your Jupyter notebooks here :) 2 | -------------------------------------------------------------------------------- /docs/tutorials/data_loading_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Loading Data into Nested-Pandas\n", 8 | "\n", 9 | "This notebook provides a brief introduction to loading data into nested-pandas or converting data into a nested structure. For an introduction to nested-pandas, see the quick start tutorial or the [readthedocs page](https://nested-pandas.readthedocs.io/en/latest/)\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Installation and Imports\n", 17 | "\n", 18 | "With a valid Python environment, nested-pandas and its dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# % pip install nested-pandas" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import os\n", 37 | "import tempfile\n", 38 | "\n", 39 | "import pandas as pd\n", 40 | "\n", 41 | "from nested_pandas import NestedFrame, read_parquet\n", 42 | "from nested_pandas.datasets import generate_parquet_file" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Overview\n", 50 | "\n", 51 | "Nested-pandas provides multiple mechanisms for loading data or converting data to the nested format. Below we walk through some of the common approaches.\n", 52 | "\n", 53 | "# Converting Flat Data\n", 54 | "\n", 55 | "Commonly existing data sets will be provided in “flat” data structures such as dictionaries or Pandas DataFrames. In these cases the data consists of a rectangular table where each row represents an instance or observation. Multiple instances of the same top-level item are linked together through an ID. All rows with the same ID correspond to the same object/item.\n", 56 | "\n", 57 | "We define one such flat dataframe consisting of 10 rows for 3 distinct items." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "flat_df = pd.DataFrame(\n", 67 | " data={\n", 68 | " \"a\": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],\n", 69 | " \"b\": [2, 2, 2, 4, 4, 4, 6, 6, 6, 6],\n", 70 | " \"c\": [0, 2, 4, 1, 4, 3, 1, 4, 1, 1],\n", 71 | " \"d\": [5, 4, 7, 5, 3, 1, 9, 3, 4, 1],\n", 72 | " },\n", 73 | " index=[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n", 74 | ")\n", 75 | "flat_df" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "The first column provides the object id. As we can see there are three rows with ID=0, three rows with ID=1, and four rows with ID=2. Some of the values are constant for each item. For example both columns “a” and “b” take a single value for object. We are wasting space by repeating them in every row. Other values are different per row (columns “c” and “d”).\n", 83 | "\n", 84 | "As a concrete example, consider patient records. Each patient is assigned a unique id and has static data such as a date birth. They also have measurements that are new with every trip to the doctor, such as blood pressure or temperature." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Converting from Flat Pandas\n", 92 | "\n", 93 | "The easiest approach to converting the flat table above into a nested structure is to use `NestedFrame.from_flat()`. This function takes\n", 94 | " * a list of columns that are not nested (base_columns)\n", 95 | " * a list of columns to nest (nested_columns)\n", 96 | " * the name of the nested column (name)\n", 97 | "Rows are associated using the index by default, but a column name on which to join can also be provided." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "nf = NestedFrame.from_flat(\n", 107 | " flat_df,\n", 108 | " base_columns=[\"a\", \"b\"], # the columns not to nest\n", 109 | " nested_columns=[\"c\", \"d\"], # the columns to nest\n", 110 | " name=\"nested\", # name of the nested column\n", 111 | ")\n", 112 | "nf" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## Inserting Nested Rows\n", 120 | "\n", 121 | "Alternatively, we can use the `NestedFrame` constructor to create our base frame from a dictionary of our columns (as we would do with a normal pandas DataFrame). This defines the top-level objects and the values that are constant across rows (\"a\" and \"b\")." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "nf = NestedFrame(\n", 131 | " data={\n", 132 | " \"a\": [1, 2, 3],\n", 133 | " \"b\": [2, 4, 6],\n", 134 | " },\n", 135 | " index=[0, 1, 2],\n", 136 | ")\n", 137 | "nf" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "We can then create an additional pandas dataframes for the nested columns and pack them into our `NestedFrame` with `NestedFrame.add_nested()` function. `add_nested` will align the nest based on the index by default (a column may be selected instead via the `on` kwarg), as we see the `nested` `DataFrame` has a repeated index corresponding to the `nf` `NestedFrame`." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "nested = pd.DataFrame(\n", 154 | " data={\n", 155 | " \"c\": [0, 2, 4, 1, 4, 3, 1, 4, 1, 1],\n", 156 | " \"d\": [5, 4, 7, 5, 3, 1, 9, 3, 4, 1],\n", 157 | " },\n", 158 | " index=[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n", 159 | ")\n", 160 | "\n", 161 | "nf = nf.add_nested(nested, \"nested\")\n", 162 | "nf" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "The \"index\" parameter is used to perform the association. All of the values for index=0 are bundled together into a sub-table and stored in row 0's \"nested\" column." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "nf.loc[0][\"nested\"]" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "We could add other nested columns by creating new sub-tables and adding them with `add_nested()`. Note that while the tables added with each `add_nested()` must be rectangular, they do not need to have the same dimensions between calls. We could add another nested row with a different number of observations." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "nested = pd.DataFrame(\n", 195 | " data={\n", 196 | " \"c\": [0, 1, 0, 1, 2, 0],\n", 197 | " \"d\": [5, 4, 5, 4, 3, 5],\n", 198 | " },\n", 199 | " index=[0, 0, 1, 1, 1, 2],\n", 200 | ")\n", 201 | "\n", 202 | "nf = nf.add_nested(nested, \"nested2\")\n", 203 | "nf" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Loading Data from Parquet Files" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "For larger datasets, we support loading data from parquet files. In the following cell, we generate a series of temporary parquet files with random data, and ingest them with the `read_parquet` method:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n", 227 | "# You can of course remove this and use your own directory and real files on your system.\n", 228 | "with tempfile.TemporaryDirectory() as temp_path:\n", 229 | " # Generates parquet files with random data within our temporary directory\n", 230 | " generate_parquet_file(10, {\"nested1\": 100, \"nested2\": 10}, os.path.join(temp_path, \"test.parquet\"))\n", 231 | "\n", 232 | " # Read the parquet file to a NestedFrame\n", 233 | " nf = read_parquet(os.path.join(temp_path, \"test.parquet\"))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Nested-Pandas nested columns are compatible with the parquet format, meaning they can be written and read from parquet natively." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "nf # nf contains nested columns" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# Saving NestedFrames to Parquet Files\n", 257 | "\n", 258 | "Additionally we can save an existing `NestedFrame` as a parquet file using `NestedFrame.to_parquet`.\n", 259 | "\n", 260 | ">Note: Nested-Pandas converts any nested columns to pyarrow datatypes when writing to parquet, meaning that parquet files with nested columns can be read by a parquet reader from other packages so long as they understand pyarrow dtypes." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n", 270 | "# You can of course remove this and use your own directory and real files on your system.\n", 271 | "with tempfile.TemporaryDirectory() as temp_path:\n", 272 | " nf.to_parquet(\n", 273 | " os.path.join(temp_path, \"output.parquet\"), # The output file path\n", 274 | " )\n", 275 | "\n", 276 | " # List the files in temp_path to ensure they were saved correctly.\n", 277 | " print(\"The NestedFrame was saved to the following parquet files :\", os.listdir(temp_path))" 278 | ] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "lsdb", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.12.8" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 2 302 | } 303 | -------------------------------------------------------------------------------- /docs/tutorials/data_manipulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine Data Manipulation with Nested-Pandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This tutorial will briefly showcase how one would perform data manipulation operations from `pandas`, like adding columns, replacing values, etc. with `nested-pandas`." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "ExecuteTime": { 22 | "end_time": "2025-03-05T23:08:41.890895Z", 23 | "start_time": "2025-03-05T23:08:41.872743Z" 24 | } 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import nested_pandas as npd\n", 29 | "from nested_pandas.datasets import generate_data\n", 30 | "\n", 31 | "# Begin by generating an example dataset\n", 32 | "ndf = generate_data(5, 20, seed=1)\n", 33 | "ndf" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "ExecuteTime": { 41 | "end_time": "2025-03-05T23:08:41.907431Z", 42 | "start_time": "2025-03-05T23:08:41.902080Z" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# Show one of the nested dataframes\n", 48 | "ndf.iloc[0].nested" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Nested Column Selection" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested `t` column, where all rows from all dataframes are present in one dataframe." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "ExecuteTime": { 70 | "end_time": "2025-03-05T23:08:41.933782Z", 71 | "start_time": "2025-03-05T23:08:41.930296Z" 72 | } 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# Directly Nested Column Selection\n", 77 | "ndf[\"nested.t\"]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "The advantage of the flat view being that this is easily manipulatable just as any `pandas.Series` object. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2025-03-05T23:08:41.956770Z", 93 | "start_time": "2025-03-05T23:08:41.953485Z" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "ndf[\"nested.t\"] + 100" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## Adding or Replacing Nested Columns" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "> *A Note on Performance: These operations involve full reconstruction of the nested columns so expect impacted performance when doing this at scale. It may be appropriate to do these operations within reduce functions directly (e.g. subtracting a value from a column) if performance is key.*" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "We can use the \"base_column.nested_sub_column\" syntax to also perform operations that add new columns or replace existing columns for a nested column. For example, we can directly replace the \"band\" column with a new column that appends an additional string to the values." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "ExecuteTime": { 127 | "end_time": "2025-03-05T23:08:41.992618Z", 128 | "start_time": "2025-03-05T23:08:41.987910Z" 129 | } 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# prepend lsst_ to the band column\n", 134 | "\n", 135 | "ndf[\"nested.band\"] = \"lsst_\" + ndf[\"nested.band\"]\n", 136 | "\n", 137 | "ndf[\"nested.band\"]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Next, we can create a new column in the \"nested\" column. For example, we can subtract a value from each time value and return it as a new column." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "ExecuteTime": { 152 | "end_time": "2025-03-05T23:08:42.016312Z", 153 | "start_time": "2025-03-05T23:08:42.012009Z" 154 | } 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "# create a new \"corrected_t\" column in \"nested\"\n", 159 | "\n", 160 | "ndf[\"nested.corrected_t\"] = ndf[\"nested.t\"] - 5\n", 161 | "\n", 162 | "ndf[\"nested.corrected_t\"]" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "ExecuteTime": { 170 | "end_time": "2025-03-05T23:08:42.037065Z", 171 | "start_time": "2025-03-05T23:08:42.032519Z" 172 | } 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "# Show the first dataframe again\n", 177 | "ndf.iloc[0].nested" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Adding New Nested Structures" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Finally, we can also add entirely new nested structures using the above syntax." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "ExecuteTime": { 199 | "end_time": "2025-03-05T23:08:42.075674Z", 200 | "start_time": "2025-03-05T23:08:42.061111Z" 201 | } 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "ndf[\"bands.band_label\"] = ndf[\"nested.band\"]\n", 206 | "ndf" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "This is functionally equivalent to using `add_nested`:" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "ExecuteTime": { 221 | "end_time": "2025-03-05T23:08:42.132918Z", 222 | "start_time": "2025-03-05T23:08:42.114796Z" 223 | } 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "ndf.add_nested(ndf[\"nested.band\"].to_frame(), \"bands_from_add_nested\")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## Embedding \"base\" column into nested column" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "We can also assign some \"base\" (non-nested) column to a nested column, which will be broadcasted to all nested dataframes with the values being repeated." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "ExecuteTime": { 249 | "end_time": "2025-03-05T23:08:42.165933Z", 250 | "start_time": "2025-03-05T23:08:42.161684Z" 251 | } 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "ndf[\"nested.a\"] = ndf[\"a\"]\n", 256 | "ndf[\"nested.a\"]" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "Or we can do some operations over the base columns first:" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "ExecuteTime": { 271 | "end_time": "2025-03-05T23:08:42.266923Z", 272 | "start_time": "2025-03-05T23:08:42.262281Z" 273 | } 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "ndf[\"nested.ab\"] = ndf[\"a\"] + ndf[\"b\"] * 2\n", 278 | "ndf[\"nested.ab\"]" 279 | ] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.10.11" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /docs/tutorials/nested_spectra.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using Nested-Pandas with Astronomical Spectra" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In Astronomy, a spectrum is a measurement (or combination of measurements) of an object that shows the intensity of light emitted over a range of energies. In this tutorial, we'll walk through a simple example of working with spectra from the Sloan Digital Sky Survey (SDSS), in particular showing how it can be represented as a `NestedFrame`." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "First, we'll use `astroquery` and `astropy` to download a handful of spectra from SDSS:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from astroquery.sdss import SDSS\n", 31 | "from astropy import coordinates as coords\n", 32 | "import astropy.units as u\n", 33 | "import nested_pandas as npd\n", 34 | "\n", 35 | "# Query SDSS for a set of objects with spectra\n", 36 | "pos = coords.SkyCoord(\"0h8m10.63s +14d50m23.3s\", frame=\"icrs\")\n", 37 | "xid = SDSS.query_region(pos, radius=3 * u.arcmin, spectro=True)\n", 38 | "xid_ndf = npd.NestedFrame(xid.to_pandas())\n", 39 | "xid_ndf" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "This initial query returns a set of objects with spectra (as specified by the `spectro=True` flag). To actually retrieve the spectra, we can do the following:" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Query SDSS for the corresponding spectra\n", 56 | "SDSS.clear_cache()\n", 57 | "sp = SDSS.get_spectra(matches=xid)\n", 58 | "sp" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "The result is a list of FITS formatted data. From this point there are a few ways that we could move towards a nested-pandas representation. The most straightforward is to build a \"flat\" spectra table from all the objects, where we gather the information from each spectrum into a single combined table." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "import numpy as np\n", 75 | "\n", 76 | "# Build a flat spectrum dataframe\n", 77 | "\n", 78 | "# Initialize some empty arrays to hold the flat data\n", 79 | "wave = np.array([])\n", 80 | "flux = np.array([])\n", 81 | "err = np.array([])\n", 82 | "index = np.array([])\n", 83 | "# Loop over each spectrum, adding its data to the arrays\n", 84 | "for i, hdu in enumerate(sp):\n", 85 | " wave = np.append(wave, 10 ** hdu[\"COADD\"].data.loglam) # * u.angstrom\n", 86 | " flux = np.append(flux, hdu[\"COADD\"].data.flux * 1e-17) # * u.erg/u.second/u.centimeter**2/u.angstrom\n", 87 | " err = np.append(err, 1 / hdu[\"COADD\"].data.ivar * 1e-17) # * flux.unit\n", 88 | "\n", 89 | " # We'll need to set an index to keep track of which rows correspond\n", 90 | " # to which object\n", 91 | " index = np.append(index, i * np.ones(len(hdu[\"COADD\"].data.loglam)))\n", 92 | "\n", 93 | "# Build a NestedFrame from the arrays\n", 94 | "flat_spec = npd.NestedFrame(dict(wave=wave, flux=flux, err=err), index=index.astype(np.int8))\n", 95 | "flat_spec" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "From here, we can simply nest our flat table within our original query result:" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "spec_ndf = xid_ndf.add_nested(flat_spec, \"coadd_spectrum\").set_index(\"objid\")\n", 112 | "spec_ndf" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "And we can see that each object now has the `coadd_spectrum` nested column with the full spectrum available." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Look at one of the spectra\n", 129 | "spec_ndf.iloc[1].coadd_spectrum" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "We now have our spectra nested, and can proceed to do any filtering and analysis as normal within nested-pandas.\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "import matplotlib.pyplot as plt\n", 146 | "\n", 147 | "# Plot a spectrum\n", 148 | "spec = spec_ndf.iloc[1].coadd_spectrum\n", 149 | "\n", 150 | "plt.plot(spec[\"wave\"], spec[\"flux\"])\n", 151 | "plt.xlabel(\"Wavelength (Å)\")\n", 152 | "plt.ylabel(r\"Flux ($ergs/s/cm^2/Å$)\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "lsdb", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.12.8" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 4 184 | } 185 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [project] 3 | name = "nested-pandas" 4 | license = {file = "LICENSE"} 5 | description = "An extension of pandas for efficient representation of nested associated datasets." 6 | readme = "README.md" 7 | authors = [ 8 | { name = "LINCC Frameworks", email = "brantd@uw.edu" } 9 | ] 10 | classifiers = [ 11 | "Development Status :: 4 - Beta", 12 | "License :: OSI Approved :: MIT License", 13 | "Intended Audience :: Developers", 14 | "Intended Audience :: Science/Research", 15 | "Operating System :: OS Independent", 16 | "Programming Language :: Python", 17 | ] 18 | dynamic = ["version"] 19 | requires-python = ">=3.10" 20 | dependencies = [ 21 | "numpy>=2", 22 | # We use internal pd._libs.missing and experimental ArrowExtensionArray 23 | "pandas>=2.2.3,<2.4", 24 | "pyarrow>=18", 25 | "universal_pathlib>=0.2", 26 | ] 27 | 28 | [project.urls] 29 | "Source Code" = "https://github.com/lincc-frameworks/nested-pandas" 30 | 31 | # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) 32 | [project.optional-dependencies] 33 | dev = [ 34 | "asv==0.6.4", # Used to compute performance benchmarks 35 | "jupyter", # Clears output from Jupyter notebooks 36 | "mypy", # Used for static type checking of files 37 | "pre-commit", # Used to run checks before finalizing a git commit 38 | "pytest", 39 | "pytest-cov", # Used to report total code coverage 40 | "ruff", # Used for static linting of files 41 | "aiohttp", 42 | "requests", 43 | ] 44 | 45 | [build-system] 46 | requires = [ 47 | "setuptools>=62", # Used to build and package the Python project 48 | "setuptools_scm>=6.2", # Gets release version from git. Makes it available programmatically 49 | ] 50 | build-backend = "setuptools.build_meta" 51 | 52 | [tool.setuptools_scm] 53 | write_to = "src/nested_pandas/_version.py" 54 | 55 | [tool.pytest.ini_options] 56 | testpaths = [ 57 | "tests", 58 | "src", 59 | "docs", 60 | ] 61 | addopts = "--doctest-modules --doctest-glob=*.rst" 62 | 63 | [tool.ruff] 64 | line-length = 110 65 | target-version = "py310" 66 | [tool.ruff.lint] 67 | select = [ 68 | # pycodestyle 69 | "E", 70 | "W", 71 | # Pyflakes 72 | "F", 73 | # pep8-naming 74 | "N", 75 | # pyupgrade 76 | "UP", 77 | # flake8-bugbear 78 | "B", 79 | # flake8-simplify 80 | "SIM", 81 | # isort 82 | "I", 83 | # docstrings 84 | "D101", 85 | "D102", 86 | "D103", 87 | "D106", 88 | "D206", 89 | "D207", 90 | "D208", 91 | "D300", 92 | "D417", 93 | "D419", 94 | # Numpy v2.0 compatibility 95 | "NPY201", 96 | ] 97 | ignore = [ 98 | "UP006", # Allow non standard library generics in type hints 99 | "UP007", # Allow Union in type hints 100 | "SIM114", # Allow if with same arms 101 | "B028", # Allow default warning level 102 | "SIM117", # Allow nested with 103 | "UP015", # Allow redundant open parameters 104 | "UP028", # Allow yield in for loop 105 | ] 106 | 107 | [tool.setuptools.package-data] 108 | nested_pandas = ["py.typed"] 109 | 110 | [tool.coverage.run] 111 | omit=["src/nested_pandas/_version.py"] 112 | -------------------------------------------------------------------------------- /src/nested_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ # noqa 2 | from .nestedframe import NestedFrame 3 | from .nestedframe.io import read_parquet 4 | 5 | # Import for registering 6 | from .series.accessor import NestSeriesAccessor # noqa: F401 7 | from .series.dtype import NestedDtype 8 | 9 | 10 | __all__ = ["NestedDtype", "NestedFrame", "read_parquet"] 11 | -------------------------------------------------------------------------------- /src/nested_pandas/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .generation import * # noqa 2 | -------------------------------------------------------------------------------- /src/nested_pandas/datasets/generation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nested_pandas import NestedFrame 4 | 5 | 6 | def generate_data(n_base, n_layer, seed=None) -> NestedFrame: 7 | """Generates a toy dataset. 8 | 9 | Parameters 10 | ---------- 11 | n_base : int 12 | The number of rows to generate for the base layer 13 | n_layer : int, or dict 14 | The number of rows per n_base row to generate for a nested layer. 15 | Alternatively, a dictionary of layer label, layer_size pairs may be 16 | specified to created multiple nested columns with custom sizing. 17 | seed : int 18 | A seed to use for random generation of data 19 | 20 | Returns 21 | ------- 22 | NestedFrame 23 | The constructed NestedFrame. 24 | 25 | Examples 26 | -------- 27 | >>> from nested_pandas.datasets import generate_data 28 | >>> nf1 = generate_data(10,100) 29 | >>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200}) 30 | """ 31 | # use provided seed, "None" acts as if no seed is provided 32 | randomstate = np.random.RandomState(seed=seed) 33 | 34 | # Generate base data 35 | base_data = {"a": randomstate.random(n_base), "b": randomstate.random(n_base) * 2} 36 | base_nf = NestedFrame(data=base_data) 37 | 38 | # In case of int, create a single nested layer called "nested" 39 | if isinstance(n_layer, int): 40 | n_layer = {"nested": n_layer} 41 | 42 | # It should be a dictionary 43 | if isinstance(n_layer, dict): 44 | for key in n_layer: 45 | layer_size = n_layer[key] 46 | layer_data = { 47 | "t": randomstate.random(layer_size * n_base) * 20, 48 | "flux": randomstate.random(layer_size * n_base) * 100, 49 | "band": randomstate.choice(["r", "g"], size=layer_size * n_base), 50 | "index": np.arange(layer_size * n_base) % n_base, 51 | } 52 | layer_nf = NestedFrame(data=layer_data).set_index("index") 53 | base_nf = base_nf.add_nested(layer_nf, key) 54 | return base_nf 55 | else: 56 | raise TypeError("Input to n_layer is not an int or dict.") 57 | 58 | 59 | def generate_parquet_file(n_base, n_layer, path, seed=None): 60 | """Generates a toy dataset and outputs it as a parquet file. 61 | 62 | Parameters 63 | ---------- 64 | n_base : int 65 | The number of rows to generate for the base layer 66 | n_layer : int, or dict 67 | The number of rows per n_base row to generate for a nested layer. 68 | Alternatively, a dictionary of layer label, layer_size pairs may be 69 | specified to created multiple nested columns with custom sizing. 70 | path : str, 71 | The path to the parquet file to write. 72 | seed : int, default=None 73 | A seed to use for random generation of data 74 | 75 | Returns 76 | ------- 77 | None 78 | """ 79 | nf = generate_data(n_base, n_layer, seed) 80 | nf.to_parquet(path) 81 | -------------------------------------------------------------------------------- /src/nested_pandas/nestedframe/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import NestedFrame # noqa 2 | from .io import read_parquet # noqa 3 | -------------------------------------------------------------------------------- /src/nested_pandas/nestedframe/expr.py: -------------------------------------------------------------------------------- 1 | """Utilities used by NestedFrame.query() and .eval()""" 2 | 3 | # typing.Self and "|" union syntax don't exist in Python 3.9 4 | from __future__ import annotations 5 | 6 | import ast 7 | import re 8 | from typing import TYPE_CHECKING 9 | 10 | import pandas as pd 11 | from pandas.core.computation import ops 12 | from pandas.core.computation.expr import PARSERS, PandasExprVisitor 13 | from pandas.core.computation.parsing import clean_column_name 14 | 15 | # Avoid cyclic import 16 | if TYPE_CHECKING: 17 | from nested_pandas import NestedFrame 18 | 19 | # Used to identify backtick-protected names in the expressions 20 | # used in NestedFrame.eval() and NestedFrame.query(). 21 | _backtick_protected_names = re.compile(r"`[^`]+`", re.MULTILINE) 22 | 23 | 24 | class NestedPandasExprVisitor(PandasExprVisitor): 25 | """ 26 | Custom expression visitor for NestedFrame evaluations, which may assign to 27 | nested columns. 28 | """ 29 | 30 | def visit_Assign(self, node, **kwargs): # noqa: N802 31 | """ 32 | Visit an assignment node, which may assign to a nested column. 33 | """ 34 | if not isinstance(node.targets[0], ast.Attribute): 35 | # If the target is not an attribute, then it's a simple assignment as usual 36 | return super().visit_Assign(node) 37 | target = node.targets[0] 38 | if not isinstance(target.value, ast.Name): 39 | raise ValueError("Assignments to nested columns must be of the form `nested.col = ...`") 40 | # target.value.id will be the name of the nest, target.attr is the column name. 41 | # Describing the proper target for the assigner is enough for both overwrite and 42 | # creation of new columns. The assigner will be a string like "nested.col". 43 | # This works both for the creation of new nest members and new nests. 44 | self.assigner = f"{target.value.id}.{target.attr}" 45 | # Continue visiting. 46 | return self.visit(node.value, **kwargs) 47 | 48 | 49 | PARSERS["nested-pandas"] = NestedPandasExprVisitor 50 | 51 | 52 | class _SeriesFromNest(pd.Series): 53 | """ 54 | Series that were unpacked from a nest. 55 | """ 56 | 57 | _metadata = ["nest_name", "flat_nest"] 58 | 59 | @property 60 | def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 61 | return _SeriesFromNest 62 | 63 | @property 64 | def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 65 | # Avoid cyclic import 66 | from nested_pandas import NestedFrame 67 | 68 | return NestedFrame 69 | 70 | # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types 71 | # The __pandas_priority__ of Series is 3000, so give _SeriesFromNest a 72 | # higher priority, so that binary operations involving this class and 73 | # Series produce instances of this class, preserving the type and origin. 74 | __pandas_priority__ = 3500 75 | 76 | 77 | class _NestResolver(dict): 78 | """ 79 | Used by NestedFrame.eval to resolve the names of nests at the top level. 80 | While the resolver is normally a dictionary, with values that are fixed 81 | upon entering evaluation, this object needs to be dynamic so that it can 82 | support multi-line expressions, where new nests may be created during 83 | evaluation. 84 | """ 85 | 86 | def __init__(self, outer: NestedFrame): 87 | self._outer = outer 88 | super().__init__() 89 | # Pre-load the field resolvers for all columns which are known at present. 90 | for column in outer.nested_columns: 91 | self._initialize_column_resolver(column, outer) 92 | 93 | def _initialize_column_resolver(self, column: str, outer: NestedFrame): 94 | """ 95 | Initialize a resolver for the given nested column, and also an alias 96 | for it, in the case of column names that have spaces or are otherwise 97 | not identifier-like. 98 | """ 99 | super().__setitem__(column, _NestedFieldResolver(column, outer)) 100 | clean_id = clean_column_name(column) 101 | # And once more for the cleaned name, if it's different. 102 | # This allows us to capture references to it from the Pandas evaluator. 103 | if clean_id != column: 104 | super().__setitem__(clean_id, _NestedFieldResolver(column, outer)) 105 | 106 | def __contains__(self, item): 107 | top_nest = item if "." not in item else item.split(".")[0].strip() 108 | return top_nest in self._outer.nested_columns 109 | 110 | def __getitem__(self, item): 111 | top_nest = item if "." not in item else item.split(".")[0].strip() 112 | if not super().__contains__(top_nest): 113 | if top_nest not in self._outer.nested_columns: 114 | raise KeyError(f"Unknown nest {top_nest}") 115 | self._initialize_column_resolver(top_nest, self._outer) 116 | return super().__getitem__(top_nest) 117 | 118 | def __setitem__(self, item, _): 119 | # Called to update the resolver with intermediate values. 120 | # The important point is to intercept the call so that the evaluator 121 | # does not create any new resolvers on the fly. We do NOT want to 122 | # store the given value, since the resolver does lazy-loading. 123 | # What we DO want to do, however, is to invalidate the cache for 124 | # any field resolver for a given nest that is receiving an assignment. 125 | # Since the resolvers are created as-needed in __getitem__, all we need 126 | # to do is delete them from the local cache when this pattern is detected. 127 | if "." in item: 128 | top_nest = item.split(".")[0].strip() 129 | if top_nest in self._outer.nested_columns and super().__contains__(top_nest): 130 | del self[top_nest] # force re-creation in __setitem__ 131 | 132 | 133 | class _NestedFieldResolver: 134 | """ 135 | Used by NestedFrame.eval to resolve the names of fields in nested columns when 136 | encountered in expressions, interpreting __getattr__ in terms of a 137 | specific nest. 138 | """ 139 | 140 | def __init__(self, nest_name: str, outer: NestedFrame): 141 | self._nest_name = nest_name 142 | # Save the outer frame with an eye toward repacking. 143 | self._outer = outer 144 | # Flattened only once for every access of this particular nest 145 | # within the expression. 146 | self._flat_nest = outer[nest_name].nest.to_flat() 147 | # Save aliases to any columns that are not identifier-like. 148 | # If our given frame has aliases for identifiers, use these instead 149 | # of generating our own. 150 | self._aliases = getattr(outer, "_aliases", None) 151 | if self._aliases is None: 152 | self._aliases = {} 153 | for column in self._flat_nest.columns: 154 | clean_id = clean_column_name(column) 155 | if clean_id != column: 156 | self._aliases[clean_id] = column 157 | 158 | def __getattr__(self, item_name: str): 159 | if self._aliases: 160 | item_name = self._aliases.get(item_name, item_name) 161 | if item_name in self._flat_nest: 162 | result = _SeriesFromNest(self._flat_nest[item_name]) 163 | # Assigning these properties directly in order to avoid any complication 164 | # or interference with the inherited pd.Series constructor. 165 | result.nest_name = self._nest_name 166 | result.flat_nest = self._flat_nest 167 | return result 168 | raise AttributeError(f"No attribute {item_name}") 169 | 170 | 171 | def _subexprs_by_nest(parents: list, node) -> dict[str, list]: 172 | """ 173 | Given an expression which contains references to both base and nested 174 | columns, return a dictionary of the sub-expressions that should be 175 | evaluated independently, keyed by nesting context. 176 | 177 | The key of the dictionary is the name of the nested column, and will 178 | be a blank string in the case of base columns. The value is a list 179 | of the parent nodes that lead to sub-expressions that can be evaluated 180 | successfully. 181 | 182 | While this is not in use today for automatically splitting expressions, 183 | it can be used to detect whether an expression is suitably structured 184 | for evaluation: the returned dictionary should have a single key. 185 | """ 186 | if isinstance(node, ops.Term) and not isinstance(node, ops.Constant): 187 | if isinstance(node.value, _SeriesFromNest): 188 | return {node.value.nest_name: parents} 189 | return {getattr(node, "upper_name", ""): parents} 190 | if not isinstance(node, ops.Op): 191 | return {} 192 | sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)] 193 | result: dict[str, list] = {} 194 | for source in sources: 195 | child = _subexprs_by_nest(parents, source) 196 | for k, v in child.items(): 197 | result.setdefault(k, []).append(v) 198 | # After a complete traversal across sources, check for any necessary splits. 199 | # If it's homogenous, move the split-node up the tree. 200 | if len(result) == 1: 201 | # Let the record of each parent node drift up the tree, 202 | # and merge the subtrees into a single node, since by definition, 203 | # this node is homogeneous over all of its children, and can 204 | # be evaluated in a single step. 205 | result = {k: [node] for k in result} 206 | # If the result is either empty or has more than one key, leave the result 207 | # alone. Each key represents a different nest (with a blank string for the base), 208 | # and the value is the highest point in the expression tree where the expression 209 | # was still within a single nest. 210 | return result 211 | 212 | 213 | def _identify_aliases(expr: str) -> tuple[str, dict[str, str]]: 214 | """ 215 | Given an expression string, identify backtick-quoted names 216 | and replace them with cleaned names, returning the cleaned 217 | expression and a dictionary of aliases, where the keys are 218 | clean aliases to the original names. 219 | """ 220 | aliases = {} 221 | 222 | def sub_and_alias(match): 223 | original = match.group(0)[1:-1] # remove backticks 224 | alias = clean_column_name(original) 225 | if alias != original: 226 | aliases[alias] = original 227 | return alias 228 | 229 | return _backtick_protected_names.sub(sub_and_alias, expr), aliases 230 | -------------------------------------------------------------------------------- /src/nested_pandas/nestedframe/io.py: -------------------------------------------------------------------------------- 1 | # typing.Self and "|" union syntax don't exist in Python 3.9 2 | from __future__ import annotations 3 | 4 | from collections.abc import Sequence 5 | 6 | import pandas as pd 7 | import pyarrow as pa 8 | import pyarrow.parquet as pq 9 | from upath import UPath 10 | 11 | from ..series.dtype import NestedDtype 12 | from ..series.utils import table_to_struct_array 13 | from .core import NestedFrame 14 | 15 | 16 | def read_parquet( 17 | data: str | UPath | bytes, 18 | columns: list[str] | None = None, 19 | reject_nesting: list[str] | str | None = None, 20 | **kwargs, 21 | ) -> NestedFrame: 22 | """ 23 | Load a parquet object from a file path into a NestedFrame. 24 | 25 | As a deviation from `pandas`, this function loads via 26 | `pyarrow.parquet.read_table`, and then converts to a NestedFrame. 27 | 28 | Parameters 29 | ---------- 30 | data: str, Upath, or file-like object 31 | Path to the data or a file-like object. If a string is passed, it can be a single file name, 32 | directory name, or a remote path (e.g., HTTP/HTTPS or S3). If a file-like object is passed, 33 | it must support the `read` method. 34 | columns : list, default=None 35 | If not None, only these columns will be read from the file. 36 | reject_nesting: list or str, default=None 37 | Column(s) to reject from being cast to a nested dtype. By default, 38 | nested-pandas assumes that any struct column with all fields being lists 39 | is castable to a nested column. However, this assumption is invalid if 40 | the lists within the struct have mismatched lengths for any given item. 41 | Columns specified here will be read using the corresponding pandas.ArrowDtype. 42 | kwargs: dict 43 | Keyword arguments passed to `pyarrow.parquet.read_table` 44 | 45 | Returns 46 | ------- 47 | NestedFrame 48 | 49 | Notes 50 | ----- 51 | pyarrow supports partial loading of nested structures from parquet, for 52 | example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will 53 | load the "a" column of the "nested" column. Standard pandas/pyarrow 54 | behavior will return "a" as a list-array base column with name "a". In 55 | nested-pandas, this behavior is changed to load the column as a sub-column 56 | of a nested column called "nested". Be aware that this will prohibit calls 57 | like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])``` 58 | from working, as this implies both full and partial load of "nested". 59 | 60 | Furthermore, there are some cases where subcolumns will have the same name 61 | as a top-level column. For example, if you have a column "nested" with 62 | subcolumns "nested.a" and "nested.b", and also a top-level column "a". In 63 | these cases, keep in mind that if "nested" is in the reject_nesting list 64 | the operation will fail, as is consistent with the default pandas behavior 65 | (but nesting will still work normally). 66 | 67 | Examples 68 | -------- 69 | 70 | Simple loading example: 71 | 72 | >>> import nested_pandas as npd 73 | >>> nf = npd.read_parquet("path/to/file.parquet") # doctest: +SKIP 74 | 75 | Partial loading: 76 | 77 | >>> #Load only the "flux" sub-column of the "nested" column 78 | >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"]) # doctest: +SKIP 79 | """ 80 | 81 | # Type convergence for reject_nesting 82 | if reject_nesting is None: 83 | reject_nesting = [] 84 | elif isinstance(reject_nesting, str): 85 | reject_nesting = [reject_nesting] 86 | 87 | # First load through pyarrow 88 | # Check if `data` is a file-like object or a sequence 89 | if hasattr(data, "read") or ( 90 | isinstance(data, Sequence) and not isinstance(data, str | bytes | bytearray) 91 | ): 92 | # If `data` is a file-like object or a sequence, pass it directly to pyarrow 93 | table = pq.read_table(data, columns=columns, **kwargs) 94 | else: 95 | # Otherwise, treat `data` as a file path and use UPath 96 | path = UPath(data) 97 | filesystem = kwargs.pop("filesystem", path.fs) 98 | table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs) 99 | 100 | # Resolve partial loading of nested structures 101 | # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux") 102 | # Use input column names and the table column names to determine if a column 103 | # was from a nested column. 104 | if columns is not None: 105 | nested_structures: dict[str, list[int]] = {} 106 | for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names, strict=True)): 107 | # if the column name is not the same, it was a partial load 108 | if col_in != col_pa: 109 | # get the top-level column name 110 | nested_col = col_in.split(".")[0] 111 | 112 | # validate that the partial load columns are list type 113 | # if any of the columns are not list type, reject the cast 114 | # and remove the column from the list of nested structures if 115 | # it was added 116 | if not pa.types.is_list(table.schema[i].type): 117 | reject_nesting.append(nested_col) 118 | if nested_col in nested_structures: 119 | # remove the column from the list of nested structures 120 | nested_structures.pop(nested_col) 121 | # track nesting for columns not in the reject list 122 | elif nested_col not in reject_nesting: 123 | if nested_col not in nested_structures: 124 | nested_structures[nested_col] = [i] 125 | else: 126 | nested_structures[nested_col].append(i) 127 | 128 | # Check for full and partial load of the same column and error 129 | # Columns in the reject_nesting will not be checked 130 | for col in columns: 131 | if col in nested_structures: 132 | raise ValueError( 133 | f"The provided column list contains both a full and partial " 134 | f"load of the column '{col}'. This is not allowed as the partial " 135 | "load will be cast to a nested column that already exists. " 136 | "Please either remove the partial load or the full load." 137 | ) 138 | 139 | # Build structs and track column indices used 140 | structs = {} 141 | indices_to_remove = [] 142 | for col, indices in nested_structures.items(): 143 | # Build a struct column from the columns 144 | structs[col] = table_to_struct_array(table.select(indices)) 145 | indices_to_remove.extend(indices) 146 | 147 | # Remove the original columns in reverse order to avoid index shifting 148 | for i in sorted(indices_to_remove, reverse=True): 149 | table = table.remove_column(i) 150 | 151 | # Append the new struct columns 152 | for col, struct in structs.items(): 153 | table = table.append_column(col, struct) 154 | 155 | # Convert to NestedFrame 156 | # not zero-copy, but reduce memory pressure via the self_destruct kwarg 157 | # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas 158 | df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True)) 159 | del table 160 | # Attempt to cast struct columns to NestedDTypes 161 | df = _cast_struct_cols_to_nested(df, reject_nesting) 162 | 163 | return df 164 | 165 | 166 | def _cast_struct_cols_to_nested(df, reject_nesting): 167 | """cast struct columns to nested dtype""" 168 | # Attempt to cast struct columns to NestedDTypes 169 | for col, dtype in df.dtypes.items(): 170 | # First validate the dtype 171 | # will return valueerror when not a struct-list 172 | valid_dtype = True 173 | try: 174 | NestedDtype._validate_dtype(dtype.pyarrow_dtype) 175 | except ValueError: 176 | valid_dtype = False 177 | 178 | if valid_dtype and col not in reject_nesting: 179 | try: 180 | # Attempt to cast Struct to NestedDType 181 | df = df.astype({col: NestedDtype(dtype.pyarrow_dtype)}) 182 | except ValueError as err: 183 | # If cast fails, the struct likely does not fit nested-pandas 184 | # criteria for a valid nested column 185 | raise ValueError( 186 | f"Column '{col}' is a Struct, but an attempt to cast it to a NestedDType failed. " 187 | "This is likely due to the struct not meeting the requirements for a nested column " 188 | "(all fields should be equal length). To proceed, you may add the column to the " 189 | "`reject_nesting` argument of the read_parquet function to skip the cast attempt:" 190 | f" read_parquet(..., reject_nesting=['{col}'])" 191 | ) from err 192 | return df 193 | -------------------------------------------------------------------------------- /src/nested_pandas/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/src/nested_pandas/py.typed -------------------------------------------------------------------------------- /src/nested_pandas/series/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/src/nested_pandas/series/__init__.py -------------------------------------------------------------------------------- /src/nested_pandas/series/_storage/__init__.py: -------------------------------------------------------------------------------- 1 | from .list_struct_storage import ListStructStorage # noqa: F401 2 | from .struct_list_storage import StructListStorage # noqa: F401 3 | from .table_storage import TableStorage # noqa: F401 4 | -------------------------------------------------------------------------------- /src/nested_pandas/series/_storage/list_struct_storage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Python 3.9 requires it for X | Y type hints 2 | 3 | from typing import TYPE_CHECKING, Any 4 | 5 | import pyarrow as pa 6 | 7 | from nested_pandas.series.utils import transpose_struct_list_chunked, validate_list_struct_type 8 | 9 | if TYPE_CHECKING: 10 | from nested_pandas.series._storage.struct_list_storage import StructListStorage 11 | from nested_pandas.series._storage.table_storage import TableStorage 12 | 13 | 14 | class ListStructStorage: 15 | """Store nested data as a PyArrow list-struct array. 16 | 17 | Parameters 18 | ---------- 19 | array : pa.ListArray or pa.ChunkedArray 20 | Pyarrow list-array with a struct value type. An array or a chunk-array 21 | """ 22 | 23 | _data: pa.ChunkedArray 24 | 25 | def __init__(self, array: pa.ListArray | pa.ChunkedArray) -> None: 26 | if isinstance(array, pa.ListArray): 27 | array = pa.chunked_array([array]) 28 | if not isinstance(array, pa.ChunkedArray): 29 | raise ValueError("array must be of type pa.ChunkedArray") 30 | validate_list_struct_type(array.type) 31 | self._data = array 32 | 33 | @property 34 | def data(self) -> pa.ChunkedArray: 35 | return self._data 36 | 37 | @classmethod 38 | def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self: # type: ignore # noqa: F821 39 | """Construct from a StructListStorage object. 40 | 41 | Parameters 42 | ---------- 43 | struct_list_storage : StructListStorage 44 | StructListStorage object. 45 | """ 46 | data = transpose_struct_list_chunked(struct_list_storage.data, validate=False) 47 | return cls(data) 48 | 49 | @classmethod 50 | def from_table_storage(cls, table_storage: TableStorage) -> Self: # type: ignore # noqa: F821 51 | """Construct from a TableStorage object. 52 | 53 | Parameters 54 | ---------- 55 | table_storage : TableStorage 56 | TableStorage object. 57 | """ 58 | from nested_pandas.series._storage import StructListStorage 59 | 60 | struct_list_storage = StructListStorage.from_table_storage(table_storage) 61 | return cls.from_struct_list_storage(struct_list_storage) 62 | 63 | def __len__(self) -> int: 64 | return len(self._data) 65 | 66 | def __eq__(self, other: Any) -> bool: 67 | if not isinstance(other, type(self)): 68 | return False 69 | return self._data == other._data 70 | 71 | @property 72 | def nbytes(self) -> int: 73 | """Number of bytes consumed by the data in memory.""" 74 | return self._data.nbytes 75 | 76 | @property 77 | def type(self) -> pa.ListType: 78 | """Pyarrow type of the underlying array.""" 79 | return self._data.type 80 | 81 | @property 82 | def num_chunks(self) -> int: 83 | """Number of chunks in the underlying array.""" 84 | return self._data.num_chunks 85 | -------------------------------------------------------------------------------- /src/nested_pandas/series/_storage/struct_list_storage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Python 3.9 requires it for X | Y type hints 2 | 3 | from collections.abc import Iterator 4 | from typing import TYPE_CHECKING 5 | 6 | import pyarrow as pa 7 | 8 | from nested_pandas.series.utils import ( 9 | table_to_struct_array, 10 | transpose_list_struct_chunked, 11 | validate_struct_list_array_for_equal_lengths, 12 | ) 13 | 14 | if TYPE_CHECKING: 15 | from nested_pandas.series._storage.list_struct_storage import ListStructStorage 16 | from nested_pandas.series._storage.table_storage import TableStorage 17 | 18 | 19 | class StructListStorage: 20 | """Store nested data as a PyArrow struct-list array. 21 | 22 | Parameters 23 | ---------- 24 | array : pa.StructArray or pa.ChunkedArray 25 | Pyarrow struct-array with all fields to be list-arrays. 26 | All list-values must be "aligned", e.g., have the same length. 27 | validate : bool (default True) 28 | Check that all the lists have the same lengths for each struct-value. 29 | """ 30 | 31 | _data: pa.ChunkedArray 32 | 33 | def __init__(self, array: pa.StructArray | pa.ChunkedArray, *, validate: bool = True) -> None: 34 | if isinstance(array, pa.StructArray): 35 | array = pa.chunked_array([array]) 36 | if not isinstance(array, pa.ChunkedArray): 37 | raise ValueError("array must be a StructArray or ChunkedArray") 38 | 39 | if validate: 40 | for chunk in array.chunks: 41 | validate_struct_list_array_for_equal_lengths(chunk) 42 | 43 | self._data = array 44 | 45 | @property 46 | def data(self) -> pa.ChunkedArray: 47 | return self._data 48 | 49 | @classmethod 50 | def from_list_struct_storage(cls, list_struct_storage: ListStructStorage) -> Self: # type: ignore # noqa: F821 51 | """Construct from a ListStructStorage object. 52 | 53 | Parameters 54 | ---------- 55 | list_struct_storage : ListStructStorage 56 | ListStructStorage object. 57 | """ 58 | data = transpose_list_struct_chunked(list_struct_storage.data) 59 | return cls(data, validate=False) 60 | 61 | @classmethod 62 | def from_table_storage(cls, table_storage: TableStorage) -> Self: # type: ignore # noqa: F821 63 | """Construct from a TableStorage object. 64 | 65 | Parameters 66 | ---------- 67 | table_storage : TableStorage 68 | TableStorage object. 69 | """ 70 | data = table_to_struct_array(table_storage.data) 71 | return cls(data, validate=False) 72 | 73 | def __iter__(self) -> Iterator[pa.StructScalar]: 74 | return iter(self._data) 75 | -------------------------------------------------------------------------------- /src/nested_pandas/series/_storage/table_storage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Python 3.9 requires it for X | Y type hints 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import pyarrow as pa 6 | 7 | from nested_pandas.series.utils import ( 8 | table_from_struct_array, 9 | table_to_struct_array, 10 | validate_struct_list_array_for_equal_lengths, 11 | ) 12 | 13 | if TYPE_CHECKING: 14 | from nested_pandas.series._storage.list_struct_storage import ListStructStorage 15 | from nested_pandas.series._storage.struct_list_storage import StructListStorage 16 | 17 | 18 | class TableStorage: 19 | """Store nested data as a PyArrow table with list-columns. 20 | 21 | Parameters 22 | ---------- 23 | table : pa.Table 24 | PyArrow table, all columns must be list-columns. 25 | All list-values must be "aligned", e.g., have the same length. 26 | """ 27 | 28 | _data: pa.Table 29 | 30 | def __init__(self, table: pa.Table, validate: bool = True) -> None: 31 | if validate: 32 | struct_array = table_to_struct_array(table) 33 | for chunk in struct_array.iterchunks(): 34 | validate_struct_list_array_for_equal_lengths(chunk) 35 | 36 | self._data = table 37 | 38 | @property 39 | def data(self) -> pa.Table: 40 | return self._data 41 | 42 | @classmethod 43 | def from_list_struct_storage(cls, list_storage: ListStructStorage) -> Self: # type: ignore # noqa: F821 44 | """Construct from a StructListStorage object. 45 | 46 | Parameters 47 | ---------- 48 | list_storage : ListStructStorage 49 | StructListStorage object. 50 | """ 51 | from nested_pandas.series._storage import StructListStorage 52 | 53 | struct_list_storage = StructListStorage.from_list_struct_storage(list_storage) 54 | return cls.from_struct_list_storage(struct_list_storage) 55 | 56 | @classmethod 57 | def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self: # type: ignore # noqa: F821 58 | """Construct from a StructListStorage object. 59 | 60 | Parameters 61 | ---------- 62 | struct_list_storage : StructListStorage 63 | StructListStorage object. 64 | """ 65 | table = table_from_struct_array(struct_list_storage.data) 66 | return cls(table, validate=False) 67 | -------------------------------------------------------------------------------- /src/nested_pandas/series/dtype.py: -------------------------------------------------------------------------------- 1 | # Use Self, which is not available until Python 3.11 2 | from __future__ import annotations 3 | 4 | from collections.abc import Mapping 5 | 6 | # We use Type, because we must use "type" as an attribute name 7 | from typing import Type, cast # noqa: UP035 8 | 9 | import pandas as pd 10 | import pyarrow as pa 11 | from pandas import ArrowDtype 12 | from pandas.api.extensions import register_extension_dtype 13 | from pandas.core.arrays import ExtensionArray 14 | from pandas.core.dtypes.base import ExtensionDtype 15 | 16 | from nested_pandas.series.utils import ( 17 | is_pa_type_is_list_struct, 18 | transpose_list_struct_type, 19 | transpose_struct_list_type, 20 | ) 21 | 22 | __all__ = ["NestedDtype"] 23 | 24 | 25 | @register_extension_dtype 26 | class NestedDtype(ExtensionDtype): 27 | """Data type to handle packed time series data 28 | 29 | Parameters 30 | ---------- 31 | pyarrow_dtype : pyarrow.StructType or pd.ArrowDtype 32 | The pyarrow data type to use for the nested type. It must be a struct 33 | type where all fields are list types. 34 | """ 35 | 36 | # ExtensionDtype overrides # 37 | 38 | _metadata = ("pyarrow_dtype",) 39 | """Attributes to use as metadata for __eq__ and __hash__""" 40 | 41 | @property 42 | def na_value(self) -> Type[pd.NA]: 43 | """The missing value for this dtype""" 44 | return pd.NA 45 | 46 | type = pd.DataFrame 47 | """The type of the array's elements, always pd.DataFrame""" 48 | 49 | @property 50 | def name(self) -> str: 51 | """The string representation of the nested type""" 52 | # Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__ 53 | nice_dtypes = { 54 | field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype 55 | for field, dtype in self.field_dtypes.items() 56 | } 57 | fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()]) 58 | return f"nested<{fields}>" 59 | 60 | def __repr__(self) -> str: 61 | return self.name 62 | 63 | @classmethod 64 | def construct_array_type(cls) -> Type[ExtensionArray]: 65 | """Corresponded array type, always NestedExtensionArray""" 66 | from nested_pandas.series.ext_array import NestedExtensionArray 67 | 68 | return NestedExtensionArray 69 | 70 | @classmethod 71 | def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821 72 | """Construct NestedDtype from a string representation. 73 | 74 | This works only for simple types, i.e. non-parametric pyarrow types. 75 | 76 | Parameters 77 | ---------- 78 | string : str 79 | The string representation of the nested type. For example, 80 | 'nested"): 96 | raise TypeError("Not a valid nested type string, expected 'nested<...>'") 97 | fields_str = string.removeprefix("nested<").removesuffix(">") 98 | 99 | field_strings = fields_str.split(", ") 100 | 101 | fields = {} 102 | for field_string in field_strings: 103 | try: 104 | field_name, field_type = field_string.split(": ", maxsplit=1) 105 | except ValueError as e: 106 | raise TypeError( 107 | "Not a valid nested type string, expected 'nested', got invalid field " 108 | f"string '{field_string}'" 109 | ) from e 110 | if not field_type.startswith("[") or not field_type.endswith("]"): 111 | raise TypeError( 112 | "Not a valid nested type string, expected 'nested', got invalid field " 113 | f"type string '{field_type}'" 114 | ) 115 | 116 | value_type = field_type.removeprefix("[").removesuffix("]") 117 | # We follow ArrowDtype implementation heere and do not try to parse complex types 118 | try: 119 | pa_value_type = pa.type_for_alias(value_type) 120 | except ValueError as e: 121 | raise TypeError( 122 | f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. " 123 | "Please use NestedDtype() or NestedDtype.from_fields() instead." 124 | ) from e 125 | 126 | fields[field_name] = pa_value_type 127 | 128 | return cls.from_fields(fields) 129 | 130 | # ArrowDtype would return None so we do 131 | def _get_common_dtype(self, dtypes: list) -> None: 132 | return None 133 | 134 | # Optional methods # 135 | 136 | def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray: 137 | """Construct a NestedExtensionArray from a pyarrow array. 138 | 139 | Parameters 140 | ---------- 141 | array : pa.Array | pa.ChunkedArray 142 | The input pyarrow array. 143 | 144 | Returns 145 | ------- 146 | NestedExtensionArray 147 | The constructed NestedExtensionArray. 148 | """ 149 | from nested_pandas.series.ext_array import NestedExtensionArray 150 | 151 | return NestedExtensionArray(array) 152 | 153 | # Additional methods and attributes # 154 | 155 | pyarrow_dtype: pa.StructType 156 | 157 | def __init__(self, pyarrow_dtype: pa.DataType) -> None: 158 | self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype) 159 | 160 | @property 161 | def struct_list_pa_dtype(self) -> pa.StructType: 162 | """Struct-list pyarrow type representing the nested type.""" 163 | return self.pyarrow_dtype 164 | 165 | @classmethod 166 | def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 167 | """Make NestedDtype from a mapping of field names and list item types. 168 | 169 | Parameters 170 | ---------- 171 | fields : Mapping[str, pa.DataType] 172 | A mapping of field names and their item types. Since all fields are lists, the item types are 173 | inner types of the lists, not the list types themselves. 174 | 175 | Returns 176 | ------- 177 | NestedDtype 178 | The constructed NestedDtype. 179 | 180 | Examples 181 | -------- 182 | >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()}) 183 | >>> dtype 184 | nested 185 | >>> assert ( 186 | ... dtype.pyarrow_dtype 187 | ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) 188 | ... ) 189 | """ 190 | pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()}) 191 | pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) 192 | return cls(pyarrow_dtype=pyarrow_dtype) 193 | 194 | @staticmethod 195 | def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListType]: 196 | """Check that the given pyarrow type is castable to the nested type. 197 | 198 | Parameters 199 | ---------- 200 | pyarrow_dtype : pa.DataType 201 | The pyarrow type to check and cast. 202 | 203 | Returns 204 | ------- 205 | pa.StructType 206 | Struct-list pyarrow type representing the nested type. 207 | pa.ListType 208 | List-struct pyarrow type representing the nested type. 209 | """ 210 | if not isinstance(pyarrow_dtype, pa.DataType): 211 | raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") 212 | if pa.types.is_struct(pyarrow_dtype): 213 | struct_type = cast(pa.StructType, pyarrow_dtype) 214 | return struct_type, transpose_struct_list_type(struct_type) 215 | # Currently, LongList and others are not supported 216 | if pa.types.is_list(pyarrow_dtype): 217 | list_type = cast(pa.ListType, pyarrow_dtype) 218 | return transpose_list_struct_type(list_type), list_type 219 | raise ValueError( 220 | f"NestedDtype can only be constructed with pa.StructType or pa.ListType only, got {pyarrow_dtype}" 221 | ) 222 | 223 | @property 224 | def fields(self) -> dict[str, pa.DataType]: 225 | """The mapping of field names and their item types.""" 226 | return {field.name: field.type.value_type for field in self.pyarrow_dtype} 227 | 228 | @property 229 | def field_names(self) -> list[str]: 230 | """The list of field names of the nested type""" 231 | return [field.name for field in self.pyarrow_dtype] 232 | 233 | @classmethod 234 | def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype) -> Self: # type: ignore[name-defined] # noqa: F821 235 | """Construct NestedDtype from a pandas.ArrowDtype. 236 | 237 | Parameters 238 | ---------- 239 | pandas_arrow_dtype : ArrowDtype 240 | The pandas.ArrowDtype to construct NestedDtype from. 241 | Must be struct-list or list-struct type. 242 | 243 | Returns 244 | ------- 245 | NestedDtype 246 | The constructed NestedDtype. 247 | 248 | Raises 249 | ------ 250 | ValueError 251 | If the given dtype is not a valid nested type. 252 | """ 253 | return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype) 254 | 255 | def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype: 256 | """Convert NestedDtype to a pandas.ArrowDtype. 257 | 258 | Parameters 259 | ---------- 260 | list_struct : bool, default False 261 | If False (default) use pyarrow struct-list type, 262 | otherwise use pyarrow list-struct type. 263 | 264 | Returns 265 | ------- 266 | ArrowDtype 267 | The corresponding pandas.ArrowDtype. 268 | """ 269 | if list_struct: 270 | return ArrowDtype(self.list_struct_pa_dtype) 271 | return ArrowDtype(self.pyarrow_dtype) 272 | 273 | def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821 274 | """Pandas dtype of a field, pd.ArrowDType or NestedDtype. 275 | 276 | Parameters 277 | ---------- 278 | field : str 279 | Field name 280 | 281 | Returns 282 | ------- 283 | pd.ArrowDtype | NestedDtype 284 | If the field is a list-struct, return NestedDtype, else wrap it 285 | as a pd.ArrowDtype. 286 | """ 287 | list_type = self.pyarrow_dtype.field(field).type 288 | value_type = list_type.value_type 289 | if is_pa_type_is_list_struct(value_type): 290 | return type(self)(value_type) 291 | return pd.ArrowDtype(value_type) 292 | 293 | @property 294 | def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821 295 | """Pandas dtypes of this dtype's fields.""" 296 | return {field: self.field_dtype(field) for field in self.field_names} 297 | -------------------------------------------------------------------------------- /src/nested_pandas/series/packer.py: -------------------------------------------------------------------------------- 1 | """Module for converting between "flat" and "list" and "nested" representations 2 | 3 | TODO: mask support 4 | TODO: multi-index support 5 | """ 6 | 7 | # "|" for python 3.9 8 | from __future__ import annotations 9 | 10 | from collections.abc import Sequence 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import pyarrow as pa 15 | 16 | from nested_pandas.series.dtype import NestedDtype 17 | from nested_pandas.series.ext_array import NestedExtensionArray 18 | 19 | __all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"] 20 | 21 | 22 | N_ROWS_INFER_DTYPE = 1000 23 | 24 | 25 | def pack( 26 | obj, 27 | name: str | None = None, 28 | *, 29 | index=None, 30 | on: None | str | list[str] = None, 31 | dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, 32 | ) -> pd.Series: 33 | """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series. 34 | 35 | Parameters 36 | ---------- 37 | obj : pd.DataFrame or Sequence of 38 | Input dataframe, with repeated indexes, or a sequence of dataframes or missed values. 39 | name : str, optional 40 | Name of the output series. 41 | index : convertable to pd.Index, optional 42 | Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index, 43 | and this value is used to override the index after the nesting. 44 | on: str or list of str, optional 45 | Column name(s) to join on. If None, the index is used. 46 | dtype : dtype or None 47 | NestedDtype of the output series, or other type to derive from. If None, 48 | the dtype is inferred from the first non-missing dataframe. 49 | 50 | Returns 51 | ------- 52 | pd.Series 53 | Output series. 54 | """ 55 | if isinstance(obj, pd.DataFrame): 56 | nested = pack_flat(obj, name=name, on=on) 57 | if index is not None: 58 | nested.index = index 59 | return nested 60 | return pack_seq(obj, name=name, index=index, dtype=dtype) 61 | 62 | 63 | def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series: 64 | """Make a structure of lists representation of a "flat" dataframe. 65 | 66 | For the input dataframe with repeated indexes, make a pandas.Series, 67 | where each original column is replaced by a structure of lists. 68 | The dtype of the column is `nested_pandas.NestedDtype` with 69 | the corresponding pyarrow type. The index of the output series is 70 | the unique index of the input dataframe. The Series has `.nest` accessor, 71 | see `nested_pandas.series.accessor.NestSeriesAccessor` for details. 72 | 73 | Parameters 74 | ---------- 75 | df : pd.DataFrame 76 | Input dataframe, with repeated indexes. 77 | name : str, optional 78 | Name of the pd.Series. 79 | on : str or list of str, optional 80 | Column name(s) to join on. If None, the df's index is used. 81 | 82 | Returns 83 | ------- 84 | pd.Series 85 | Output series, with unique indexes. 86 | 87 | See Also 88 | -------- 89 | nested_pandas.series.accessor.NestedSeriesAccessor : .nest accessor for the output series. 90 | nested_pandas.series.dtype.NestedDtype : The dtype of the output series. 91 | nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays. 92 | """ 93 | 94 | if on is not None: 95 | df = df.set_index(on) 96 | # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted 97 | sorted_flat = df.sort_index(kind="stable") 98 | return pack_sorted_df_into_struct(sorted_flat, name=name) 99 | 100 | 101 | def pack_seq( 102 | sequence: Sequence, 103 | name: str | None = None, 104 | *, 105 | index: object = None, 106 | dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, 107 | ) -> pd.Series: 108 | """Pack a sequence of "flat" dataframes into a "nested" series. 109 | 110 | Parameters 111 | ---------- 112 | sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar 113 | Input sequence of dataframes or missed values. 114 | name : str, optional 115 | Name of the output series. 116 | index : pd.Index, optional 117 | Index of the output series. 118 | dtype : dtype or None 119 | NestedDtype of the output series, or other type to derive from. If None, 120 | the dtype is inferred from the first non-missing dataframe. 121 | 122 | Returns 123 | ------- 124 | pd.Series 125 | Output series. 126 | """ 127 | if isinstance(sequence, pd.Series): 128 | if index is None: 129 | index = sequence.index 130 | if name is None: 131 | name = sequence.name 132 | 133 | ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype) 134 | series = pd.Series(ext_array, index=index, name=name, copy=False) 135 | return series 136 | 137 | 138 | def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series: 139 | """Make a structure of lists representation of a "flat" dataframe. 140 | 141 | Input dataframe must be sorted and all the columns must have pyarrow dtypes. 142 | 143 | Parameters 144 | ---------- 145 | df : pd.DataFrame 146 | Input dataframe, with repeated indexes. It must be sorted and 147 | all the columns must have pyarrow dtypes. 148 | 149 | name : str, optional 150 | Name of the pd.Series. 151 | 152 | Returns 153 | ------- 154 | pd.Series 155 | Output series, with unique indexes. 156 | """ 157 | if not df.index.is_monotonic_increasing: 158 | raise ValueError("The index of the input dataframe must be sorted") 159 | 160 | packed_df = view_sorted_df_as_list_arrays(df) 161 | # No need to validate the dataframe, the length of the nested arrays is forced to be the same by 162 | # the view_sorted_df_as_list_arrays function. 163 | return pack_lists(packed_df, name=name, validate=False) 164 | 165 | 166 | def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series: 167 | """Make a series of arrow structures from a dataframe with nested arrays. 168 | 169 | For the input dataframe with repeated indexes, make a pandas.Series, 170 | where each original column is replaced by a structure of lists. 171 | The dtype of the column is `nested_pandas.NestedDtype` with the corresponding 172 | pyarrow type. The index of the output series is the unique index of the 173 | input dataframe. The Series has `.nest` accessor, see 174 | `nested_pandas.series.accessor.NestSeriesAccessor` for details. 175 | 176 | For every row, all the nested array (aka pyarrow list) lengths must be 177 | the same. 178 | 179 | Parameters 180 | ---------- 181 | df : pd.DataFrame 182 | Input dataframe, with pyarrow list-arrays. 183 | name : str, optional 184 | Name of the pd.Series. 185 | validate : bool, default True 186 | Whether to validate the input dataframe. 187 | 188 | Returns 189 | ------- 190 | pd.Series 191 | Output series, with unique indexes. 192 | 193 | See Also 194 | -------- 195 | nested_pandas.series.accessor.NestSeriesAccessor : The accessor for the output series. 196 | nested_pandas.series.dtype.NestedDtype : The dtype of the output series. 197 | nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes. 198 | """ 199 | # When series is converted to pa.array it may be both Array and ChunkedArray 200 | # We convert it to chunked for the sake of consistency 201 | pa_arrays_maybe_chunked = {column: pa.array(df[column]) for column in df.columns} 202 | pa_chunked_arrays = { 203 | column: arr if isinstance(arr, pa.ChunkedArray) else pa.chunked_array([arr]) 204 | for column, arr in pa_arrays_maybe_chunked.items() 205 | } 206 | 207 | # If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no 208 | # data copying. 209 | chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()]) 210 | if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths): 211 | chunks = [] 212 | num_chunks = next(iter(pa_chunked_arrays.values())).num_chunks 213 | for i in range(num_chunks): 214 | chunks.append( 215 | pa.StructArray.from_arrays( 216 | [arr.chunk(i) for arr in pa_chunked_arrays.values()], 217 | names=pa_chunked_arrays.keys(), 218 | ) 219 | ) 220 | struct_array = pa.chunked_array(chunks) 221 | else: # "flatten" the chunked arrays 222 | struct_array = pa.StructArray.from_arrays( 223 | [arr.combine_chunks() for arr in pa_chunked_arrays.values()], 224 | names=pa_chunked_arrays.keys(), 225 | ) 226 | 227 | ext_array = NestedExtensionArray(struct_array, validate=validate) 228 | return pd.Series( 229 | ext_array, 230 | index=df.index, 231 | copy=False, 232 | name=name, 233 | ) 234 | 235 | 236 | def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame: 237 | """Make a nested array representation of a "flat" dataframe. 238 | 239 | Parameters 240 | ---------- 241 | df : pd.DataFrame 242 | Input dataframe, with repeated indexes. It must be sorted by its index. 243 | 244 | Returns 245 | ------- 246 | pd.DataFrame 247 | Output dataframe, with unique indexes. It is a view over the input 248 | dataframe, so it would mute the input dataframe if modified. 249 | """ 250 | if not df.index.is_monotonic_increasing: 251 | raise ValueError("The index of the input dataframe must be sorted") 252 | 253 | offset_array = calculate_sorted_index_offsets(df.index) 254 | unique_index = df.index[offset_array[:-1]] 255 | 256 | series_ = { 257 | column: view_sorted_series_as_list_array(df[column], offset_array, unique_index) 258 | for column in df.columns 259 | } 260 | 261 | df = pd.DataFrame(series_) 262 | 263 | return df 264 | 265 | 266 | def view_sorted_series_as_list_array( 267 | series: pd.Series, offset: np.ndarray | None = None, unique_index: np.ndarray | None = None 268 | ) -> pd.Series: 269 | """Make a nested array representation of a "flat" series. 270 | 271 | Parameters 272 | ---------- 273 | series : pd.Series 274 | Input series, with repeated indexes. It must be sorted by its index. 275 | 276 | offset: np.ndarray or None, optional 277 | Pre-calculated offsets of the input series index. 278 | unique_index: np.ndarray or None, optional 279 | Pre-calculated unique index of the input series. If given it must be 280 | equal to `series.index.unique()` and `series.index.values[offset[:-1]]`. 281 | 282 | Returns 283 | ------- 284 | pd.Series 285 | Output series, with unique indexes. It is a view over the input series, 286 | so it would mute the input series if modified. 287 | """ 288 | if not series.index.is_monotonic_increasing: 289 | raise ValueError("The index of the input series must be sorted") 290 | 291 | if offset is None: 292 | offset = calculate_sorted_index_offsets(series.index) 293 | if unique_index is None: 294 | unique_index = series.index[offset[:-1]] 295 | 296 | # Input series may be represented by pyarrow.ChunkedArray, in this case pa.array(series) would fail 297 | # with "TypeError: Cannot convert a 'ChunkedArray' to a 'ListArray'". 298 | # https://github.com/lincc-frameworks/nested-pandas/issues/189 299 | flat_array = pa.array(series, from_pandas=True) 300 | if isinstance(flat_array, pa.ChunkedArray): 301 | flat_array = flat_array.combine_chunks() 302 | list_array = pa.ListArray.from_arrays( 303 | offset, 304 | flat_array, 305 | ) 306 | 307 | return pd.Series( 308 | list_array, 309 | dtype=pd.ArrowDtype(list_array.type), 310 | index=unique_index, 311 | copy=False, 312 | name=series.name, 313 | ) 314 | 315 | 316 | def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: 317 | """Calculate the offsets of the pre-sorted index values. 318 | 319 | Parameters 320 | ---------- 321 | index : pd.Index 322 | Input index, must be sorted. 323 | 324 | Returns 325 | ------- 326 | np.ndarray 327 | Output array of offsets, one element more than the number of unique 328 | index values. 329 | """ 330 | if not index.is_monotonic_increasing: 331 | raise ValueError("The index must be sorted") 332 | 333 | # pd.Index.duplicated returns False for the first occurance and True for all others. 334 | # So our offsets would be indexes of these False values with the array length in the end. 335 | offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0] 336 | offset = np.append(offset_but_last, len(index)) 337 | 338 | # Arrow uses int32 for offsets 339 | offset = offset.astype(np.int32) 340 | 341 | return offset 342 | -------------------------------------------------------------------------------- /src/nested_pandas/series/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Python 3.9 requires it for X | Y type hints 2 | 3 | from typing import TYPE_CHECKING, cast 4 | 5 | import pandas as pd 6 | import pyarrow as pa 7 | 8 | if TYPE_CHECKING: 9 | from nested_pandas.series.dtype import NestedDtype 10 | 11 | 12 | def is_pa_type_a_list(pa_type: pa.DataType) -> bool: 13 | """Check if the given pyarrow type is a list type. 14 | 15 | I.e., one of the following types: ListArray, LargeListArray, 16 | FixedSizeListArray. 17 | 18 | Parameters 19 | ---------- 20 | pa_type : pa.DataType 21 | The pyarrow type to check. 22 | 23 | Returns 24 | ------- 25 | bool 26 | True if the given type is a list type, False otherwise. 27 | """ 28 | return ( 29 | pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_fixed_size_list(pa_type) 30 | ) 31 | 32 | 33 | def is_pa_type_is_list_struct(pa_type: pa.DataType) -> bool: 34 | """Check if the given pyarrow type is a list-struct type. 35 | 36 | Parameters 37 | ---------- 38 | pa_type : pa.DataType 39 | The pyarrow type to check. 40 | 41 | Returns 42 | ------- 43 | bool 44 | True if the given type is a list-type with struct values, 45 | False otherwise. 46 | """ 47 | return is_pa_type_a_list(pa_type) and pa.types.is_struct(pa_type.value_type) 48 | 49 | 50 | def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None: 51 | """Check if the given struct array has lists of equal length. 52 | 53 | Parameters 54 | ---------- 55 | array : pa.StructArray 56 | Input struct array. 57 | 58 | Raises 59 | ------ 60 | ValueError 61 | If the struct array has lists of unequal length or type of the input 62 | array is not a StructArray or fields are not ListArrays. 63 | """ 64 | if not pa.types.is_struct(array.type): 65 | raise ValueError(f"Expected a StructArray, got {array.type}") 66 | 67 | first_list_array: pa.ListArray | None = None 68 | for field in array.type: 69 | inner_array = array.field(field.name) 70 | if not is_pa_type_a_list(inner_array.type): 71 | raise ValueError(f"Expected a ListArray, got {inner_array.type}") 72 | list_array = cast(pa.ListArray, inner_array) 73 | 74 | if first_list_array is None: 75 | first_list_array = list_array 76 | continue 77 | # compare offsets from the first list array with the current one 78 | if not first_list_array.offsets.equals(list_array.offsets): 79 | raise ValueError("Offsets of all ListArrays must be the same") 80 | 81 | 82 | def transpose_struct_list_type(t: pa.StructType) -> pa.ListType: 83 | """Converts a type of struct-list array into a type of list-struct array. 84 | 85 | Parameters 86 | ---------- 87 | t : pa.DataType 88 | Input type of struct-list array. 89 | 90 | Returns 91 | ------- 92 | pa.DataType 93 | Type of list-struct array. 94 | 95 | Raises 96 | ------ 97 | ValueError 98 | If the input type is not a struct-list type. 99 | """ 100 | if not pa.types.is_struct(t): 101 | raise ValueError(f"Expected a StructType, got {t}") 102 | 103 | fields = [] 104 | for field in t: 105 | if not is_pa_type_a_list(field.type): 106 | raise ValueError(f"Expected a ListType, got {field.type}") 107 | list_type = cast(pa.ListType, field.type) 108 | fields.append(pa.field(field.name, list_type.value_type)) 109 | 110 | list_type = cast(pa.ListType, pa.list_(pa.struct(fields))) 111 | return list_type 112 | 113 | 114 | def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.ListArray: 115 | """Converts a struct-array of lists into a list-array of structs. 116 | 117 | Parameters 118 | ---------- 119 | array : pa.StructArray 120 | Input struct array, each scalar must have lists of equal length. 121 | validate : bool, default True 122 | Whether to validate the input array for list lengths. Raises ValueError 123 | if something is wrong. 124 | 125 | Returns 126 | ------- 127 | pa.ListArray 128 | List array of structs. 129 | """ 130 | if validate: 131 | validate_struct_list_array_for_equal_lengths(array) 132 | 133 | mask = array.is_null() 134 | if not pa.compute.any(mask).as_py(): 135 | mask = None 136 | 137 | # Since we know that all lists have the same length, we can use the first list to get offsets 138 | try: 139 | offsets = array.field(0).offsets 140 | except IndexError as e: 141 | raise ValueError("Nested arrays must have at least one field") from e 142 | else: 143 | # Shift offsets 144 | if offsets.offset != 0: 145 | offsets = pa.compute.subtract(offsets, offsets[0]) 146 | 147 | struct_flat_array = pa.StructArray.from_arrays( 148 | # Select values within the offsets 149 | [field.values[field.offsets[0].as_py() : field.offsets[-1].as_py()] for field in array.flatten()], 150 | names=array.type.names, 151 | ) 152 | return pa.ListArray.from_arrays( 153 | offsets=offsets, 154 | values=struct_flat_array, 155 | mask=mask, 156 | ) 157 | 158 | 159 | def transpose_struct_list_chunked(chunked_array: pa.ChunkedArray, validate: bool = True) -> pa.ChunkedArray: 160 | """Converts a chunked array of struct-list into a chunked array of list-struct. 161 | 162 | Parameters 163 | ---------- 164 | chunked_array : pa.ChunkedArray 165 | Input chunked array of struct-list. 166 | validate : bool, default True 167 | Whether to validate the input array for list lengths. Raises ValueError 168 | if something is wrong. 169 | 170 | Returns 171 | ------- 172 | pa.ChunkedArray 173 | Chunked array of list-struct. 174 | """ 175 | if chunked_array.num_chunks == 0: 176 | return pa.chunked_array([], type=transpose_struct_list_type(chunked_array.type)) 177 | return pa.chunked_array( 178 | [transpose_struct_list_array(array, validate) for array in chunked_array.iterchunks()] 179 | ) 180 | 181 | 182 | def transpose_list_struct_scalar(scalar: pa.ListScalar) -> pa.StructScalar: 183 | """Converts a list-scalar of structs into a struct-scalar of lists. 184 | 185 | Parameters 186 | ---------- 187 | scalar : pa.ListScalar 188 | Input list-struct scalar. 189 | 190 | Returns 191 | ------- 192 | pa.StructScalar 193 | Struct-list scalar. 194 | """ 195 | struct_type = transpose_list_struct_type(scalar.type) 196 | struct_scalar = pa.scalar( 197 | {field: scalar.values.field(field) for field in struct_type.names}, 198 | type=struct_type, 199 | ) 200 | return cast(pa.StructScalar, struct_scalar) 201 | 202 | 203 | def validate_list_struct_type(t: pa.ListType) -> None: 204 | """Raise a ValueError if not a list-struct type.""" 205 | if not is_pa_type_a_list(t): 206 | raise ValueError(f"Expected a ListType, got {t}") 207 | 208 | if not pa.types.is_struct(t.value_type): 209 | raise ValueError(f"Expected a StructType as a list value type, got {t.value_type}") 210 | 211 | 212 | def transpose_list_struct_type(t: pa.ListType) -> pa.StructType: 213 | """Converts a type of list-struct array into a type of struct-list array. 214 | 215 | Parameters 216 | ---------- 217 | t : pa.DataType 218 | Input type of list-struct array. 219 | 220 | Returns 221 | ------- 222 | pa.DataType 223 | Type of struct-list array. 224 | 225 | Raises 226 | ------ 227 | ValueError 228 | If the input type is not a list-struct type. 229 | """ 230 | validate_list_struct_type(t) 231 | 232 | struct_type = cast(pa.StructType, t.value_type) 233 | fields = [] 234 | for field in struct_type: 235 | fields.append(pa.field(field.name, pa.list_(field.type))) 236 | 237 | struct_type = cast(pa.StructType, pa.struct(fields)) 238 | return struct_type 239 | 240 | 241 | def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray: 242 | """Converts a list-array of structs into a struct-array of lists. 243 | 244 | Parameters 245 | ---------- 246 | array : pa.ListArray 247 | Input list array of structs. 248 | 249 | Returns 250 | ------- 251 | pa.StructArray 252 | Struct array of lists. 253 | """ 254 | offsets, values = array.offsets, array.values 255 | mask = array.is_null() 256 | if not pa.compute.any(mask).as_py(): 257 | mask = None 258 | 259 | fields = [] 260 | for field_values in values.flatten(): 261 | list_array = pa.ListArray.from_arrays(offsets, field_values) 262 | fields.append(list_array) 263 | 264 | return pa.StructArray.from_arrays( 265 | arrays=fields, 266 | names=array.type.value_type.names, 267 | mask=mask, 268 | ) 269 | 270 | 271 | def transpose_list_struct_chunked(chunked_array: pa.ChunkedArray) -> pa.ChunkedArray: 272 | """Converts a chunked array of list-struct into a chunked array of struct-list. 273 | 274 | Parameters 275 | ---------- 276 | chunked_array : pa.ChunkedArray 277 | Input chunked array of list-struct. 278 | 279 | Returns 280 | ------- 281 | pa.ChunkedArray 282 | Chunked array of struct-list. 283 | """ 284 | if chunked_array.num_chunks == 0: 285 | return pa.chunked_array([], type=transpose_list_struct_type(chunked_array.type)) 286 | return pa.chunked_array([transpose_list_struct_array(array) for array in chunked_array.iterchunks()]) 287 | 288 | 289 | def nested_types_mapper(type: pa.DataType) -> pd.ArrowDtype | NestedDtype: 290 | """Type mapper for pyarrow .to_pandas(types_mapper) methods.""" 291 | from nested_pandas.series.dtype import NestedDtype 292 | 293 | if pa.types.is_list(type): 294 | try: 295 | return NestedDtype(type) 296 | except (ValueError, TypeError): 297 | return pd.ArrowDtype(type) 298 | return pd.ArrowDtype(type) 299 | 300 | 301 | def table_to_struct_array(table: pa.Table) -> pa.ChunkedArray: 302 | """pa.Table.to_struct_array 303 | 304 | pyarrow has a bug for empty tables: 305 | https://github.com/apache/arrow/issues/46355 306 | """ 307 | if len(table) == 0: 308 | return pa.chunked_array([], type=pa.struct(table.schema)) 309 | return table.to_struct_array() 310 | 311 | 312 | def table_from_struct_array(array: pa.ChunkedArray | pa.array) -> pa.Table: 313 | """pa.Table.from_struct_array, but working with chunkless input""" 314 | if isinstance(array, pa.ChunkedArray) and array.num_chunks == 0: 315 | array = pa.array([], type=array.type) 316 | return pa.Table.from_struct_array(array) 317 | -------------------------------------------------------------------------------- /src/nested_pandas/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # noqa 2 | -------------------------------------------------------------------------------- /src/nested_pandas/utils/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from nested_pandas import NestedFrame 4 | 5 | 6 | def count_nested(df, nested, by=None, join=True) -> NestedFrame: 7 | """Counts the number of rows of a nested dataframe. 8 | 9 | Parameters 10 | ---------- 11 | df: NestedFrame 12 | A NestedFrame that contains the desired `nested` series 13 | to count. 14 | nested: 'str' 15 | The label of the nested series to count. 16 | by: 'str', optional 17 | Specifies a column within nested to count by, returning 18 | a count for each unique value in `by`. 19 | join: bool, optional 20 | Join the output count columns to df and return df, otherwise 21 | just return a NestedFrame containing only the count columns. 22 | 23 | Returns 24 | ------- 25 | NestedFrame 26 | 27 | Examples 28 | -------- 29 | 30 | >>> import pandas as pd 31 | >>> # Show all columns 32 | >>> pd.set_option("display.width", 200) 33 | >>> pd.set_option("display.max_columns", None) 34 | >>> from nested_pandas.datasets.generation import generate_data 35 | >>> nf = generate_data(5, 10, seed=1) 36 | 37 | >>> from nested_pandas.utils import count_nested 38 | >>> count_nested(nf, "nested") 39 | a b nested n_nested 40 | 0 0.417022 0.184677 [{t: 8.38389, flux: 10.233443, band: 'g'}; …] ... 10 41 | 1 0.720324 0.372520 [{t: 13.70439, flux: 41.405599, band: 'g'}; …]... 10 42 | 2 0.000114 0.691121 [{t: 4.089045, flux: 69.440016, band: 'g'}; …]... 10 43 | 3 0.302333 0.793535 [{t: 17.562349, flux: 41.417927, band: 'g'}; …... 10 44 | 4 0.146756 1.077633 [{t: 0.547752, flux: 4.995346, band: 'r'}; …] ... 10 45 | 46 | `count_nested` also allows counting by a given subcolumn, for example we 47 | can count by "band" label: 48 | 49 | >>> # join=False, allows the result to be kept separate from the original nf 50 | >>> count_nested(nf, "nested", by="band", join=False) 51 | band n_nested_g n_nested_r 52 | 0 8 2 53 | 1 5 5 54 | 2 5 5 55 | 3 6 4 56 | 4 6 4 57 | """ 58 | 59 | if by is None: 60 | field_to_len = df[nested].nest.fields[0] 61 | counts = df[nested].nest.to_lists().apply(lambda x: len(x[field_to_len]), axis=1) 62 | counts.name = f"n_{nested}" # update name directly (rename causes issues downstream) 63 | else: 64 | # this may be able to be sped up using tolists() as well 65 | counts = df[nested].apply(lambda x: x[by].value_counts(sort=False)) 66 | counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns}) 67 | counts = counts.reindex(sorted(counts.columns), axis=1) 68 | if join: 69 | return df.join(counts) 70 | # else just return the counts NestedFrame 71 | if isinstance(counts, pd.Series): # for by=None, which returns a Series 72 | counts = NestedFrame(counts.to_frame()) 73 | return counts 74 | -------------------------------------------------------------------------------- /tests/nested_pandas/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/tests/nested_pandas/conftest.py -------------------------------------------------------------------------------- /tests/nested_pandas/datasets/test_generation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from nested_pandas.datasets import generate_data 3 | 4 | 5 | @pytest.mark.parametrize("n_layers", [10, {"nested_a": 10, "nested_b": 20}]) 6 | def test_generate_data(n_layers): 7 | """test the data generator function""" 8 | nf = generate_data(10, n_layers, seed=1) 9 | 10 | if isinstance(n_layers, int): 11 | assert len(nf.nested.nest.to_flat()) == 100 12 | 13 | elif isinstance(n_layers, dict): 14 | assert "nested_a" in nf.columns 15 | assert "nested_b" in nf.columns 16 | 17 | assert len(nf.nested_a.nest.to_flat()) == 100 18 | assert len(nf.nested_b.nest.to_flat()) == 200 19 | 20 | 21 | def test_generate_data_bad_input(): 22 | """test a poor n_layer input to generate_data""" 23 | with pytest.raises(TypeError): 24 | generate_data(10, "nested", seed=1) 25 | -------------------------------------------------------------------------------- /tests/nested_pandas/e2e_tests/test_issue89.py: -------------------------------------------------------------------------------- 1 | """Based on https://github.com/lincc-frameworks/nested-pandas/issues/89""" 2 | 3 | import nested_pandas as npd 4 | import numpy as np 5 | 6 | 7 | def test_issue89(): 8 | """Check that code snippet from issue 89 works as expected 9 | 10 | https://github.com/lincc-frameworks/nested-pandas/issues/89 11 | """ 12 | 13 | # Load some ZTF data 14 | catalogs_dir = "https://epyc.astro.washington.edu/~lincc-frameworks/half_degree_surveys/ztf/" 15 | 16 | object_ndf = npd.read_parquet( 17 | f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet", 18 | columns=["ra", "dec", "ps1_objid"], 19 | partitioning=None, 20 | ).set_index("ps1_objid") 21 | 22 | source_ndf = npd.read_parquet( 23 | f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet", 24 | columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"], 25 | partitioning=None, 26 | ).set_index("ps1_objid") 27 | 28 | object_ndf = object_ndf.add_nested(source_ndf, "ztf_source") 29 | 30 | nf = object_ndf 31 | nf.reduce(np.mean, "ztf_source.mjd") 32 | -------------------------------------------------------------------------------- /tests/nested_pandas/nestedframe/test_io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pandas as pd 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | import pytest 8 | from nested_pandas import read_parquet 9 | from nested_pandas.datasets import generate_data 10 | from pandas.testing import assert_frame_equal 11 | from upath import UPath 12 | 13 | 14 | def test_read_parquet(): 15 | """Test reading a parquet file with no columns specified""" 16 | # Load in the example file 17 | nf = read_parquet("tests/test_data/nested.parquet") 18 | 19 | # Check the columns 20 | assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"] 21 | 22 | # Make sure nested columns were recognized 23 | assert nf.nested_columns == ["nested", "lincc"] 24 | 25 | # Check the nested columns 26 | assert nf.nested.nest.fields == ["t", "flux", "band"] 27 | assert nf.lincc.nest.fields == ["band", "frameworks"] 28 | 29 | 30 | def test_read_parquet_list(): 31 | """Test reading a parquet file with no columns specified""" 32 | # Load in the example files 33 | single_file_nf = read_parquet("tests/test_data/nested.parquet") 34 | nf = read_parquet(["tests/test_data/nested.parquet", "tests/test_data/nested.parquet"]) 35 | 36 | # Check the columns 37 | assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"] 38 | 39 | # Make sure nested columns were recognized 40 | assert nf.nested_columns == ["nested", "lincc"] 41 | 42 | # Check the nested columns 43 | assert nf.nested.nest.fields == ["t", "flux", "band"] 44 | assert nf.lincc.nest.fields == ["band", "frameworks"] 45 | 46 | # Check loading list works correctly 47 | assert len(nf) == 2 * len(single_file_nf) 48 | 49 | 50 | def test_read_parquet_directory(): 51 | """Test reading a parquet file with no columns specified""" 52 | # Load in the example file 53 | nf = read_parquet("tests/test_data") 54 | 55 | # Check the columns 56 | assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"] 57 | 58 | # Make sure nested columns were recognized 59 | assert nf.nested_columns == ["nested", "lincc"] 60 | 61 | # Check the nested columns 62 | assert nf.nested.nest.fields == ["t", "flux", "band"] 63 | assert nf.lincc.nest.fields == ["band", "frameworks"] 64 | 65 | 66 | def test_read_parquet_directory_with_filesystem(): 67 | """Test reading a parquet file with no columns specified""" 68 | # Load in the example file 69 | path = UPath("tests/test_data") 70 | nf = read_parquet(path.path, filesystem=path.fs) 71 | 72 | # Check the columns 73 | assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"] 74 | 75 | # Make sure nested columns were recognized 76 | assert nf.nested_columns == ["nested", "lincc"] 77 | 78 | # Check the nested columns 79 | assert nf.nested.nest.fields == ["t", "flux", "band"] 80 | assert nf.lincc.nest.fields == ["band", "frameworks"] 81 | 82 | 83 | def test_file_object_read_parquet(): 84 | """Test reading parquet from a file-object""" 85 | with open("tests/test_data/nested.parquet", "rb") as f: 86 | nf = read_parquet(f) 87 | # Check the columns 88 | assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"] 89 | # Make sure nested columns were recognized 90 | assert nf.nested_columns == ["nested", "lincc"] 91 | # Check the nested columns 92 | assert nf.nested.nest.fields == ["t", "flux", "band"] 93 | assert nf.lincc.nest.fields == ["band", "frameworks"] 94 | 95 | 96 | @pytest.mark.parametrize( 97 | "columns", 98 | [ 99 | ["a", "flux"], 100 | ["flux", "nested", "lincc"], 101 | ["nested.flux", "nested.band"], 102 | ["flux", "nested.flux"], 103 | ["nested.band", "lincc.band"], 104 | ], 105 | ) 106 | def test_read_parquet_column_selection(columns): 107 | """Test reading a parquet file with column selection""" 108 | # Load in the example file 109 | nf = read_parquet("tests/test_data/nested.parquet", columns=columns) 110 | 111 | # Output expectations 112 | if columns == ["a", "flux"]: 113 | expected_columns = ["a", "flux"] 114 | elif columns == ["flux", "nested", "lincc"]: 115 | expected_columns = ["flux", "nested", "lincc"] 116 | elif columns == ["nested.flux", "nested.band"]: 117 | expected_columns = ["nested"] 118 | elif columns == ["flux", "nested.flux"]: 119 | expected_columns = ["flux", "nested"] 120 | elif columns == ["nested.band", "lincc.band"]: 121 | expected_columns = ["nested", "lincc"] 122 | 123 | # Check the columns 124 | assert nf.columns.tolist() == expected_columns 125 | 126 | # Check nested columns 127 | if columns == ["nested.flux", "nested.t"]: 128 | assert nf.nested.nest.fields == ["flux", "t"] 129 | elif columns == ["nested.band", "lincc.band"]: 130 | assert nf.nested.nest.fields == ["band"] 131 | assert nf.lincc.nest.fields == ["band"] 132 | 133 | 134 | @pytest.mark.parametrize("reject", [["nested"], "nested"]) 135 | def test_read_parquet_reject_nesting(reject): 136 | """Test reading a parquet file with column selection""" 137 | # Load in the example file 138 | nf = read_parquet("tests/test_data/nested.parquet", columns=["a", "nested"], reject_nesting=reject) 139 | 140 | # Check the columns 141 | assert nf.columns.tolist() == ["a", "nested"] 142 | 143 | # Make sure "nested" was not recognized as a nested column 144 | assert nf.nested_columns == [] 145 | 146 | assert pa.types.is_struct(nf["nested"].dtype.pyarrow_dtype) 147 | 148 | 149 | def test_read_parquet_reject_nesting_partial_loading(): 150 | """Test reading a parquet file with column selection""" 151 | # Load in the example file 152 | nf = read_parquet("tests/test_data/nested.parquet", columns=["a", "nested.t"], reject_nesting=["nested"]) 153 | 154 | # Check the columns 155 | assert nf.columns.tolist() == ["a", "t"] 156 | 157 | 158 | def test_read_parquet_catch_full_and_partial(): 159 | """Test reading a parquet file with column selection""" 160 | # Load in the example file 161 | with pytest.raises(ValueError): 162 | read_parquet("tests/test_data/nested.parquet", columns=["a", "nested.t", "nested"]) 163 | 164 | 165 | def test_read_parquet_catch_failed_cast(): 166 | """Test reading a parquet file with column selection""" 167 | # Load in the example file 168 | with pytest.raises(ValueError): 169 | read_parquet("tests/test_data/not_nestable.parquet") 170 | 171 | 172 | def test_read_parquet_test_mixed_struct(): 173 | """Test reading a parquet file with mixed struct types""" 174 | # Create the pure-list StructArray 175 | field1 = pa.array([[1, 2], [3, 4], [5, 6]]) 176 | field2 = pa.array([["a", "b"], ["b", "c"], ["c", "d"]]) 177 | field3 = pa.array([[True, False], [True, False], [True, False]]) 178 | struct_array_list = pa.StructArray.from_arrays([field1, field2, field3], ["list1", "list2", "list3"]) 179 | 180 | # Create the value StructArray 181 | field1 = pa.array([1, 2, 3]) 182 | field2 = pa.array(["a", "b", "c"]) 183 | field3 = pa.array([True, False, True]) 184 | struct_array_val = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "val3"]) 185 | 186 | # Create the mixed-list StructArray 187 | field1 = pa.array([1, 2, 3]) 188 | field2 = pa.array(["a", "b", "c"]) 189 | field3 = pa.array([[True, False], [True, False], [True, False]]) 190 | struct_array_mix = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "list3"]) 191 | 192 | # Create a PyArrow Table with the StructArray as one of the columns 193 | table = pa.table( 194 | { 195 | "id": pa.array([100, 101, 102]), # Another column 196 | "struct_list": struct_array_list, # Struct column 197 | "struct_value": struct_array_val, 198 | "struct_mix": struct_array_mix, 199 | } 200 | ) 201 | 202 | # Write to a temporary file 203 | with tempfile.TemporaryDirectory() as tmpdir: 204 | pq.write_table(table, os.path.join(tmpdir, "structs.parquet")) 205 | 206 | # Test full read 207 | nf = read_parquet(os.path.join(tmpdir, "structs.parquet")) 208 | assert nf.columns.tolist() == ["id", "struct_list", "struct_value", "struct_mix"] 209 | assert nf.nested_columns == ["struct_list"] 210 | 211 | # Test partial read 212 | nf = read_parquet(os.path.join(tmpdir, "structs.parquet"), columns=["id", "struct_mix.list3"]) 213 | assert nf.columns.tolist() == ["id", "struct_mix"] 214 | assert nf.nested_columns == ["struct_mix"] 215 | 216 | # Test partial read with ordering to force reject pops 217 | nf = read_parquet( 218 | os.path.join(tmpdir, "structs.parquet"), columns=["id", "struct_mix.list3", "struct_mix.val1"] 219 | ) 220 | assert nf.columns.tolist() == ["id", "list3", "val1"] 221 | assert len(nf.nested_columns) == 0 222 | 223 | 224 | def test_to_parquet(): 225 | """Test writing a parquet file with no columns specified""" 226 | # Load in the example file 227 | nf = read_parquet("tests/test_data/nested.parquet") 228 | 229 | # Write to a temporary file 230 | with tempfile.TemporaryDirectory() as tmpdir: 231 | nf.to_parquet(os.path.join(tmpdir, "nested.parquet")) 232 | 233 | # Read the file back in 234 | nf2 = read_parquet(os.path.join(tmpdir, "nested.parquet")) 235 | 236 | # Check the columns 237 | assert nf.columns.tolist() == nf2.columns.tolist() 238 | 239 | # Check the nested columns 240 | assert nf.nested_columns == nf2.nested_columns 241 | 242 | # Check the data 243 | assert_frame_equal(nf, nf2) 244 | 245 | 246 | def test_pandas_read_parquet(): 247 | """Test that pandas can read our serialized files""" 248 | 249 | nf = generate_data(10, 100, seed=1) 250 | with tempfile.TemporaryDirectory() as tmpdir: 251 | nf.to_parquet(os.path.join(tmpdir, "nested_for_pd.parquet")) 252 | # Load in the example file 253 | df = pd.read_parquet(os.path.join(tmpdir, "nested_for_pd.parquet")) 254 | 255 | # Check the columns 256 | assert df.columns.tolist() == ["a", "b", "nested"] 257 | 258 | 259 | def test_read_empty_parquet(): 260 | """Test that we can read empty parquet files""" 261 | orig_nf = generate_data(1, 2).iloc[:0] 262 | 263 | with tempfile.NamedTemporaryFile("wb", suffix="parquet") as tmpfile: 264 | orig_nf.to_parquet(tmpfile.name) 265 | # All columns 266 | # Do not check dtype because of: 267 | # https://github.com/lincc-frameworks/nested-pandas/issues/252 268 | assert_frame_equal(read_parquet(tmpfile.name), orig_nf, check_dtype=False) 269 | # Few columns 270 | assert_frame_equal( 271 | read_parquet( 272 | tmpfile.name, 273 | columns=[ 274 | "a", 275 | "nested.flux", 276 | "nested.band", 277 | ], 278 | ), 279 | orig_nf.drop(["b", "nested.t"], axis=1), 280 | check_dtype=False, 281 | ) 282 | -------------------------------------------------------------------------------- /tests/nested_pandas/series/test_dtype.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyarrow as pa 3 | import pytest 4 | from nested_pandas.datasets import generate_data 5 | from nested_pandas.nestedframe import NestedFrame 6 | from nested_pandas.series.dtype import NestedDtype 7 | from nested_pandas.series.ext_array import NestedExtensionArray 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "pyarrow_dtype", 12 | [ 13 | pa.struct([pa.field("a", pa.list_(pa.int64()))]), 14 | pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), 15 | pa.struct( 16 | [ 17 | pa.field("a", pa.list_(pa.int64())), 18 | pa.field("b", pa.list_(pa.struct([pa.field("c", pa.int64())]))), 19 | ] 20 | ), 21 | ], 22 | ) 23 | def test_from_pyarrow_dtype_struct_list(pyarrow_dtype): 24 | """Test that we can construct NestedDtype from pyarrow struct type.""" 25 | dtype = NestedDtype(pyarrow_dtype) 26 | assert dtype.pyarrow_dtype == pyarrow_dtype 27 | 28 | 29 | @pytest.mark.parametrize( 30 | "pyarrow_dtype", 31 | [ 32 | pa.list_(pa.struct([pa.field("a", pa.int64())])), 33 | pa.list_(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())])), 34 | pa.list_( 35 | pa.struct( 36 | [ 37 | pa.field("a", pa.list_(pa.int64())), 38 | pa.field("b", pa.list_(pa.float64())), 39 | ] 40 | ) 41 | ), 42 | ], 43 | ) 44 | def test_from_pyarrow_dtype_list_struct(pyarrow_dtype): 45 | """Test that we can construct NestedDtype from pyarrow list type.""" 46 | dtype = NestedDtype(pyarrow_dtype) 47 | assert dtype.list_struct_pa_dtype == pyarrow_dtype 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "pyarrow_dtype", 52 | [ 53 | pa.int64(), 54 | pa.list_(pa.int64()), 55 | pa.struct([pa.field("a", pa.int64())]), 56 | pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())]), 57 | pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())]), 58 | ], 59 | ) 60 | def test_from_pyarrow_dtype_raises(pyarrow_dtype): 61 | """Test that we raise an error when constructing NestedDtype from invalid pyarrow type.""" 62 | with pytest.raises(ValueError): 63 | NestedDtype(pyarrow_dtype) 64 | 65 | 66 | def test_to_pandas_arrow_dtype(): 67 | """Test that NestedDtype.to_pandas_arrow_dtype() returns the correct pyarrow struct type.""" 68 | dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}) 69 | assert dtype.to_pandas_arrow_dtype() == pd.ArrowDtype( 70 | pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) 71 | ) 72 | 73 | 74 | def test_from_pandas_arrow_dtype(): 75 | """Test that we can construct NestedDtype from pandas.ArrowDtype.""" 76 | dtype_from_struct = NestedDtype.from_pandas_arrow_dtype( 77 | pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])) 78 | ) 79 | assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) 80 | dtype_from_list = NestedDtype.from_pandas_arrow_dtype( 81 | pd.ArrowDtype(pa.list_(pa.struct([pa.field("a", pa.int64())]))) 82 | ) 83 | assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) 84 | 85 | 86 | def test_to_pandas_list_struct_arrow_dtype(): 87 | """Test that NestedDtype.to_pandas_arrow_dtype(list_struct=True) returns the correct pyarrow type.""" 88 | dtype = NestedDtype.from_fields({"a": pa.list_(pa.int64()), "b": pa.float64()}) 89 | assert dtype.to_pandas_arrow_dtype(list_struct=True) == pd.ArrowDtype( 90 | pa.list_(pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())])) 91 | ) 92 | 93 | 94 | def test_from_fields(): 95 | """Test NestedDtype.from_fields().""" 96 | fields = {"a": pa.int64(), "b": pa.float64()} 97 | dtype = NestedDtype.from_fields(fields) 98 | assert dtype.pyarrow_dtype == pa.struct( 99 | [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] 100 | ) 101 | 102 | 103 | def test_na_value(): 104 | """Test that NestedDtype.na_value is a singleton instance of NAType.""" 105 | dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])) 106 | assert dtype.na_value is pd.NA 107 | 108 | 109 | def test_fields(): 110 | """Test NestedDtype.fields property""" 111 | dtype = NestedDtype( 112 | pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) 113 | ) 114 | assert dtype.fields == {"a": pa.int64(), "b": pa.float64()} 115 | 116 | 117 | def test_field_names(): 118 | """Test NestedDtype.field_names property""" 119 | dtype = NestedDtype( 120 | pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) 121 | ) 122 | assert dtype.field_names == ["a", "b"] 123 | 124 | 125 | @pytest.mark.parametrize( 126 | "fields", 127 | [ 128 | {"a": pa.int64(), "b": pa.float64()}, 129 | {"a": pa.int64(), "b": pa.float64(), "c": pa.int64()}, 130 | {"a": pa.string(), "b": pa.float64()}, 131 | # Nested / parametric types are not implemented. 132 | # {"a": pa.list_(pa.int64()), "b": pa.float64()}, 133 | # {"a": pa.list_(pa.int64()), "b": pa.list_(pa.string())}, 134 | # {"a": pa.struct([pa.field("a", pa.int64())]), "b": pa.list_(pa.int64())}, 135 | ], 136 | ) 137 | def test_name_vs_construct_from_string(fields): 138 | """Test that dtype.name is consistent with dtype.construct_from_string(dtype.name).""" 139 | dtype = NestedDtype.from_fields(fields) 140 | assert dtype == NestedDtype.construct_from_string(dtype.name) 141 | 142 | 143 | def test_name_multiple_nested(): 144 | """Check string representation of a multiple-nested dtype.""" 145 | nf = generate_data(10, 2) 146 | # Add a column to nest on 147 | nf = nf.assign(id=[0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) 148 | nf = nf.rename(columns={"nested": "inner"}) 149 | nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer") 150 | assert ( 151 | nnf["outer"].dtype.name 152 | == "nested]>" 153 | ) 154 | 155 | 156 | @pytest.mark.parametrize( 157 | "s", 158 | [ 159 | "float", # not a nested type 160 | "nested(f: [int64])", # must be <> instead 161 | "ts", # 'ts' was a previous name, now we use 'nested' 162 | "nested", # no type specified 163 | "nested", # no field specified 165 | "nested", # no field name specified 166 | "nested<[int64]>", # no field name specified 167 | "nested", # separator must be ": " with space 168 | "nested", # separator must be ", " with space 169 | "nested", # missed [] - nested list 170 | "nested", # not an arrow type 171 | "nested]>", # complex arrow types are not supported 172 | ], 173 | ) 174 | def test_construct_from_string_raises(s): 175 | """Test that we raise an error when constructing NestedDtype from invalid string.""" 176 | with pytest.raises(TypeError): 177 | NestedDtype.construct_from_string(s) 178 | 179 | 180 | def test_construct_array_type(): 181 | """Test that NestedDtype.construct_array_type() returns NestedExtensionArray.""" 182 | assert NestedDtype.construct_array_type() is NestedExtensionArray 183 | -------------------------------------------------------------------------------- /tests/nested_pandas/series/test_series_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyarrow as pa 3 | import pytest 4 | from nested_pandas import NestedDtype 5 | from nested_pandas.series.utils import ( 6 | nested_types_mapper, 7 | transpose_list_struct_array, 8 | transpose_list_struct_scalar, 9 | transpose_list_struct_type, 10 | transpose_struct_list_array, 11 | transpose_struct_list_type, 12 | validate_struct_list_array_for_equal_lengths, 13 | ) 14 | 15 | 16 | def test_validate_struct_list_array_for_equal_lengths(): 17 | """Test validate_struct_list_array_for_equal_lengths function.""" 18 | # Raises for wrong types 19 | with pytest.raises(ValueError): 20 | validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.int64())) 21 | with pytest.raises(ValueError): 22 | validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.list_(pa.int64()))) 23 | 24 | # Raises if one of the fields is not a ListArray 25 | with pytest.raises(ValueError): 26 | validate_struct_list_array_for_equal_lengths( 27 | pa.StructArray.from_arrays([pa.array([[1, 2], [3, 4, 5]]), pa.array([1, 2])], ["a", "b"]) 28 | ) 29 | 30 | # Raises for mismatched lengths 31 | with pytest.raises(ValueError): 32 | validate_struct_list_array_for_equal_lengths( 33 | pa.StructArray.from_arrays( 34 | [pa.array([[1, 2], [3, 4, 5]]), pa.array([[1, 2, 3], [4, 5]])], ["a", "b"] 35 | ) 36 | ) 37 | 38 | input_array = pa.StructArray.from_arrays( 39 | arrays=[ 40 | pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), 41 | pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), 42 | ], 43 | names=["a", "b"], 44 | ) 45 | assert validate_struct_list_array_for_equal_lengths(input_array) is None 46 | 47 | 48 | def test_transpose_struct_list_type(): 49 | """Test transpose_struct_list_type function.""" 50 | # Raises for wrong types 51 | with pytest.raises(ValueError): 52 | transpose_struct_list_type(pa.int64()) 53 | with pytest.raises(ValueError): 54 | transpose_struct_list_type(pa.list_(pa.int64())) 55 | 56 | # Raises if one of the fields is not a ListType 57 | with pytest.raises(ValueError): 58 | transpose_struct_list_type(pa.struct([("a", pa.int64()), ("b", pa.int64())])) 59 | 60 | input_type = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))]) 61 | expected_output = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())])) 62 | assert transpose_struct_list_type(input_type) == expected_output 63 | 64 | 65 | def test_transpose_list_struct_type(): 66 | """Test transpose_list_struct_type function.""" 67 | # Raises for wrong types 68 | with pytest.raises(ValueError): 69 | transpose_list_struct_type(pa.int64()) 70 | with pytest.raises(ValueError): 71 | transpose_list_struct_type(pa.struct([("a", pa.int64()), ("b", pa.int64())])) 72 | 73 | input_type = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())])) 74 | expected_output = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))]) 75 | assert transpose_list_struct_type(input_type) == expected_output 76 | 77 | 78 | def test_transpose_struct_list_array(): 79 | """Test transpose_struct_list_array function.""" 80 | input_array = pa.StructArray.from_arrays( 81 | arrays=[ 82 | pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), 83 | pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), 84 | ], 85 | names=["a", "b"], 86 | ) 87 | desired = pa.array( 88 | [ 89 | [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}], 90 | [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}], 91 | [], 92 | [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}], 93 | ] 94 | ) 95 | actual = transpose_struct_list_array(input_array) 96 | assert actual == desired 97 | 98 | 99 | def test_transpose_list_struct_array(): 100 | """Test transpose_list_struct_array function.""" 101 | input_array = pa.array( 102 | [ 103 | [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}], 104 | [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}], 105 | [], 106 | [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}], 107 | ] 108 | ) 109 | desired = pa.StructArray.from_arrays( 110 | arrays=[ 111 | pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), 112 | pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), 113 | ], 114 | names=["a", "b"], 115 | ) 116 | actual = transpose_list_struct_array(input_array) 117 | assert actual == desired 118 | 119 | 120 | def test_transpose_list_struct_scalar(): 121 | """Test transpose_list_struct_scalar function.""" 122 | input_scalar = pa.scalar([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}]) 123 | desired = pa.scalar({"a": [1, 2], "b": ["x", "y"]}) 124 | actual = transpose_list_struct_scalar(input_scalar) 125 | assert actual == desired 126 | 127 | 128 | @pytest.mark.parametrize( 129 | "pa_type,is_nested", 130 | [ 131 | (pa.float64(), False), 132 | (pa.list_(pa.float64()), False), 133 | (pa.list_(pa.struct([("a", pa.float64()), ("b", pa.float64())])), True), 134 | ], 135 | ) 136 | def test_nested_types_mapper(pa_type, is_nested): 137 | """Test nested_types_mapper function.""" 138 | dtype = nested_types_mapper(pa_type) 139 | if is_nested: 140 | assert isinstance(dtype, NestedDtype) 141 | assert dtype.list_struct_pa_dtype == pa_type 142 | else: 143 | assert isinstance(dtype, pd.ArrowDtype) 144 | assert dtype.pyarrow_dtype == pa_type 145 | -------------------------------------------------------------------------------- /tests/nested_pandas/test_packaging.py: -------------------------------------------------------------------------------- 1 | import nested_pandas 2 | 3 | 4 | def test_version(): 5 | """Check to see that the version property returns something""" 6 | assert nested_pandas.__version__ is not None 7 | -------------------------------------------------------------------------------- /tests/nested_pandas/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from nested_pandas import NestedFrame 5 | from nested_pandas.utils import count_nested 6 | 7 | 8 | @pytest.mark.parametrize("join", [True, False]) 9 | def test_count_nested(join): 10 | """Test the functionality of count nested""" 11 | 12 | # Initialize test data 13 | base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2]) 14 | nested = pd.DataFrame( 15 | data={ 16 | "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1], 17 | "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], 18 | "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"], 19 | }, 20 | index=[0, 0, 0, 1, 1, 1, 2, 2, 2], 21 | ) 22 | base = base.add_nested(nested, "nested") 23 | 24 | # Test general count 25 | total_counts = count_nested(base, "nested", join=join) 26 | assert all(total_counts["n_nested"].values == 3) 27 | 28 | # Test count by 29 | label_counts = count_nested(base, "nested", by="label", join=join) 30 | 31 | assert all(label_counts["n_nested_a"].values == [1, 2, 1]) 32 | assert all(label_counts["n_nested_b"].values == [2, 1, 2]) 33 | 34 | # Make sure the ordering is alphabetical 35 | # https://github.com/lincc-frameworks/nested-pandas/issues/109 36 | assert label_counts.columns[-1] == "n_nested_b" 37 | assert label_counts.columns[-2] == "n_nested_a" 38 | 39 | # Test join behavior 40 | if join: 41 | assert total_counts.columns.tolist() == base.columns.tolist() + ["n_nested"] 42 | assert label_counts.columns.tolist() == base.columns.tolist() + ["n_nested_a", "n_nested_b"] 43 | else: 44 | assert total_counts.columns.tolist() == ["n_nested"] 45 | assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"] 46 | 47 | 48 | def test_check_expr_nesting(): 49 | """ 50 | Test the correctness of the evaluation expression pre-flight checks, which are 51 | used to ensure that an expression-based query does not try to combine base and nested 52 | sub-expressions. 53 | """ 54 | base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2]) 55 | nested = pd.DataFrame( 56 | data={ 57 | "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1], 58 | "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], 59 | "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"], 60 | }, 61 | index=[0, 0, 0, 1, 1, 1, 2, 2, 2], 62 | ) 63 | b1 = base.add_nested(nested, "nested") 64 | assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"} 65 | assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"} 66 | assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""} 67 | 68 | b2 = base.add_nested(nested.copy(), "n") 69 | assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""} 70 | 71 | abc = pd.DataFrame( 72 | data={ 73 | "c": [3, 1, 4, 1, 5, 9, 2, 6, 5], 74 | "d": [1, 4, 1, 2, 1, 3, 5, 6, 2], 75 | "g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], 76 | }, 77 | index=[0, 0, 0, 1, 1, 1, 2, 2, 2], 78 | ) 79 | b3 = base.add_nested(abc, "abc").add_nested(abc, "c") 80 | assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"} 81 | 82 | assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"} 83 | assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"} 84 | assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"} 85 | 86 | assert b1.extract_nest_names("a>3") == {""} 87 | assert b1.extract_nest_names("a > 3") == {""} 88 | 89 | b4 = base.add_nested(nested, "test") 90 | assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""} 91 | assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""} 92 | -------------------------------------------------------------------------------- /tests/test_data/nested.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/tests/test_data/nested.parquet -------------------------------------------------------------------------------- /tests/test_data/not_nestable.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/tests/test_data/not_nestable.parquet -------------------------------------------------------------------------------- /tests/test_data/vsx-x-ztfdr22_lc-m31.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/78deda7f896727baa7be7990ab159d9236d9f68c/tests/test_data/vsx-x-ztfdr22_lc-m31.parquet --------------------------------------------------------------------------------