├── .copier-answers.yml
├── .git_archival.txt
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 0-general_issue.md
    │   ├── 1-bug_report.md
    │   ├── 2-feature_request.md
    │   └── README.md
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── README.md
    │   ├── asv-main.yml
    │   ├── asv-nightly.yml
    │   ├── asv-pr.yml
    │   ├── build-documentation.yml
    │   ├── pre-commit-ci.yml
    │   ├── publish-benchmarks-pr.yml
    │   ├── publish-to-pypi.yml
    │   ├── smoke-test.yml
    │   └── testing-and-coverage.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── .setup_dev.sh
├── LICENSE
├── README.md
├── benchmarks
    ├── README.md
    ├── __init__.py
    ├── asv.conf.json
    └── benchmarks.py
├── docs
    ├── Makefile
    ├── _static
    │   └── custom.css
    ├── _templates
    │   └── autosummary
    │   │   ├── base.rst
    │   │   ├── class.rst
    │   │   └── module.rst
    ├── about.rst
    ├── about
    │   ├── internals.rst
    │   └── npd_internals.png
    ├── conf.py
    ├── gettingstarted.rst
    ├── gettingstarted
    │   ├── contributing.rst
    │   ├── installation.rst
    │   └── quickstart.ipynb
    ├── index.rst
    ├── intro_images
    │   ├── loc_into_nested.png
    │   ├── nestedframe_example.png
    │   ├── pandas_dfs.png
    │   └── reduce.png
    ├── pre_executed
    │   └── performance.ipynb
    ├── reference.rst
    ├── reference
    │   ├── accessor.rst
    │   ├── ext_array.rst
    │   ├── nesteddtype.rst
    │   ├── nestedframe.rst
    │   ├── packer.rst
    │   └── utils.rst
    ├── requirements.txt
    ├── tutorials.rst
    └── tutorials
    │   ├── README.md
    │   ├── data_loading_notebook.ipynb
    │   ├── data_manipulation.ipynb
    │   ├── low_level.ipynb
    │   └── nested_spectra.ipynb
├── pyproject.toml
├── src
    └── nested_pandas
    │   ├── __init__.py
    │   ├── datasets
    │       ├── __init__.py
    │       └── generation.py
    │   ├── nestedframe
    │       ├── __init__.py
    │       ├── core.py
    │       ├── expr.py
    │       └── io.py
    │   ├── py.typed
    │   ├── series
    │       ├── __init__.py
    │       ├── _storage
    │       │   ├── __init__.py
    │       │   ├── list_struct_storage.py
    │       │   ├── struct_list_storage.py
    │       │   └── table_storage.py
    │       ├── accessor.py
    │       ├── dtype.py
    │       ├── ext_array.py
    │       ├── packer.py
    │       └── utils.py
    │   └── utils
    │       ├── __init__.py
    │       └── utils.py
└── tests
    ├── nested_pandas
        ├── conftest.py
        ├── datasets
        │   └── test_generation.py
        ├── e2e_tests
        │   └── test_issue89.py
        ├── nestedframe
        │   ├── test_io.py
        │   └── test_nestedframe.py
        ├── series
        │   ├── test_accessor.py
        │   ├── test_dtype.py
        │   ├── test_ext_array.py
        │   ├── test_packer.py
        │   └── test_series_utils.py
        ├── test_packaging.py
        └── utils
        │   └── test_utils.py
    └── test_data
        ├── nested.parquet
        ├── not_nestable.parquet
        └── vsx-x-ztfdr22_lc-m31.parquet


/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # Changes here will be overwritten by Copier
 2 | _commit: v2.0.7
 3 | _src_path: gh:lincc-frameworks/python-project-template
 4 | author_email: brantd@uw.edu
 5 | author_name: LINCC Frameworks
 6 | create_example_module: false
 7 | custom_install: true
 8 | enforce_style:
 9 | - ruff_lint
10 | - ruff_format
11 | failure_notification: []
12 | include_benchmarks: true
13 | include_docs: true
14 | include_notebooks: true
15 | mypy_type_checking: basic
16 | package_name: nested_pandas
17 | project_license: MIT
18 | project_name: nested-pandas
19 | project_organization: lincc-frameworks
20 | python_versions:
21 | - '3.10'
22 | - '3.11'
23 | - '3.12'
24 | - '3.13'
25 | test_lowest_version: all
26 | 


--------------------------------------------------------------------------------
/.git_archival.txt:
--------------------------------------------------------------------------------
1 | node: 94621d79e84041ef98107b13968ea9fa36fbce5a
2 | node-date: 2025-06-18T12:17:49-04:00
3 | describe-name: v0.4.5-2-g94621d79
4 | ref-names: HEAD -> main


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # For explanation of this file and uses see
 2 | # https://git-scm.com/docs/gitattributes
 3 | # https://developer.lsst.io/git/git-lfs.html#using-git-lfs-enabled-repositories
 4 | # https://lincc-ppt.readthedocs.io/en/latest/practices/git-lfs.html
 5 | #
 6 | # Used by https://github.com/lsst/afwdata.git
 7 | # *.boost filter=lfs diff=lfs merge=lfs -text
 8 | # *.dat filter=lfs diff=lfs merge=lfs -text
 9 | # *.fits filter=lfs diff=lfs merge=lfs -text
10 | # *.gz filter=lfs diff=lfs merge=lfs -text
11 | #
12 | # apache parquet files
13 | # *.parq	filter=lfs diff=lfs merge=lfs -text
14 | #
15 | # sqlite files
16 | # *.sqlite3	filter=lfs diff=lfs merge=lfs -text
17 | #
18 | # gzip files
19 | # *.gz	filter=lfs diff=lfs merge=lfs -text
20 | #
21 | # png image files
22 | # *.png	filter=lfs diff=lfs merge=lfs -text
23 | 
24 | .git_archival.txt export-subst


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/0-general_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General issue
3 | about: Quickly create a general issue
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Tell us about a problem to fix
 4 | title: 'Short description'
 5 | labels: 'bug'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | **Bug report**
10 | 
11 | 
12 | **Before submitting**
13 | Please check the following:
14 | 
15 | - [ ] I have described the situation in which the bug arose, including what code was executed, information about my environment, and any applicable data others will need to reproduce the problem.
16 | - [ ] I have included available evidence of the unexpected behavior (including error messages, screenshots, and/or plots) as well as a description of what I expected instead.
17 | - [ ] If I have a solution in mind, I have provided an explanation and/or pseudocode and/or task list.
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: 'Short description'
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Feature request**
11 | 
12 | 
13 | **Before submitting**
14 | Please check the following:
15 | 
16 | - [ ] I have described the purpose of the suggested change, specifying what I need the enhancement to accomplish, i.e. what problem it solves.
17 | - [ ] I have included any relevant links, screenshots, environment information, and data relevant to implementing the requested feature, as well as pseudocode for how I want to access the new functionality.
18 | - [ ] If I have ideas for how the new feature could be implemented, I have provided explanations and/or pseudocode and/or task lists for the steps.
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/README.md:
--------------------------------------------------------------------------------
1 | # Configurations
2 | 
3 | Templates for various different issue types are defined in this directory
4 | and a pull request template is defined as ``../pull_request_template.md``. Adding,
5 | removing, and modifying these templates to suit the needs of your project is encouraged.
6 | 
7 | For more information about these templates, look here: https://lincc-ppt.readthedocs.io/en/latest/practices/issue_pr_templating.html
8 | 
9 | Or if you still have questions contact us: https://lincc-ppt.readthedocs.io/en/latest/source/contact.html


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "monthly"
 7 |   - package-ecosystem: "pip"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "monthly"
11 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- 
 2 | Thank you for your contribution to the repo :)
 3 | 
 4 | Pull Request (PR) Instructions:
 5 | Provide a general summary of your changes in the Title above. Fill out each section of the template, and replace the space with an `x` in all the boxes that apply. If you're unsure about any of these, don't hesitate to ask. We're here to help! Once you are satisfied with the pull request, click the "Create pull request" button to submit it for review.
 6 | 
 7 | Before submitting this PR, please ensure that your input and responses are entered in the designated space provided below each section to keep all project-related information organized and easily accessible.
 8 |  
 9 | How to link to a PR:
10 | https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue 
11 | -->
12 | 
13 | ## Change Description
14 | <!--- 
15 | Describe your changes in detail. In your description, you should answer questions like "Why is this change required? What problem does it solve?".
16 | 
17 | If it fixes an open issue, please link to the issue here. If this PR closes an issue, put the word 'closes' before the issue link to auto-close the issue when the PR is merged.
18 | -->
19 | - [ ] My PR includes a link to the issue that I am addressing
20 | 
21 | 
22 | 
23 | ## Solution Description
24 | <!-- Please explain the technical solution that I have provided and how it addresses the issue or feature being implemented -->
25 | 
26 | 
27 | 
28 | ## Code Quality
29 | - [ ] I have read the Contribution Guide
30 | - [ ] My code follows the code style of this project
31 | - [ ] My code builds (or compiles) cleanly without any errors or warnings
32 | - [ ] My code contains relevant comments and necessary documentation
33 | 
34 | ## Project-Specific Pull Request Checklists
35 | <!--- Please only use the checklist that apply to your change type(s) -->
36 | 
37 | ### Bug Fix Checklist
38 | - [ ] My fix includes a new test that breaks as a result of the bug (if possible)
39 | - [ ] My change includes a breaking change
40 |   - [ ] My change includes backwards compatibility and deprecation warnings (if possible)
41 | 
42 | ### New Feature Checklist
43 | - [ ] I have added or updated the docstrings associated with my feature using the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html)
44 | - [ ] I have updated the tutorial to highlight my new feature (if appropriate)
45 | - [ ] I have added unit/End-to-End (E2E) test cases to cover my new feature
46 | - [ ] My change includes a breaking change
47 |   - [ ] My change includes backwards compatibility and deprecation warnings (if possible)
48 | 
49 | ### Documentation Change Checklist
50 | - [ ] Any updated docstrings use the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html)
51 | 
52 | ### Build/CI Change Checklist
53 | - [ ] If required or optional dependencies have changed (including version numbers), I have updated the README to reflect this
54 | - [ ] If this is a new CI setup, I have added the associated badge to the README
55 | 
56 | <!-- ### Version Change Checklist [For Future Use] -->
57 | 
58 | ### Other Change Checklist
59 | - [ ] Any new or updated docstrings use the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html).
60 | - [ ] I have updated the tutorial to highlight my new feature (if appropriate)
61 | - [ ] I have added unit/End-to-End (E2E) test cases to cover any changes
62 | - [ ] My change includes a breaking change
63 |   - [ ] My change includes backwards compatibility and deprecation warnings (if possible)
64 | 


--------------------------------------------------------------------------------
/.github/workflows/README.md:
--------------------------------------------------------------------------------
1 | # Workflows
2 | 
3 | The .yml files in this directory are used to define the various continuous
4 | integration scripts that will be run on your behalf e.g. nightly as a smoke check,
5 | or when you create a new PR.
6 | 
7 | For more information about CI and workflows, look here: https://lincc-ppt.readthedocs.io/en/latest/practices/ci.html
8 | 
9 | Or if you still have questions contact us: https://lincc-ppt.readthedocs.io/en/latest/source/contact.html


--------------------------------------------------------------------------------
/.github/workflows/asv-main.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will run benchmarks with airspeed velocity (asv), 
 2 | # store the new results in the "benchmarks" branch and publish them
 3 | # to a dashboard on GH Pages.
 4 | name: Run ASV benchmarks for main
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main ]
 9 | 
10 | env:
11 |   PYTHON_VERSION: "3.11"
12 |   ASV_VERSION: "0.6.4"
13 |   WORKING_DIR: ${{github.workspace}}/benchmarks
14 | 
15 | concurrency:
16 |   group: ${{github.workflow}}-${{github.ref}}
17 |   cancel-in-progress: true
18 | 
19 | jobs:
20 |   asv-main:
21 |     runs-on: ubuntu-latest
22 |     permissions:
23 |       contents: write
24 |     defaults:
25 |       run:
26 |         working-directory: ${{env.WORKING_DIR}}
27 |     steps:
28 |     - name: Set up Python ${{env.PYTHON_VERSION}}
29 |       uses: actions/setup-python@v5
30 |       with:
31 |         python-version: ${{env.PYTHON_VERSION}}
32 |     - name: Checkout main branch of the repository
33 |       uses: actions/checkout@v4
34 |       with:
35 |         fetch-depth: 0
36 |     - name: Install dependencies
37 |       run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}"
38 |     - name: Configure git
39 |       run: |
40 |         git config user.name "github-actions[bot]"
41 |         git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
42 |     - name: Create ASV machine config file
43 |       run: asv machine --machine gh-runner --yes
44 |     - name: Fetch previous results from the "benchmarks" branch
45 |       run: |
46 |         if git ls-remote --exit-code origin benchmarks > /dev/null 2>&1; then
47 |           git merge origin/benchmarks \
48 |             --allow-unrelated-histories \
49 |             --no-commit
50 |           mv ../_results .
51 |         fi
52 |     - name: Run ASV for the main branch
53 |       run: asv run ALL --skip-existing --verbose || true
54 |     - name: Submit new results to the "benchmarks" branch
55 |       uses: JamesIves/github-pages-deploy-action@v4
56 |       with:
57 |         branch: benchmarks
58 |         folder: ${{env.WORKING_DIR}}/_results
59 |         target-folder: _results
60 |     - name: Generate dashboard HTML
61 |       run: |
62 |         asv show
63 |         asv publish
64 |     - name: Deploy to Github pages
65 |       uses: JamesIves/github-pages-deploy-action@v4
66 |       with:
67 |         branch: gh-pages
68 |         folder: ${{env.WORKING_DIR}}/_html


--------------------------------------------------------------------------------
/.github/workflows/asv-nightly.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will run daily at 06:45.
 2 | # It will run benchmarks with airspeed velocity (asv)
 3 | # and compare performance with the previous nightly build.
 4 | name: Run benchmarks nightly job
 5 | 
 6 | on:
 7 |   schedule:
 8 |     - cron: 45 6 * * *
 9 |   workflow_dispatch:
10 |   
11 | env:
12 |   PYTHON_VERSION: "3.11"
13 |   ASV_VERSION: "0.6.4"
14 |   WORKING_DIR: ${{github.workspace}}/benchmarks
15 |   NIGHTLY_HASH_FILE: nightly-hash
16 | 
17 | jobs:
18 |   asv-nightly:
19 |     runs-on: ubuntu-latest
20 |     defaults:
21 |       run:
22 |         working-directory: ${{env.WORKING_DIR}}
23 |     steps:
24 |     - name: Set up Python ${{env.PYTHON_VERSION}}
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: ${{env.PYTHON_VERSION}}
28 |     - name: Checkout main branch of the repository
29 |       uses: actions/checkout@v4
30 |       with:
31 |         fetch-depth: 0
32 |     - name: Install dependencies
33 |       run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}"
34 |     - name: Configure git
35 |       run: |
36 |         git config user.name "github-actions[bot]"
37 |         git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
38 |     - name: Create ASV machine config file
39 |       run: asv machine --machine gh-runner --yes
40 |     - name: Fetch previous results from the "benchmarks" branch
41 |       run: |
42 |         if git ls-remote --exit-code origin benchmarks > /dev/null 2>&1; then
43 |           git merge origin/benchmarks \
44 |             --allow-unrelated-histories \
45 |             --no-commit
46 |           mv ../_results .
47 |         fi
48 |     - name: Get nightly dates under comparison
49 |       id: nightly-dates
50 |       run: |
51 |         echo "yesterday=$(date -d yesterday +'%Y-%m-%d')" >> $GITHUB_OUTPUT
52 |         echo "today=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
53 |     - name: Use last nightly commit hash from cache
54 |       uses: actions/cache@v4
55 |       with:
56 |         path: ${{env.WORKING_DIR}}
57 |         key: nightly-results-${{steps.nightly-dates.outputs.yesterday}}
58 |     - name: Run comparison of main against last nightly build
59 |       run: |
60 |         HASH_FILE=${{env.NIGHTLY_HASH_FILE}}
61 |         CURRENT_HASH=${{github.sha}}
62 |         if [ -f $HASH_FILE ]; then
63 |           PREV_HASH=$(cat $HASH_FILE)
64 |           asv continuous $PREV_HASH $CURRENT_HASH --verbose || true
65 |           asv compare $PREV_HASH $CURRENT_HASH --sort ratio --verbose
66 |         fi
67 |         echo $CURRENT_HASH > $HASH_FILE
68 |     - name: Update last nightly hash in cache
69 |       uses: actions/cache@v4
70 |       with:
71 |         path: ${{env.WORKING_DIR}}
72 |         key: nightly-results-${{steps.nightly-dates.outputs.today}}


--------------------------------------------------------------------------------
/.github/workflows/asv-pr.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will run benchmarks with airspeed velocity (asv) for pull requests.
 2 | # It will compare the performance of the main branch with the performance of the merge
 3 | # with the new changes. It then publishes a comment with this assessment by triggering
 4 | # the publish-benchmarks-pr workflow.
 5 | # Based on https://securitylab.github.com/research/github-actions-preventing-pwn-requests/.
 6 | name: Run benchmarks for PR
 7 | 
 8 | on:
 9 |   pull_request:
10 |     branches: [ main ]
11 |   workflow_dispatch:
12 | 
13 | concurrency:
14 |   group: ${{github.workflow}}-${{github.ref}}
15 |   cancel-in-progress: true
16 | 
17 | env:
18 |   PYTHON_VERSION: "3.11"
19 |   ASV_VERSION: "0.6.4"
20 |   WORKING_DIR: ${{github.workspace}}/benchmarks
21 |   ARTIFACTS_DIR: ${{github.workspace}}/artifacts
22 | 
23 | jobs:
24 |   asv-pr:
25 |     runs-on: ubuntu-latest
26 |     defaults:
27 |       run:
28 |         working-directory: ${{env.WORKING_DIR}}
29 |     steps:
30 |     - name: Set up Python ${{env.PYTHON_VERSION}}
31 |       uses: actions/setup-python@v5
32 |       with:
33 |         python-version: ${{env.PYTHON_VERSION}}
34 |     - name: Checkout PR branch of the repository
35 |       uses: actions/checkout@v4
36 |       with:
37 |         fetch-depth: 0
38 |     - name: Display Workflow Run Information
39 |       run: |
40 |         echo "Workflow Run ID: ${{github.run_id}}"
41 |     - name: Install dependencies
42 |       run: pip install "asv[virtualenv]==${{env.ASV_VERSION}}" lf-asv-formatter
43 |     - name: Make artifacts directory
44 |       run: mkdir -p ${{env.ARTIFACTS_DIR}}
45 |     - name: Save pull request number
46 |       run: echo ${{github.event.pull_request.number}} > ${{env.ARTIFACTS_DIR}}/pr
47 |     - name: Get current job logs URL
48 |       uses: Tiryoh/gha-jobid-action@v1
49 |       id: jobs
50 |       with:
51 |         github_token: ${{secrets.GITHUB_TOKEN}}
52 |         job_name: ${{github.job}}
53 |     - name: Create ASV machine config file
54 |       run: asv machine --machine gh-runner --yes
55 |     - name: Save comparison of PR against main branch
56 |       run: |
57 |         git remote add upstream https://github.com/${{github.repository}}.git
58 |         git fetch upstream
59 |         asv continuous upstream/main HEAD --verbose || true
60 |         asv compare upstream/main HEAD --sort ratio --verbose | tee output
61 |         python -m lf_asv_formatter --asv_version "$(asv --version | awk '{print $2}')"
62 |         printf "\n\nClick [here]($STEP_URL) to view all benchmarks." >> output
63 |         mv output ${{env.ARTIFACTS_DIR}}
64 |       env:
65 |         STEP_URL: ${{steps.jobs.outputs.html_url}}#step:10:1
66 |     - name: Upload artifacts (PR number and benchmarks output)
67 |       uses: actions/upload-artifact@v4
68 |       with:
69 |         name: benchmark-artifacts
70 |         path: ${{env.ARTIFACTS_DIR}}


--------------------------------------------------------------------------------
/.github/workflows/build-documentation.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # This workflow will install Python dependencies, build the package and then build the documentation.
 3 | 
 4 | name: Build documentation
 5 | 
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 | concurrency:
14 |   group: ${{ github.workflow }}-${{ github.ref }}
15 |   cancel-in-progress: true
16 | 
17 | jobs:
18 |   build:
19 | 
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python 3.11
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: '3.11'
28 |     - name: Install dependencies
29 |       run: |
30 |         sudo apt-get update
31 |         python -m pip install --upgrade pip
32 |         if [ -f docs/requirements.txt ]; then pip install -r docs/requirements.txt; fi
33 |         pip install .
34 |     - name: Install notebook requirements
35 |       run: |
36 |         sudo apt-get install pandoc
37 |     - name: Build docs
38 |       run: |
39 |         sphinx-build -T -E -b html -d docs/build/doctrees ./docs docs/build/html
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit-ci.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # This workflow runs pre-commit hooks on pushes and pull requests to main
 3 | # to enforce coding style. To ensure correct configuration, please refer to:
 4 | #  https://lincc-ppt.readthedocs.io/en/latest/practices/ci_precommit.html
 5 | name: Run pre-commit hooks
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 | jobs:
14 |   pre-commit-ci:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |       with:
19 |         fetch-depth: 0 
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: '3.11'
24 |     - name: Install dependencies
25 |       run: |
26 |         sudo apt-get update
27 |         python -m pip install --upgrade pip
28 |         pip install .[dev]
29 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
30 |     - uses: pre-commit/action@v3.0.1
31 |       with:
32 |         extra_args: --all-files --verbose
33 |       env:
34 |         SKIP: "check-lincc-frameworks-template-version,no-commit-to-branch,check-added-large-files,validate-pyproject,sphinx-build,pytest-check"
35 |     - uses: pre-commit-ci/lite-action@v1.1.0
36 |       if: failure() && github.event_name == 'pull_request' && github.event.pull_request.draft == false


--------------------------------------------------------------------------------
/.github/workflows/publish-benchmarks-pr.yml:
--------------------------------------------------------------------------------
 1 | # This workflow publishes a benchmarks comment on a pull request. It is triggered after the
 2 | # benchmarks are computed in the asv-pr workflow. This separation of concerns allows us limit
 3 | # access to the target repository private tokens and secrets, increasing the level of security.
 4 | # Based on https://securitylab.github.com/research/github-actions-preventing-pwn-requests/.
 5 | name: Publish benchmarks comment to PR
 6 | 
 7 | on:
 8 |   workflow_run:
 9 |     workflows: ["Run benchmarks for PR"]
10 |     types: [completed]
11 | 
12 | jobs:
13 |   upload-pr-comment:
14 |     runs-on: ubuntu-latest
15 |     if: >
16 |       github.event.workflow_run.event == 'pull_request' &&
17 |       github.event.workflow_run.conclusion == 'success'
18 |     permissions:
19 |       issues: write
20 |       pull-requests: write
21 |     steps:
22 |     - name: Display Workflow Run Information
23 |       run: |
24 |         echo "Workflow Run ID: ${{ github.event.workflow_run.id }}"
25 |         echo "Head SHA: ${{ github.event.workflow_run.head_sha }}"
26 |         echo "Head Branch: ${{ github.event.workflow_run.head_branch }}"
27 |         echo "Conclusion: ${{ github.event.workflow_run.conclusion }}"
28 |         echo "Event: ${{ github.event.workflow_run.event }}"
29 |     - name: Download artifact
30 |       uses: dawidd6/action-download-artifact@v7
31 |       with:
32 |         name: benchmark-artifacts
33 |         run_id: ${{ github.event.workflow_run.id }}
34 |     - name: Extract artifacts information
35 |       id: pr-info
36 |       run: |
37 |         printf "PR number: $(cat pr)\n"
38 |         printf "Output:\n$(cat output)"
39 |         printf "pr=$(cat pr)" >> $GITHUB_OUTPUT
40 |     - name: Find benchmarks comment
41 |       uses: peter-evans/find-comment@v3
42 |       id: find-comment
43 |       with:
44 |         issue-number: ${{ steps.pr-info.outputs.pr }}
45 |         comment-author: 'github-actions[bot]'
46 |         body-includes: view all benchmarks
47 |     - name: Create or update benchmarks comment
48 |       uses: peter-evans/create-or-update-comment@v4
49 |       with:
50 |         comment-id: ${{ steps.find-comment.outputs.comment-id }}
51 |         issue-number: ${{ steps.pr-info.outputs.pr }}
52 |         body-path: output
53 |         edit-mode: replace


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # This workflow will upload a Python Package using Twine when a release is created
 3 | # For more information see: https://github.com/pypa/gh-action-pypi-publish#trusted-publishing
 4 | 
 5 | # This workflow uses actions that are not certified by GitHub.
 6 | # They are provided by a third-party and are governed by
 7 | # separate terms of service, privacy policy, and support
 8 | # documentation.
 9 | 
10 | name: Upload Python Package
11 | 
12 | on:
13 |   release:
14 |     types: [published]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   deploy:
21 | 
22 |     runs-on: ubuntu-latest
23 |     permissions:
24 |       id-token: write
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |     - name: Set up Python
28 |       uses: actions/setup-python@v5
29 |       with:
30 |         python-version: '3.11'
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip
34 |         pip install build
35 |     - name: Build package
36 |       run: python -m build
37 |     - name: Publish package
38 |       uses: pypa/gh-action-pypi-publish@release/v1
39 | 


--------------------------------------------------------------------------------
/.github/workflows/smoke-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will run daily at 06:45.
 2 | # It will install Python dependencies and run tests with a variety of Python versions.
 3 | # See documentation for help debugging smoke test issues:
 4 | #    https://lincc-ppt.readthedocs.io/en/latest/practices/ci_testing.html#version-culprit
 5 | 
 6 | name: Unit test smoke test
 7 | 
 8 | on:
 9 | 
10 |   # Runs this workflow automatically
11 |   schedule:
12 |     - cron: 45 6 * * *
13 |     
14 |   # Allows you to run this workflow manually from the Actions tab
15 |   workflow_dispatch:
16 | 
17 | jobs:
18 |   build:
19 | 
20 |     runs-on: ubuntu-latest
21 |     strategy:
22 |       matrix:
23 |         python-version: ['3.10', '3.11', '3.12', '3.13']
24 | 
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |     - name: Set up Python ${{ matrix.python-version }}
28 |       uses: actions/setup-python@v5
29 |       with:
30 |         python-version: ${{ matrix.python-version }}
31 |     - name: Install dependencies
32 |       run: |
33 |         sudo apt-get update
34 |         python -m pip install --upgrade pip
35 |         pip install -e .[dev]
36 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
37 |     - name: List dependencies
38 |       run: |
39 |         pip list
40 |     - name: Run unit tests with pytest
41 |       run: |
42 |         python -m pytest


--------------------------------------------------------------------------------
/.github/workflows/testing-and-coverage.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # This workflow will install Python dependencies, run tests and report code coverage with a variety of Python versions
 3 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 4 | 
 5 | name: Unit test and code coverage
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 | jobs:
14 |   build:
15 | 
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         python-version: ['3.10', '3.11', '3.12', '3.13']
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         sudo apt-get update
30 |         python -m pip install --upgrade pip
31 |         pip install -e .[dev]
32 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 |     - name: Run unit tests with pytest
34 |       run: |
35 |         python -m pytest --cov=nested_pandas --cov-report=xml
36 |     - name: Upload coverage report to codecov
37 |       uses: codecov/codecov-action@v5
38 |       with:
39 |         token: ${{ secrets.CODECOV_TOKEN }}
40 |   test-lowest-versions:
41 |     runs-on: ubuntu-latest
42 |     steps:
43 |     - uses: actions/checkout@v4
44 |     - name: Set up Python 3.10
45 |       uses: actions/setup-python@v5
46 |       with:
47 |         python-version: '3.10'
48 |     - name: Install dependencies
49 |       run: |
50 |         sudo apt-get update
51 |         python -m pip install --upgrade uv
52 |         uv venv venv
53 |         source venv/bin/activate
54 |         uv pip compile --resolution=lowest -o requirements_lowest.txt pyproject.toml
55 |         uv pip install --constraint=requirements_lowest.txt -e .[dev]
56 |     - name: Run unit tests with pytest
57 |       run: |
58 |         source venv/bin/activate
59 |         python -m pytest
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | _version.py
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | _readthedocs/
 75 | docs/reference/api/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # vscode
135 | .vscode/
136 | 
137 | # dask
138 | dask-worker-space/
139 | 
140 | # tmp directory
141 | tmp/
142 | 
143 | # Mac OS
144 | .DS_Store
145 | 
146 | # Airspeed Velocity performance results
147 | _results/
148 | _html/
149 | 
150 | # Project initialization script
151 | .initialize_new_project.sh
152 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | repos:
  3 |     # Compare the local template version to the latest remote template version
  4 |     # This hook should always pass. It will print a message if the local version 
  5 |     # is out of date.
  6 |   - repo: https://github.com/lincc-frameworks/pre-commit-hooks
  7 |     rev: v0.1.2
  8 |     hooks:
  9 |       - id: check-lincc-frameworks-template-version
 10 |         name: Check template version
 11 |         description: Compare current template version against latest
 12 |         verbose: true
 13 |     # Clear output from jupyter notebooks so that only the input cells are committed.
 14 |   - repo: local
 15 |     hooks:
 16 |       - id: jupyter-nb-clear-output
 17 |         name: Clear output from Jupyter notebooks
 18 |         description: Clear output from Jupyter notebooks.
 19 |         files: \.ipynb$
 20 |         stages: [pre-commit]
 21 |         language: system
 22 |         entry: jupyter nbconvert --clear-output
 23 |         exclude: docs/pre_executed
 24 |     # Prevents committing directly branches named 'main' and 'master'.
 25 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 26 |     rev: v4.4.0
 27 |     hooks:
 28 |       - id: no-commit-to-branch
 29 |         name: Prevent main branch commits
 30 |         description: Prevent the user from committing directly to the primary branch.
 31 |       - id: check-added-large-files
 32 |         name: Check for large files
 33 |         description: Prevent the user from committing very large files.
 34 |         args: ['--maxkb=500']
 35 |     # Verify that pyproject.toml is well formed
 36 |   - repo: https://github.com/abravalheri/validate-pyproject
 37 |     rev: v0.12.1
 38 |     hooks:
 39 |       - id: validate-pyproject
 40 |         name: Validate pyproject.toml
 41 |         description: Verify that pyproject.toml adheres to the established schema.
 42 |     # Verify that GitHub workflows are well formed
 43 |   - repo: https://github.com/python-jsonschema/check-jsonschema
 44 |     rev: 0.28.0
 45 |     hooks:
 46 |       - id: check-github-workflows
 47 |         args: ["--verbose"]
 48 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 49 |     # Ruff version.
 50 |     rev: v0.2.1
 51 |     hooks:
 52 |       - id: ruff
 53 |         name: Lint code using ruff; sort and organize imports 
 54 |         types_or: [ python, pyi ]
 55 |         args: ["--fix"]
 56 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 57 |     # Ruff version.
 58 |     rev: v0.2.1
 59 |     hooks:
 60 |       - id: ruff-format
 61 |         name: Format code using ruff
 62 |         types_or: [ python, pyi, jupyter ]
 63 |     # Analyze type hints and report errors.
 64 |   - repo: local
 65 |     hooks:
 66 |       - id: mypy
 67 |         name: mypy (python files in src/ and tests/)
 68 |         entry: mypy
 69 |         language: system
 70 |         types: [python]
 71 |         files: ^(src|tests)/
 72 |         args:
 73 |           [
 74 |             "--ignore-missing-imports", # Ignore imports without type hints
 75 |           ]
 76 |     # Make sure Sphinx can build the documentation while explicitly omitting 
 77 |     # notebooks from the docs, so users don't have to wait through the execution 
 78 |     # of each notebook or each commit. By default, these will be checked in the 
 79 |     # GitHub workflows.
 80 |   - repo: local
 81 |     hooks:
 82 |       - id: sphinx-build
 83 |         name: Build documentation with Sphinx
 84 |         entry: sphinx-build
 85 |         language: system
 86 |         always_run: true
 87 |         exclude_types: [file, symlink]
 88 |         args:
 89 |           [
 90 |             "-M", # Run sphinx in make mode, so we can use -D flag later
 91 |                   # Note: -M requires next 3 args to be builder, source, output
 92 |             "html", # Specify builder
 93 |             "./docs", # Source directory of documents
 94 |             "./_readthedocs", # Output directory for rendered documents
 95 |             "-T", # Show full trace back on exception
 96 |             "-E", # Don't use saved env; always read all files
 97 |             "-d", # Flag for cached environment and doctrees
 98 |             "./docs/_build/doctrees", # Directory
 99 |             "-D", # Flag to override settings in conf.py
100 |             "exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit
101 |           ]
102 |     # Run unit tests, verify that they pass. Note that coverage is run against
103 |     # the ./src directory here because that is what will be committed. In the
104 |     # github workflow script, the coverage is run against the installed package
105 |     # and uploaded to Codecov by calling pytest like so:
106 |     # `python -m pytest --cov=<package_name> --cov-report=xml`
107 |   - repo: local
108 |     hooks:
109 |       - id: pytest-check
110 |         name: Run unit tests
111 |         description: Run unit tests with pytest.
112 |         entry: bash -c "if python -m pytest --co -qq; then python -m pytest --cov=./src --cov-report=html; fi"
113 |         language: system
114 |         pass_filenames: false
115 |         always_run: true
116 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # .readthedocs.yml
 3 | # Read the Docs configuration file
 4 | # See  https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 5 | 
 6 | # Required
 7 | version: 2
 8 | 
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements.txt
22 |    - method: pip
23 |      path: .
24 | 


--------------------------------------------------------------------------------
/.setup_dev.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Bash Unofficial strict mode (http://redsymbol.net/articles/unofficial-bash-strict-mode/) 
 4 | # and (https://disconnected.systems/blog/another-bash-strict-mode/)
 5 | set -o nounset # Any uninitialized variable is an error
 6 | set -o errexit # Exit the script on the failure of any command to execute without error
 7 | set -o pipefail # Fail command pipelines on the failure of any individual step
 8 | IFS=$'\n\t' #set internal field separator to avoid iteration errors
 9 | # Trap all exits and output something helpful
10 | trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR
11 | 
12 | # This script should be run by new developers to install this package in
13 | # editable mode and configure their local environment
14 | 
15 | echo "Checking virtual environment"
16 | if [ "${VIRTUAL_ENV:-missing}" = "missing" ] && [ "${CONDA_PREFIX:-missing}" = "missing" ]; then
17 |     echo 'No virtual environment detected: none of $VIRTUAL_ENV or $CONDA_PREFIX is set.'
18 |     echo
19 |     echo "=== This script is going to install the project in the system python environment ==="
20 |     echo "Proceed? [y/N]"
21 |     read -r RESPONCE
22 |     if [ "${RESPONCE}" != "y" ]; then
23 |         echo "See https://lincc-ppt.readthedocs.io/ for details."
24 |         echo "Exiting."
25 |         exit 1
26 |     fi
27 | 
28 | fi
29 | 
30 | echo "Checking pip version"
31 | MINIMUM_PIP_VERSION=22
32 | pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./\n\t/g') )
33 | if let "${pipversion[0]}<${MINIMUM_PIP_VERSION}"; then
34 |     echo "Insufficient version of pip found. Requires at least version ${MINIMUM_PIP_VERSION}."
35 |     echo "See https://lincc-ppt.readthedocs.io/ for details."
36 |     exit 1
37 | fi
38 | 
39 | echo "Installing package and runtime dependencies in local environment"
40 | python -m pip install -e . > /dev/null
41 | 
42 | echo "Installing developer dependencies in local environment"
43 | python -m pip install -e .'[dev]' > /dev/null
44 | if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
45 | 
46 | echo "Installing pre-commit"
47 | pre-commit install > /dev/null
48 | 
49 | #######################################################
50 | # Include any additional configurations below this line
51 | #######################################################
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 LINCC Frameworks
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nested-pandas
 2 | 
 3 | [![Template](https://img.shields.io/badge/Template-LINCC%20Frameworks%20Python%20Project%20Template-brightgreen)](https://lincc-ppt.readthedocs.io/en/latest/)
 4 | 
 5 | [![PyPI](https://img.shields.io/pypi/v/nested-pandas?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/nested-pandas/)
 6 | [![Conda](https://img.shields.io/conda/vn/conda-forge/nested-pandas.svg?color=blue&logo=condaforge&logoColor=white)](https://anaconda.org/conda-forge/nested-pandas)
 7 | 
 8 | [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/lincc-frameworks/nested-pandas/smoke-test.yml)](https://github.com/lincc-frameworks/nested-pandas/actions/workflows/smoke-test.yml)
 9 | [![codecov](https://codecov.io/gh/lincc-frameworks/nested-pandas/branch/main/graph/badge.svg)](https://codecov.io/gh/lincc-frameworks/nested-pandas)
10 | [![Read the Docs](https://img.shields.io/readthedocs/nested-pandas)](https://nested-pandas.readthedocs.io/)
11 | [![benchmarks](https://img.shields.io/github/actions/workflow/status/lincc-frameworks/nested-pandas/asv-main.yml?label=benchmarks)](https://lincc-frameworks.github.io/nested-pandas/)
12 | 
13 | An extension of pandas for efficient representation of nested
14 | associated datasets.
15 | 
16 | Nested-Pandas extends the [pandas](https://pandas.pydata.org/) package with 
17 | tooling and support for nested dataframes packed into values of top-level 
18 | dataframe columns. [Pyarrow](https://arrow.apache.org/docs/python/index.html) 
19 | is used internally to aid in scalability and performance.
20 | 
21 | Nested-Pandas allows data like this:
22 | 
23 | <p align="center">
24 |     <img src="./docs/intro_images/pandas_dfs.png" alt="pandas dataframes" width="400"/>
25 | </p>
26 | 
27 | To instead be represented like this:
28 | 
29 | <p align="center">
30 |     <img src="./docs/intro_images/nestedframe_example.png" alt="nestedframe" width="300"/>
31 | </p>
32 | 
33 | Where the nested data is represented as nested dataframes:
34 | 
35 | ```python
36 |    # Each row of "object_nf" now has it's own sub-dataframe of matched rows from "source_df"
37 |    object_nf.loc[0]["nested_sources"]
38 | ```
39 | 
40 | <p align="center">
41 |     <img src="./docs/intro_images/loc_into_nested.png" alt="sub-dataframe" width="225"/>
42 | </p>
43 | 
44 | Allowing powerful and straightforward operations, like:
45 | 
46 | ```python
47 |    # Compute the mean flux for each row of "object_nf"
48 |    import numpy as np
49 |    object_nf.reduce(np.mean, "nested_sources.flux")
50 | ```
51 | 
52 | <p align="center">
53 |     <img src="./docs/intro_images/reduce.png" alt="using reduce" width="150"/>
54 | </p>
55 | 
56 | Nested-Pandas is motivated by time-domain astronomy use cases, where we see
57 | typically two levels of information, information about astronomical objects and
58 | then an associated set of `N` measurements of those objects. Nested-Pandas offers
59 | a performant and memory-efficient package for working with these types of datasets. 
60 | 
61 | Core advantages being:
62 | * hierarchical column access
63 | * efficient packing of nested information into inputs to custom user functions
64 | * avoiding costly groupby operations
65 | 
66 | 
67 | 
68 | This is a LINCC Frameworks project - find more information about LINCC Frameworks [here](https://lsstdiscoveryalliance.org/programs/lincc-frameworks/).
69 | 
70 | 
71 | 
72 | ## Acknowledgements
73 | 
74 | This project is supported by Schmidt Sciences.
75 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | This directory contains files that will be run via continuous testing either
 4 | nightly or after committing code to a pull request. 
 5 | 
 6 | The runtime and/or memory usage of the functions defined in these files will be
 7 | tracked and reported to give you a sense of the overall performance of your code.
 8 | 
 9 | You are encouraged to add, update, or remove benchmark functions to suit the needs
10 | of your project.
11 | 
12 | For more information, see the documentation here: https://lincc-ppt.readthedocs.io/en/latest/practices/ci_benchmarking.html


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/asv.conf.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     // The version of the config file format. Do not change, unless
 4 |     // you know what you are doing.
 5 |     "version": 1,
 6 |     // The name of the project being benchmarked.
 7 |     "project": "nested-pandas",
 8 |     // The project's homepage.
 9 |     "project_url": "https://github.com/lincc-frameworks/nested-pandas",
10 |     // The URL or local path of the source code repository for the
11 |     // project being benchmarked.
12 |     "repo": "..",
13 |     // List of branches to benchmark. If not provided, defaults to "master"
14 |     // (for git) or "tip" (for mercurial).
15 |     "branches": [
16 |         "HEAD"
17 |     ],
18 |     "install_command": [
19 |         "python -m pip install {wheel_file}"
20 |     ],
21 |     "build_command": [
22 |         "python -m build --wheel -o {build_cache_dir} {build_dir}"
23 |     ],
24 |     // The DVCS being used. If not set, it will be automatically
25 |     // determined from "repo" by looking at the protocol in the URL
26 |     // (if remote), or by looking for special directories, such as
27 |     // ".git" (if local).
28 |     "dvcs": "git",
29 |     // The tool to use to create environments. May be "conda",
30 |     // "virtualenv" or other value depending on the plugins in use.
31 |     // If missing or the empty string, the tool will be automatically
32 |     // determined by looking for tools on the PATH environment
33 |     // variable.
34 |     "environment_type": "virtualenv",
35 |     // the base URL to show a commit for the project.
36 |     "show_commit_url": "https://github.com/lincc-frameworks/nested-pandas/commit/",
37 |     // The Pythons you'd like to test against. If not provided, defaults
38 |     // to the current version of Python used to run `asv`.
39 |     "pythons": [
40 |         "3.11"
41 |     ],
42 |     // The matrix of dependencies to test. Each key is the name of a
43 |     // package (in PyPI) and the values are version numbers.  An empty
44 |     // list indicates to just test against the default (latest)
45 |     // version.
46 |     "matrix": {
47 |         "Cython": [],
48 |         "build": [],
49 |         "packaging": []
50 |     },
51 |     // The directory (relative to the current directory) that benchmarks are
52 |     // stored in. If not provided, defaults to "benchmarks".
53 |     "benchmark_dir": ".",
54 |     // The directory (relative to the current directory) to cache the Python
55 |     // environments in. If not provided, defaults to "env".
56 |     "env_dir": "env",
57 |     // The directory (relative to the current directory) that raw benchmark
58 |     // results are stored in. If not provided, defaults to "results".
59 |     "results_dir": "_results",
60 |     // The directory (relative to the current directory) that the html tree
61 |     // should be written to. If not provided, defaults to "html".
62 |     "html_dir": "_html",
63 |     // The number of characters to retain in the commit hashes.
64 |     // "hash_length": 8,
65 |     // `asv` will cache wheels of the recent builds in each
66 |     // environment, making them faster to install next time. This is
67 |     // number of builds to keep, per environment.
68 |     "build_cache_size": 8
69 |     // The commits after which the regression search in `asv publish`
70 |     // should start looking for regressions. Dictionary whose keys are
71 |     // regexps matching to benchmark names, and values corresponding to
72 |     // the commit (exclusive) after which to start looking for
73 |     // regressions. The default is to start from the first commit
74 |     // with results. If the commit is `null`, regression detection is
75 |     // skipped for the matching benchmark.
76 |     //
77 |     // "regressions_first_commits": {
78 |     //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
79 |     //    "another_benchmark": null,   // Skip regression detection altogether
80 |     // }
81 | }


--------------------------------------------------------------------------------
/benchmarks/benchmarks.py:
--------------------------------------------------------------------------------
  1 | """Two sample benchmarks to compute runtime and memory usage.
  2 | 
  3 | For more information on writing benchmarks:
  4 | https://asv.readthedocs.io/en/stable/writing_benchmarks.html."""
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pyarrow as pa
  9 | from nested_pandas import NestedDtype, NestedFrame, datasets
 10 | from nested_pandas.utils import count_nested
 11 | 
 12 | 
 13 | class AssignSingleDfToNestedSeries:
 14 |     """Benchmark the performance of changing a single nested series element"""
 15 | 
 16 |     n_objects = 10_000
 17 |     n_sources = 100
 18 |     new_df: pd.DataFrame
 19 |     series: pd.Series
 20 | 
 21 |     def setup(self):
 22 |         """Set up the benchmark environment."""
 23 |         self.new_df = pd.DataFrame(
 24 |             {
 25 |                 "time": np.arange(self.n_sources, dtype=np.float64),
 26 |                 "flux": np.linspace(0, 1, self.n_sources),
 27 |                 "band": np.full_like("lsstg", self.n_sources),
 28 |             }
 29 |         )
 30 |         original_df = pd.DataFrame(
 31 |             {
 32 |                 "time": np.linspace(0, 1, self.n_sources),
 33 |                 "flux": np.arange(self.n_sources, dtype=np.float64),
 34 |                 "band": np.full_like("sdssu", self.n_sources),
 35 |             }
 36 |         )
 37 |         self.series = pd.Series(
 38 |             [original_df] * self.n_objects,
 39 |             # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we
 40 |             # need to order by field name here for backwards compatibility.
 41 |             dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}),
 42 |         )
 43 | 
 44 |     def run(self):
 45 |         """Run the benchmark."""
 46 |         self.series[self.n_objects // 2] = self.new_df
 47 | 
 48 |     def time_run(self):
 49 |         """Benchmark the runtime of changing a single nested series element."""
 50 |         self.run()
 51 | 
 52 |     def peakmem_run(self):
 53 |         """Benchmark the memory usage of changing a single nested series element."""
 54 |         self.run()
 55 | 
 56 | 
 57 | class ReassignHalfOfNestedSeries:
 58 |     """Benchmark the performance of changing a lot of nested series elements"""
 59 | 
 60 |     n_objects = 10_000
 61 |     n_sources = 100
 62 |     series: pd.Series
 63 |     new_series: pd.Series
 64 | 
 65 |     def setup(self):
 66 |         """Set up the benchmark environment."""
 67 |         # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to
 68 |         # order by field name here for backwards compatibility.
 69 |         dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()})
 70 |         original_df = pd.DataFrame(
 71 |             {
 72 |                 "time": np.linspace(0, 1, self.n_sources),
 73 |                 "flux": np.arange(self.n_sources, dtype=np.float64),
 74 |                 "band": np.full_like("sdssu", self.n_sources),
 75 |             }
 76 |         )
 77 |         self.series = pd.Series(
 78 |             [original_df] * self.n_objects,
 79 |             dtype=dtype,
 80 |         )
 81 | 
 82 |         new_df = pd.DataFrame(
 83 |             {
 84 |                 "time": np.arange(self.n_sources, dtype=np.float64),
 85 |                 "flux": np.linspace(0, 1, self.n_sources),
 86 |                 "band": np.full_like("lsstg", self.n_sources),
 87 |             }
 88 |         )
 89 |         self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype)
 90 | 
 91 |     def run(self):
 92 |         """Run the benchmark."""
 93 |         self.series[::2] = self.new_series
 94 | 
 95 |     def time_run(self):
 96 |         """Benchmark the runtime of changing a single nested series element."""
 97 |         self.run()
 98 | 
 99 |     def peakmem_run(self):
100 |         """Benchmark the memory usage of changing a single nested series element."""
101 |         self.run()
102 | 
103 | 
104 | class NestedFrameAddNested:
105 |     """Benchmark the NestedFrame.add_nested function"""
106 | 
107 |     n_base = 100
108 |     layer_size = 1000
109 |     base_nf: NestedFrame
110 |     layer_nf: NestedFrame
111 | 
112 |     def setup(self):
113 |         """Set up the benchmark environment"""
114 |         # use provided seed, "None" acts as if no seed is provided
115 |         randomstate = np.random.RandomState(seed=1)
116 | 
117 |         # Generate base data
118 |         base_data = {"a": randomstate.random(self.n_base), "b": randomstate.random(self.n_base) * 2}
119 |         self.base_nf = NestedFrame(data=base_data)
120 | 
121 |         layer_data = {
122 |             "t": randomstate.random(self.layer_size * self.n_base) * 20,
123 |             "flux": randomstate.random(self.layer_size * self.n_base) * 100,
124 |             "band": randomstate.choice(["r", "g"], size=self.layer_size * self.n_base),
125 |             "index": np.arange(self.layer_size * self.n_base) % self.n_base,
126 |         }
127 |         self.layer_nf = NestedFrame(data=layer_data).set_index("index")
128 | 
129 |     def run(self):
130 |         """Run the benchmark."""
131 |         self.base_nf.add_nested(self.layer_nf, "nested")
132 | 
133 |     def time_run(self):
134 |         """Benchmark the runtime of adding a nested layer"""
135 |         self.run()
136 | 
137 |     def peakmem_run(self):
138 |         """Benchmark the memory usage of adding a nested layer"""
139 |         self.run()
140 | 
141 | 
142 | class NestedFrameReduce:
143 |     """Benchmark the NestedFrame.reduce function"""
144 | 
145 |     n_base = 100
146 |     n_nested = 1000
147 |     nf: NestedFrame
148 | 
149 |     def setup(self):
150 |         """Set up the benchmark environment"""
151 |         self.nf = datasets.generate_data(self.n_base, self.n_nested)
152 | 
153 |     def run(self):
154 |         """Run the benchmark."""
155 |         self.nf.reduce(np.mean, "nested.flux")
156 | 
157 |     def time_run(self):
158 |         """Benchmark the runtime of applying the reduce function"""
159 |         self.run()
160 | 
161 |     def peakmem_run(self):
162 |         """Benchmark the memory usage of applying the reduce function"""
163 |         self.run()
164 | 
165 | 
166 | class NestedFrameQuery:
167 |     """Benchmark the NestedFrame.query function"""
168 | 
169 |     n_base = 100
170 |     n_nested = 1000
171 |     nf: NestedFrame
172 | 
173 |     def setup(self):
174 |         """Set up the benchmark environment"""
175 |         self.nf = datasets.generate_data(self.n_base, self.n_nested)
176 | 
177 |     def run(self):
178 |         """Run the benchmark."""
179 | 
180 |         # Apply nested layer query
181 |         self.nf = self.nf.query("nested.band == 'g'")
182 | 
183 |     def time_run(self):
184 |         """Benchmark the runtime of applying the two queries"""
185 |         self.run()
186 | 
187 |     def peakmem_run(self):
188 |         """Benchmark the memory usage of applying the two queries"""
189 |         self.run()
190 | 
191 | 
192 | class CountNestedBy:
193 |     """Benchmark count_nested(nf, by=...)"""
194 | 
195 |     n_base = 1000
196 |     n_nested = 300
197 |     nf: NestedFrame
198 | 
199 |     def setup(self):
200 |         """Set up the benchmark environment"""
201 |         self.nf = datasets.generate_data(self.n_base, self.n_nested)
202 | 
203 |     def run(self):
204 |         """Run the benchmark."""
205 |         _ = count_nested(self.nf, nested="nested", by="band")
206 | 
207 |     def time_run(self):
208 |         """Benchmark the runtime of count_nested(nf, by=...)"""
209 |         self.run()
210 | 
211 |     def peakmem_run(self):
212 |         """Benchmark the memory usage of count_nested(nf, by=...)"""
213 |         self.run()
214 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?= -T -E -d _build/doctrees -D language=en
 7 | EXCLUDENB     ?= -D exclude_patterns="notebooks/*","_build","**.ipynb_checkpoints"
 8 | SPHINXBUILD   ?= sphinx-build
 9 | SOURCEDIR     = .
10 | BUILDDIR      = ../_readthedocs/
11 | 
12 | .PHONY: help clean Makefile no-nb no-notebooks
13 | 
14 | # Put it first so that "make" without argument is like "make help".
15 | help:
16 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
17 | 
18 | # Build all Sphinx docs locally, except the notebooks
19 | no-nb no-notebooks:
20 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(EXCLUDENB) $(O)
21 | 
22 | # Cleans up files generated by the build process
23 | clean:
24 | 	rm -r "_build/doctrees"
25 | 	rm -r "$(BUILDDIR)"
26 | 
27 | # Catch-all target: route all unknown targets to Sphinx using the new
28 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
29 | %: Makefile
30 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | .jupyter-widgets {
2 |     color: var(--pst-color-text-base) !important;
3 | }


--------------------------------------------------------------------------------
/docs/_templates/autosummary/base.rst:
--------------------------------------------------------------------------------
 1 | {%- if objname.split('.')[-1] == objname %}
 2 | {{ objname | escape | underline }}
 3 | {%- else %}
 4 | {{ objname.split('.')[-1] | escape | underline }}
 5 | {%- endif %}
 6 | 
 7 | .. currentmodule:: {{ module }}
 8 | 
 9 | .. auto{{ objtype }}:: {{ objname }}
10 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | {%- if objname.split('.')[-1] == objname %}
 2 | {{ objname | escape | underline }}
 3 | {%- else %}
 4 | {{ objname.split('.')[-1] | escape | underline }}
 5 | {%- endif %}
 6 | 
 7 | .. currentmodule:: {{ module }}
 8 | 
 9 | .. autoclass:: {{ objname }}
10 | 
11 |    {% block methods %}
12 |    .. automethod:: __init__
13 | 
14 |    {% if methods %}
15 |    .. rubric:: {{ _('Methods') }}
16 | 
17 |    .. autosummary::
18 |    {% for item in methods %}
19 |       ~{{ name }}.{{ item }}
20 |    {%- endfor %}
21 |    {% endif %}
22 |    {% endblock %}
23 | 
24 |    {% block attributes %}
25 |    {% if attributes %}
26 |    .. rubric:: {{ _('Attributes') }}
27 | 
28 |    .. autosummary::
29 |    {% for item in attributes %}
30 |       ~{{ name }}.{{ item }}
31 |    {%- endfor %}
32 |    {% endif %}
33 |    {% endblock %}
34 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. automodule:: {{ fullname }}
 4 | 
 5 |    {% block attributes %}
 6 |    {%- if attributes %}
 7 |    .. rubric:: {{ _('Module Attributes') }}
 8 | 
 9 |    .. autosummary::
10 |    {% for item in attributes %}
11 |       {{ item }}
12 |    {%- endfor %}
13 |    {% endif %}
14 |    {%- endblock %}
15 | 
16 |    {%- block functions %}
17 |    {%- if functions %}
18 |    .. rubric:: {{ _('Functions') }}
19 | 
20 |    .. autosummary::
21 |    {% for item in functions %}
22 |       {{ item }}
23 |    {%- endfor %}
24 |    {% endif %}
25 |    {%- endblock %}
26 | 
27 |    {%- block classes %}
28 |    {%- if classes %}
29 |    .. rubric:: {{ _('Classes') }}
30 | 
31 |    .. autosummary::
32 |    {% for item in classes %}
33 |       {{ item }}
34 |    {%- endfor %}
35 |    {% endif %}
36 |    {%- endblock %}
37 | 
38 |    {%- block exceptions %}
39 |    {%- if exceptions %}
40 |    .. rubric:: {{ _('Exceptions') }}
41 | 
42 |    .. autosummary::
43 |    {% for item in exceptions %}
44 |       {{ item }}
45 |    {%- endfor %}
46 |    {% endif %}
47 |    {%- endblock %}
48 | 
49 | {%- block modules %}
50 | {%- if modules %}
51 | .. rubric:: Modules
52 | 
53 | .. autosummary::
54 |    :toctree:
55 |    :recursive:
56 | {% for item in modules %}
57 |    {{ item }}
58 | {%- endfor %}
59 | {% endif %}
60 | {%- endblock %}
61 | 


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
1 | About Nested-Pandas
2 | ===================
3 | 
4 | 
5 | .. toctree::
6 | 
7 |     Internal Representation of Nested Data <about/internals>
8 |     Performance Impact of Nested-Pandas <pre_executed/performance>


--------------------------------------------------------------------------------
/docs/about/internals.rst:
--------------------------------------------------------------------------------
 1 | Internal Representation of Nested Data
 2 | ======================================
 3 | "Dataframes within Dataframes" is a useful hueristic for understanding the 
 4 | API/workings of a NestedFrame. However, the actual storage representation 
 5 | leverages pyarrow and materializes the nested dataframes as a view of the 
 6 | data. The following diagram details the actual storage representation of 
 7 | nested-pandas:
 8 | 
 9 | .. image:: ./npd_internals.png
10 |    :width: 400
11 |    :align: center
12 |    :alt: Internal representation of nested-pandas
13 | 
14 | 
15 | The advantage of this approach is that each sub-column ("field" in pyarrow) is
16 | stored in a flat array, with an offset array used to slice the data into the
17 | respective sub-dataframes. This allows for efficient transformations to other
18 | data representations (dataframes, list-arrays, flat arrays, etc.) which are
19 | used internally to minimize overhead of operations involving nested data.
20 | 
21 | Nested Serialization to Parquet
22 | -------------------------------
23 | The internal design of nested columns has valid pyarrow struct-list objects
24 | underneath. This allows for direct serialization of nested columns to the
25 | parquet format. nested-pandas will automatically write nested columns to
26 | parquet format as valid pyarrow dtypes, which allows for them to be read
27 | by other parquet readers that support complex types. Additionally, nested-pandas
28 | will attempt to cast pyarrow struct-list columns to nested columns directly
29 | when reading from parquet.
30 | 
31 | 
32 | Multi-level Nesting Support
33 | ---------------------------
34 | At this time, nested-pandas only supports a single level of nesting. Though we
35 | intend to support multiple levels of nesting in the future, and would be
36 | additionally motivated by community use cases that would benefit from this.


--------------------------------------------------------------------------------
/docs/about/npd_internals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/docs/about/npd_internals.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | 
 7 | import os
 8 | import sys
 9 | from importlib.metadata import version
10 | 
11 | # Define path to the code to be documented **relative to where conf.py (this file) is kept**
12 | sys.path.insert(0, os.path.abspath("../src/"))
13 | 
14 | # -- Project information -----------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
16 | 
17 | project = "nested-pandas"
18 | copyright = "2024, LINCC Frameworks"
19 | author = "LINCC Frameworks"
20 | release = version("nested-pandas")
21 | # for example take major/minor
22 | version = ".".join(release.split(".")[:2])
23 | 
24 | # -- General configuration ---------------------------------------------------
25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
26 | 
27 | extensions = ["sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.viewcode", "sphinx.ext.autosummary"]
28 | 
29 | extensions.append("nbsphinx")
30 | 
31 | # -- sphinx-copybutton configuration ----------------------------------------
32 | extensions.append("sphinx_copybutton")
33 | ## sets up the expected prompt text from console blocks, and excludes it from
34 | ## the text that goes into the clipboard.
35 | copybutton_exclude = ".linenos, .gp"
36 | copybutton_prompt_text = ">> "
37 | 
38 | ## lets us suppress the copy button on select code blocks.
39 | copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre"
40 | 
41 | templates_path = ["_templates"]
42 | exclude_patterns = ["_build", "**.ipynb_checkpoints"]
43 | 
44 | # This assumes that sphinx-build is called from the root directory
45 | master_doc = "index"
46 | # Remove 'view source code' from top of page (for html, not python)
47 | html_show_sourcelink = False
48 | # Remove namespaces from class/method signatures
49 | add_module_names = False
50 | # Hide full module path in navigation
51 | modindex_common_prefix = ["nested_pandas."]
52 | # Customize display of autosummary entries
53 | autosummary_imported_members = True
54 | 
55 | html_theme = "sphinx_book_theme"
56 | 
57 | html_static_path = ["_static"]
58 | html_css_files = ["custom.css"]
59 | 


--------------------------------------------------------------------------------
/docs/gettingstarted.rst:
--------------------------------------------------------------------------------
 1 | Getting Started
 2 | ========================================================================================
 3 | 
 4 | These pages will help you install and learn the basics of using nested-pandas. If you encounter any issues
 5 | we encourage you to open an issue on the 
 6 | `nested-pandas github repository <https://github.com/lincc-frameworks/nested-pandas/issues>`_.
 7 | 
 8 | .. toctree::
 9 |     :maxdepth: 1
10 | 
11 |     Installing nested-pandas <gettingstarted/installation>
12 |     Contribution Guide <gettingstarted/contributing>
13 |     Quickstart Guide <gettingstarted/quickstart>


--------------------------------------------------------------------------------
/docs/gettingstarted/contributing.rst:
--------------------------------------------------------------------------------
 1 | Contribution Guide
 2 | ==================
 3 | 
 4 | Dev Guide - Getting Started
 5 | ---------------------------
 6 | 
 7 | Download code and install dependencies in a conda environment. Run unit tests at the end as a verification that the packages are properly installed.
 8 | 
 9 | .. code-block:: bash
10 | 
11 |     conda create -n nested_pandas_env python=3.11
12 |     conda activate nested_pandas_env
13 | 
14 |     git clone https://github.com/lincc-frameworks/nested-pandas.git
15 |     cd nested-pandas/
16 |     bash ./.setup_dev.sh
17 | 
18 |     pip install pytest
19 |     pytest
20 | 


--------------------------------------------------------------------------------
/docs/gettingstarted/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | nested-pandas is available to install with pip, using the "nested-pandas" package name:
 5 | 
 6 | .. code-block:: bash
 7 | 
 8 |     % pip install nested-pandas
 9 | 
10 | 
11 | This will grab the latest release version of nested-pandas from pip.
12 | 
13 | Installation from Source
14 | ---------------------
15 | 
16 | In some cases, installation via pip may not be sufficient. In particular, if you're looking to grab the latest
17 | development version of nested-pandas, you should instead build 'nested-pandas' from source. The following process downloads the 
18 | 'nested-pandas' source code and installs it and any needed dependencies in a fresh conda environment. 
19 | 
20 | .. code-block:: bash
21 | 
22 |     conda create -n nested_pandas_env python=3.11
23 |     conda activate nested_pandas_env
24 | 
25 |     git clone https://github.com/lincc-frameworks/nested-pandas.git
26 |     cd nested-pandas
27 |     pip install .
28 |     pip install .[dev]  # it may be necessary to use `pip install .'[dev]'` (with single quotes) depending on your machine.
29 | 
30 | The ``pip install .[dev]`` command is optional, and installs dependencies needed to run the unit tests and build
31 | the documentation. The latest source version of nested-pandas may be less stable than a release, and so we recommend 
32 | running the unit test suite to verify that your local install is performing as expected.
33 | 
34 | .. code-block:: bash
35 | 
36 |     pip install pytest
37 |     pytest


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. nested_pandas documentation main file.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Nested-Pandas
 7 | =============
 8 | 
 9 | An extension of pandas for efficient representation of nested
10 | associated datasets.
11 | 
12 | Nested-Pandas extends the `pandas <https://pandas.pydata.org/>`_ package with 
13 | tooling and support for nested dataframes packed into values of top-level 
14 | dataframe columns. `Pyarrow <https://arrow.apache.org/docs/python/index.html>`_ 
15 | is used internally to aid in scalability and performance.
16 | 
17 | Nested-Pandas allows data like this:
18 | 
19 | .. image:: ./intro_images/pandas_dfs.png
20 |   :width: 400
21 |   :align: center
22 |   :alt: pandas dataframes
23 | 
24 | To instead be represented like this:
25 | 
26 | .. image:: ./intro_images/nestedframe_example.png
27 |   :width: 300
28 |   :align: center
29 |   :alt: pandas dataframes
30 | 
31 | Where the nested data is represented as nested dataframes:
32 | 
33 | .. code-block:: python
34 | 
35 |    # Each row of "object_nf" now has it's own sub-dataframe of matched rows from "source_df"
36 |    object_nf.loc[0]["nested_sources"]
37 | 
38 | .. image:: ./intro_images/loc_into_nested.png
39 |   :width: 300
40 |   :align: center
41 |   :alt: pandas dataframes
42 | 
43 | Allowing powerful and straightforward operations, like:
44 | 
45 | .. code-block:: python
46 | 
47 |    # Compute the mean flux for each row of "object_nf"
48 |    import numpy as np
49 |    object_nf.reduce(np.mean, "nested_sources.flux")
50 | 
51 | .. image:: ./intro_images/reduce.png
52 |   :width: 150
53 |   :align: center
54 |   :alt: pandas dataframes
55 | 
56 | Nested-Pandas is motivated by time-domain astronomy use cases, where we see
57 | typically two levels of information, information about astronomical objects and
58 | then an associated set of `N` measurements of those objects. Nested-Pandas offers
59 | a performant and memory-efficient package for working with these types of datasets. 
60 | 
61 | Core advantages being:
62 | 
63 | * hierarchical column access
64 | * efficient packing of nested information into inputs to custom user functions
65 | * avoiding costly groupby operations
66 | 
67 | 
68 | How to Use This Guide
69 | =====================
70 | 
71 | Begin with the :doc:`Getting Started <gettingstarted/installation>`
72 | guide to learn the basics of installation and walkthrough a simple example of 
73 | using nested-pandas.
74 | 
75 | The :doc:`Tutorials <tutorials>`
76 | section showcases the fundamental features of nested-pandas.
77 | 
78 | API-level information about nested-pandas is viewable in the 
79 | :doc:`API Reference <reference>`
80 | section.
81 | 
82 | The :doc:`About Nested-Pandas <about>` section provides information on the
83 | design and performance advantages of nested-pandas.
84 | 
85 | Learn more about contributing to this repository in our :doc:`Contribution Guide <gettingstarted/contributing>`.
86 | 
87 | .. toctree::
88 |    :hidden:
89 | 
90 |    Home page <self>
91 |    Getting Started <gettingstarted>
92 |    Tutorials <tutorials>
93 |    API Reference <reference>
94 |    About Nested-Pandas <about>
95 | 


--------------------------------------------------------------------------------
/docs/intro_images/loc_into_nested.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/docs/intro_images/loc_into_nested.png


--------------------------------------------------------------------------------
/docs/intro_images/nestedframe_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/docs/intro_images/nestedframe_example.png


--------------------------------------------------------------------------------
/docs/intro_images/pandas_dfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/docs/intro_images/pandas_dfs.png


--------------------------------------------------------------------------------
/docs/intro_images/reduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/docs/intro_images/reduce.png


--------------------------------------------------------------------------------
/docs/pre_executed/performance.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Performance Impact of `nested-pandas`\n",
  8 |     "\n",
  9 |     "For use-cases involving nesting data, `nested-pandas` can offer significant speedups compared to using the native `pandas` API. Below is a brief example workflow comparison between `pandas` and `nested-pandas`, where this example workflow calculates the amplitude of photometric fluxes after a few filtering steps."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import nested_pandas as npd\n",
 19 |     "import pandas as pd\n",
 20 |     "import light_curve as licu\n",
 21 |     "import numpy as np\n",
 22 |     "\n",
 23 |     "from nested_pandas.utils import count_nested"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Pandas"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 5,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "498 ms ± 3.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "%%timeit\n",
 48 |     "\n",
 49 |     "# Read data\n",
 50 |     "object_df = pd.read_parquet(\"objects.parquet\")\n",
 51 |     "source_df = pd.read_parquet(\"ztf_sources.parquet\")\n",
 52 |     "\n",
 53 |     "# Filter on object\n",
 54 |     "filtered_object = object_df.query(\"ra > 10.0\")\n",
 55 |     "# sync object to source --removes any index values of source not found in object\n",
 56 |     "filtered_source = filtered_object[[]].join(source_df, how=\"left\")\n",
 57 |     "\n",
 58 |     "# Count number of observations per photometric band and add it to the object table\n",
 59 |     "band_counts = (\n",
 60 |     "    source_df.groupby(level=0)\n",
 61 |     "    .apply(lambda x: x[[\"band\"]].value_counts().reset_index())\n",
 62 |     "    .pivot_table(values=\"count\", index=\"index\", columns=\"band\", aggfunc=\"sum\")\n",
 63 |     ")\n",
 64 |     "filtered_object = filtered_object.join(band_counts[[\"g\", \"r\"]])\n",
 65 |     "\n",
 66 |     "# Filter on our nobs\n",
 67 |     "filtered_object = filtered_object.query(\"g > 520\")\n",
 68 |     "filtered_source = filtered_object[[]].join(source_df, how=\"left\")\n",
 69 |     "\n",
 70 |     "# Calculate Amplitude\n",
 71 |     "amplitude = licu.Amplitude()\n",
 72 |     "filtered_source.groupby(level=0).apply(lambda x: amplitude(np.array(x.mjd), np.array(x.flux)))"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Nested-Pandas"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "228 ms ± 2.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "%%timeit\n",
 97 |     "\n",
 98 |     "# Read in parquet data\n",
 99 |     "# nesting sources into objects\n",
100 |     "nf = npd.read_parquet(\"objects.parquet\")\n",
101 |     "nf = nf.add_nested(npd.read_parquet(\"ztf_sources.parquet\"), \"ztf_sources\")\n",
102 |     "\n",
103 |     "# Filter on object\n",
104 |     "nf = nf.query(\"ra > 10.0\")\n",
105 |     "\n",
106 |     "# Count number of observations per photometric band and add it as a column\n",
107 |     "nf = count_nested(nf, \"ztf_sources\", by=\"band\", join=True)  # use an existing utility\n",
108 |     "\n",
109 |     "# Filter on our nobs\n",
110 |     "nf = nf.query(\"n_ztf_sources_g > 520\")\n",
111 |     "\n",
112 |     "# Calculate Amplitude\n",
113 |     "amplitude = licu.Amplitude()\n",
114 |     "nf.reduce(amplitude, \"ztf_sources.mjd\", \"ztf_sources.flux\")"
115 |    ]
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "lsdb",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.11.11"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 2
139 | }
140 | 


--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | ========================================================================================
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 2
 6 | 
 7 |     NestedFrame <reference/nestedframe>
 8 |     .nest Accessor <reference/accessor>
 9 |     Utility Functions <reference/utils>
10 |     NestedDtype <reference/nesteddtype>
11 |     Nested Extension Array <reference/ext_array>
12 |     Packer Functions <reference/packer>
13 | 


--------------------------------------------------------------------------------
/docs/reference/accessor.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | .nest Series Accessor
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | Constructor
 7 | ~~~~~~~~~~~
 8 | .. autosummary::
 9 |    :toctree: api/
10 | 
11 |    NestSeriesAccessor
12 | 
13 | Functions
14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .. autosummary::
16 |     :toctree: api/
17 | 
18 |     NestSeriesAccessor.to_lists
19 |     NestSeriesAccessor.to_flat
20 |     NestSeriesAccessor.to_flatten_inner
21 |     NestSeriesAccessor.with_field
22 |     NestSeriesAccessor.with_flat_field
23 |     NestSeriesAccessor.with_list_field
24 |     NestSeriesAccessor.with_filled_field
25 |     NestSeriesAccessor.without_field
26 |     NestSeriesAccessor.query_flat
27 |     NestSeriesAccessor.get_flat_index
28 |     NestSeriesAccessor.get_flat_series
29 |     NestSeriesAccessor.get_list_series
30 | 


--------------------------------------------------------------------------------
/docs/reference/ext_array.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | NestedDtype
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | Constructor
 7 | ~~~~~~~~~~~
 8 | .. autosummary::
 9 |    :toctree: api/
10 | 
11 |    series.ext_array.NestedExtensionArray
12 | 
13 | Functions
14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .. autosummary::
16 |     :toctree: api/
17 | 
18 |     series.ext_array.NestedExtensionArray.dtype
19 |     series.ext_array.NestedExtensionArray.nbytes
20 |     series.ext_array.NestedExtensionArray.list_array
21 |     series.ext_array.NestedExtensionArray.struct_array
22 |     series.ext_array.NestedExtensionArray.py_table
23 |     series.ext_array.NestedExtensionArray.list_offsets
24 |     series.ext_array.NestedExtensionArray.field_names
25 |     series.ext_array.NestedExtensionArray.list_lengths
26 |     series.ext_array.NestedExtensionArray.flat_length
27 |     series.ext_array.NestedExtensionArray.num_chunks
28 |     series.ext_array.NestedExtensionArray.to_numpy
29 |     series.ext_array.NestedExtensionArray.isna
30 |     series.ext_array.NestedExtensionArray.take
31 |     series.ext_array.NestedExtensionArray.copy
32 |     series.ext_array.NestedExtensionArray.equals
33 |     series.ext_array.NestedExtensionArray.dropna
34 |     series.ext_array.NestedExtensionArray.from_sequence
35 |     series.ext_array.NestedExtensionArray.to_arrow_ext_array
36 |     series.ext_array.NestedExtensionArray.to_pyarrow_scalar
37 |     series.ext_array.NestedExtensionArray.get_list_index
38 |     series.ext_array.NestedExtensionArray.iter_field_lists
39 |     series.ext_array.NestedExtensionArray.view_fields
40 |     series.ext_array.NestedExtensionArray.set_flat_field
41 |     series.ext_array.NestedExtensionArray.set_list_field
42 |     series.ext_array.NestedExtensionArray.fill_field_lists
43 |     series.ext_array.NestedExtensionArray.pop_fields


--------------------------------------------------------------------------------
/docs/reference/nesteddtype.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | NestedDtype
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | Constructor
 7 | ~~~~~~~~~~~
 8 | .. autosummary::
 9 |    :toctree: api/
10 | 
11 |    NestedDtype
12 | 
13 | Functions
14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .. autosummary::
16 |     :toctree: api/
17 | 
18 |     NestedDtype.construct_array_type
19 |     NestedDtype.construct_from_string
20 |     NestedDtype.from_fields
21 |     NestedDtype.from_pandas_arrow_dtype
22 |     NestedDtype.to_pandas_arrow_dtype


--------------------------------------------------------------------------------
/docs/reference/nestedframe.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | NestedFrame
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | Constructor
 7 | ~~~~~~~~~~~
 8 | .. autosummary::
 9 |    :toctree: api/
10 | 
11 |    NestedFrame
12 | 
13 | Nesting
14 | ~~~~~~~~~
15 | .. autosummary::
16 |     :toctree: api/
17 |     
18 |     NestedFrame.add_nested
19 |     NestedFrame.nest_lists
20 |     NestedFrame.from_flat
21 |     NestedFrame.from_lists
22 | 
23 | Extended Pandas.DataFrame Interface
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | 
26 | .. note:: 
27 |    The NestedFrame extends the Pandas.DataFrame interface, so all methods
28 |    of Pandas.DataFrame are available. The following methods are extended
29 |    to support NestedFrame functionality. Please reference the Pandas
30 |    documentation for more information.
31 |    https://pandas.pydata.org/docs/reference/frame.html
32 |    
33 | .. autosummary::
34 |     :toctree: api/
35 | 
36 |     NestedFrame.eval
37 |     NestedFrame.query
38 |     NestedFrame.dropna
39 |     NestedFrame.sort_values
40 |     NestedFrame.reduce
41 |     NestedFrame.drop
42 | 
43 | I/O
44 | ~~~~~~~~~
45 | .. autosummary::
46 |     :toctree: api/
47 | 
48 |     NestedFrame.to_parquet
49 |     read_parquet


--------------------------------------------------------------------------------
/docs/reference/packer.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Packer
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | Functions
 7 | ~~~~~~~~~
 8 | .. autosummary::
 9 |     :toctree: api/
10 |     
11 |     series.packer.pack
12 |     series.packer.pack_flat
13 |     series.packer.pack_seq
14 |     series.packer.pack_lists


--------------------------------------------------------------------------------
/docs/reference/utils.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Utility Functions
 3 | =========
 4 | .. currentmodule:: nested_pandas
 5 | 
 6 | NestedFrame Utilities
 7 | ~~~~~~~~~
 8 | .. autosummary::
 9 |     :toctree: api/
10 |     
11 |     utils.count_nested
12 | 
13 | Generating Toy Datasets
14 | ~~~~~~~~~
15 | .. autosummary::
16 |     :toctree: api/
17 |     
18 |     datasets.generation.generate_data
19 |     datasets.generation.generate_parquet_file


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ipykernel
 3 | ipython
 4 | jupytext
 5 | nbconvert
 6 | nbsphinx
 7 | sphinx
 8 | sphinx-autoapi
 9 | sphinx-copybutton
10 | sphinx-book-theme
11 | astroquery
12 | astropy
13 | matplotlib
14 | light-curve


--------------------------------------------------------------------------------
/docs/tutorials.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | ========================================================================================
 3 | 
 4 | .. toctree::
 5 | 
 6 |     Loading Data into Nested-Pandas <tutorials/data_loading_notebook>
 7 |     Fine Data Manipulation with Nested-Pandas <tutorials/data_manipulation>
 8 |     Lower-level interfaces <tutorials/low_level.ipynb>
 9 |     Using Nested-Pandas with Astronomical Spectra <tutorials/nested_spectra.ipynb>
10 | 


--------------------------------------------------------------------------------
/docs/tutorials/README.md:
--------------------------------------------------------------------------------
1 | Put your Jupyter notebooks here :)
2 | 


--------------------------------------------------------------------------------
/docs/tutorials/data_loading_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Loading Data into Nested-Pandas\n",
  8 |     "\n",
  9 |     "This notebook provides a brief introduction to loading data into nested-pandas or converting data into a nested structure. For an introduction to nested-pandas, see the quick start tutorial or the [readthedocs page](https://nested-pandas.readthedocs.io/en/latest/)\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Installation and Imports\n",
 17 |     "\n",
 18 |     "With a valid Python environment, nested-pandas and its dependencies are easy to install using the `pip` package manager. The following command can be used to install it:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# % pip install nested-pandas"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import os\n",
 37 |     "import tempfile\n",
 38 |     "\n",
 39 |     "import pandas as pd\n",
 40 |     "\n",
 41 |     "from nested_pandas import NestedFrame, read_parquet\n",
 42 |     "from nested_pandas.datasets import generate_parquet_file"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "# Overview\n",
 50 |     "\n",
 51 |     "Nested-pandas provides multiple mechanisms for loading data or converting data to the nested format.  Below we walk through some of the common approaches.\n",
 52 |     "\n",
 53 |     "# Converting Flat Data\n",
 54 |     "\n",
 55 |     "Commonly existing data sets will be provided in “flat” data structures such as dictionaries or Pandas DataFrames.  In these cases the data consists of a rectangular table where each row represents an instance or observation. Multiple instances of the same top-level item are linked together through an ID. All rows with the same ID correspond to the same object/item.\n",
 56 |     "\n",
 57 |     "We define one such flat dataframe consisting of 10 rows for 3 distinct items."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "flat_df = pd.DataFrame(\n",
 67 |     "    data={\n",
 68 |     "        \"a\": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],\n",
 69 |     "        \"b\": [2, 2, 2, 4, 4, 4, 6, 6, 6, 6],\n",
 70 |     "        \"c\": [0, 2, 4, 1, 4, 3, 1, 4, 1, 1],\n",
 71 |     "        \"d\": [5, 4, 7, 5, 3, 1, 9, 3, 4, 1],\n",
 72 |     "    },\n",
 73 |     "    index=[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n",
 74 |     ")\n",
 75 |     "flat_df"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "The first column provides the object id. As we can see there are three rows with ID=0, three rows with ID=1, and four rows with ID=2. Some of the values are constant for each item. For example both columns “a” and “b” take a single value for object. We are wasting space by repeating them in every row. Other values are different per row (columns “c” and “d”).\n",
 83 |     "\n",
 84 |     "As a concrete example, consider patient records. Each patient is assigned a unique id and has static data such as a date birth. They also have measurements that are new with every trip to the doctor, such as blood pressure or temperature."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Converting from Flat Pandas\n",
 92 |     "\n",
 93 |     "The easiest approach to converting the flat table above into a nested structure is to use `NestedFrame.from_flat()`. This function takes\n",
 94 |     "  * a list of columns that are not nested (base_columns)\n",
 95 |     "  * a list of columns to nest (nested_columns)\n",
 96 |     "  * the name of the nested column (name)\n",
 97 |     "Rows are associated using the index by default, but a column name on which to join can also be provided."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "nf = NestedFrame.from_flat(\n",
107 |     "    flat_df,\n",
108 |     "    base_columns=[\"a\", \"b\"],  # the columns not to nest\n",
109 |     "    nested_columns=[\"c\", \"d\"],  # the columns to nest\n",
110 |     "    name=\"nested\",  # name of the nested column\n",
111 |     ")\n",
112 |     "nf"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Inserting Nested Rows\n",
120 |     "\n",
121 |     "Alternatively, we can use the `NestedFrame` constructor to create our base frame from a dictionary of our columns (as we would do with a normal pandas DataFrame). This defines the top-level objects and the values that are constant across rows (\"a\" and \"b\")."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "nf = NestedFrame(\n",
131 |     "    data={\n",
132 |     "        \"a\": [1, 2, 3],\n",
133 |     "        \"b\": [2, 4, 6],\n",
134 |     "    },\n",
135 |     "    index=[0, 1, 2],\n",
136 |     ")\n",
137 |     "nf"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "We can then create an additional pandas dataframes for the nested columns and pack them into our `NestedFrame` with `NestedFrame.add_nested()` function. `add_nested` will align the nest based on the index by default (a column may be selected instead via the `on` kwarg), as we see the `nested` `DataFrame` has a repeated index corresponding to the `nf` `NestedFrame`."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "nested = pd.DataFrame(\n",
154 |     "    data={\n",
155 |     "        \"c\": [0, 2, 4, 1, 4, 3, 1, 4, 1, 1],\n",
156 |     "        \"d\": [5, 4, 7, 5, 3, 1, 9, 3, 4, 1],\n",
157 |     "    },\n",
158 |     "    index=[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],\n",
159 |     ")\n",
160 |     "\n",
161 |     "nf = nf.add_nested(nested, \"nested\")\n",
162 |     "nf"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "The \"index\" parameter is used to perform the association.  All of the values for index=0 are bundled together into a sub-table and stored in row 0's \"nested\" column."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "nf.loc[0][\"nested\"]"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "We could add other nested columns by creating new sub-tables and adding them with `add_nested()`. Note that while the tables added with each `add_nested()` must be rectangular, they do not need to have the same dimensions between calls. We could add another nested row with a different number of observations."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "nested = pd.DataFrame(\n",
195 |     "    data={\n",
196 |     "        \"c\": [0, 1, 0, 1, 2, 0],\n",
197 |     "        \"d\": [5, 4, 5, 4, 3, 5],\n",
198 |     "    },\n",
199 |     "    index=[0, 0, 1, 1, 1, 2],\n",
200 |     ")\n",
201 |     "\n",
202 |     "nf = nf.add_nested(nested, \"nested2\")\n",
203 |     "nf"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "# Loading Data from Parquet Files"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "For larger datasets, we support loading data from parquet files. In the following cell, we generate a series of temporary parquet files with random data, and ingest them with the `read_parquet` method:"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n",
227 |     "# You can of course remove this and use your own directory and real files on your system.\n",
228 |     "with tempfile.TemporaryDirectory() as temp_path:\n",
229 |     "    # Generates parquet files with random data within our temporary directory\n",
230 |     "    generate_parquet_file(10, {\"nested1\": 100, \"nested2\": 10}, os.path.join(temp_path, \"test.parquet\"))\n",
231 |     "\n",
232 |     "    # Read the parquet file to a NestedFrame\n",
233 |     "    nf = read_parquet(os.path.join(temp_path, \"test.parquet\"))"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "Nested-Pandas nested columns are compatible with the parquet format, meaning they can be written and read from parquet natively."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "nf  # nf contains nested columns"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# Saving NestedFrames to Parquet Files\n",
257 |     "\n",
258 |     "Additionally we can save an existing `NestedFrame` as a parquet file using `NestedFrame.to_parquet`.\n",
259 |     "\n",
260 |     ">Note: Nested-Pandas converts any nested columns to pyarrow datatypes when writing to parquet, meaning that parquet files with nested columns can be read by a parquet reader from other packages so long as they understand pyarrow dtypes."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n",
270 |     "# You can of course remove this and use your own directory and real files on your system.\n",
271 |     "with tempfile.TemporaryDirectory() as temp_path:\n",
272 |     "    nf.to_parquet(\n",
273 |     "        os.path.join(temp_path, \"output.parquet\"),  # The output file path\n",
274 |     "    )\n",
275 |     "\n",
276 |     "    # List the files in temp_path to ensure they were saved correctly.\n",
277 |     "    print(\"The NestedFrame was saved to the following parquet files :\", os.listdir(temp_path))"
278 |    ]
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "lsdb",
284 |    "language": "python",
285 |    "name": "python3"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.12.8"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 2
302 | }
303 | 


--------------------------------------------------------------------------------
/docs/tutorials/data_manipulation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine Data Manipulation with Nested-Pandas"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This tutorial will briefly showcase how one would perform data manipulation operations from `pandas`, like adding columns, replacing values, etc. with `nested-pandas`."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "ExecuteTime": {
 22 |      "end_time": "2025-03-05T23:08:41.890895Z",
 23 |      "start_time": "2025-03-05T23:08:41.872743Z"
 24 |     }
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import nested_pandas as npd\n",
 29 |     "from nested_pandas.datasets import generate_data\n",
 30 |     "\n",
 31 |     "# Begin by generating an example dataset\n",
 32 |     "ndf = generate_data(5, 20, seed=1)\n",
 33 |     "ndf"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "ExecuteTime": {
 41 |      "end_time": "2025-03-05T23:08:41.907431Z",
 42 |      "start_time": "2025-03-05T23:08:41.902080Z"
 43 |     }
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Show one of the nested dataframes\n",
 48 |     "ndf.iloc[0].nested"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## Nested Column Selection"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested `t` column, where all rows from all dataframes are present in one dataframe."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {
 69 |     "ExecuteTime": {
 70 |      "end_time": "2025-03-05T23:08:41.933782Z",
 71 |      "start_time": "2025-03-05T23:08:41.930296Z"
 72 |     }
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Directly Nested Column Selection\n",
 77 |     "ndf[\"nested.t\"]"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "The advantage of the flat view being that this is easily manipulatable just as any `pandas.Series` object. "
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "ExecuteTime": {
 92 |      "end_time": "2025-03-05T23:08:41.956770Z",
 93 |      "start_time": "2025-03-05T23:08:41.953485Z"
 94 |     }
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "ndf[\"nested.t\"] + 100"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "## Adding or Replacing Nested Columns"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "> *A Note on Performance: These operations involve full reconstruction of the nested columns so expect impacted performance when doing this at scale. It may be appropriate to do these operations within reduce functions directly (e.g. subtracting a value from a column) if performance is key.*"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "We can use the \"base_column.nested_sub_column\" syntax to also perform operations that add new columns or replace existing columns for a nested column. For example, we can directly replace the \"band\" column with a new column that appends an additional string to the values."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "ExecuteTime": {
127 |      "end_time": "2025-03-05T23:08:41.992618Z",
128 |      "start_time": "2025-03-05T23:08:41.987910Z"
129 |     }
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "# prepend lsst_ to the band column\n",
134 |     "\n",
135 |     "ndf[\"nested.band\"] = \"lsst_\" + ndf[\"nested.band\"]\n",
136 |     "\n",
137 |     "ndf[\"nested.band\"]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "Next, we can create a new column in the \"nested\" column. For example, we can subtract a value from each time value and return it as a new column."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "ExecuteTime": {
152 |      "end_time": "2025-03-05T23:08:42.016312Z",
153 |      "start_time": "2025-03-05T23:08:42.012009Z"
154 |     }
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "# create a new \"corrected_t\" column in \"nested\"\n",
159 |     "\n",
160 |     "ndf[\"nested.corrected_t\"] = ndf[\"nested.t\"] - 5\n",
161 |     "\n",
162 |     "ndf[\"nested.corrected_t\"]"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {
169 |     "ExecuteTime": {
170 |      "end_time": "2025-03-05T23:08:42.037065Z",
171 |      "start_time": "2025-03-05T23:08:42.032519Z"
172 |     }
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "# Show the first dataframe again\n",
177 |     "ndf.iloc[0].nested"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Adding New Nested Structures"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Finally, we can also add entirely new nested structures using the above syntax."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "ExecuteTime": {
199 |      "end_time": "2025-03-05T23:08:42.075674Z",
200 |      "start_time": "2025-03-05T23:08:42.061111Z"
201 |     }
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "ndf[\"bands.band_label\"] = ndf[\"nested.band\"]\n",
206 |     "ndf"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "This is functionally equivalent to using `add_nested`:"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {
220 |     "ExecuteTime": {
221 |      "end_time": "2025-03-05T23:08:42.132918Z",
222 |      "start_time": "2025-03-05T23:08:42.114796Z"
223 |     }
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "ndf.add_nested(ndf[\"nested.band\"].to_frame(), \"bands_from_add_nested\")"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "In addition to assigning individual nested columns, we can use the above syntax to nest an entire flat dataframe.\n",
235 |     "\n",
236 |     "As an example, we can flatten our existing \"nested\" frame and use the `[]` syntax to assign it as an additional nested frame."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Create a flat dataframe from our existing nested dataframe\n",
246 |     "flat_df = ndf[\"nested\"].nest.to_flat()\n",
247 |     "\n",
248 |     "# Nest our flat dataframe back into our original dataframe\n",
249 |     "ndf[\"example\"] = flat_df\n",
250 |     "ndf"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "The above again being shorthand for the following `add_nested` call:"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "ndf.add_nested(flat_df, \"example_from_add_nested\")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "## Embedding \"base\" column into nested column"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "We can also assign some \"base\" (non-nested) column to a nested column, which will be broadcasted to all nested dataframes with the values being repeated."
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {
287 |     "ExecuteTime": {
288 |      "end_time": "2025-03-05T23:08:42.165933Z",
289 |      "start_time": "2025-03-05T23:08:42.161684Z"
290 |     }
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "ndf[\"nested.a\"] = ndf[\"a\"]\n",
295 |     "ndf[\"nested.a\"]"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "Or we can do some operations over the base columns first:"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {
309 |     "ExecuteTime": {
310 |      "end_time": "2025-03-05T23:08:42.266923Z",
311 |      "start_time": "2025-03-05T23:08:42.262281Z"
312 |     }
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "ndf[\"nested.ab\"] = ndf[\"a\"] + ndf[\"b\"] * 2\n",
317 |     "ndf[\"nested.ab\"]"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "## Combining Nested Structures\n",
325 |     "\n",
326 |     "There may be cases where you would want to combine two nested structures into a single nested structure. There are multiple ways to do this, but by far the most direct path is through direct assignment, first let's set up a toy example:"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "# Setup a toy dataframe with two nested columns\n",
336 |     "list_nf = npd.NestedFrame(\n",
337 |     "    {\n",
338 |     "        \"a\": [\"cat\", \"dog\", \"bird\"],\n",
339 |     "        \"b\": [1, 2, 3],\n",
340 |     "        \"c\": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n",
341 |     "        \"d\": [[10, 20, 30], [40, 50, 60], [70, 80, 90]],\n",
342 |     "    }\n",
343 |     ")\n",
344 |     "\n",
345 |     "list_nf = list_nf.nest_lists(\"c\", [\"c\"])\n",
346 |     "list_nf = list_nf.nest_lists(\"d\", [\"d\"])\n",
347 |     "list_nf"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "\n",
355 |     "Given the nested structures, \"c\" and \"d\", they can be combined directly as shown below. Note that this requires \"c\" and \"d\" to be compatible, which means that the shapes of the inner dataframes should be aligned for every row of your top-level nestedframe."
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "# Combine \"c\" and \"d\"\n",
365 |     "list_nf[\"nested\"] = list_nf[[\"c\", \"d\"]]\n",
366 |     "list_nf = list_nf.drop(columns=[\"c\", \"d\"])  # drop the original columns\n",
367 |     "list_nf"
368 |    ]
369 |   }
370 |  ],
371 |  "metadata": {
372 |   "kernelspec": {
373 |    "display_name": "lsdb",
374 |    "language": "python",
375 |    "name": "python3"
376 |   },
377 |   "language_info": {
378 |    "codemirror_mode": {
379 |     "name": "ipython",
380 |     "version": 3
381 |    },
382 |    "file_extension": ".py",
383 |    "mimetype": "text/x-python",
384 |    "name": "python",
385 |    "nbconvert_exporter": "python",
386 |    "pygments_lexer": "ipython3",
387 |    "version": "3.12.8"
388 |   }
389 |  },
390 |  "nbformat": 4,
391 |  "nbformat_minor": 2
392 | }
393 | 


--------------------------------------------------------------------------------
/docs/tutorials/nested_spectra.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Using Nested-Pandas with Astronomical Spectra"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In Astronomy, a spectrum is a measurement (or combination of measurements) of an object that shows the intensity of light emitted over a range of energies. In this tutorial, we'll walk through a simple example of working with spectra from the Sloan Digital Sky Survey (SDSS), in particular showing how it can be represented as a `NestedFrame`."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "First, we'll use `astroquery` and `astropy` to download a handful of spectra from SDSS:"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from astroquery.sdss import SDSS\n",
 31 |     "from astropy import coordinates as coords\n",
 32 |     "import astropy.units as u\n",
 33 |     "import nested_pandas as npd\n",
 34 |     "\n",
 35 |     "# Query SDSS for a set of objects with spectra\n",
 36 |     "pos = coords.SkyCoord(\"0h8m10.63s +14d50m23.3s\", frame=\"icrs\")\n",
 37 |     "xid = SDSS.query_region(pos, radius=3 * u.arcmin, spectro=True)\n",
 38 |     "xid_ndf = npd.NestedFrame(xid.to_pandas())\n",
 39 |     "xid_ndf"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "This initial query returns a set of objects with spectra (as specified by the `spectro=True` flag). To actually retrieve the spectra, we can do the following:"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Query SDSS for the corresponding spectra\n",
 56 |     "SDSS.clear_cache()\n",
 57 |     "sp = SDSS.get_spectra(matches=xid)\n",
 58 |     "sp"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "The result is a list of FITS formatted data. From this point there are a few ways that we could move towards a nested-pandas representation. The most straightforward is to build a \"flat\" spectra table from all the objects, where we gather the information from each spectrum into a single combined table."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "import numpy as np\n",
 75 |     "\n",
 76 |     "# Build a flat spectrum dataframe\n",
 77 |     "\n",
 78 |     "# Initialize some empty arrays to hold the flat data\n",
 79 |     "wave = np.array([])\n",
 80 |     "flux = np.array([])\n",
 81 |     "err = np.array([])\n",
 82 |     "index = np.array([])\n",
 83 |     "# Loop over each spectrum, adding its data to the arrays\n",
 84 |     "for i, hdu in enumerate(sp):\n",
 85 |     "    wave = np.append(wave, 10 ** hdu[\"COADD\"].data.loglam)  # * u.angstrom\n",
 86 |     "    flux = np.append(flux, hdu[\"COADD\"].data.flux * 1e-17)  # * u.erg/u.second/u.centimeter**2/u.angstrom\n",
 87 |     "    err = np.append(err, 1 / hdu[\"COADD\"].data.ivar * 1e-17)  # * flux.unit\n",
 88 |     "\n",
 89 |     "    # We'll need to set an index to keep track of which rows correspond\n",
 90 |     "    # to which object\n",
 91 |     "    index = np.append(index, i * np.ones(len(hdu[\"COADD\"].data.loglam)))\n",
 92 |     "\n",
 93 |     "# Build a NestedFrame from the arrays\n",
 94 |     "flat_spec = npd.NestedFrame(dict(wave=wave, flux=flux, err=err), index=index.astype(np.int8))\n",
 95 |     "flat_spec"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "From here, we can simply nest our flat table within our original query result:"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "spec_ndf = xid_ndf.add_nested(flat_spec, \"coadd_spectrum\").set_index(\"objid\")\n",
112 |     "spec_ndf"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "And we can see that each object now has the `coadd_spectrum` nested column with the full spectrum available."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# Look at one of the spectra\n",
129 |     "spec_ndf.iloc[1].coadd_spectrum"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "We now have our spectra nested, and can proceed to do any filtering and analysis as normal within nested-pandas.\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "import matplotlib.pyplot as plt\n",
146 |     "\n",
147 |     "# Plot a spectrum\n",
148 |     "spec = spec_ndf.iloc[1].coadd_spectrum\n",
149 |     "\n",
150 |     "plt.plot(spec[\"wave\"], spec[\"flux\"])\n",
151 |     "plt.xlabel(\"Wavelength (Å)\")\n",
152 |     "plt.ylabel(r\"Flux ($ergs/s/cm^2/Å$)\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "lsdb",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.12.8"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 4
184 | }
185 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | 
  2 | [project]
  3 | name = "nested-pandas"
  4 | license = {file = "LICENSE"}
  5 | description = "An extension of pandas for efficient representation of nested associated datasets."
  6 | readme = "README.md"
  7 | authors = [
  8 |     { name = "LINCC Frameworks", email = "brantd@uw.edu" }
  9 | ]
 10 | classifiers = [
 11 |     "Development Status :: 4 - Beta",
 12 |     "License :: OSI Approved :: MIT License",
 13 |     "Intended Audience :: Developers",
 14 |     "Intended Audience :: Science/Research",
 15 |     "Operating System :: OS Independent",
 16 |     "Programming Language :: Python",
 17 | ]
 18 | dynamic = ["version"]
 19 | requires-python = ">=3.10"
 20 | dependencies = [
 21 |     "numpy>=2",
 22 |     # We use internal pd._libs.missing and experimental ArrowExtensionArray
 23 |     "pandas>=2.2.3,<2.4",
 24 |     "pyarrow>=18",
 25 |     "universal_pathlib>=0.2",
 26 | ]
 27 | 
 28 | [project.urls]
 29 | "Source Code" = "https://github.com/lincc-frameworks/nested-pandas"
 30 | 
 31 | # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
 32 | [project.optional-dependencies]
 33 | dev = [
 34 |     "asv==0.6.4", # Used to compute performance benchmarks
 35 |     "jupyter", # Clears output from Jupyter notebooks
 36 |     "mypy", # Used for static type checking of files
 37 |     "pre-commit", # Used to run checks before finalizing a git commit
 38 |     "pytest",
 39 |     "pytest-cov", # Used to report total code coverage
 40 |     "ruff", # Used for static linting of files
 41 |     "aiohttp",
 42 |     "requests",
 43 | ]
 44 | 
 45 | [build-system]
 46 | requires = [
 47 |     "setuptools>=62", # Used to build and package the Python project
 48 |     "setuptools_scm>=6.2", # Gets release version from git. Makes it available programmatically
 49 | ]
 50 | build-backend = "setuptools.build_meta"
 51 | 
 52 | [tool.setuptools_scm]
 53 | write_to = "src/nested_pandas/_version.py"
 54 | 
 55 | [tool.pytest.ini_options]
 56 | testpaths = [
 57 |     "tests",
 58 |     "src",
 59 |     "docs",
 60 | ]
 61 | addopts = "--doctest-modules --doctest-glob=*.rst"
 62 | 
 63 | [tool.ruff]
 64 | line-length = 110
 65 | target-version = "py310"
 66 | [tool.ruff.lint]
 67 | select = [
 68 |     # pycodestyle
 69 |     "E",
 70 |     "W",
 71 |     # Pyflakes
 72 |     "F",
 73 |     # pep8-naming
 74 |     "N",
 75 |     # pyupgrade
 76 |     "UP",
 77 |     # flake8-bugbear
 78 |     "B",
 79 |     # flake8-simplify
 80 |     "SIM",
 81 |     # isort
 82 |     "I",
 83 |     # docstrings
 84 |     "D101",
 85 |     "D102",
 86 |     "D103",
 87 |     "D106",
 88 |     "D206",
 89 |     "D207",
 90 |     "D208",
 91 |     "D300",
 92 |     "D417",
 93 |     "D419",
 94 |     # Numpy v2.0 compatibility
 95 |     "NPY201",
 96 | ]
 97 | ignore = [
 98 |     "UP006", # Allow non standard library generics in type hints
 99 |     "UP007", # Allow Union in type hints
100 |     "SIM114", # Allow if with same arms
101 |     "B028", # Allow default warning level
102 |     "SIM117", # Allow nested with
103 |     "UP015", # Allow redundant open parameters
104 |     "UP028", # Allow yield in for loop
105 | ]
106 | 
107 | [tool.setuptools.package-data]
108 | nested_pandas = ["py.typed"]
109 | 
110 | [tool.coverage.run]
111 | omit=["src/nested_pandas/_version.py"]
112 | 


--------------------------------------------------------------------------------
/src/nested_pandas/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._version import __version__  # noqa
 2 | from .nestedframe import NestedFrame
 3 | from .nestedframe.io import read_parquet
 4 | 
 5 | # Import for registering
 6 | from .series.accessor import NestSeriesAccessor  # noqa: F401
 7 | from .series.dtype import NestedDtype
 8 | 
 9 | 
10 | __all__ = ["NestedDtype", "NestedFrame", "read_parquet"]
11 | 


--------------------------------------------------------------------------------
/src/nested_pandas/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .generation import *  # noqa
2 | 


--------------------------------------------------------------------------------
/src/nested_pandas/datasets/generation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nested_pandas import NestedFrame
 4 | 
 5 | 
 6 | def generate_data(n_base, n_layer, seed=None) -> NestedFrame:
 7 |     """Generates a toy dataset.
 8 | 
 9 |     Parameters
10 |     ----------
11 |     n_base : int
12 |         The number of rows to generate for the base layer
13 |     n_layer : int, or dict
14 |         The number of rows per n_base row to generate for a nested layer.
15 |         Alternatively, a dictionary of layer label, layer_size pairs may be
16 |         specified to created multiple nested columns with custom sizing.
17 |     seed : int
18 |         A seed to use for random generation of data
19 | 
20 |     Returns
21 |     -------
22 |     NestedFrame
23 |         The constructed NestedFrame.
24 | 
25 |     Examples
26 |     --------
27 |     >>> from nested_pandas.datasets import generate_data
28 |     >>> nf1 = generate_data(10,100)
29 |     >>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200})
30 |     """
31 |     # use provided seed, "None" acts as if no seed is provided
32 |     randomstate = np.random.RandomState(seed=seed)
33 | 
34 |     # Generate base data
35 |     base_data = {"a": randomstate.random(n_base), "b": randomstate.random(n_base) * 2}
36 |     base_nf = NestedFrame(data=base_data)
37 | 
38 |     # In case of int, create a single nested layer called "nested"
39 |     if isinstance(n_layer, int):
40 |         n_layer = {"nested": n_layer}
41 | 
42 |     # It should be a dictionary
43 |     if isinstance(n_layer, dict):
44 |         for key in n_layer:
45 |             layer_size = n_layer[key]
46 |             layer_data = {
47 |                 "t": randomstate.random(layer_size * n_base) * 20,
48 |                 "flux": randomstate.random(layer_size * n_base) * 100,
49 |                 "band": randomstate.choice(["r", "g"], size=layer_size * n_base),
50 |                 "index": np.arange(layer_size * n_base) % n_base,
51 |             }
52 |             layer_nf = NestedFrame(data=layer_data).set_index("index")
53 |             base_nf = base_nf.add_nested(layer_nf, key)
54 |         return base_nf
55 |     else:
56 |         raise TypeError("Input to n_layer is not an int or dict.")
57 | 
58 | 
59 | def generate_parquet_file(n_base, n_layer, path, seed=None):
60 |     """Generates a toy dataset and outputs it as a parquet file.
61 | 
62 |     Parameters
63 |     ----------
64 |     n_base : int
65 |         The number of rows to generate for the base layer
66 |     n_layer : int, or dict
67 |         The number of rows per n_base row to generate for a nested layer.
68 |         Alternatively, a dictionary of layer label, layer_size pairs may be
69 |         specified to created multiple nested columns with custom sizing.
70 |     path : str,
71 |         The path to the parquet file to write.
72 |     seed : int, default=None
73 |         A seed to use for random generation of data
74 | 
75 |     Returns
76 |     -------
77 |     None
78 |     """
79 |     nf = generate_data(n_base, n_layer, seed)
80 |     nf.to_parquet(path)
81 | 


--------------------------------------------------------------------------------
/src/nested_pandas/nestedframe/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import NestedFrame  # noqa
2 | from .io import read_parquet  # noqa
3 | 


--------------------------------------------------------------------------------
/src/nested_pandas/nestedframe/expr.py:
--------------------------------------------------------------------------------
  1 | """Utilities used by NestedFrame.query() and .eval()"""
  2 | 
  3 | # typing.Self and "|" union syntax don't exist in Python 3.9
  4 | from __future__ import annotations
  5 | 
  6 | import ast
  7 | import re
  8 | from typing import TYPE_CHECKING
  9 | 
 10 | import pandas as pd
 11 | from pandas.core.computation import ops
 12 | from pandas.core.computation.expr import PARSERS, PandasExprVisitor
 13 | from pandas.core.computation.parsing import clean_column_name
 14 | 
 15 | # Avoid cyclic import
 16 | if TYPE_CHECKING:
 17 |     from nested_pandas import NestedFrame
 18 | 
 19 | # Used to identify backtick-protected names in the expressions
 20 | # used in NestedFrame.eval() and NestedFrame.query().
 21 | _backtick_protected_names = re.compile(r"`[^`]+`", re.MULTILINE)
 22 | 
 23 | 
 24 | class NestedPandasExprVisitor(PandasExprVisitor):
 25 |     """
 26 |     Custom expression visitor for NestedFrame evaluations, which may assign to
 27 |     nested columns.
 28 |     """
 29 | 
 30 |     def visit_Assign(self, node, **kwargs):  # noqa: N802
 31 |         """
 32 |         Visit an assignment node, which may assign to a nested column.
 33 |         """
 34 |         if not isinstance(node.targets[0], ast.Attribute):
 35 |             # If the target is not an attribute, then it's a simple assignment as usual
 36 |             return super().visit_Assign(node)
 37 |         target = node.targets[0]
 38 |         if not isinstance(target.value, ast.Name):
 39 |             raise ValueError("Assignments to nested columns must be of the form `nested.col = ...`")
 40 |         # target.value.id will be the name of the nest, target.attr is the column name.
 41 |         # Describing the proper target for the assigner is enough for both overwrite and
 42 |         # creation of new columns.  The assigner will be a string like "nested.col".
 43 |         # This works both for the creation of new nest members and new nests.
 44 |         self.assigner = f"{target.value.id}.{target.attr}"
 45 |         # Continue visiting.
 46 |         return self.visit(node.value, **kwargs)
 47 | 
 48 | 
 49 | PARSERS["nested-pandas"] = NestedPandasExprVisitor
 50 | 
 51 | 
 52 | class _SeriesFromNest(pd.Series):
 53 |     """
 54 |     Series that were unpacked from a nest.
 55 |     """
 56 | 
 57 |     _metadata = ["nest_name", "flat_nest"]
 58 | 
 59 |     @property
 60 |     def _constructor(self) -> Self:  # type: ignore[name-defined] # noqa: F821
 61 |         return _SeriesFromNest
 62 | 
 63 |     @property
 64 |     def _constructor_expanddim(self) -> Self:  # type: ignore[name-defined] # noqa: F821
 65 |         # Avoid cyclic import
 66 |         from nested_pandas import NestedFrame
 67 | 
 68 |         return NestedFrame
 69 | 
 70 |     # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types
 71 |     # The __pandas_priority__ of Series is 3000, so give _SeriesFromNest a
 72 |     # higher priority, so that binary operations involving this class and
 73 |     # Series produce instances of this class, preserving the type and origin.
 74 |     __pandas_priority__ = 3500
 75 | 
 76 | 
 77 | class _NestResolver(dict):
 78 |     """
 79 |     Used by NestedFrame.eval to resolve the names of nests at the top level.
 80 |     While the resolver is normally a dictionary, with values that are fixed
 81 |     upon entering evaluation, this object needs to be dynamic so that it can
 82 |     support multi-line expressions, where new nests may be created during
 83 |     evaluation.
 84 |     """
 85 | 
 86 |     def __init__(self, outer: NestedFrame):
 87 |         self._outer = outer
 88 |         super().__init__()
 89 |         # Pre-load the field resolvers for all columns which are known at present.
 90 |         for column in outer.nested_columns:
 91 |             self._initialize_column_resolver(column, outer)
 92 | 
 93 |     def _initialize_column_resolver(self, column: str, outer: NestedFrame):
 94 |         """
 95 |         Initialize a resolver for the given nested column, and also an alias
 96 |         for it, in the case of column names that have spaces or are otherwise
 97 |         not identifier-like.
 98 |         """
 99 |         super().__setitem__(column, _NestedFieldResolver(column, outer))
100 |         clean_id = clean_column_name(column)
101 |         # And once more for the cleaned name, if it's different.
102 |         # This allows us to capture references to it from the Pandas evaluator.
103 |         if clean_id != column:
104 |             super().__setitem__(clean_id, _NestedFieldResolver(column, outer))
105 | 
106 |     def __contains__(self, item):
107 |         top_nest = item if "." not in item else item.split(".")[0].strip()
108 |         return top_nest in self._outer.nested_columns
109 | 
110 |     def __getitem__(self, item):
111 |         top_nest = item if "." not in item else item.split(".")[0].strip()
112 |         if not super().__contains__(top_nest):
113 |             if top_nest not in self._outer.nested_columns:
114 |                 raise KeyError(f"Unknown nest {top_nest}")
115 |             self._initialize_column_resolver(top_nest, self._outer)
116 |         return super().__getitem__(top_nest)
117 | 
118 |     def __setitem__(self, item, _):
119 |         # Called to update the resolver with intermediate values.
120 |         # The important point is to intercept the call so that the evaluator
121 |         # does not create any new resolvers on the fly.  We do NOT want to
122 |         # store the given value, since the resolver does lazy-loading.
123 |         # What we DO want to do, however, is to invalidate the cache for
124 |         # any field resolver for a given nest that is receiving an assignment.
125 |         # Since the resolvers are created as-needed in __getitem__, all we need
126 |         # to do is delete them from the local cache when this pattern is detected.
127 |         if "." in item:
128 |             top_nest = item.split(".")[0].strip()
129 |             if top_nest in self._outer.nested_columns and super().__contains__(top_nest):
130 |                 del self[top_nest]  # force re-creation in __setitem__
131 | 
132 | 
133 | class _NestedFieldResolver:
134 |     """
135 |     Used by NestedFrame.eval to resolve the names of fields in nested columns when
136 |     encountered in expressions, interpreting __getattr__ in terms of a
137 |     specific nest.
138 |     """
139 | 
140 |     def __init__(self, nest_name: str, outer: NestedFrame):
141 |         self._nest_name = nest_name
142 |         # Save the outer frame with an eye toward repacking.
143 |         self._outer = outer
144 |         # Flattened only once for every access of this particular nest
145 |         # within the expression.
146 |         self._flat_nest = outer[nest_name].nest.to_flat()
147 |         # Save aliases to any columns that are not identifier-like.
148 |         # If our given frame has aliases for identifiers, use these instead
149 |         # of generating our own.
150 |         self._aliases = getattr(outer, "_aliases", None)
151 |         if self._aliases is None:
152 |             self._aliases = {}
153 |             for column in self._flat_nest.columns:
154 |                 clean_id = clean_column_name(column)
155 |                 if clean_id != column:
156 |                     self._aliases[clean_id] = column
157 | 
158 |     def __getattr__(self, item_name: str):
159 |         if self._aliases:
160 |             item_name = self._aliases.get(item_name, item_name)
161 |         if item_name in self._flat_nest:
162 |             result = _SeriesFromNest(self._flat_nest[item_name])
163 |             # Assigning these properties directly in order to avoid any complication
164 |             # or interference with the inherited pd.Series constructor.
165 |             result.nest_name = self._nest_name
166 |             result.flat_nest = self._flat_nest
167 |             return result
168 |         raise AttributeError(f"No attribute {item_name}")
169 | 
170 | 
171 | def _subexprs_by_nest(parents: list, node) -> dict[str, list]:
172 |     """
173 |     Given an expression which contains references to both base and nested
174 |     columns, return a dictionary of the sub-expressions that should be
175 |     evaluated independently, keyed by nesting context.
176 | 
177 |     The key of the dictionary is the name of the nested column, and will
178 |     be a blank string in the case of base columns.  The value is a list
179 |     of the parent nodes that lead to sub-expressions that can be evaluated
180 |     successfully.
181 | 
182 |     While this is not in use today for automatically splitting expressions,
183 |     it can be used to detect whether an expression is suitably structured
184 |     for evaluation: the returned dictionary should have a single key.
185 |     """
186 |     if isinstance(node, ops.Term) and not isinstance(node, ops.Constant):
187 |         if isinstance(node.value, _SeriesFromNest):
188 |             return {node.value.nest_name: parents}
189 |         return {getattr(node, "upper_name", ""): parents}
190 |     if not isinstance(node, ops.Op):
191 |         return {}
192 |     sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)]
193 |     result: dict[str, list] = {}
194 |     for source in sources:
195 |         child = _subexprs_by_nest(parents, source)
196 |         for k, v in child.items():
197 |             result.setdefault(k, []).append(v)
198 |     # After a complete traversal across sources, check for any necessary splits.
199 |     # If it's homogenous, move the split-node up the tree.
200 |     if len(result) == 1:
201 |         # Let the record of each parent node drift up the tree,
202 |         # and merge the subtrees into a single node, since by definition,
203 |         # this node is homogeneous over all of its children, and can
204 |         # be evaluated in a single step.
205 |         result = {k: [node] for k in result}
206 |     # If the result is either empty or has more than one key, leave the result
207 |     # alone.  Each key represents a different nest (with a blank string for the base),
208 |     # and the value is the highest point in the expression tree where the expression
209 |     # was still within a single nest.
210 |     return result
211 | 
212 | 
213 | def _identify_aliases(expr: str) -> tuple[str, dict[str, str]]:
214 |     """
215 |     Given an expression string, identify backtick-quoted names
216 |     and replace them with cleaned names, returning the cleaned
217 |     expression and a dictionary of aliases, where the keys are
218 |     clean aliases to the original names.
219 |     """
220 |     aliases = {}
221 | 
222 |     def sub_and_alias(match):
223 |         original = match.group(0)[1:-1]  # remove backticks
224 |         alias = clean_column_name(original)
225 |         if alias != original:
226 |             aliases[alias] = original
227 |         return alias
228 | 
229 |     return _backtick_protected_names.sub(sub_and_alias, expr), aliases
230 | 


--------------------------------------------------------------------------------
/src/nested_pandas/nestedframe/io.py:
--------------------------------------------------------------------------------
  1 | # typing.Self and "|" union syntax don't exist in Python 3.9
  2 | from __future__ import annotations
  3 | 
  4 | from collections.abc import Sequence
  5 | 
  6 | import pandas as pd
  7 | import pyarrow as pa
  8 | import pyarrow.parquet as pq
  9 | from upath import UPath
 10 | 
 11 | from ..series.dtype import NestedDtype
 12 | from ..series.packer import pack_lists
 13 | from ..series.utils import table_to_struct_array
 14 | from .core import NestedFrame
 15 | 
 16 | 
 17 | def read_parquet(
 18 |     data: str | UPath | bytes,
 19 |     columns: list[str] | None = None,
 20 |     reject_nesting: list[str] | str | None = None,
 21 |     autocast_list: bool = False,
 22 |     **kwargs,
 23 | ) -> NestedFrame:
 24 |     """
 25 |     Load a parquet object from a file path into a NestedFrame.
 26 | 
 27 |     As a deviation from `pandas`, this function loads via
 28 |     `pyarrow.parquet.read_table`, and then converts to a NestedFrame.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     data: str, Upath, or file-like object
 33 |         Path to the data or a file-like object. If a string is passed, it can be a single file name,
 34 |         directory name, or a remote path (e.g., HTTP/HTTPS or S3). If a file-like object is passed,
 35 |         it must support the `read` method.
 36 |     columns : list, default=None
 37 |         If not None, only these columns will be read from the file.
 38 |     reject_nesting: list or str, default=None
 39 |         Column(s) to reject from being cast to a nested dtype. By default,
 40 |         nested-pandas assumes that any struct column with all fields being lists
 41 |         is castable to a nested column. However, this assumption is invalid if
 42 |         the lists within the struct have mismatched lengths for any given item.
 43 |         Columns specified here will be read using the corresponding pandas.ArrowDtype.
 44 |     autocast_list: bool, default=True
 45 |         If True, automatically cast list columns to nested columns with NestedDType.
 46 |     kwargs: dict
 47 |         Keyword arguments passed to `pyarrow.parquet.read_table`
 48 | 
 49 |     Returns
 50 |     -------
 51 |     NestedFrame
 52 | 
 53 |     Notes
 54 |     -----
 55 |     pyarrow supports partial loading of nested structures from parquet, for
 56 |     example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
 57 |     load the "a" column of the "nested" column. Standard pandas/pyarrow
 58 |     behavior will return "a" as a list-array base column with name "a". In
 59 |     nested-pandas, this behavior is changed to load the column as a sub-column
 60 |     of a nested column called "nested". Be aware that this will prohibit calls
 61 |     like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
 62 |     from working, as this implies both full and partial load of "nested".
 63 | 
 64 |     Furthermore, there are some cases where subcolumns will have the same name
 65 |     as a top-level column. For example, if you have a column "nested" with
 66 |     subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
 67 |     these cases, keep in mind that if "nested" is in the reject_nesting list
 68 |     the operation will fail, as is consistent with the default pandas behavior
 69 |     (but nesting will still work normally).
 70 | 
 71 |     Examples
 72 |     --------
 73 | 
 74 |     Simple loading example:
 75 | 
 76 |     >>> import nested_pandas as npd
 77 |     >>> nf = npd.read_parquet("path/to/file.parquet")  # doctest: +SKIP
 78 | 
 79 |     Partial loading:
 80 | 
 81 |     >>> #Load only the "flux" sub-column of the "nested" column
 82 |     >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"])  # doctest: +SKIP
 83 |     """
 84 | 
 85 |     # Type convergence for reject_nesting
 86 |     if reject_nesting is None:
 87 |         reject_nesting = []
 88 |     elif isinstance(reject_nesting, str):
 89 |         reject_nesting = [reject_nesting]
 90 | 
 91 |     # First load through pyarrow
 92 |     # Check if `data` is a file-like object or a sequence
 93 |     if hasattr(data, "read") or (
 94 |         isinstance(data, Sequence) and not isinstance(data, str | bytes | bytearray)
 95 |     ):
 96 |         # If `data` is a file-like object or a sequence, pass it directly to pyarrow
 97 |         table = pq.read_table(data, columns=columns, **kwargs)
 98 |     else:
 99 |         # Otherwise, treat `data` as a file path and use UPath
100 |         path = UPath(data)
101 |         filesystem = kwargs.pop("filesystem", path.fs)
102 |         table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs)
103 | 
104 |     # Resolve partial loading of nested structures
105 |     # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
106 |     # Use input column names and the table column names to determine if a column
107 |     # was from a nested column.
108 |     if columns is not None:
109 |         nested_structures: dict[str, list[int]] = {}
110 |         for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names, strict=True)):
111 |             # if the column name is not the same, it was a partial load
112 |             if col_in != col_pa:
113 |                 # get the top-level column name
114 |                 nested_col = col_in.split(".")[0]
115 | 
116 |                 # validate that the partial load columns are list type
117 |                 # if any of the columns are not list type, reject the cast
118 |                 # and remove the column from the list of nested structures if
119 |                 # it was added
120 |                 if not pa.types.is_list(table.schema[i].type):
121 |                     reject_nesting.append(nested_col)
122 |                     if nested_col in nested_structures:
123 |                         # remove the column from the list of nested structures
124 |                         nested_structures.pop(nested_col)
125 |                 # track nesting for columns not in the reject list
126 |                 elif nested_col not in reject_nesting:
127 |                     if nested_col not in nested_structures:
128 |                         nested_structures[nested_col] = [i]
129 |                     else:
130 |                         nested_structures[nested_col].append(i)
131 | 
132 |         # Check for full and partial load of the same column and error
133 |         # Columns in the reject_nesting will not be checked
134 |         for col in columns:
135 |             if col in nested_structures:
136 |                 raise ValueError(
137 |                     f"The provided column list contains both a full and partial "
138 |                     f"load of the column '{col}'. This is not allowed as the partial "
139 |                     "load will be cast to a nested column that already exists. "
140 |                     "Please either remove the partial load or the full load."
141 |                 )
142 | 
143 |         # Build structs and track column indices used
144 |         structs = {}
145 |         indices_to_remove = []
146 |         for col, indices in nested_structures.items():
147 |             # Build a struct column from the columns
148 |             structs[col] = table_to_struct_array(table.select(indices))
149 |             indices_to_remove.extend(indices)
150 | 
151 |         # Remove the original columns in reverse order to avoid index shifting
152 |         for i in sorted(indices_to_remove, reverse=True):
153 |             table = table.remove_column(i)
154 | 
155 |         # Append the new struct columns
156 |         for col, struct in structs.items():
157 |             table = table.append_column(col, struct)
158 | 
159 |     return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list)
160 | 
161 | 
162 | def from_pyarrow(
163 |     table: pa.Table,
164 |     reject_nesting: list[str] | str | None = None,
165 |     autocast_list: bool = False,
166 | ) -> NestedFrame:
167 |     """
168 |     Load a pyarrow Table object into a NestedFrame.
169 | 
170 |     Parameters
171 |     ----------
172 |     table: pa.Table
173 |         PyArrow Table object to load NestedFrame from
174 |     reject_nesting: list or str, default=None
175 |         Column(s) to reject from being cast to a nested dtype. By default,
176 |         nested-pandas assumes that any struct column with all fields being lists
177 |         is castable to a nested column. However, this assumption is invalid if
178 |         the lists within the struct have mismatched lengths for any given item.
179 |         Columns specified here will be read using the corresponding pandas.ArrowDtype.
180 |     autocast_list: bool, default=False
181 |         If True, automatically cast list columns to nested columns with NestedDType.
182 | 
183 |     Returns
184 |     -------
185 |     NestedFrame
186 | 
187 |     """
188 | 
189 |     if reject_nesting is None:
190 |         reject_nesting = []
191 |     elif isinstance(reject_nesting, str):
192 |         reject_nesting = [reject_nesting]
193 | 
194 |     # Convert to NestedFrame
195 |     # not zero-copy, but reduce memory pressure via the self_destruct kwarg
196 |     # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
197 |     df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True))
198 |     # Attempt to cast struct columns to NestedDTypes
199 |     df = _cast_struct_cols_to_nested(df, reject_nesting)
200 | 
201 |     # If autocast_list is True, cast list columns to NestedDTypes
202 |     if autocast_list:
203 |         df = _cast_list_cols_to_nested(df)
204 | 
205 |     return df
206 | 
207 | 
208 | def _cast_struct_cols_to_nested(df, reject_nesting):
209 |     """cast struct columns to nested dtype"""
210 |     # Attempt to cast struct columns to NestedDTypes
211 |     for col, dtype in df.dtypes.items():
212 |         # First validate the dtype
213 |         # will return valueerror when not a struct-list
214 |         valid_dtype = True
215 |         try:
216 |             NestedDtype._validate_dtype(dtype.pyarrow_dtype)
217 |         except ValueError:
218 |             valid_dtype = False
219 | 
220 |         if valid_dtype and col not in reject_nesting:
221 |             try:
222 |                 # Attempt to cast Struct to NestedDType
223 |                 df = df.astype({col: NestedDtype(dtype.pyarrow_dtype)})
224 |             except ValueError as err:
225 |                 # If cast fails, the struct likely does not fit nested-pandas
226 |                 # criteria for a valid nested column
227 |                 raise ValueError(
228 |                     f"Column '{col}' is a Struct, but an attempt to cast it to a NestedDType failed. "
229 |                     "This is likely due to the struct not meeting the requirements for a nested column "
230 |                     "(all fields should be equal length). To proceed, you may add the column to the "
231 |                     "`reject_nesting` argument of the read_parquet function to skip the cast attempt:"
232 |                     f" read_parquet(..., reject_nesting=['{col}'])"
233 |                 ) from err
234 |     return df
235 | 
236 | 
237 | def _cast_list_cols_to_nested(df):
238 |     """cast list columns to nested dtype"""
239 |     for col, dtype in df.dtypes.items():
240 |         if pa.types.is_list(dtype.pyarrow_dtype):
241 |             df[col] = pack_lists(df[[col]])
242 |     return df
243 | 


--------------------------------------------------------------------------------
/src/nested_pandas/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/src/nested_pandas/py.typed


--------------------------------------------------------------------------------
/src/nested_pandas/series/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/src/nested_pandas/series/__init__.py


--------------------------------------------------------------------------------
/src/nested_pandas/series/_storage/__init__.py:
--------------------------------------------------------------------------------
1 | from .list_struct_storage import ListStructStorage  # noqa: F401
2 | from .struct_list_storage import StructListStorage  # noqa: F401
3 | from .table_storage import TableStorage  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/_storage/list_struct_storage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations  # Python 3.9 requires it for X | Y type hints
 2 | 
 3 | from typing import TYPE_CHECKING, Any
 4 | 
 5 | import pyarrow as pa
 6 | 
 7 | from nested_pandas.series.utils import transpose_struct_list_chunked, validate_list_struct_type
 8 | 
 9 | if TYPE_CHECKING:
10 |     from nested_pandas.series._storage.struct_list_storage import StructListStorage
11 |     from nested_pandas.series._storage.table_storage import TableStorage
12 | 
13 | 
14 | class ListStructStorage:
15 |     """Store nested data as a PyArrow list-struct array.
16 | 
17 |     Parameters
18 |     ----------
19 |     array : pa.ListArray or pa.ChunkedArray
20 |         Pyarrow list-array with a struct value type. An array or a chunk-array
21 |     """
22 | 
23 |     _data: pa.ChunkedArray
24 | 
25 |     def __init__(self, array: pa.ListArray | pa.ChunkedArray) -> None:
26 |         if isinstance(array, pa.ListArray):
27 |             array = pa.chunked_array([array])
28 |         if not isinstance(array, pa.ChunkedArray):
29 |             raise ValueError("array must be of type pa.ChunkedArray")
30 |         validate_list_struct_type(array.type)
31 |         self._data = array
32 | 
33 |     @property
34 |     def data(self) -> pa.ChunkedArray:
35 |         return self._data
36 | 
37 |     @classmethod
38 |     def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self:  # type: ignore # noqa: F821
39 |         """Construct from a StructListStorage object.
40 | 
41 |         Parameters
42 |         ----------
43 |         struct_list_storage : StructListStorage
44 |             StructListStorage object.
45 |         """
46 |         data = transpose_struct_list_chunked(struct_list_storage.data, validate=False)
47 |         return cls(data)
48 | 
49 |     @classmethod
50 |     def from_table_storage(cls, table_storage: TableStorage) -> Self:  # type: ignore # noqa: F821
51 |         """Construct from a TableStorage object.
52 | 
53 |         Parameters
54 |         ----------
55 |         table_storage : TableStorage
56 |             TableStorage object.
57 |         """
58 |         from nested_pandas.series._storage import StructListStorage
59 | 
60 |         struct_list_storage = StructListStorage.from_table_storage(table_storage)
61 |         return cls.from_struct_list_storage(struct_list_storage)
62 | 
63 |     def __len__(self) -> int:
64 |         return len(self._data)
65 | 
66 |     def __eq__(self, other: Any) -> bool:
67 |         if not isinstance(other, type(self)):
68 |             return False
69 |         return self._data == other._data
70 | 
71 |     @property
72 |     def nbytes(self) -> int:
73 |         """Number of bytes consumed by the data in memory."""
74 |         return self._data.nbytes
75 | 
76 |     @property
77 |     def type(self) -> pa.ListType:
78 |         """Pyarrow type of the underlying array."""
79 |         return self._data.type
80 | 
81 |     @property
82 |     def num_chunks(self) -> int:
83 |         """Number of chunks in the underlying array."""
84 |         return self._data.num_chunks
85 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/_storage/struct_list_storage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations  # Python 3.9 requires it for X | Y type hints
 2 | 
 3 | from collections.abc import Iterator
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | import pyarrow as pa
 7 | 
 8 | from nested_pandas.series.utils import (
 9 |     table_to_struct_array,
10 |     transpose_list_struct_chunked,
11 |     validate_struct_list_array_for_equal_lengths,
12 | )
13 | 
14 | if TYPE_CHECKING:
15 |     from nested_pandas.series._storage.list_struct_storage import ListStructStorage
16 |     from nested_pandas.series._storage.table_storage import TableStorage
17 | 
18 | 
19 | class StructListStorage:
20 |     """Store nested data as a PyArrow struct-list array.
21 | 
22 |     Parameters
23 |     ----------
24 |     array : pa.StructArray or pa.ChunkedArray
25 |         Pyarrow struct-array with all fields to be list-arrays.
26 |         All list-values must be "aligned", e.g., have the same length.
27 |     validate : bool (default True)
28 |         Check that all the lists have the same lengths for each struct-value.
29 |     """
30 | 
31 |     _data: pa.ChunkedArray
32 | 
33 |     def __init__(self, array: pa.StructArray | pa.ChunkedArray, *, validate: bool = True) -> None:
34 |         if isinstance(array, pa.StructArray):
35 |             array = pa.chunked_array([array])
36 |         if not isinstance(array, pa.ChunkedArray):
37 |             raise ValueError("array must be a StructArray or ChunkedArray")
38 | 
39 |         if validate:
40 |             for chunk in array.chunks:
41 |                 validate_struct_list_array_for_equal_lengths(chunk)
42 | 
43 |         self._data = array
44 | 
45 |     @property
46 |     def data(self) -> pa.ChunkedArray:
47 |         return self._data
48 | 
49 |     @classmethod
50 |     def from_list_struct_storage(cls, list_struct_storage: ListStructStorage) -> Self:  # type: ignore # noqa: F821
51 |         """Construct from a ListStructStorage object.
52 | 
53 |         Parameters
54 |         ----------
55 |         list_struct_storage : ListStructStorage
56 |             ListStructStorage object.
57 |         """
58 |         data = transpose_list_struct_chunked(list_struct_storage.data)
59 |         return cls(data, validate=False)
60 | 
61 |     @classmethod
62 |     def from_table_storage(cls, table_storage: TableStorage) -> Self:  # type: ignore # noqa: F821
63 |         """Construct from a TableStorage object.
64 | 
65 |         Parameters
66 |         ----------
67 |         table_storage : TableStorage
68 |             TableStorage object.
69 |         """
70 |         data = table_to_struct_array(table_storage.data)
71 |         return cls(data, validate=False)
72 | 
73 |     def __iter__(self) -> Iterator[pa.StructScalar]:
74 |         return iter(self._data)
75 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/_storage/table_storage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations  # Python 3.9 requires it for X | Y type hints
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import pyarrow as pa
 6 | 
 7 | from nested_pandas.series.utils import (
 8 |     table_from_struct_array,
 9 |     table_to_struct_array,
10 |     validate_struct_list_array_for_equal_lengths,
11 | )
12 | 
13 | if TYPE_CHECKING:
14 |     from nested_pandas.series._storage.list_struct_storage import ListStructStorage
15 |     from nested_pandas.series._storage.struct_list_storage import StructListStorage
16 | 
17 | 
18 | class TableStorage:
19 |     """Store nested data as a PyArrow table with list-columns.
20 | 
21 |     Parameters
22 |     ----------
23 |     table : pa.Table
24 |         PyArrow table, all columns must be list-columns.
25 |         All list-values must be "aligned", e.g., have the same length.
26 |     """
27 | 
28 |     _data: pa.Table
29 | 
30 |     def __init__(self, table: pa.Table, validate: bool = True) -> None:
31 |         if validate:
32 |             struct_array = table_to_struct_array(table)
33 |             for chunk in struct_array.iterchunks():
34 |                 validate_struct_list_array_for_equal_lengths(chunk)
35 | 
36 |         self._data = table
37 | 
38 |     @property
39 |     def data(self) -> pa.Table:
40 |         return self._data
41 | 
42 |     @classmethod
43 |     def from_list_struct_storage(cls, list_storage: ListStructStorage) -> Self:  # type: ignore # noqa: F821
44 |         """Construct from a StructListStorage object.
45 | 
46 |         Parameters
47 |         ----------
48 |         list_storage : ListStructStorage
49 |             StructListStorage object.
50 |         """
51 |         from nested_pandas.series._storage import StructListStorage
52 | 
53 |         struct_list_storage = StructListStorage.from_list_struct_storage(list_storage)
54 |         return cls.from_struct_list_storage(struct_list_storage)
55 | 
56 |     @classmethod
57 |     def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self:  # type: ignore # noqa: F821
58 |         """Construct from a StructListStorage object.
59 | 
60 |         Parameters
61 |         ----------
62 |         struct_list_storage : StructListStorage
63 |             StructListStorage object.
64 |         """
65 |         table = table_from_struct_array(struct_list_storage.data)
66 |         return cls(table, validate=False)
67 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/dtype.py:
--------------------------------------------------------------------------------
  1 | # Use Self, which is not available until Python 3.11
  2 | from __future__ import annotations
  3 | 
  4 | from collections.abc import Mapping
  5 | 
  6 | # We use Type, because we must use "type" as an attribute name
  7 | from typing import Type, cast  # noqa: UP035
  8 | 
  9 | import pandas as pd
 10 | import pyarrow as pa
 11 | from pandas import ArrowDtype
 12 | from pandas.api.extensions import register_extension_dtype
 13 | from pandas.core.arrays import ExtensionArray
 14 | from pandas.core.dtypes.base import ExtensionDtype
 15 | 
 16 | from nested_pandas.series.utils import (
 17 |     is_pa_type_is_list_struct,
 18 |     transpose_list_struct_type,
 19 |     transpose_struct_list_type,
 20 | )
 21 | 
 22 | __all__ = ["NestedDtype"]
 23 | 
 24 | 
 25 | @register_extension_dtype
 26 | class NestedDtype(ExtensionDtype):
 27 |     """Data type to handle packed time series data
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     pyarrow_dtype : pyarrow.StructType or pd.ArrowDtype
 32 |         The pyarrow data type to use for the nested type. It must be a struct
 33 |         type where all fields are list types.
 34 |     """
 35 | 
 36 |     # ExtensionDtype overrides #
 37 | 
 38 |     _metadata = ("pyarrow_dtype",)
 39 |     """Attributes to use as metadata for __eq__ and __hash__"""
 40 | 
 41 |     @property
 42 |     def na_value(self) -> Type[pd.NA]:
 43 |         """The missing value for this dtype"""
 44 |         return pd.NA
 45 | 
 46 |     type = pd.DataFrame
 47 |     """The type of the array's elements, always pd.DataFrame"""
 48 | 
 49 |     @property
 50 |     def name(self) -> str:
 51 |         """The string representation of the nested type"""
 52 |         # Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__
 53 |         nice_dtypes = {
 54 |             field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype
 55 |             for field, dtype in self.field_dtypes.items()
 56 |         }
 57 |         fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
 58 |         return f"nested<{fields}>"
 59 | 
 60 |     def __repr__(self) -> str:
 61 |         return self.name
 62 | 
 63 |     @classmethod
 64 |     def construct_array_type(cls) -> Type[ExtensionArray]:
 65 |         """Corresponded array type, always NestedExtensionArray"""
 66 |         from nested_pandas.series.ext_array import NestedExtensionArray
 67 | 
 68 |         return NestedExtensionArray
 69 | 
 70 |     @classmethod
 71 |     def construct_from_string(cls, string: str) -> Self:  # type: ignore[name-defined] # noqa: F821
 72 |         """Construct NestedDtype from a string representation.
 73 | 
 74 |         This works only for simple types, i.e. non-parametric pyarrow types.
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         string : str
 79 |             The string representation of the nested type. For example,
 80 |             'nested<x: [int64], y: [float64]'. It must be consistent with
 81 |             the string representation of the dtype given by the `name`
 82 |             attribute.
 83 | 
 84 |         Returns
 85 |         -------
 86 |         NestedDtype
 87 |             The constructed NestedDtype.
 88 | 
 89 |         Raises
 90 |         ------
 91 |         TypeError
 92 |             If the string is not a valid nested type string or if the element types
 93 |             are parametric pyarrow types.
 94 |         """
 95 |         if not string.startswith("nested<") or not string.endswith(">"):
 96 |             raise TypeError("Not a valid nested type string, expected 'nested<...>'")
 97 |         fields_str = string.removeprefix("nested<").removesuffix(">")
 98 | 
 99 |         field_strings = fields_str.split(", ")
100 | 
101 |         fields = {}
102 |         for field_string in field_strings:
103 |             try:
104 |                 field_name, field_type = field_string.split(": ", maxsplit=1)
105 |             except ValueError as e:
106 |                 raise TypeError(
107 |                     "Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field "
108 |                     f"string '{field_string}'"
109 |                 ) from e
110 |             if not field_type.startswith("[") or not field_type.endswith("]"):
111 |                 raise TypeError(
112 |                     "Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field "
113 |                     f"type string '{field_type}'"
114 |                 )
115 | 
116 |             value_type = field_type.removeprefix("[").removesuffix("]")
117 |             # We follow ArrowDtype implementation heere and do not try to parse complex types
118 |             try:
119 |                 pa_value_type = pa.type_for_alias(value_type)
120 |             except ValueError as e:
121 |                 raise TypeError(
122 |                     f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. "
123 |                     "Please use NestedDtype() or NestedDtype.from_fields() instead."
124 |                 ) from e
125 | 
126 |             fields[field_name] = pa_value_type
127 | 
128 |         return cls.from_fields(fields)
129 | 
130 |     # ArrowDtype would return None so we do
131 |     def _get_common_dtype(self, dtypes: list) -> None:
132 |         return None
133 | 
134 |     # Optional methods #
135 | 
136 |     def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
137 |         """Construct a NestedExtensionArray from a pyarrow array.
138 | 
139 |         Parameters
140 |         ----------
141 |         array : pa.Array | pa.ChunkedArray
142 |             The input pyarrow array.
143 | 
144 |         Returns
145 |         -------
146 |         NestedExtensionArray
147 |             The constructed NestedExtensionArray.
148 |         """
149 |         from nested_pandas.series.ext_array import NestedExtensionArray
150 | 
151 |         return NestedExtensionArray(array)
152 | 
153 |     # Additional methods and attributes #
154 | 
155 |     pyarrow_dtype: pa.StructType
156 | 
157 |     def __init__(self, pyarrow_dtype: pa.DataType) -> None:
158 |         self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
159 | 
160 |     @property
161 |     def struct_list_pa_dtype(self) -> pa.StructType:
162 |         """Struct-list pyarrow type representing the nested type."""
163 |         return self.pyarrow_dtype
164 | 
165 |     @classmethod
166 |     def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self:  # type: ignore[name-defined] # noqa: F821
167 |         """Make NestedDtype from a mapping of field names and list item types.
168 | 
169 |         Parameters
170 |         ----------
171 |         fields : Mapping[str, pa.DataType]
172 |             A mapping of field names and their item types. Since all fields are lists, the item types are
173 |             inner types of the lists, not the list types themselves.
174 | 
175 |         Returns
176 |         -------
177 |         NestedDtype
178 |             The constructed NestedDtype.
179 | 
180 |         Examples
181 |         --------
182 |         >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
183 |         >>> dtype
184 |         nested<a: [double], b: [int64]>
185 |         >>> assert (
186 |         ...     dtype.pyarrow_dtype
187 |         ...     == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
188 |         ... )
189 |         """
190 |         pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
191 |         pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
192 |         return cls(pyarrow_dtype=pyarrow_dtype)
193 | 
194 |     @staticmethod
195 |     def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListType]:
196 |         """Check that the given pyarrow type is castable to the nested type.
197 | 
198 |         Parameters
199 |         ----------
200 |         pyarrow_dtype : pa.DataType
201 |             The pyarrow type to check and cast.
202 | 
203 |         Returns
204 |         -------
205 |         pa.StructType
206 |             Struct-list pyarrow type representing the nested type.
207 |         pa.ListType
208 |             List-struct pyarrow type representing the nested type.
209 |         """
210 |         if not isinstance(pyarrow_dtype, pa.DataType):
211 |             raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
212 |         if pa.types.is_struct(pyarrow_dtype):
213 |             struct_type = cast(pa.StructType, pyarrow_dtype)
214 |             return struct_type, transpose_struct_list_type(struct_type)
215 |         # Currently, LongList and others are not supported
216 |         if pa.types.is_list(pyarrow_dtype):
217 |             list_type = cast(pa.ListType, pyarrow_dtype)
218 |             return transpose_list_struct_type(list_type), list_type
219 |         raise ValueError(
220 |             f"NestedDtype can only be constructed with pa.StructType or pa.ListType only, got {pyarrow_dtype}"
221 |         )
222 | 
223 |     @property
224 |     def fields(self) -> dict[str, pa.DataType]:
225 |         """The mapping of field names and their item types."""
226 |         return {field.name: field.type.value_type for field in self.pyarrow_dtype}
227 | 
228 |     @property
229 |     def field_names(self) -> list[str]:
230 |         """The list of field names of the nested type"""
231 |         return [field.name for field in self.pyarrow_dtype]
232 | 
233 |     @classmethod
234 |     def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype) -> Self:  # type: ignore[name-defined] # noqa: F821
235 |         """Construct NestedDtype from a pandas.ArrowDtype.
236 | 
237 |         Parameters
238 |         ----------
239 |         pandas_arrow_dtype : ArrowDtype
240 |             The pandas.ArrowDtype to construct NestedDtype from.
241 |             Must be struct-list or list-struct type.
242 | 
243 |         Returns
244 |         -------
245 |         NestedDtype
246 |             The constructed NestedDtype.
247 | 
248 |         Raises
249 |         ------
250 |         ValueError
251 |             If the given dtype is not a valid nested type.
252 |         """
253 |         return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype)
254 | 
255 |     def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype:
256 |         """Convert NestedDtype to a pandas.ArrowDtype.
257 | 
258 |         Parameters
259 |         ----------
260 |         list_struct : bool, default False
261 |             If False (default) use pyarrow struct-list type,
262 |             otherwise use pyarrow list-struct type.
263 | 
264 |         Returns
265 |         -------
266 |         ArrowDtype
267 |             The corresponding pandas.ArrowDtype.
268 |         """
269 |         if list_struct:
270 |             return ArrowDtype(self.list_struct_pa_dtype)
271 |         return ArrowDtype(self.pyarrow_dtype)
272 | 
273 |     def field_dtype(self, field: str) -> pd.ArrowDtype | Self:  # type: ignore[name-defined] # noqa: F821
274 |         """Pandas dtype of a field, pd.ArrowDType or NestedDtype.
275 | 
276 |         Parameters
277 |         ----------
278 |         field : str
279 |             Field name
280 | 
281 |         Returns
282 |         -------
283 |         pd.ArrowDtype | NestedDtype
284 |             If the field is a list-struct, return NestedDtype, else wrap it
285 |             as a pd.ArrowDtype.
286 |         """
287 |         list_type = self.pyarrow_dtype.field(field).type
288 |         value_type = list_type.value_type
289 |         if is_pa_type_is_list_struct(value_type):
290 |             return type(self)(value_type)
291 |         return pd.ArrowDtype(value_type)
292 | 
293 |     @property
294 |     def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]:  # type: ignore[name-defined] # noqa: F821
295 |         """Pandas dtypes of this dtype's fields."""
296 |         return {field: self.field_dtype(field) for field in self.field_names}
297 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/packer.py:
--------------------------------------------------------------------------------
  1 | """Module for converting between "flat" and "list" and "nested" representations
  2 | 
  3 | TODO: mask support
  4 | TODO: multi-index support
  5 | """
  6 | 
  7 | # "|" for python 3.9
  8 | from __future__ import annotations
  9 | 
 10 | from collections.abc import Sequence
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | import pyarrow as pa
 15 | 
 16 | from nested_pandas.series.dtype import NestedDtype
 17 | from nested_pandas.series.ext_array import NestedExtensionArray
 18 | 
 19 | __all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"]
 20 | 
 21 | 
 22 | N_ROWS_INFER_DTYPE = 1000
 23 | 
 24 | 
 25 | def pack(
 26 |     obj,
 27 |     name: str | None = None,
 28 |     *,
 29 |     index=None,
 30 |     on: None | str | list[str] = None,
 31 |     dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
 32 | ) -> pd.Series:
 33 |     """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     obj : pd.DataFrame or Sequence of
 38 |         Input dataframe, with repeated indexes, or a sequence of dataframes or missed values.
 39 |     name : str, optional
 40 |         Name of the output series.
 41 |     index : convertable to pd.Index, optional
 42 |         Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
 43 |         and this value is used to override the index after the nesting.
 44 |     on: str or list of str, optional
 45 |         Column name(s) to join on. If None, the index is used.
 46 |     dtype : dtype or None
 47 |         NestedDtype of the output series, or other type to derive from. If None,
 48 |         the dtype is inferred from the first non-missing dataframe.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     pd.Series
 53 |         Output series.
 54 |     """
 55 |     if isinstance(obj, pd.DataFrame):
 56 |         nested = pack_flat(obj, name=name, on=on)
 57 |         if index is not None:
 58 |             nested.index = index
 59 |         return nested
 60 |     return pack_seq(obj, name=name, index=index, dtype=dtype)
 61 | 
 62 | 
 63 | def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
 64 |     """Make a structure of lists representation of a "flat" dataframe.
 65 | 
 66 |     For the input dataframe with repeated indexes, make a pandas.Series,
 67 |     where each original column is replaced by a structure of lists.
 68 |     The dtype of the column is `nested_pandas.NestedDtype` with
 69 |     the corresponding pyarrow type. The index of the output series is
 70 |     the unique index of the input dataframe. The Series has `.nest` accessor,
 71 |     see `nested_pandas.series.accessor.NestSeriesAccessor` for details.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     df : pd.DataFrame
 76 |         Input dataframe, with repeated indexes.
 77 |     name : str, optional
 78 |         Name of the pd.Series.
 79 |     on : str or list of str, optional
 80 |         Column name(s) to join on. If None, the df's index is used.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     pd.Series
 85 |         Output series, with unique indexes.
 86 | 
 87 |     See Also
 88 |     --------
 89 |     nested_pandas.series.accessor.NestedSeriesAccessor : .nest accessor for the output series.
 90 |     nested_pandas.series.dtype.NestedDtype : The dtype of the output series.
 91 |     nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
 92 |     """
 93 | 
 94 |     if on is not None:
 95 |         df = df.set_index(on)
 96 |     # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
 97 |     sorted_flat = df.sort_index(kind="stable")
 98 |     return pack_sorted_df_into_struct(sorted_flat, name=name)
 99 | 
100 | 
101 | def pack_seq(
102 |     sequence: Sequence,
103 |     name: str | None = None,
104 |     *,
105 |     index: object = None,
106 |     dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
107 | ) -> pd.Series:
108 |     """Pack a sequence of "flat" dataframes into a "nested" series.
109 | 
110 |     Parameters
111 |     ----------
112 |     sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar
113 |         Input sequence of dataframes or missed values.
114 |     name : str, optional
115 |         Name of the output series.
116 |     index : pd.Index, optional
117 |         Index of the output series.
118 |     dtype : dtype or None
119 |         NestedDtype of the output series, or other type to derive from. If None,
120 |         the dtype is inferred from the first non-missing dataframe.
121 | 
122 |     Returns
123 |     -------
124 |     pd.Series
125 |         Output series.
126 |     """
127 |     if isinstance(sequence, pd.Series):
128 |         if index is None:
129 |             index = sequence.index
130 |         if name is None:
131 |             name = sequence.name
132 | 
133 |     ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype)
134 |     series = pd.Series(ext_array, index=index, name=name, copy=False)
135 |     return series
136 | 
137 | 
138 | def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series:
139 |     """Make a structure of lists representation of a "flat" dataframe.
140 | 
141 |     Input dataframe must be sorted and all the columns must have pyarrow dtypes.
142 | 
143 |     Parameters
144 |     ----------
145 |     df : pd.DataFrame
146 |         Input dataframe, with repeated indexes. It must be sorted and
147 |         all the columns must have pyarrow dtypes.
148 | 
149 |     name : str, optional
150 |         Name of the pd.Series.
151 | 
152 |     Returns
153 |     -------
154 |     pd.Series
155 |         Output series, with unique indexes.
156 |     """
157 |     if not df.index.is_monotonic_increasing:
158 |         raise ValueError("The index of the input dataframe must be sorted")
159 | 
160 |     packed_df = view_sorted_df_as_list_arrays(df)
161 |     # No need to validate the dataframe, the length of the nested arrays is forced to be the same by
162 |     # the view_sorted_df_as_list_arrays function.
163 |     return pack_lists(packed_df, name=name, validate=False)
164 | 
165 | 
166 | def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series:
167 |     """Make a series of arrow structures from a dataframe with nested arrays.
168 | 
169 |     For the input dataframe with repeated indexes, make a pandas.Series,
170 |     where each original column is replaced by a structure of lists.
171 |     The dtype of the column is `nested_pandas.NestedDtype` with the corresponding
172 |     pyarrow type. The index of the output series is the unique index of the
173 |     input dataframe. The Series has `.nest` accessor, see
174 |     `nested_pandas.series.accessor.NestSeriesAccessor` for details.
175 | 
176 |     For every row, all the nested array (aka pyarrow list) lengths must be
177 |     the same.
178 | 
179 |     Parameters
180 |     ----------
181 |     df : pd.DataFrame
182 |         Input dataframe, with pyarrow list-arrays.
183 |     name : str, optional
184 |         Name of the pd.Series.
185 |     validate : bool, default True
186 |         Whether to validate the input dataframe.
187 | 
188 |     Returns
189 |     -------
190 |     pd.Series
191 |         Output series, with unique indexes.
192 | 
193 |     See Also
194 |     --------
195 |     nested_pandas.series.accessor.NestSeriesAccessor : The accessor for the output series.
196 |     nested_pandas.series.dtype.NestedDtype : The dtype of the output series.
197 |     nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes.
198 |     """
199 |     # When series is converted to pa.array it may be both Array and ChunkedArray
200 |     # We convert it to chunked for the sake of consistency
201 |     pa_arrays_maybe_chunked = {column: pa.array(df[column]) for column in df.columns}
202 |     pa_chunked_arrays = {
203 |         column: arr if isinstance(arr, pa.ChunkedArray) else pa.chunked_array([arr])
204 |         for column, arr in pa_arrays_maybe_chunked.items()
205 |     }
206 | 
207 |     # If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no
208 |     # data copying.
209 |     chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()])
210 |     if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths):
211 |         chunks = []
212 |         num_chunks = next(iter(pa_chunked_arrays.values())).num_chunks
213 |         for i in range(num_chunks):
214 |             chunks.append(
215 |                 pa.StructArray.from_arrays(
216 |                     [arr.chunk(i) for arr in pa_chunked_arrays.values()],
217 |                     names=pa_chunked_arrays.keys(),
218 |                 )
219 |             )
220 |         struct_array = pa.chunked_array(chunks)
221 |     else:  # "flatten" the chunked arrays
222 |         struct_array = pa.StructArray.from_arrays(
223 |             [arr.combine_chunks() for arr in pa_chunked_arrays.values()],
224 |             names=pa_chunked_arrays.keys(),
225 |         )
226 | 
227 |     ext_array = NestedExtensionArray(struct_array, validate=validate)
228 |     return pd.Series(
229 |         ext_array,
230 |         index=df.index,
231 |         copy=False,
232 |         name=name,
233 |     )
234 | 
235 | 
236 | def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame:
237 |     """Make a nested array representation of a "flat" dataframe.
238 | 
239 |     Parameters
240 |     ----------
241 |     df : pd.DataFrame
242 |         Input dataframe, with repeated indexes. It must be sorted by its index.
243 | 
244 |     Returns
245 |     -------
246 |     pd.DataFrame
247 |         Output dataframe, with unique indexes. It is a view over the input
248 |         dataframe, so it would mute the input dataframe if modified.
249 |     """
250 |     if not df.index.is_monotonic_increasing:
251 |         raise ValueError("The index of the input dataframe must be sorted")
252 | 
253 |     offset_array = calculate_sorted_index_offsets(df.index)
254 |     unique_index = df.index[offset_array[:-1]]
255 | 
256 |     series_ = {
257 |         column: view_sorted_series_as_list_array(df[column], offset_array, unique_index)
258 |         for column in df.columns
259 |     }
260 | 
261 |     df = pd.DataFrame(series_)
262 | 
263 |     return df
264 | 
265 | 
266 | def view_sorted_series_as_list_array(
267 |     series: pd.Series, offset: np.ndarray | None = None, unique_index: np.ndarray | None = None
268 | ) -> pd.Series:
269 |     """Make a nested array representation of a "flat" series.
270 | 
271 |     Parameters
272 |     ----------
273 |     series : pd.Series
274 |         Input series, with repeated indexes. It must be sorted by its index.
275 | 
276 |     offset: np.ndarray or None, optional
277 |         Pre-calculated offsets of the input series index.
278 |     unique_index: np.ndarray or None, optional
279 |         Pre-calculated unique index of the input series. If given it must be
280 |         equal to `series.index.unique()` and `series.index.values[offset[:-1]]`.
281 | 
282 |     Returns
283 |     -------
284 |     pd.Series
285 |         Output series, with unique indexes. It is a view over the input series,
286 |         so it would mute the input series if modified.
287 |     """
288 |     if not series.index.is_monotonic_increasing:
289 |         raise ValueError("The index of the input series must be sorted")
290 | 
291 |     if offset is None:
292 |         offset = calculate_sorted_index_offsets(series.index)
293 |     if unique_index is None:
294 |         unique_index = series.index[offset[:-1]]
295 | 
296 |     # Input series may be represented by pyarrow.ChunkedArray, in this case pa.array(series) would fail
297 |     # with "TypeError: Cannot convert a 'ChunkedArray' to a 'ListArray'".
298 |     # https://github.com/lincc-frameworks/nested-pandas/issues/189
299 |     flat_array = pa.array(series, from_pandas=True)
300 |     if isinstance(flat_array, pa.ChunkedArray):
301 |         flat_array = flat_array.combine_chunks()
302 |     list_array = pa.ListArray.from_arrays(
303 |         offset,
304 |         flat_array,
305 |     )
306 | 
307 |     return pd.Series(
308 |         list_array,
309 |         dtype=pd.ArrowDtype(list_array.type),
310 |         index=unique_index,
311 |         copy=False,
312 |         name=series.name,
313 |     )
314 | 
315 | 
316 | def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
317 |     """Calculate the offsets of the pre-sorted index values.
318 | 
319 |     Parameters
320 |     ----------
321 |     index : pd.Index
322 |         Input index, must be sorted.
323 | 
324 |     Returns
325 |     -------
326 |     np.ndarray
327 |         Output array of offsets, one element more than the number of unique
328 |         index values.
329 |     """
330 |     if not index.is_monotonic_increasing:
331 |         raise ValueError("The index must be sorted")
332 | 
333 |     # pd.Index.duplicated returns False for the first occurance and True for all others.
334 |     # So our offsets would be indexes of these False values with the array length in the end.
335 |     offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
336 |     offset = np.append(offset_but_last, len(index))
337 | 
338 |     # Arrow uses int32 for offsets
339 |     offset = offset.astype(np.int32)
340 | 
341 |     return offset
342 | 


--------------------------------------------------------------------------------
/src/nested_pandas/series/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations  # Python 3.9 requires it for X | Y type hints
  2 | 
  3 | from typing import TYPE_CHECKING, cast
  4 | 
  5 | import pandas as pd
  6 | import pyarrow as pa
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from nested_pandas.series.dtype import NestedDtype
 10 | 
 11 | 
 12 | def is_pa_type_a_list(pa_type: pa.DataType) -> bool:
 13 |     """Check if the given pyarrow type is a list type.
 14 | 
 15 |     I.e., one of the following types: ListArray, LargeListArray,
 16 |     FixedSizeListArray.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     pa_type : pa.DataType
 21 |         The pyarrow type to check.
 22 | 
 23 |     Returns
 24 |     -------
 25 |     bool
 26 |         True if the given type is a list type, False otherwise.
 27 |     """
 28 |     return (
 29 |         pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_fixed_size_list(pa_type)
 30 |     )
 31 | 
 32 | 
 33 | def is_pa_type_is_list_struct(pa_type: pa.DataType) -> bool:
 34 |     """Check if the given pyarrow type is a list-struct type.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     pa_type : pa.DataType
 39 |         The pyarrow type to check.
 40 | 
 41 |     Returns
 42 |     -------
 43 |     bool
 44 |         True if the given type is a list-type with struct values,
 45 |         False otherwise.
 46 |     """
 47 |     return is_pa_type_a_list(pa_type) and pa.types.is_struct(pa_type.value_type)
 48 | 
 49 | 
 50 | def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None:
 51 |     """Check if the given struct array has lists of equal length.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     array : pa.StructArray
 56 |         Input struct array.
 57 | 
 58 |     Raises
 59 |     ------
 60 |     ValueError
 61 |         If the struct array has lists of unequal length or type of the input
 62 |         array is not a StructArray or fields are not ListArrays.
 63 |     """
 64 |     if not pa.types.is_struct(array.type):
 65 |         raise ValueError(f"Expected a StructArray, got {array.type}")
 66 | 
 67 |     first_list_array: pa.ListArray | None = None
 68 |     for field in array.type:
 69 |         inner_array = array.field(field.name)
 70 |         if not is_pa_type_a_list(inner_array.type):
 71 |             raise ValueError(f"Expected a ListArray, got {inner_array.type}")
 72 |         list_array = cast(pa.ListArray, inner_array)
 73 | 
 74 |         if first_list_array is None:
 75 |             first_list_array = list_array
 76 |             continue
 77 |         # compare offsets from the first list array with the current one
 78 |         if not first_list_array.offsets.equals(list_array.offsets):
 79 |             raise ValueError("Offsets of all ListArrays must be the same")
 80 | 
 81 | 
 82 | def transpose_struct_list_type(t: pa.StructType) -> pa.ListType:
 83 |     """Converts a type of struct-list array into a type of list-struct array.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     t : pa.DataType
 88 |         Input type of struct-list array.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     pa.DataType
 93 |         Type of list-struct array.
 94 | 
 95 |     Raises
 96 |     ------
 97 |     ValueError
 98 |         If the input type is not a struct-list type.
 99 |     """
100 |     if not pa.types.is_struct(t):
101 |         raise ValueError(f"Expected a StructType, got {t}")
102 | 
103 |     fields = []
104 |     for field in t:
105 |         if not is_pa_type_a_list(field.type):
106 |             raise ValueError(f"Expected a ListType, got {field.type}")
107 |         list_type = cast(pa.ListType, field.type)
108 |         fields.append(pa.field(field.name, list_type.value_type))
109 | 
110 |     list_type = cast(pa.ListType, pa.list_(pa.struct(fields)))
111 |     return list_type
112 | 
113 | 
114 | def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.ListArray:
115 |     """Converts a struct-array of lists into a list-array of structs.
116 | 
117 |     Parameters
118 |     ----------
119 |     array : pa.StructArray
120 |         Input struct array, each scalar must have lists of equal length.
121 |     validate : bool, default True
122 |         Whether to validate the input array for list lengths. Raises ValueError
123 |         if something is wrong.
124 | 
125 |     Returns
126 |     -------
127 |     pa.ListArray
128 |         List array of structs.
129 |     """
130 |     if validate:
131 |         validate_struct_list_array_for_equal_lengths(array)
132 | 
133 |     mask = array.is_null()
134 |     if not pa.compute.any(mask).as_py():
135 |         mask = None
136 | 
137 |     # Since we know that all lists have the same length, we can use the first list to get offsets
138 |     try:
139 |         offsets = array.field(0).offsets
140 |     except IndexError as e:
141 |         raise ValueError("Nested arrays must have at least one field") from e
142 |     else:
143 |         # Shift offsets
144 |         if offsets.offset != 0:
145 |             offsets = pa.compute.subtract(offsets, offsets[0])
146 | 
147 |     struct_flat_array = pa.StructArray.from_arrays(
148 |         # Select values within the offsets
149 |         [field.values[field.offsets[0].as_py() : field.offsets[-1].as_py()] for field in array.flatten()],
150 |         names=array.type.names,
151 |     )
152 |     return pa.ListArray.from_arrays(
153 |         offsets=offsets,
154 |         values=struct_flat_array,
155 |         mask=mask,
156 |     )
157 | 
158 | 
159 | def transpose_struct_list_chunked(chunked_array: pa.ChunkedArray, validate: bool = True) -> pa.ChunkedArray:
160 |     """Converts a chunked array of struct-list into a chunked array of list-struct.
161 | 
162 |     Parameters
163 |     ----------
164 |     chunked_array : pa.ChunkedArray
165 |         Input chunked array of struct-list.
166 |     validate : bool, default True
167 |         Whether to validate the input array for list lengths. Raises ValueError
168 |         if something is wrong.
169 | 
170 |     Returns
171 |     -------
172 |     pa.ChunkedArray
173 |         Chunked array of list-struct.
174 |     """
175 |     if chunked_array.num_chunks == 0:
176 |         return pa.chunked_array([], type=transpose_struct_list_type(chunked_array.type))
177 |     return pa.chunked_array(
178 |         [transpose_struct_list_array(array, validate) for array in chunked_array.iterchunks()]
179 |     )
180 | 
181 | 
182 | def transpose_list_struct_scalar(scalar: pa.ListScalar) -> pa.StructScalar:
183 |     """Converts a list-scalar of structs into a struct-scalar of lists.
184 | 
185 |     Parameters
186 |     ----------
187 |     scalar : pa.ListScalar
188 |         Input list-struct scalar.
189 | 
190 |     Returns
191 |     -------
192 |     pa.StructScalar
193 |         Struct-list scalar.
194 |     """
195 |     struct_type = transpose_list_struct_type(scalar.type)
196 |     struct_scalar = pa.scalar(
197 |         {field: scalar.values.field(field) for field in struct_type.names},
198 |         type=struct_type,
199 |     )
200 |     return cast(pa.StructScalar, struct_scalar)
201 | 
202 | 
203 | def validate_list_struct_type(t: pa.ListType) -> None:
204 |     """Raise a ValueError if not a list-struct type."""
205 |     if not is_pa_type_a_list(t):
206 |         raise ValueError(f"Expected a ListType, got {t}")
207 | 
208 |     if not pa.types.is_struct(t.value_type):
209 |         raise ValueError(f"Expected a StructType as a list value type, got {t.value_type}")
210 | 
211 | 
212 | def transpose_list_struct_type(t: pa.ListType) -> pa.StructType:
213 |     """Converts a type of list-struct array into a type of struct-list array.
214 | 
215 |     Parameters
216 |     ----------
217 |     t : pa.DataType
218 |         Input type of list-struct array.
219 | 
220 |     Returns
221 |     -------
222 |     pa.DataType
223 |         Type of struct-list array.
224 | 
225 |     Raises
226 |     ------
227 |     ValueError
228 |         If the input type is not a list-struct type.
229 |     """
230 |     validate_list_struct_type(t)
231 | 
232 |     struct_type = cast(pa.StructType, t.value_type)
233 |     fields = []
234 |     for field in struct_type:
235 |         fields.append(pa.field(field.name, pa.list_(field.type)))
236 | 
237 |     struct_type = cast(pa.StructType, pa.struct(fields))
238 |     return struct_type
239 | 
240 | 
241 | def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray:
242 |     """Converts a list-array of structs into a struct-array of lists.
243 | 
244 |     Parameters
245 |     ----------
246 |     array : pa.ListArray
247 |         Input list array of structs.
248 | 
249 |     Returns
250 |     -------
251 |     pa.StructArray
252 |         Struct array of lists.
253 |     """
254 |     offsets, values = array.offsets, array.values
255 |     mask = array.is_null()
256 |     if not pa.compute.any(mask).as_py():
257 |         mask = None
258 | 
259 |     fields = []
260 |     for field_values in values.flatten():
261 |         list_array = pa.ListArray.from_arrays(offsets, field_values)
262 |         fields.append(list_array)
263 | 
264 |     return pa.StructArray.from_arrays(
265 |         arrays=fields,
266 |         names=array.type.value_type.names,
267 |         mask=mask,
268 |     )
269 | 
270 | 
271 | def transpose_list_struct_chunked(chunked_array: pa.ChunkedArray) -> pa.ChunkedArray:
272 |     """Converts a chunked array of list-struct into a chunked array of struct-list.
273 | 
274 |     Parameters
275 |     ----------
276 |     chunked_array : pa.ChunkedArray
277 |         Input chunked array of list-struct.
278 | 
279 |     Returns
280 |     -------
281 |     pa.ChunkedArray
282 |         Chunked array of struct-list.
283 |     """
284 |     if chunked_array.num_chunks == 0:
285 |         return pa.chunked_array([], type=transpose_list_struct_type(chunked_array.type))
286 |     return pa.chunked_array([transpose_list_struct_array(array) for array in chunked_array.iterchunks()])
287 | 
288 | 
289 | def nested_types_mapper(type: pa.DataType) -> pd.ArrowDtype | NestedDtype:
290 |     """Type mapper for pyarrow .to_pandas(types_mapper) methods."""
291 |     from nested_pandas.series.dtype import NestedDtype
292 | 
293 |     if pa.types.is_list(type):
294 |         try:
295 |             return NestedDtype(type)
296 |         except (ValueError, TypeError):
297 |             return pd.ArrowDtype(type)
298 |     return pd.ArrowDtype(type)
299 | 
300 | 
301 | def table_to_struct_array(table: pa.Table) -> pa.ChunkedArray:
302 |     """pa.Table.to_struct_array
303 | 
304 |     pyarrow has a bug for empty tables:
305 |     https://github.com/apache/arrow/issues/46355
306 |     """
307 |     if len(table) == 0:
308 |         return pa.chunked_array([], type=pa.struct(table.schema))
309 |     return table.to_struct_array()
310 | 
311 | 
312 | def table_from_struct_array(array: pa.ChunkedArray | pa.array) -> pa.Table:
313 |     """pa.Table.from_struct_array, but working with chunkless input"""
314 |     if isinstance(array, pa.ChunkedArray) and array.num_chunks == 0:
315 |         array = pa.array([], type=array.type)
316 |     return pa.Table.from_struct_array(array)
317 | 


--------------------------------------------------------------------------------
/src/nested_pandas/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # noqa
2 | 


--------------------------------------------------------------------------------
/src/nested_pandas/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from nested_pandas import NestedFrame
 5 | 
 6 | 
 7 | def count_nested(df, nested, by=None, join=True) -> NestedFrame:
 8 |     """Counts the number of rows of a nested dataframe.
 9 | 
10 |     Parameters
11 |     ----------
12 |     df: NestedFrame
13 |         A NestedFrame that contains the desired `nested` series
14 |         to count.
15 |     nested: 'str'
16 |         The label of the nested series to count.
17 |     by: 'str', optional
18 |         Specifies a column within nested to count by, returning
19 |         a count for each unique value in `by`.
20 |     join: bool, optional
21 |         Join the output count columns to df and return df, otherwise
22 |         just return a NestedFrame containing only the count columns.
23 | 
24 |     Returns
25 |     -------
26 |     NestedFrame
27 | 
28 |     Examples
29 |     --------
30 | 
31 |     >>> import pandas as pd
32 |     >>> # Show all columns
33 |     >>> pd.set_option("display.width", 200)
34 |     >>> pd.set_option("display.max_columns", None)
35 |     >>> from nested_pandas.datasets.generation import generate_data
36 |     >>> nf = generate_data(5, 10, seed=1)
37 | 
38 |     >>> from nested_pandas.utils import count_nested
39 |     >>> count_nested(nf, "nested")
40 |               a         b                                             nested  n_nested
41 |     0  0.417022  0.184677  [{t: 8.38389, flux: 10.233443, band: 'g'}; …] ...        10
42 |     1  0.720324  0.372520  [{t: 13.70439, flux: 41.405599, band: 'g'}; …]...        10
43 |     2  0.000114  0.691121  [{t: 4.089045, flux: 69.440016, band: 'g'}; …]...        10
44 |     3  0.302333  0.793535  [{t: 17.562349, flux: 41.417927, band: 'g'}; …...        10
45 |     4  0.146756  1.077633  [{t: 0.547752, flux: 4.995346, band: 'r'}; …] ...        10
46 | 
47 |     `count_nested` also allows counting by a given subcolumn, for example we
48 |     can count by "band" label:
49 | 
50 |     >>> # join=False, allows the result to be kept separate from the original nf
51 |     >>> count_nested(nf, "nested", by="band", join=False)
52 |        n_nested_g  n_nested_r
53 |     0           8           2
54 |     1           5           5
55 |     2           5           5
56 |     3           6           4
57 |     4           6           4
58 |     """
59 | 
60 |     if by is None:
61 |         counts = pd.Series(df[nested].nest.list_lengths, name=f"n_{nested}", index=df.index)
62 |     else:
63 |         counts = df.reduce(
64 |             lambda x: dict(zip(*np.unique(x, return_counts=True), strict=False)), f"{nested}.{by}"
65 |         )
66 |         counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns})
67 |         counts = counts.reindex(sorted(counts.columns), axis=1)
68 |     if join:
69 |         return df.join(counts)
70 |     # else just return the counts NestedFrame
71 |     if isinstance(counts, pd.Series):  # for by=None, which returns a Series
72 |         counts = NestedFrame(counts.to_frame())
73 |     return counts
74 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/conftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/tests/nested_pandas/conftest.py


--------------------------------------------------------------------------------
/tests/nested_pandas/datasets/test_generation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from nested_pandas.datasets import generate_data
 3 | 
 4 | 
 5 | @pytest.mark.parametrize("n_layers", [10, {"nested_a": 10, "nested_b": 20}])
 6 | def test_generate_data(n_layers):
 7 |     """test the data generator function"""
 8 |     nf = generate_data(10, n_layers, seed=1)
 9 | 
10 |     if isinstance(n_layers, int):
11 |         assert len(nf.nested.nest.to_flat()) == 100
12 | 
13 |     elif isinstance(n_layers, dict):
14 |         assert "nested_a" in nf.columns
15 |         assert "nested_b" in nf.columns
16 | 
17 |         assert len(nf.nested_a.nest.to_flat()) == 100
18 |         assert len(nf.nested_b.nest.to_flat()) == 200
19 | 
20 | 
21 | def test_generate_data_bad_input():
22 |     """test a poor n_layer input to generate_data"""
23 |     with pytest.raises(TypeError):
24 |         generate_data(10, "nested", seed=1)
25 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/e2e_tests/test_issue89.py:
--------------------------------------------------------------------------------
 1 | """Based on https://github.com/lincc-frameworks/nested-pandas/issues/89"""
 2 | 
 3 | import nested_pandas as npd
 4 | import numpy as np
 5 | 
 6 | 
 7 | def test_issue89():
 8 |     """Check that code snippet from issue 89 works as expected
 9 | 
10 |     https://github.com/lincc-frameworks/nested-pandas/issues/89
11 |     """
12 | 
13 |     # Load some ZTF data
14 |     catalogs_dir = "https://epyc.astro.washington.edu/~lincc-frameworks/half_degree_surveys/ztf/"
15 | 
16 |     object_ndf = npd.read_parquet(
17 |         f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet",
18 |         columns=["ra", "dec", "ps1_objid"],
19 |         partitioning=None,
20 |     ).set_index("ps1_objid")
21 | 
22 |     source_ndf = npd.read_parquet(
23 |         f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet",
24 |         columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"],
25 |         partitioning=None,
26 |     ).set_index("ps1_objid")
27 | 
28 |     object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")
29 | 
30 |     nf = object_ndf
31 |     nf.reduce(np.mean, "ztf_source.mjd")
32 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/nestedframe/test_io.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import pandas as pd
  5 | import pyarrow as pa
  6 | import pyarrow.parquet as pq
  7 | import pytest
  8 | from nested_pandas import NestedFrame, read_parquet
  9 | from nested_pandas.datasets import generate_data
 10 | from nested_pandas.nestedframe.io import from_pyarrow
 11 | from pandas.testing import assert_frame_equal
 12 | from upath import UPath
 13 | 
 14 | 
 15 | def test_read_parquet():
 16 |     """Test reading a parquet file with no columns specified"""
 17 |     # Load in the example file
 18 |     nf = read_parquet("tests/test_data/nested.parquet")
 19 | 
 20 |     # Check the columns
 21 |     assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
 22 | 
 23 |     # Make sure nested columns were recognized
 24 |     assert nf.nested_columns == ["nested", "lincc"]
 25 | 
 26 |     # Check the nested columns
 27 |     assert nf.nested.nest.fields == ["t", "flux", "band"]
 28 |     assert nf.lincc.nest.fields == ["band", "frameworks"]
 29 | 
 30 | 
 31 | def test_read_parquet_list():
 32 |     """Test reading a parquet file with no columns specified"""
 33 |     # Load in the example files
 34 |     single_file_nf = read_parquet("tests/test_data/nested.parquet")
 35 |     nf = read_parquet(["tests/test_data/nested.parquet", "tests/test_data/nested.parquet"])
 36 | 
 37 |     # Check the columns
 38 |     assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
 39 | 
 40 |     # Make sure nested columns were recognized
 41 |     assert nf.nested_columns == ["nested", "lincc"]
 42 | 
 43 |     # Check the nested columns
 44 |     assert nf.nested.nest.fields == ["t", "flux", "band"]
 45 |     assert nf.lincc.nest.fields == ["band", "frameworks"]
 46 | 
 47 |     # Check loading list works correctly
 48 |     assert len(nf) == 2 * len(single_file_nf)
 49 | 
 50 | 
 51 | def test_read_parquet_directory():
 52 |     """Test reading a parquet file with no columns specified"""
 53 |     # Load in the example file
 54 |     nf = read_parquet("tests/test_data")
 55 | 
 56 |     # Check the columns
 57 |     assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
 58 | 
 59 |     # Make sure nested columns were recognized
 60 |     assert nf.nested_columns == ["nested", "lincc"]
 61 | 
 62 |     # Check the nested columns
 63 |     assert nf.nested.nest.fields == ["t", "flux", "band"]
 64 |     assert nf.lincc.nest.fields == ["band", "frameworks"]
 65 | 
 66 | 
 67 | def test_read_parquet_directory_with_filesystem():
 68 |     """Test reading a parquet file with no columns specified"""
 69 |     # Load in the example file
 70 |     path = UPath("tests/test_data")
 71 |     nf = read_parquet(path.path, filesystem=path.fs)
 72 | 
 73 |     # Check the columns
 74 |     assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
 75 | 
 76 |     # Make sure nested columns were recognized
 77 |     assert nf.nested_columns == ["nested", "lincc"]
 78 | 
 79 |     # Check the nested columns
 80 |     assert nf.nested.nest.fields == ["t", "flux", "band"]
 81 |     assert nf.lincc.nest.fields == ["band", "frameworks"]
 82 | 
 83 | 
 84 | def test_file_object_read_parquet():
 85 |     """Test reading parquet from a file-object"""
 86 |     with open("tests/test_data/nested.parquet", "rb") as f:
 87 |         nf = read_parquet(f)
 88 |     # Check the columns
 89 |     assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
 90 |     # Make sure nested columns were recognized
 91 |     assert nf.nested_columns == ["nested", "lincc"]
 92 |     # Check the nested columns
 93 |     assert nf.nested.nest.fields == ["t", "flux", "band"]
 94 |     assert nf.lincc.nest.fields == ["band", "frameworks"]
 95 | 
 96 | 
 97 | @pytest.mark.parametrize(
 98 |     "columns",
 99 |     [
100 |         ["a", "flux"],
101 |         ["flux", "nested", "lincc"],
102 |         ["nested.flux", "nested.band"],
103 |         ["flux", "nested.flux"],
104 |         ["nested.band", "lincc.band"],
105 |     ],
106 | )
107 | def test_read_parquet_column_selection(columns):
108 |     """Test reading a parquet file with column selection"""
109 |     # Load in the example file
110 |     nf = read_parquet("tests/test_data/nested.parquet", columns=columns)
111 | 
112 |     # Output expectations
113 |     if columns == ["a", "flux"]:
114 |         expected_columns = ["a", "flux"]
115 |     elif columns == ["flux", "nested", "lincc"]:
116 |         expected_columns = ["flux", "nested", "lincc"]
117 |     elif columns == ["nested.flux", "nested.band"]:
118 |         expected_columns = ["nested"]
119 |     elif columns == ["flux", "nested.flux"]:
120 |         expected_columns = ["flux", "nested"]
121 |     elif columns == ["nested.band", "lincc.band"]:
122 |         expected_columns = ["nested", "lincc"]
123 | 
124 |     # Check the columns
125 |     assert nf.columns.tolist() == expected_columns
126 | 
127 |     # Check nested columns
128 |     if columns == ["nested.flux", "nested.t"]:
129 |         assert nf.nested.nest.fields == ["flux", "t"]
130 |     elif columns == ["nested.band", "lincc.band"]:
131 |         assert nf.nested.nest.fields == ["band"]
132 |         assert nf.lincc.nest.fields == ["band"]
133 | 
134 | 
135 | @pytest.mark.parametrize("reject", [["nested"], "nested"])
136 | def test_read_parquet_reject_nesting(reject):
137 |     """Test reading a parquet file with column selection"""
138 |     # Load in the example file
139 |     nf = read_parquet("tests/test_data/nested.parquet", columns=["a", "nested"], reject_nesting=reject)
140 | 
141 |     # Check the columns
142 |     assert nf.columns.tolist() == ["a", "nested"]
143 | 
144 |     # Make sure "nested" was not recognized as a nested column
145 |     assert nf.nested_columns == []
146 | 
147 |     assert pa.types.is_struct(nf["nested"].dtype.pyarrow_dtype)
148 | 
149 | 
150 | def test_read_parquet_reject_nesting_partial_loading():
151 |     """Test reading a parquet file with column selection"""
152 |     # Load in the example file
153 |     nf = read_parquet("tests/test_data/nested.parquet", columns=["a", "nested.t"], reject_nesting=["nested"])
154 | 
155 |     # Check the columns
156 |     assert nf.columns.tolist() == ["a", "t"]
157 | 
158 | 
159 | def test_read_parquet_catch_full_and_partial():
160 |     """Test reading a parquet file with column selection"""
161 |     # Load in the example file
162 |     with pytest.raises(ValueError):
163 |         read_parquet("tests/test_data/nested.parquet", columns=["a", "nested.t", "nested"])
164 | 
165 | 
166 | def test_read_parquet_catch_failed_cast():
167 |     """Test reading a parquet file with column selection"""
168 |     # Load in the example file
169 |     with pytest.raises(ValueError):
170 |         read_parquet("tests/test_data/not_nestable.parquet")
171 | 
172 | 
173 | def test_read_parquet_test_mixed_struct():
174 |     """Test reading a parquet file with mixed struct types"""
175 |     # Create the pure-list StructArray
176 |     field1 = pa.array([[1, 2], [3, 4], [5, 6]])
177 |     field2 = pa.array([["a", "b"], ["b", "c"], ["c", "d"]])
178 |     field3 = pa.array([[True, False], [True, False], [True, False]])
179 |     struct_array_list = pa.StructArray.from_arrays([field1, field2, field3], ["list1", "list2", "list3"])
180 | 
181 |     # Create the value StructArray
182 |     field1 = pa.array([1, 2, 3])
183 |     field2 = pa.array(["a", "b", "c"])
184 |     field3 = pa.array([True, False, True])
185 |     struct_array_val = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "val3"])
186 | 
187 |     # Create the mixed-list StructArray
188 |     field1 = pa.array([1, 2, 3])
189 |     field2 = pa.array(["a", "b", "c"])
190 |     field3 = pa.array([[True, False], [True, False], [True, False]])
191 |     struct_array_mix = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "list3"])
192 | 
193 |     # Create a PyArrow Table with the StructArray as one of the columns
194 |     table = pa.table(
195 |         {
196 |             "id": pa.array([100, 101, 102]),  # Another column
197 |             "struct_list": struct_array_list,  # Struct column
198 |             "struct_value": struct_array_val,
199 |             "struct_mix": struct_array_mix,
200 |         }
201 |     )
202 | 
203 |     # Write to a temporary file
204 |     with tempfile.TemporaryDirectory() as tmpdir:
205 |         pq.write_table(table, os.path.join(tmpdir, "structs.parquet"))
206 | 
207 |         # Test full read
208 |         nf = read_parquet(os.path.join(tmpdir, "structs.parquet"))
209 |         assert nf.columns.tolist() == ["id", "struct_list", "struct_value", "struct_mix"]
210 |         assert nf.nested_columns == ["struct_list"]
211 | 
212 |         # Test partial read
213 |         nf = read_parquet(os.path.join(tmpdir, "structs.parquet"), columns=["id", "struct_mix.list3"])
214 |         assert nf.columns.tolist() == ["id", "struct_mix"]
215 |         assert nf.nested_columns == ["struct_mix"]
216 | 
217 |         # Test partial read with ordering to force reject pops
218 |         nf = read_parquet(
219 |             os.path.join(tmpdir, "structs.parquet"), columns=["id", "struct_mix.list3", "struct_mix.val1"]
220 |         )
221 |         assert nf.columns.tolist() == ["id", "list3", "val1"]
222 |         assert len(nf.nested_columns) == 0
223 | 
224 | 
225 | def test_from_pyarrow_test_mixed_struct():
226 |     """Test reading a pyarrow table with mixed struct types"""
227 |     # Create the pure-list StructArray
228 |     field1 = pa.array([[1, 2], [3, 4], [5, 6]])
229 |     field2 = pa.array([["a", "b"], ["b", "c"], ["c", "d"]])
230 |     field3 = pa.array([[True, False], [True, False], [True, False]])
231 |     struct_array_list = pa.StructArray.from_arrays([field1, field2, field3], ["list1", "list2", "list3"])
232 | 
233 |     # Create the value StructArray
234 |     field1 = pa.array([1, 2, 3])
235 |     field2 = pa.array(["a", "b", "c"])
236 |     field3 = pa.array([True, False, True])
237 |     struct_array_val = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "val3"])
238 | 
239 |     # Create the mixed-list StructArray
240 |     field1 = pa.array([1, 2, 3])
241 |     field2 = pa.array(["a", "b", "c"])
242 |     field3 = pa.array([[True, False], [True, False], [True, False]])
243 |     struct_array_mix = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "list3"])
244 | 
245 |     # Create a PyArrow Table with the StructArray as one of the columns
246 |     table = pa.table(
247 |         {
248 |             "id": pa.array([100, 101, 102]),  # Another column
249 |             "struct_list": struct_array_list,  # Struct column
250 |             "struct_value": struct_array_val,
251 |             "struct_mix": struct_array_mix,
252 |         }
253 |     )
254 | 
255 |     # Test full read
256 |     nf = from_pyarrow(table)
257 |     assert nf.columns.tolist() == ["id", "struct_list", "struct_value", "struct_mix"]
258 |     assert nf.nested_columns == ["struct_list"]
259 | 
260 | 
261 | def test_to_parquet():
262 |     """Test writing a parquet file with no columns specified"""
263 |     # Load in the example file
264 |     nf = read_parquet("tests/test_data/nested.parquet")
265 | 
266 |     # Write to a temporary file
267 |     with tempfile.TemporaryDirectory() as tmpdir:
268 |         nf.to_parquet(os.path.join(tmpdir, "nested.parquet"))
269 | 
270 |         # Read the file back in
271 |         nf2 = read_parquet(os.path.join(tmpdir, "nested.parquet"))
272 | 
273 |         # Check the columns
274 |         assert nf.columns.tolist() == nf2.columns.tolist()
275 | 
276 |         # Check the nested columns
277 |         assert nf.nested_columns == nf2.nested_columns
278 | 
279 |         # Check the data
280 |         assert_frame_equal(nf, nf2)
281 | 
282 | 
283 | def test_pandas_read_parquet():
284 |     """Test that pandas can read our serialized files"""
285 | 
286 |     nf = generate_data(10, 100, seed=1)
287 |     with tempfile.TemporaryDirectory() as tmpdir:
288 |         nf.to_parquet(os.path.join(tmpdir, "nested_for_pd.parquet"))
289 |         # Load in the example file
290 |         df = pd.read_parquet(os.path.join(tmpdir, "nested_for_pd.parquet"))
291 | 
292 |         # Check the columns
293 |         assert df.columns.tolist() == ["a", "b", "nested"]
294 | 
295 | 
296 | def test_read_empty_parquet():
297 |     """Test that we can read empty parquet files"""
298 |     orig_nf = generate_data(1, 2).iloc[:0]
299 | 
300 |     with tempfile.NamedTemporaryFile("wb", suffix="parquet") as tmpfile:
301 |         orig_nf.to_parquet(tmpfile.name)
302 |         # All columns
303 |         # Do not check dtype because of:
304 |         # https://github.com/lincc-frameworks/nested-pandas/issues/252
305 |         assert_frame_equal(read_parquet(tmpfile.name), orig_nf, check_dtype=False)
306 |         # Few columns
307 |         assert_frame_equal(
308 |             read_parquet(
309 |                 tmpfile.name,
310 |                 columns=[
311 |                     "a",
312 |                     "nested.flux",
313 |                     "nested.band",
314 |                 ],
315 |             ),
316 |             orig_nf.drop(["b", "nested.t"], axis=1),
317 |             check_dtype=False,
318 |         )
319 | 
320 | 
321 | def test_read_parquet_list_autocast():
322 |     """Test reading a parquet file with list autocasting"""
323 |     list_nf = NestedFrame(
324 |         {
325 |             "a": ["cat", "dog", "bird"],
326 |             "b": [1, 2, 3],
327 |             "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
328 |             "d": [[10, 20, 30], [40, 50, 60], [70, 80, 90]],
329 |         }
330 |     )
331 |     with tempfile.NamedTemporaryFile("wb", suffix="parquet") as tmpfile:
332 |         list_nf.to_parquet(tmpfile.name)
333 | 
334 |         nf = read_parquet(tmpfile.name, autocast_list=True)
335 | 
336 |         assert nf.nested_columns == ["c", "d"]
337 |         assert nf["c"].nest.fields == ["c"]
338 |         assert len(nf["c"].nest.to_flat()) == 9
339 |         assert nf["d"].nest.fields == ["d"]
340 |         assert len(nf["d"].nest.to_flat()) == 9
341 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/series/test_dtype.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pyarrow as pa
  3 | import pytest
  4 | from nested_pandas.datasets import generate_data
  5 | from nested_pandas.nestedframe import NestedFrame
  6 | from nested_pandas.series.dtype import NestedDtype
  7 | from nested_pandas.series.ext_array import NestedExtensionArray
  8 | 
  9 | 
 10 | @pytest.mark.parametrize(
 11 |     "pyarrow_dtype",
 12 |     [
 13 |         pa.struct([pa.field("a", pa.list_(pa.int64()))]),
 14 |         pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]),
 15 |         pa.struct(
 16 |             [
 17 |                 pa.field("a", pa.list_(pa.int64())),
 18 |                 pa.field("b", pa.list_(pa.struct([pa.field("c", pa.int64())]))),
 19 |             ]
 20 |         ),
 21 |     ],
 22 | )
 23 | def test_from_pyarrow_dtype_struct_list(pyarrow_dtype):
 24 |     """Test that we can construct NestedDtype from pyarrow struct type."""
 25 |     dtype = NestedDtype(pyarrow_dtype)
 26 |     assert dtype.pyarrow_dtype == pyarrow_dtype
 27 | 
 28 | 
 29 | @pytest.mark.parametrize(
 30 |     "pyarrow_dtype",
 31 |     [
 32 |         pa.list_(pa.struct([pa.field("a", pa.int64())])),
 33 |         pa.list_(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())])),
 34 |         pa.list_(
 35 |             pa.struct(
 36 |                 [
 37 |                     pa.field("a", pa.list_(pa.int64())),
 38 |                     pa.field("b", pa.list_(pa.float64())),
 39 |                 ]
 40 |             )
 41 |         ),
 42 |     ],
 43 | )
 44 | def test_from_pyarrow_dtype_list_struct(pyarrow_dtype):
 45 |     """Test that we can construct NestedDtype from pyarrow list type."""
 46 |     dtype = NestedDtype(pyarrow_dtype)
 47 |     assert dtype.list_struct_pa_dtype == pyarrow_dtype
 48 | 
 49 | 
 50 | @pytest.mark.parametrize(
 51 |     "pyarrow_dtype",
 52 |     [
 53 |         pa.int64(),
 54 |         pa.list_(pa.int64()),
 55 |         pa.struct([pa.field("a", pa.int64())]),
 56 |         pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())]),
 57 |         pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())]),
 58 |     ],
 59 | )
 60 | def test_from_pyarrow_dtype_raises(pyarrow_dtype):
 61 |     """Test that we raise an error when constructing NestedDtype from invalid pyarrow type."""
 62 |     with pytest.raises(ValueError):
 63 |         NestedDtype(pyarrow_dtype)
 64 | 
 65 | 
 66 | def test_to_pandas_arrow_dtype():
 67 |     """Test that NestedDtype.to_pandas_arrow_dtype() returns the correct pyarrow struct type."""
 68 |     dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()})
 69 |     assert dtype.to_pandas_arrow_dtype() == pd.ArrowDtype(
 70 |         pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
 71 |     )
 72 | 
 73 | 
 74 | def test_from_pandas_arrow_dtype():
 75 |     """Test that we can construct NestedDtype from pandas.ArrowDtype."""
 76 |     dtype_from_struct = NestedDtype.from_pandas_arrow_dtype(
 77 |         pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))]))
 78 |     )
 79 |     assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))])
 80 |     dtype_from_list = NestedDtype.from_pandas_arrow_dtype(
 81 |         pd.ArrowDtype(pa.list_(pa.struct([pa.field("a", pa.int64())])))
 82 |     )
 83 |     assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))])
 84 | 
 85 | 
 86 | def test_to_pandas_list_struct_arrow_dtype():
 87 |     """Test that NestedDtype.to_pandas_arrow_dtype(list_struct=True) returns the correct pyarrow type."""
 88 |     dtype = NestedDtype.from_fields({"a": pa.list_(pa.int64()), "b": pa.float64()})
 89 |     assert dtype.to_pandas_arrow_dtype(list_struct=True) == pd.ArrowDtype(
 90 |         pa.list_(pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())]))
 91 |     )
 92 | 
 93 | 
 94 | def test_from_fields():
 95 |     """Test NestedDtype.from_fields()."""
 96 |     fields = {"a": pa.int64(), "b": pa.float64()}
 97 |     dtype = NestedDtype.from_fields(fields)
 98 |     assert dtype.pyarrow_dtype == pa.struct(
 99 |         [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]
100 |     )
101 | 
102 | 
103 | def test_na_value():
104 |     """Test that NestedDtype.na_value is a singleton instance of NAType."""
105 |     dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))]))
106 |     assert dtype.na_value is pd.NA
107 | 
108 | 
109 | def test_fields():
110 |     """Test NestedDtype.fields property"""
111 |     dtype = NestedDtype(
112 |         pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
113 |     )
114 |     assert dtype.fields == {"a": pa.int64(), "b": pa.float64()}
115 | 
116 | 
117 | def test_field_names():
118 |     """Test NestedDtype.field_names property"""
119 |     dtype = NestedDtype(
120 |         pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))])
121 |     )
122 |     assert dtype.field_names == ["a", "b"]
123 | 
124 | 
125 | @pytest.mark.parametrize(
126 |     "fields",
127 |     [
128 |         {"a": pa.int64(), "b": pa.float64()},
129 |         {"a": pa.int64(), "b": pa.float64(), "c": pa.int64()},
130 |         {"a": pa.string(), "b": pa.float64()},
131 |         # Nested / parametric types are not implemented.
132 |         # {"a": pa.list_(pa.int64()), "b": pa.float64()},
133 |         # {"a": pa.list_(pa.int64()), "b": pa.list_(pa.string())},
134 |         # {"a": pa.struct([pa.field("a", pa.int64())]), "b": pa.list_(pa.int64())},
135 |     ],
136 | )
137 | def test_name_vs_construct_from_string(fields):
138 |     """Test that dtype.name is consistent with dtype.construct_from_string(dtype.name)."""
139 |     dtype = NestedDtype.from_fields(fields)
140 |     assert dtype == NestedDtype.construct_from_string(dtype.name)
141 | 
142 | 
143 | def test_name_multiple_nested():
144 |     """Check string representation of a multiple-nested dtype."""
145 |     nf = generate_data(10, 2)
146 |     # Add a column to nest on
147 |     nf = nf.assign(id=[0, 0, 1, 1, 2, 2, 3, 3, 4, 4])
148 |     nf = nf.rename(columns={"nested": "inner"})
149 |     nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
150 |     assert (
151 |         nnf["outer"].dtype.name
152 |         == "nested<a: [double], b: [double], inner: [nested<t: [double], flux: [double], band: [string]>]>"
153 |     )
154 | 
155 | 
156 | @pytest.mark.parametrize(
157 |     "s",
158 |     [
159 |         "float",  # not a nested type
160 |         "nested(f: [int64])",  # must be <> instead
161 |         "ts<in64>",  # 'ts' was a previous name, now we use 'nested'
162 |         "nested",  # no type specified
163 |         "nested<a: [int64]",  # missed closing bracket
164 |         "nested<>",  # no field specified
165 |         "nested<int64>",  # no field name specified
166 |         "nested<[int64]>",  # no field name specified
167 |         "nested<a:[int64]>",  # separator must be ": " with space
168 |         "nested<a: [int64],b: [float32]>",  # separator must be ", " with space
169 |         "nested<a: int64>",  # missed [] - nested list
170 |         "nested<a: [complex64]>",  # not an arrow type
171 |         "nested<a: [list<item: double>]>",  # complex arrow types are not supported
172 |     ],
173 | )
174 | def test_construct_from_string_raises(s):
175 |     """Test that we raise an error when constructing NestedDtype from invalid string."""
176 |     with pytest.raises(TypeError):
177 |         NestedDtype.construct_from_string(s)
178 | 
179 | 
180 | def test_construct_array_type():
181 |     """Test that NestedDtype.construct_array_type() returns NestedExtensionArray."""
182 |     assert NestedDtype.construct_array_type() is NestedExtensionArray
183 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/series/test_series_utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pyarrow as pa
  3 | import pytest
  4 | from nested_pandas import NestedDtype
  5 | from nested_pandas.series.utils import (
  6 |     nested_types_mapper,
  7 |     transpose_list_struct_array,
  8 |     transpose_list_struct_scalar,
  9 |     transpose_list_struct_type,
 10 |     transpose_struct_list_array,
 11 |     transpose_struct_list_type,
 12 |     validate_struct_list_array_for_equal_lengths,
 13 | )
 14 | 
 15 | 
 16 | def test_validate_struct_list_array_for_equal_lengths():
 17 |     """Test validate_struct_list_array_for_equal_lengths function."""
 18 |     # Raises for wrong types
 19 |     with pytest.raises(ValueError):
 20 |         validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.int64()))
 21 |     with pytest.raises(ValueError):
 22 |         validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.list_(pa.int64())))
 23 | 
 24 |     # Raises if one of the fields is not a ListArray
 25 |     with pytest.raises(ValueError):
 26 |         validate_struct_list_array_for_equal_lengths(
 27 |             pa.StructArray.from_arrays([pa.array([[1, 2], [3, 4, 5]]), pa.array([1, 2])], ["a", "b"])
 28 |         )
 29 | 
 30 |     # Raises for mismatched lengths
 31 |     with pytest.raises(ValueError):
 32 |         validate_struct_list_array_for_equal_lengths(
 33 |             pa.StructArray.from_arrays(
 34 |                 [pa.array([[1, 2], [3, 4, 5]]), pa.array([[1, 2, 3], [4, 5]])], ["a", "b"]
 35 |             )
 36 |         )
 37 | 
 38 |     input_array = pa.StructArray.from_arrays(
 39 |         arrays=[
 40 |             pa.array([[1, 2], [3, 4], [], [5, 6, 7]]),
 41 |             pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]),
 42 |         ],
 43 |         names=["a", "b"],
 44 |     )
 45 |     assert validate_struct_list_array_for_equal_lengths(input_array) is None
 46 | 
 47 | 
 48 | def test_transpose_struct_list_type():
 49 |     """Test transpose_struct_list_type function."""
 50 |     # Raises for wrong types
 51 |     with pytest.raises(ValueError):
 52 |         transpose_struct_list_type(pa.int64())
 53 |     with pytest.raises(ValueError):
 54 |         transpose_struct_list_type(pa.list_(pa.int64()))
 55 | 
 56 |     # Raises if one of the fields is not a ListType
 57 |     with pytest.raises(ValueError):
 58 |         transpose_struct_list_type(pa.struct([("a", pa.int64()), ("b", pa.int64())]))
 59 | 
 60 |     input_type = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))])
 61 |     expected_output = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())]))
 62 |     assert transpose_struct_list_type(input_type) == expected_output
 63 | 
 64 | 
 65 | def test_transpose_list_struct_type():
 66 |     """Test transpose_list_struct_type function."""
 67 |     # Raises for wrong types
 68 |     with pytest.raises(ValueError):
 69 |         transpose_list_struct_type(pa.int64())
 70 |     with pytest.raises(ValueError):
 71 |         transpose_list_struct_type(pa.struct([("a", pa.int64()), ("b", pa.int64())]))
 72 | 
 73 |     input_type = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())]))
 74 |     expected_output = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))])
 75 |     assert transpose_list_struct_type(input_type) == expected_output
 76 | 
 77 | 
 78 | def test_transpose_struct_list_array():
 79 |     """Test transpose_struct_list_array function."""
 80 |     input_array = pa.StructArray.from_arrays(
 81 |         arrays=[
 82 |             pa.array([[1, 2], [3, 4], [], [5, 6, 7]]),
 83 |             pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]),
 84 |         ],
 85 |         names=["a", "b"],
 86 |     )
 87 |     desired = pa.array(
 88 |         [
 89 |             [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}],
 90 |             [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}],
 91 |             [],
 92 |             [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}],
 93 |         ]
 94 |     )
 95 |     actual = transpose_struct_list_array(input_array)
 96 |     assert actual == desired
 97 | 
 98 | 
 99 | def test_transpose_list_struct_array():
100 |     """Test transpose_list_struct_array function."""
101 |     input_array = pa.array(
102 |         [
103 |             [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}],
104 |             [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}],
105 |             [],
106 |             [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}],
107 |         ]
108 |     )
109 |     desired = pa.StructArray.from_arrays(
110 |         arrays=[
111 |             pa.array([[1, 2], [3, 4], [], [5, 6, 7]]),
112 |             pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]),
113 |         ],
114 |         names=["a", "b"],
115 |     )
116 |     actual = transpose_list_struct_array(input_array)
117 |     assert actual == desired
118 | 
119 | 
120 | def test_transpose_list_struct_scalar():
121 |     """Test transpose_list_struct_scalar function."""
122 |     input_scalar = pa.scalar([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}])
123 |     desired = pa.scalar({"a": [1, 2], "b": ["x", "y"]})
124 |     actual = transpose_list_struct_scalar(input_scalar)
125 |     assert actual == desired
126 | 
127 | 
128 | @pytest.mark.parametrize(
129 |     "pa_type,is_nested",
130 |     [
131 |         (pa.float64(), False),
132 |         (pa.list_(pa.float64()), False),
133 |         (pa.list_(pa.struct([("a", pa.float64()), ("b", pa.float64())])), True),
134 |     ],
135 | )
136 | def test_nested_types_mapper(pa_type, is_nested):
137 |     """Test nested_types_mapper function."""
138 |     dtype = nested_types_mapper(pa_type)
139 |     if is_nested:
140 |         assert isinstance(dtype, NestedDtype)
141 |         assert dtype.list_struct_pa_dtype == pa_type
142 |     else:
143 |         assert isinstance(dtype, pd.ArrowDtype)
144 |         assert dtype.pyarrow_dtype == pa_type
145 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/test_packaging.py:
--------------------------------------------------------------------------------
1 | import nested_pandas
2 | 
3 | 
4 | def test_version():
5 |     """Check to see that the version property returns something"""
6 |     assert nested_pandas.__version__ is not None
7 | 


--------------------------------------------------------------------------------
/tests/nested_pandas/utils/test_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from nested_pandas import NestedFrame
 5 | from nested_pandas.utils import count_nested
 6 | from numpy.testing import assert_array_equal
 7 | from pandas.testing import assert_index_equal
 8 | 
 9 | 
10 | @pytest.mark.parametrize("join", [True, False])
11 | def test_count_nested(join):
12 |     """Test the functionality of count nested"""
13 | 
14 |     # Initialize test data
15 |     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[100, 101, 102])
16 |     nested = pd.DataFrame(
17 |         data={
18 |             "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1],
19 |             "d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
20 |             "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"],
21 |         },
22 |         index=[100, 100, 100, 101, 101, 101, 102, 102, 102],
23 |     )
24 |     base = base.add_nested(nested, "nested")
25 | 
26 |     # Test general count
27 |     total_counts = count_nested(base, "nested", join=join)
28 |     assert_array_equal(total_counts["n_nested"].values, 3)
29 |     assert_index_equal(total_counts.index, base.index)
30 | 
31 |     # Test count by
32 |     label_counts = count_nested(base, "nested", by="label", join=join)
33 | 
34 |     assert_array_equal(label_counts["n_nested_a"].values, [1, 2, 1])
35 |     assert_array_equal(label_counts["n_nested_b"].values, [2, 1, 2])
36 |     assert_index_equal(label_counts.index, base.index)
37 | 
38 |     # Make sure the ordering is alphabetical
39 |     # https://github.com/lincc-frameworks/nested-pandas/issues/109
40 |     assert label_counts.columns[-1] == "n_nested_b"
41 |     assert label_counts.columns[-2] == "n_nested_a"
42 | 
43 |     # Test join behavior
44 |     if join:
45 |         assert total_counts.columns.tolist() == base.columns.tolist() + ["n_nested"]
46 |         assert label_counts.columns.tolist() == base.columns.tolist() + ["n_nested_a", "n_nested_b"]
47 |     else:
48 |         assert total_counts.columns.tolist() == ["n_nested"]
49 |         assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"]
50 | 
51 | 
52 | def test_check_expr_nesting():
53 |     """
54 |     Test the correctness of the evaluation expression pre-flight checks, which are
55 |     used to ensure that an expression-based query does not try to combine base and nested
56 |     sub-expressions.
57 |     """
58 |     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2])
59 |     nested = pd.DataFrame(
60 |         data={
61 |             "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1],
62 |             "d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
63 |             "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"],
64 |         },
65 |         index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
66 |     )
67 |     b1 = base.add_nested(nested, "nested")
68 |     assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
69 |     assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
70 |     assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""}
71 | 
72 |     b2 = base.add_nested(nested.copy(), "n")
73 |     assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""}
74 | 
75 |     abc = pd.DataFrame(
76 |         data={
77 |             "c": [3, 1, 4, 1, 5, 9, 2, 6, 5],
78 |             "d": [1, 4, 1, 2, 1, 3, 5, 6, 2],
79 |             "g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
80 |         },
81 |         index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
82 |     )
83 |     b3 = base.add_nested(abc, "abc").add_nested(abc, "c")
84 |     assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"}
85 | 
86 |     assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"}
87 |     assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"}
88 |     assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"}
89 | 
90 |     assert b1.extract_nest_names("a>3") == {""}
91 |     assert b1.extract_nest_names("a > 3") == {""}
92 | 
93 |     b4 = base.add_nested(nested, "test")
94 |     assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""}
95 |     assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}
96 | 


--------------------------------------------------------------------------------
/tests/test_data/nested.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/tests/test_data/nested.parquet


--------------------------------------------------------------------------------
/tests/test_data/not_nestable.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/tests/test_data/not_nestable.parquet


--------------------------------------------------------------------------------
/tests/test_data/vsx-x-ztfdr22_lc-m31.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lincc-frameworks/nested-pandas/94621d79e84041ef98107b13968ea9fa36fbce5a/tests/test_data/vsx-x-ztfdr22_lc-m31.parquet


--------------------------------------------------------------------------------