├── .github
└── workflows
│ ├── black-ruff.yml
│ ├── check-urls.yml
│ ├── codeql.yml
│ ├── documentation.yml
│ └── wheels-any.yml
├── .gitignore
├── .local.jenkins.lin.yml
├── CHANGELOGS.rst
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── _doc
├── _static
│ ├── git_logo.png
│ ├── project_ico.ico
│ └── project_ico.png
├── api
│ ├── connex_split.rst
│ ├── dataframe.rst
│ ├── dataframe_io.rst
│ ├── dataframe_split.rst
│ ├── index.rst
│ ├── rdata.rst
│ ├── rdf.rst
│ ├── rexc.rst
│ └── rio.rst
├── conf.py
├── examples
│ ├── README.txt
│ └── first_step.py
├── i_ex.rst
├── index.rst
├── license.rst
├── sg_execution_times.rst
└── tutorial
│ └── index.rst
├── _unittests
├── ut_df
│ ├── data
│ │ ├── buggy_hash.csv
│ │ ├── buggy_hash2.csv
│ │ ├── classic.json
│ │ ├── example.json
│ │ └── example2.json
│ ├── test_connex_split.py
│ ├── test_connex_split_big.py
│ ├── test_connex_split_cat.py
│ ├── test_dataframe_helpers.py
│ ├── test_dataframe_helpers_simple.py
│ ├── test_dataframe_io.py
│ ├── test_dataframe_io_helpers.py
│ ├── test_dataframe_sort.py
│ ├── test_pandas_groupbynan.py
│ └── test_streaming_dataframe.py
└── ut_module
│ └── test_sklearn.py
├── appveyor.yml
├── azure-pipelines.yml
├── pandas_streaming
├── __init__.py
├── data
│ ├── __init__.py
│ └── dummy.py
├── df
│ ├── __init__.py
│ ├── connex_split.py
│ ├── dataframe.py
│ ├── dataframe_helpers.py
│ ├── dataframe_io.py
│ ├── dataframe_io_helpers.py
│ └── dataframe_split.py
├── exc
│ ├── __init__.py
│ └── exc_streaming.py
└── ext_test_case.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── setup.py
/.github/workflows/black-ruff.yml:
--------------------------------------------------------------------------------
1 | name: Black + Ruff Format Checker
2 | on: [push, pull_request]
3 | jobs:
4 | black-format-check:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v2
8 | - uses: psf/black@stable
9 | with:
10 | options: "--diff --check"
11 | src: "."
12 | ruff-format-check:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | - uses: chartboost/ruff-action@v1
17 |
--------------------------------------------------------------------------------
/.github/workflows/check-urls.yml:
--------------------------------------------------------------------------------
1 | name: Check URLs
2 |
3 | on:
4 | pull_request:
5 | branches: [main]
6 | schedule:
7 | # ┌───────────── minute (0 - 59)
8 | # │ ┌───────────── hour (0 - 23)
9 | # │ │ ┌───────────── day of the month (1 - 31)
10 | # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
11 | # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
12 | # │ │ │ │ │
13 | # │ │ │ │ │
14 | # │ │ │ │ │
15 | # * * * * *
16 | - cron: '30 1 * * 0'
17 |
18 | jobs:
19 | build:
20 | runs-on: ubuntu-latest
21 |
22 | steps:
23 | - uses: actions/checkout@v3
24 |
25 | - name: urls-checker-code
26 | uses: urlstechie/urlchecker-action@master
27 | with:
28 | subfolder: pandas_streaming
29 | file_types: .md,.py,.rst,.ipynb
30 | print_all: false
31 | timeout: 2
32 | retry_count# : 2
33 | # exclude_urls: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz,https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
34 | # exclude_patterns: https://dumps.wikimedia.org/
35 | # force_pass : true
36 |
37 | - name: urls-checker-docs
38 | uses: urlstechie/urlchecker-action@master
39 | with:
40 | subfolder: _doc
41 | file_types: .md,.py,.rst,.ipynb
42 | print_all: false
43 | timeout: 2
44 | retry_count# : 2
45 | # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document
46 | exclude_patterns: https://circleci.com/gh/sdpython/pandas_streaming/
47 | # force_pass : true
48 |
--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
1 | name: "Code Scanning - Action"
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 | schedule:
9 | # ┌───────────── minute (0 - 59)
10 | # │ ┌───────────── hour (0 - 23)
11 | # │ │ ┌───────────── day of the month (1 - 31)
12 | # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
13 | # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
14 | # │ │ │ │ │
15 | # │ │ │ │ │
16 | # │ │ │ │ │
17 | # * * * * *
18 | - cron: '30 1 * * 0'
19 |
20 | jobs:
21 | CodeQL-Build:
22 | # CodeQL runs on ubuntu-latest, windows-latest, and macos-latest
23 | runs-on: ubuntu-latest
24 |
25 | permissions:
26 | # required for all workflows
27 | security-events: write
28 |
29 | # only required for workflows in private repositories
30 | actions: read
31 | contents: read
32 |
33 | steps:
34 | - name: Checkout repository
35 | uses: actions/checkout@v3
36 |
37 | # Initializes the CodeQL tools for scanning.
38 | - name: Initialize CodeQL
39 | uses: github/codeql-action/init@v2
40 | # Override language selection by uncommenting this and choosing your languages
41 | # with:
42 | # languages: go, javascript, csharp, python, cpp, java, ruby
43 |
44 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
45 | # If this step fails, then you should remove it and run the build manually (see below).
46 | - name: Autobuild
47 | uses: github/codeql-action/autobuild@v2
48 |
49 | # ℹ️ Command-line programs to run using the OS shell.
50 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
51 |
52 | # ✏️ If the Autobuild fails above, remove it and uncomment the following
53 | # three lines and modify them (or add more) to build your code if your
54 | # project uses a compiled language
55 |
56 | #- run: |
57 | # make bootstrap
58 | # make release
59 |
60 | - name: Perform CodeQL Analysis
61 | uses: github/codeql-action/analyze@v2
62 |
--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
1 | name: Documentation and Code Coverage
2 |
3 | on:
4 | push:
5 | pull_request:
6 | types:
7 | - closed
8 | branches:
9 | - main
10 |
11 | jobs:
12 | run:
13 | name: Build documentation on ${{ matrix.os }}
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | matrix:
17 | os: [ubuntu-latest]
18 |
19 | steps:
20 | - uses: actions/checkout@v3
21 |
22 | - uses: actions/setup-python@v4
23 | with:
24 | python-version: '3.11'
25 |
26 | - uses: tlylt/install-graphviz@v1
27 |
28 | - name: Install pandoc
29 | run: sudo apt-get install -y pandoc
30 |
31 | - name: Install requirements
32 | run: python -m pip install -r requirements.txt
33 |
34 | - name: Install requirements dev
35 | run: python -m pip install -r requirements-dev.txt
36 |
37 | - name: Cache pip
38 | uses: actions/cache@v2
39 | with:
40 | path: ~/.cache/pip
41 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
42 | restore-keys: |
43 | ${{ runner.os }}-pip-
44 | ${{ runner.os }}-
45 |
46 | - name: Generate coverage report
47 | run: |
48 | pip install pytest
49 | pip install pytest-cov
50 | export PYTHONPATH=.
51 | pytest --cov=./pandas_streaming/ --cov-report=xml --durations=10 --ignore-glob=**LONG*.py --ignore-glob=**notebook*.py
52 | export PYTHONPATH=
53 |
54 | - name: Upload coverage reports to Codecov
55 | uses: codecov/codecov-action@v3
56 | env:
57 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
58 |
59 | - name: Install
60 | run: python setup.py install
61 |
62 | - name: Copy license, changelogs
63 | run: |
64 | cp LICENSE* ./_doc
65 | cp CHANGELOGS* ./_doc
66 |
67 | - name: Documentation
68 | run: python -m sphinx ./_doc ./dist/html -n -w doc.txt
69 |
70 | - name: Summary
71 | run: cat doc.txt
72 |
73 | - name: Check for errors and warnings
74 | run: |
75 | if [[ $(grep ERROR doc.txt) ]]; then
76 | echo "Documentation produces errors."
77 | grep ERROR doc.txt
78 | exit 1
79 | fi
80 | if [[ $(grep WARNING doc.txt | grep -v 'std:term:y') ]]; then
81 | echo "Documentation produces warnings."
82 | grep WARNING doc.txt
83 | exit 1
84 | fi
85 |
86 | - uses: actions/upload-artifact@v3
87 | with:
88 | path: ./dist/html/**
89 |
--------------------------------------------------------------------------------
/.github/workflows/wheels-any.yml:
--------------------------------------------------------------------------------
1 | name: Build Any Wheel
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - 'releases/**'
8 |
9 | jobs:
10 | build_wheels:
11 | name: Build wheels on ${{ matrix.os }}
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest]
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 |
20 | - uses: actions/setup-python@v4
21 | with:
22 | python-version: '3.11'
23 |
24 | - name: build wheel
25 | run: python -m pip wheel .
26 |
27 | - uses: actions/upload-artifact@v3
28 | with:
29 | path: ./pandas_streaming*.whl
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyd
3 | *.dylib
4 | *.so
5 | *.whl
6 | *.csv
7 | *.zip
8 | coverage.html/*
9 | _cache/*
10 | .coverage
11 | dist/*
12 | build/*
13 | .eggs/*
14 | .hypothesis/*
15 | *egg-info/*
16 | prof
17 | _doc/CHANGELOGS.rst
18 | _doc/LICENSE.txt
19 | _doc/auto_examples/*
20 | _doc/examples/_cache/*
21 | _doc/examples/plot_*.png
22 | _doc/examples/plot_*.xlsx
23 | _doc/examples/*.html
24 | _doc/_static/require.js
25 | _doc/_static/viz.js
26 | _unittests/ut__main/*.png
27 | _unittests/ut__main/_cache/*
28 | _unittests/ut__main/*.html
29 | _unittests/.hypothesis/*
30 |
--------------------------------------------------------------------------------
/.local.jenkins.lin.yml:
--------------------------------------------------------------------------------
1 |
2 | language: python
3 |
4 | python:
5 | - { PATH: "{{Python39}}", VERSION: 3.9, DIST: std, PYINT: python3.9 }
6 |
7 | virtualenv:
8 | - path: {{ospathjoin(root_path, pickname("$NAME_JENKINS", project_name + "_$VERSION_$DIST_$NAME"), "_venv")}}
9 |
10 | install:
11 | - $PYINT -m pip install --upgrade pip
12 | - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/
13 | - $PYINT -m pip install -r requirements.txt
14 | - $PYINT -m pip install -r requirements-dev.txt
15 | - $PYINT --version
16 | - $PYINT -m pip freeze
17 |
18 | script:
19 | - { CMD: "$PYINT -u setup.py unittests --covtoken=14c7930a-a5c0-405d-a22f-3f9c6feaf0bc", NAME: "UT" }
20 |
21 | after_script:
22 | - $PYINT -u setup.py bdist_wheel
23 | - if [ ${NAME} == "UT" ] then cp dist/*.whl {{root_path}}/../local_pypi/local_pypi_server fi
24 |
25 | documentation:
26 | - if [ ${NAME} == "UT" ] then $PYINT -u setup.py build_sphinx --layout=html fi
27 | - if [ ${NAME} == "UT" ] then cp -R -f _doc/sphinxdoc/build/html dist/html fi
28 |
--------------------------------------------------------------------------------
/CHANGELOGS.rst:
--------------------------------------------------------------------------------
1 |
2 | Change Logs
3 | ===========
4 |
5 | 0.5.1
6 | +++++
7 |
8 | * :pr:`43`: improves reproducibility of function train_test_apart_stratify
9 |
10 | 0.5.0
11 | +++++
12 |
13 | * :pr:`33`: removes pyquickhelper dependency
14 | * :pr:`30`: fix compatiblity with pandas 2.0
15 |
16 | 0.3.239
17 | +++++++
18 |
19 | * :pr:`27`: Fixes json parser when input is a stream (2021-10-26)
20 | * :pr:`26`: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
21 | * :pr:`25`: Fixes documentation (2021-10-18)
22 | * :pr:`24`: Implements a first version of sort_values. (2021-10-18)
23 | * :pr:`23`: First version of operator __setitem__ (2021-10-16)
24 | * :pr:`22`: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
25 | * :pr:`21`: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
26 | * :pr:`17`: Implements method describe (2021-04-08)
27 |
28 | 0.2.175
29 | +++++++
30 |
31 | * :pr:`16`: Unit tests failing with pandas 1.1.0. (2020-08-06)
32 | * :pr:`15`: implements parameter lines, flatten for read_json (2018-11-21)
33 | * :pr:`14`: implements fillna (2018-10-29)
34 | * :pr:`13`: implement concat for axis=0,1 (2018-10-26)
35 | * :pr:`12`: add groupby_streaming (2018-10-26)
36 | * :pr:`11`: add method add_column (2018-10-26)
37 | * :pr:`10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
38 | * :pr:`9`: head is very slow (2018-10-26)
39 | * :pr:`8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
40 | * :pr:`7`: implement read_json (2018-05-17)
41 | * :pr:`6`: add pandas_groupby_nan from pyensae (2018-05-17)
42 | * :pr:`5`: add random_state parameter to splitting functions (2018-02-04)
43 | * :pr:`2`: add method sample, resevoir sampling (2017-11-05)
44 | * :pr:`3`: method train_test_split for out-of-memory datasets (2017-10-21)
45 | * :pr:`1`: Excited for your project (2017-10-10)
46 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 |
3 | We are a community based on openness, as well as friendly and didactic discussions.
4 |
5 | We aspire to treat everybody equally, and value their contributions.
6 |
7 | Decisions are made based on technical merit and consensus.
8 |
9 | Code is not the only way to help the project. Reviewing pull requests,
10 | answering questions to help others on mailing lists or issues, organizing and
11 | teaching tutorials, working on the website, improving the documentation, are
12 | all priceless contributions.
13 |
14 | We abide by the principles of openness, respect, and consideration of others of
15 | the Python Software Foundation: https://www.python.org/psf/codeofconduct/
16 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017-2024, Xavier Dupré
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune _doc
2 | prune _unittests
3 | exclude *.bat
4 | exclude *.yml
5 | exclude *.git*
6 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | pandas-streaming: streaming API over pandas
2 | ===========================================
3 |
4 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true
5 | :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
6 | :alt: Build Status Windows
7 |
8 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
9 | :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
10 |
11 | .. image:: https://badge.fury.io/py/pandas_streaming.svg
12 | :target: http://badge.fury.io/py/pandas_streaming
13 |
14 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
15 | :alt: MIT License
16 | :target: https://opensource.org/license/MIT/
17 |
18 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8
19 | :target: https://codecov.io/gh/sdpython/pandas-streaming
20 |
21 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
22 | :alt: GitHub Issues
23 | :target: https://github.com/sdpython/pandas_streaming/issues
24 |
25 | .. image:: https://pepy.tech/badge/pandas_streaming/month
26 | :target: https://pepy.tech/project/pandas_streaming/month
27 | :alt: Downloads
28 |
29 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg
30 | :target: https://github.com/sdpython/pandas_streaming/
31 | :alt: Forks
32 |
33 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg
34 | :target: https://github.com/sdpython/pandas_streaming/
35 | :alt: Stars
36 |
37 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming
38 | :target: https://github.com/sdpython/pandas_streaming/
39 | :alt: size
40 |
41 | `pandas-streaming `_
42 | aims at processing big files with `pandas `_,
43 | too big to hold in memory, too small to be parallelized with a significant gain.
44 | The module replicates a subset of *pandas* API
45 | and implements other functionalities for machine learning.
46 |
47 | .. code-block:: python
48 |
49 | from pandas_streaming.df import StreamingDataFrame
50 | sdf = StreamingDataFrame.read_csv("filename", sep="\t", encoding="utf-8")
51 |
52 | for df in sdf:
53 | # process this chunk of data
54 | # df is a dataframe
55 | print(df)
56 |
57 | The module can also stream an existing dataframe.
58 |
59 | .. code-block:: python
60 |
61 | import pandas
62 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"),
63 | dict(cf=1, cint=1, cstr="1"),
64 | dict(cf=3, cint=3, cstr="3")])
65 |
66 | from pandas_streaming.df import StreamingDataFrame
67 | sdf = StreamingDataFrame.read_df(df)
68 |
69 | for df in sdf:
70 | # process this chunk of data
71 | # df is a dataframe
72 | print(df)
73 |
74 | It contains other helpers to split datasets into
75 | train and test with some weird constraints.
76 |
--------------------------------------------------------------------------------
/_doc/_static/git_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/git_logo.png
--------------------------------------------------------------------------------
/_doc/_static/project_ico.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.ico
--------------------------------------------------------------------------------
/_doc/_static/project_ico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.png
--------------------------------------------------------------------------------
/_doc/api/connex_split.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.df.connex_split
3 | ================================
4 |
5 | .. automodule:: pandas_streaming.df.connex_split
6 | :members:
7 |
--------------------------------------------------------------------------------
/_doc/api/dataframe.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.df.dataframe
3 | =============================
4 |
5 | StreamingDataFrameSchemaError
6 | +++++++++++++++++++++++++++++
7 |
8 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrameSchemaError
9 | :members:
10 |
11 | StreamingDataFrame
12 | ++++++++++++++++++
13 |
14 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame
15 | :members:
16 | :special-members:
17 |
18 | StreamingSeries
19 | +++++++++++++++
20 |
21 | .. autoclass:: pandas_streaming.df.dataframe.StreamingSeries
22 | :members:
23 |
--------------------------------------------------------------------------------
/_doc/api/dataframe_io.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.df.dataframe_io
3 | ================================
4 |
5 | .. automodule:: pandas_streaming.df.dataframe_io
6 | :members:
7 |
--------------------------------------------------------------------------------
/_doc/api/dataframe_split.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.df.dataframe_split
3 | ===================================
4 |
5 | .. automodule:: pandas_streaming.df.dataframe_split
6 | :members:
7 |
--------------------------------------------------------------------------------
/_doc/api/index.rst:
--------------------------------------------------------------------------------
1 |
2 | API
3 | ===
4 |
5 | .. toctree::
6 |
7 | rdata
8 | rdf
9 | rexc
10 | rio
11 |
--------------------------------------------------------------------------------
/_doc/api/rdata.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.data
3 | =====================
4 |
5 | Collection of functions which produces
6 | :class:`StreamingDataFrame `.
7 |
8 | .. autofunction:: pandas_streaming.data.dummy.dummy_streaming_dataframe
9 |
--------------------------------------------------------------------------------
/_doc/api/rdf.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.df
3 | ===================
4 |
5 | Streaming
6 | +++++++++
7 |
8 | The main class is an interface which mimic
9 | :class:`pandas.DataFrame` interface to offer
10 | a short list of methods which apply on an
11 | iterator of dataframes. This provides somehow
12 | a streaming version of it. As a result, the creation
13 | of an instance is fast as long as the data is not
14 | processed. Iterators can be chained as many map reduce
15 | framework does.
16 |
17 | .. toctree::
18 | :maxdepth: 2
19 |
20 | dataframe
21 |
22 | The module implements additional and useful functions
23 | not necessarily for the streaming version of the dataframes.
24 | Many methods have been rewritten to support
25 | streaming. Among them, IO methods:
26 | :meth:`read_csv `,
27 | :meth:`read_df `,
28 | :meth:`read_json `.
29 |
30 | Data Manipulation
31 | +++++++++++++++++
32 |
33 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns
34 |
35 | .. autofunction:: pandas_streaming.df.connex_split.dataframe_shuffle
36 |
37 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
38 |
39 | .. autofunction:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan
40 |
41 | Complex splits
42 | ++++++++++++++
43 |
44 | Splitting a database into train and test is usually simple except
45 | if rows are not independant and share some ids. In that case,
46 | the following functions will try to build two partitions keeping
47 | ids separate or separate as much as possible:
48 | :func:`train_test_apart_stratify `,
49 | :func:`train_test_connex_split `,
50 | :func:`train_test_split_weights `.
51 |
52 | Extensions
53 | ++++++++++
54 |
55 | .. toctree::
56 | :maxdepth: 1
57 |
58 | connex_split
59 | dataframe_io
60 | dataframe_split
61 |
--------------------------------------------------------------------------------
/_doc/api/rexc.rst:
--------------------------------------------------------------------------------
1 |
2 | pandas_streaming.exc
3 | ====================
4 |
5 | Exceptions.
6 |
7 | .. autoclass:: pandas_streaming.exc.exc_streaming.StreamingInefficientException
8 |
--------------------------------------------------------------------------------
/_doc/api/rio.rst:
--------------------------------------------------------------------------------
1 |
2 | Inputs / Outputs
3 | ================
4 |
5 | Dataframes / Numpy arrays
6 | +++++++++++++++++++++++++
7 |
8 | `HDF5 `_
9 | is easy to manipulate in the :epkg:`Python` world but difficult
10 | to exchange with other people and other environments.
11 | The two following functions makes it easier to collapse many dataframes
12 | or numpy arrays into one single file. The data can be unzipped afterwards,
13 | see :func:`read_zip `,
14 | :func:`to_zip `.
15 |
--------------------------------------------------------------------------------
/_doc/conf.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from sphinx_runpython.github_link import make_linkcode_resolve
4 | from sphinx_runpython.conf_helper import has_dvipng, has_dvisvgm
5 | from pandas_streaming import __version__
6 |
7 |
8 | extensions = [
9 | "nbsphinx",
10 | "sphinx.ext.autodoc",
11 | "sphinx.ext.coverage",
12 | "sphinx.ext.githubpages",
13 | "sphinx.ext.ifconfig",
14 | "sphinx.ext.intersphinx",
15 | "sphinx.ext.linkcode",
16 | "sphinx.ext.viewcode",
17 | "sphinx.ext.napoleon",
18 | "sphinx.ext.todo",
19 | "sphinx_gallery.gen_gallery",
20 | "sphinx_issues",
21 | "sphinx_runpython.blocdefs.sphinx_exref_extension",
22 | "sphinx_runpython.blocdefs.sphinx_mathdef_extension",
23 | "sphinx_runpython.epkg",
24 | "sphinx_runpython.gdot",
25 | "sphinx_runpython.runpython",
26 | "matplotlib.sphinxext.plot_directive",
27 | ]
28 |
29 | if has_dvisvgm():
30 | extensions.append("sphinx.ext.imgmath")
31 | imgmath_image_format = "svg"
32 | elif has_dvipng():
33 | extensions.append("sphinx.ext.pngmath")
34 | imgmath_image_format = "png"
35 | else:
36 | extensions.append("sphinx.ext.mathjax")
37 |
38 | templates_path = ["_templates"]
39 | html_logo = "_static/project_ico.png"
40 | source_suffix = ".rst"
41 | master_doc = "index"
42 | project = "pandas-streaming"
43 | copyright = "2017-2024, Xavier Dupré"
44 | author = "Xavier Dupré"
45 | version = __version__
46 | release = __version__
47 | language = "en"
48 | exclude_patterns = ["auto_examples/*.ipynb"]
49 | pygments_style = "sphinx"
50 | todo_include_todos = True
51 | nbsphinx_execute = "never"
52 |
53 | html_theme = "furo"
54 | html_theme_path = ["_static"]
55 | html_theme_options = {}
56 | html_sourcelink_suffix = ""
57 | html_static_path = ["_static"]
58 |
59 | issues_github_path = "sdpython/pandas-streaming"
60 |
61 | # The following is used by sphinx.ext.linkcode to provide links to github
62 | linkcode_resolve = make_linkcode_resolve(
63 | "pandas_streaming",
64 | (
65 | "https://github.com/sdpython/pandas-streaming/"
66 | "blob/{revision}/{package}/"
67 | "{path}#L{lineno}"
68 | ),
69 | )
70 |
71 | latex_elements = {
72 | "papersize": "a4",
73 | "pointsize": "10pt",
74 | "title": project,
75 | }
76 |
77 | mathjax3_config = {"chtml": {"displayAlign": "left"}}
78 |
79 | intersphinx_mapping = {
80 | "onnx": ("https://onnx.ai/onnx/", None),
81 | "matplotlib": ("https://matplotlib.org/", None),
82 | "numpy": ("https://numpy.org/doc/stable", None),
83 | "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
84 | "python": (f"https://docs.python.org/{sys.version_info.major}", None),
85 | "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
86 | "sklearn": ("https://scikit-learn.org/stable/", None),
87 | "sklearn-onnx": ("https://onnx.ai/sklearn-onnx/", None),
88 | "torch": ("https://pytorch.org/docs/stable/", None),
89 | }
90 |
91 | # Check intersphinx reference targets exist
92 | nitpicky = True
93 | # See also scikit-learn/scikit-learn#26761
94 | nitpick_ignore = [
95 | ("py:class", "False"),
96 | ("py:class", "True"),
97 | ("py:class", "pipeline.Pipeline"),
98 | ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"),
99 | ]
100 |
101 | sphinx_gallery_conf = {
102 | # path to your examples scripts
103 | "examples_dirs": os.path.join(os.path.dirname(__file__), "examples"),
104 | # path where to save gallery generated examples
105 | "gallery_dirs": "auto_examples",
106 | }
107 |
108 | # next
109 |
110 | preamble = """
111 | \\usepackage{etex}
112 | \\usepackage{fixltx2e} % LaTeX patches, \\textsubscript
113 | \\usepackage{cmap} % fix search and cut-and-paste in Acrobat
114 | \\usepackage[raccourcis]{fast-diagram}
115 | \\usepackage{titlesec}
116 | \\usepackage{amsmath}
117 | \\usepackage{amssymb}
118 | \\usepackage{amsfonts}
119 | \\usepackage{graphics}
120 | \\usepackage{epic}
121 | \\usepackage{eepic}
122 | %\\usepackage{pict2e}
123 | %%% Redefined titleformat
124 | \\setlength{\\parindent}{0cm}
125 | \\setlength{\\parskip}{1ex plus 0.5ex minus 0.2ex}
126 | \\newcommand{\\hsp}{\\hspace{20pt}}
127 | \\newcommand{\\acc}[1]{\\left\\{#1\\right\\}}
128 | \\newcommand{\\cro}[1]{\\left[#1\\right]}
129 | \\newcommand{\\pa}[1]{\\left(#1\\right)}
130 | \\newcommand{\\R}{\\mathbb{R}}
131 | \\newcommand{\\HRule}{\\rule{\\linewidth}{0.5mm}}
132 | %\\titleformat{\\chapter}[hang]{\\Huge\\bfseries\\sffamily}{\\thechapter\\hsp}{0pt}{\\Huge\\bfseries\\sffamily}
133 |
134 | \\usepackage[all]{xy}
135 | \\newcommand{\\vecteur}[2]{\\pa{#1,\\dots,#2}}
136 | \\newcommand{\\N}[0]{\\mathbb{N}}
137 | \\newcommand{\\indicatrice}[1]{ {1\\!\\!1}_{\\acc{#1}} }
138 | \\newcommand{\\infegal}[0]{\\leqslant}
139 | \\newcommand{\\supegal}[0]{\\geqslant}
140 | \\newcommand{\\ensemble}[2]{\\acc{#1,\\dots,#2}}
141 | \\newcommand{\\fleche}[1]{\\overrightarrow{ #1 }}
142 | \\newcommand{\\intervalle}[2]{\\left\\{#1,\\cdots,#2\\right\\}}
143 | \\newcommand{\\independant}[0]{\\perp \\!\\!\\! \\perp}
144 | \\newcommand{\\esp}{\\mathbb{E}}
145 | \\newcommand{\\espf}[2]{\\mathbb{E}_{#1}\\pa{#2}}
146 | \\newcommand{\\var}{\\mathbb{V}}
147 | \\newcommand{\\pr}[1]{\\mathbb{P}\\pa{#1}}
148 | \\newcommand{\\loi}[0]{{\\cal L}}
149 | \\newcommand{\\vecteurno}[2]{#1,\\dots,#2}
150 | \\newcommand{\\norm}[1]{\\left\\Vert#1\\right\\Vert}
151 | \\newcommand{\\norme}[1]{\\left\\Vert#1\\right\\Vert}
152 | \\newcommand{\\scal}[2]{\\left<#1,#2\\right>}
153 | \\newcommand{\\dans}[0]{\\rightarrow}
154 | \\newcommand{\\partialfrac}[2]{\\frac{\\partial #1}{\\partial #2}}
155 | \\newcommand{\\partialdfrac}[2]{\\dfrac{\\partial #1}{\\partial #2}}
156 | \\newcommand{\\trace}[1]{tr\\pa{#1}}
157 | \\newcommand{\\sac}[0]{|}
158 | \\newcommand{\\abs}[1]{\\left|#1\\right|}
159 | \\newcommand{\\loinormale}[2]{{\\cal N} \\pa{#1,#2}}
160 | \\newcommand{\\loibinomialea}[1]{{\\cal B} \\pa{#1}}
161 | \\newcommand{\\loibinomiale}[2]{{\\cal B} \\pa{#1,#2}}
162 | \\newcommand{\\loimultinomiale}[1]{{\\cal M} \\pa{#1}}
163 | \\newcommand{\\variance}[1]{\\mathbb{V}\\pa{#1}}
164 | \\newcommand{\\intf}[1]{\\left\\lfloor #1 \\right\\rfloor}
165 | """
166 |
167 | imgmath_latex_preamble = preamble
168 | latex_elements["preamble"] = imgmath_latex_preamble
169 |
170 |
171 | epkg_dictionary = {
172 | "csv": "https://en.wikipedia.org/wiki/Comma-separated_values",
173 | "dask": "https://dask.pydata.org/en/latest/",
174 | "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
175 | "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
176 | "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
177 | "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
178 | "dill": "https://dill.readthedocs.io/en/latest/dill.html",
179 | "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html",
180 | "Jupyter": "https://jupyter.org/",
181 | "Hadoop": "http://hadoop.apache.org/",
182 | "ijson": "https://github.com/ICRAR/ijson",
183 | "json": "https://docs.python.org/3/library/json.html",
184 | "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
185 | "numpy": "https://numpy.org/",
186 | "pandas": (
187 | "http://pandas.pydata.org/pandas-docs/stable/",
188 | (
189 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html",
190 | 1,
191 | ),
192 | (
193 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html",
194 | 2,
195 | ),
196 | ),
197 | "pyarrow": "https://arrow.apache.org/docs/python/",
198 | "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
199 | "Python": "https://www.python.org/",
200 | "scikit-learn": "https://scikit-learn.org/stable/",
201 | "scikit-multiflow": "https://scikit-multiflow.github.io/",
202 | "sklearn": (
203 | "https://scikit-learn.org/stable/",
204 | ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1),
205 | ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
206 | ),
207 | "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
208 | "tornado": "https://www.tornadoweb.org/en/stable/",
209 | "zip": "https://en.wikipedia.org/wiki/ZIP_(file_format)",
210 | }
211 |
--------------------------------------------------------------------------------
/_doc/examples/README.txt:
--------------------------------------------------------------------------------
1 | Gallery of Examples
2 | ===================
3 |
4 |
--------------------------------------------------------------------------------
/_doc/examples/first_step.py:
--------------------------------------------------------------------------------
1 | """
2 | First steps with pandas_streaming
3 | =================================
4 |
5 | A few difference between :epkg:`pandas` and *pandas_streaming*.
6 |
7 | pandas to pandas_streaming
8 | ++++++++++++++++++++++++++
9 | """
10 |
11 | import glob
12 | from pandas import DataFrame
13 | from pandas_streaming.df import StreamingDataFrame
14 |
15 |
16 | df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
17 | df
18 |
19 |
20 | #############################
21 | # We create a streaming dataframe:
22 |
23 |
24 | sdf = StreamingDataFrame.read_df(df)
25 | sdf
26 |
27 |
28 | ################################
29 | #
30 |
31 | sdf.to_dataframe()
32 |
33 |
34 | ########################################
35 | # Internally, StreamingDataFrame implements an iterator on
36 | # dataframes and then tries to replicate the same interface as
37 | # :class:`pandas.DataFrame` possibly wherever it is possible to
38 | # manipulate data without loading everything into memory.
39 |
40 |
41 | sdf2 = sdf.concat(sdf)
42 | sdf2.to_dataframe()
43 |
44 |
45 | ###############################
46 | #
47 |
48 | m = DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
49 | m
50 |
51 |
52 | ##########################################
53 | #
54 |
55 | sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
56 | sdf3.to_dataframe()
57 |
58 |
59 | ############################################
60 | #
61 |
62 | sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer")
63 |
64 |
65 | ############################################
66 | # The order might be different.
67 |
68 |
69 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
70 | sdfte.head()
71 |
72 |
73 | ############################################
74 | #
75 |
76 |
77 | sdftr.head()
78 |
79 |
80 | ############################################
81 | # split a big file
82 | # ++++++++++++++++
83 |
84 |
85 | sdf2.to_csv("example.txt")
86 |
87 |
88 | ############################################
89 | #
90 |
91 |
92 | new_sdf = StreamingDataFrame.read_csv("example.txt")
93 | new_sdf.train_test_split("example.{}.txt", streaming=False)
94 |
95 |
96 | ############################################
97 | #
98 |
99 | glob.glob("ex*.txt")
100 |
--------------------------------------------------------------------------------
/_doc/i_ex.rst:
--------------------------------------------------------------------------------
1 |
2 | Examples
3 | ========
4 |
5 | About array
6 | +++++++++++
7 |
8 | .. exreflist::
9 | :contents:
10 | :tag: array
11 |
12 | About DataFrame
13 | +++++++++++++++
14 |
15 | .. exreflist::
16 | :contents:
17 | :tag: dataframe
18 |
19 | About StreamingDataFrame
20 | ++++++++++++++++++++++++
21 |
22 | .. exreflist::
23 | :contents:
24 | :tag: streaming
25 |
--------------------------------------------------------------------------------
/_doc/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. |gitlogo| image:: _static/git_logo.png
3 | :height: 20
4 |
5 | pandas-streaming: streaming API over pandas
6 | ===========================================
7 |
8 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true
9 | :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
10 | :alt: Build Status Windows
11 |
12 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
13 | :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
14 |
15 | .. image:: https://badge.fury.io/py/pandas_streaming.svg
16 | :target: http://badge.fury.io/py/pandas-streaming
17 |
18 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
19 | :alt: MIT License
20 | :target: https://opensource.org/license/MIT/
21 |
22 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8
23 | :target: https://codecov.io/gh/sdpython/pandas-streaming
24 |
25 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
26 | :alt: GitHub Issues
27 | :target: https://github.com/sdpython/pandas_streaming/issues
28 |
29 | .. image:: https://pepy.tech/badge/pandas_streaming
30 | :target: https://pypi.org/project/pandas_streaming/
31 | :alt: Downloads
32 |
33 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg
34 | :target: https://github.com/sdpython/pandas_streaming/
35 | :alt: Forks
36 |
37 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg
38 | :target: https://github.com/sdpython/pandas_streaming/
39 | :alt: Stars
40 |
41 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming
42 | :target: https://github.com/sdpython/pandas_streaming/
43 | :alt: size
44 |
45 | *pandas_streaming* aims at processing big files with :epkg:`pandas`,
46 | too big to hold in memory, too small to be parallelized with a significant gain.
47 | The module replicates a subset of :epkg:`pandas` API
48 | and implements other functionalities for machine learning.
49 |
50 | .. toctree::
51 | :maxdepth: 1
52 | :caption: Contents
53 |
54 | tutorial/index
55 | auto_examples/index
56 | api/index
57 | i_ex
58 |
59 | .. toctree::
60 | :maxdepth: 1
61 | :caption: More
62 |
63 | CHANGELOGS
64 | license
65 |
66 | Source are available at `sdpython/pandas_streaming `_.
67 |
68 | Older versions
69 | ++++++++++++++
70 |
71 | * `0.5.1 <../v0.5.1/index.html>`_
72 | * `0.5.0 <../v0.5.0/index.html>`_
73 |
--------------------------------------------------------------------------------
/_doc/license.rst:
--------------------------------------------------------------------------------
1 | .. _l-license:
2 |
3 | License
4 | =======
5 |
6 | .. include:: LICENSE.txt
7 | :literal:
8 |
--------------------------------------------------------------------------------
/_doc/sg_execution_times.rst:
--------------------------------------------------------------------------------
1 |
2 | :orphan:
3 |
4 | .. _sphx_glr_sg_execution_times:
5 |
6 |
7 | Computation times
8 | =================
9 | **00:00.000** total execution time for 1 file **from all galleries**:
10 |
11 | .. container::
12 |
13 | .. raw:: html
14 |
15 |
19 |
20 |
21 |
22 |
27 |
28 | .. list-table::
29 | :header-rows: 1
30 | :class: table table-striped sg-datatable
31 |
32 | * - Example
33 | - Time
34 | - Mem (MB)
35 | * - :ref:`sphx_glr_auto_examples_first_step.py` (``examples/first_step.py``)
36 | - 00:00.000
37 | - 0.0
38 |
--------------------------------------------------------------------------------
/_doc/tutorial/index.rst:
--------------------------------------------------------------------------------
1 |
2 | Tutorial
3 | ========
4 |
5 | The main class :class:`StreamingDataFrame `
6 | is basically on iterator on dataframes. Altogether, it is a
7 | single dataframe which does not have to fit in memory.
8 | It implements a subset a functionalities :epkg:`pandas` provides
9 | related to map reduce,
10 | :meth:`concat `,
11 | :meth:`join `.
12 | Both return a :class:`StreamingDataFrame `
13 | as opposed to :meth:`groupby `
14 | which does not.
15 |
16 | The beginning is always the same, we create such object with one
17 | method :meth:`read_csv `,
18 | :meth:`read_df `,
19 | :meth:`read_str `.
20 | The module was initially created to easily split a dataset into train/test
21 | when it does not fit into memory.
22 |
23 | ::
24 |
25 | from pandas_streaming.df import StreamingDataFrame
26 | sdf = StreamingDataFrame.read_csv("", sep="\t")
27 | sdf.train_test_split("dataset_split_{}.txt", sep="\t")
28 |
29 | >>> ['dataset_split_train.txt', 'dataset_split_test.txt']
30 |
31 | Objectives and Competitors
32 | ++++++++++++++++++++++++++
33 |
34 | The first objective is speed.
35 | :class:`StreamingDataFrame `
36 | is useful when the user needs to process a large data set which does not
37 | hold in memory (*out-of-memory dataset*) or when the user needs to fastly
38 | check an algorithm on the beginning of a big dataset without paying the
39 | cost of loading the data.
40 |
41 | The second objective is simplicity. The proposed interface
42 | tries to follow the same syntax as :epkg:`pandas`.
43 | That is one of the direction followed by :epkg:`dask`.
44 |
45 | :epkg:`dask` tries to address these two objectives
46 | and also offers parallelization. Based on my experience,
47 | :epkg:`dask` is efficient but tends to be slow for simple things
48 | on medium datasets (a couple of gigabytes). The API is not exactly
49 | the same either. The parser does not behave exactly the same.
50 | :epkg:`pyspark` seems a bit of overhead, more difficult
51 | to install and still slow if it is used locally.
52 | :epkg:`pyarrow` is supposed to be the next :epkg:`pandas` but its
53 | scope is larger (it handles streaming dataset from :epkg:`Hadoop`)
54 | and does not work yet with :epkg:`scikit-learn`.
55 | I expect this module to be live until
56 | :epkg:`scikit-learn` updates its code to handle
57 | a streaming container. This one will probably be
58 | the winner.
59 | :epkg:`streamz` follows a different direction.
60 | It offers parallelisation, relies on :epkg:`tornado` but not
61 | on :epkg:`pandas` meaning using it for machine learning
62 | might hide some unexpected loopholes.
63 | :epkg:`scikit-multiflow` does not only implement streaming
64 | container but also streaming machine learning trainings.
65 |
66 | One element of design to remember
67 | +++++++++++++++++++++++++++++++++
68 |
69 | The class :class:`StreamingDataFrame `
70 | does not hold an iterator but a function which creates an iterator.
71 | Every time the user writes the following loop, the function is called
72 | to create an iterator then used to walk through the data.
73 |
74 | .. runpython::
75 | :showcode:
76 |
77 | import pandas
78 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
79 | dict(cf=3, cint=3, cstr="3")])
80 |
81 | from pandas_streaming.df import StreamingDataFrame
82 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
83 |
84 | print("First time:")
85 |
86 | for df in sdf:
87 | # process this chunk of data
88 | print(df)
89 |
90 | print("\nSecond time:\n")
91 |
92 | for df in sdf:
93 | # process this chunk of data a second time
94 | print(df)
95 |
96 | The reason why the class cannot directly use an iterator is because
97 | it is not possible to pickle an iterator. An iterator is meant to
98 | be used only once, a second loop would not be possible and would
99 | be quite surprising to most of users.
100 |
101 | A :class:`StreamingDataFrame `
102 | is also supposed to be *stable*: the two loops in the previous example
103 | should produce the exact same chunks. However, in some cases, the user can choose
104 | not to abide by this constraint. Drawing a sample is one of the reasons.
105 | A user can either choose to draw the same sample every time he is going
106 | through the data. He could also choose that a different sample should be
107 | drawn each time. The following method indicates which kinds of sample
108 | the :class:`StreamingDataFrame `
109 | is producing.
110 |
111 | Check the schema consistency of a large file
112 | ++++++++++++++++++++++++++++++++++++++++++++
113 |
114 | Large files usually comes from an export of a database and this
115 | for some reason, this export failed for a couple of lines.
116 | It can be character *end of line* not removed from a comment,
117 | a separator also present in the data. When that happens, :epkg:`pandas`
118 | takes the least strict type as the column type. Sometimes, we prefer to get a
119 | an idea of where we could find the error.
120 |
121 | .. runpython::
122 | :showcode:
123 |
124 | import pandas
125 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
126 | dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")])
127 | name = "temp_df.csv"
128 | df.to_csv(name, index=False)
129 |
130 | from pandas_streaming.df import StreamingDataFrame
131 | try:
132 | sdf = StreamingDataFrame.read_csv(name, chunksize=2)
133 | for df in sdf:
134 | print(df.dtypes)
135 | except Exception as e:
136 | print("ERROR:", e)
137 |
138 | The method :meth:`__iter__
139 | `
140 | checks that the schema does not change between two iterations.
141 | It can be disabled by adding *check_schema=False* when
142 | the constructor is called.
143 |
--------------------------------------------------------------------------------
/_unittests/ut_df/data/buggy_hash2.csv:
--------------------------------------------------------------------------------
1 | 1092397418290.0 a181248367 366498568522.0
2 | 138742792720.0 516e2e745c 73810952621.0
3 | 108082559849.0 1601fecc7f 79402822525.0
4 | 251797282335.0 29d56f63ec 530980115159.0
5 | 651822622544.0 67be9eb2e5 618639148003.0
6 | 817909238810.0 3a24c42894 441595633456.0
7 | 427513930052.0 42fbf1e0a9 759755785197.0
8 | 409652918460.0 e0e09bcb7b 487633962255.0
9 | 126536040328.0 a2c6f80ea6 325262414951.0
10 | 195809963606.0 7d67e33166 58693978128.0
11 | 426363751898.0 4f67c53e66 1037516316531.0
12 | 51702292002.0 37c64b233a 206747200377.0
13 | 945246123121.0 8739a9cebb 639796038157.0
14 |
--------------------------------------------------------------------------------
/_unittests/ut_df/data/classic.json:
--------------------------------------------------------------------------------
1 | [{"name":"cudaGetDeviceCount",
2 | "ph":"X",
3 | "cat":"cuda",
4 | "ts":1634290065724226794,
5 | "dur":800,
6 | "tid":"Thread 2080429824: Runtime API",
7 | "pid":"[89792] Process",
8 | "args":{}},
9 | {"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
10 | "ph":"X",
11 | "cat":"cuda",
12 | "ts":1634290112071305413,
13 | "dur":1888,
14 | "tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
15 | "pid":"[0:1] Compute",
16 | "args":{"Grid size":"[ 1, 1, 1 ]",
17 | "Block size":"[ 256, 1, 1 ]"}},
18 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
19 | "ph":"X",
20 | "cat":"cuda",
21 | "ts":1634290112071308133,
22 | "dur":1440,
23 | "tid":"Compute",
24 | "pid":"[0:1] Overview",
25 | "args":{"Grid size":"[ 1, 1, 1 ]",
26 | "Block size":"[ 1, 256, 1 ]"}},
27 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
28 | "ph":"X",
29 | "cat":"cuda",
30 | "ts":1634290112071308133,
31 | "dur":1440,
32 | "tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
33 | "pid":"[0:1] Compute",
34 | "args":{"Grid size":"[ 1, 1, 1 ]",
35 | "Block size":"[ 1, 256, 1 ]"}}]
36 |
--------------------------------------------------------------------------------
/_unittests/ut_df/data/example.json:
--------------------------------------------------------------------------------
1 | {"a": 1, "b": 2}
2 | {"a": 3, "b": 4}
--------------------------------------------------------------------------------
/_unittests/ut_df/data/example2.json:
--------------------------------------------------------------------------------
1 | [{"a":1,"b":2},{"a":3,"b":4}]
--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas
3 | from pandas_streaming.ext_test_case import ExtTestCase
4 | from pandas_streaming.df import (
5 | dataframe_shuffle,
6 | train_test_split_weights,
7 | train_test_connex_split,
8 | )
9 |
10 |
11 | class TestConnexSplit(ExtTestCase):
12 | def test_shuffle(self):
13 | df = pandas.DataFrame(
14 | [
15 | dict(a=1, b="e", c=5.6, ind="a1"),
16 | dict(a=2, b="f", c=5.7, ind="a2"),
17 | dict(a=4, b="g", c=5.8, ind="a3"),
18 | dict(a=8, b="h", c=5.9, ind="a4"),
19 | dict(a=16, b="i", c=6.2, ind="a5"),
20 | ]
21 | )
22 | shuffled = dataframe_shuffle(df, random_state=0)
23 | sorted_ = shuffled.sort_values("a")
24 | self.assertEqualDataFrame(df, sorted_)
25 |
26 | df2 = df.set_index("ind")
27 | shuffled = dataframe_shuffle(df2, random_state=0)
28 | sorted_ = shuffled.sort_values("a")
29 | self.assertEqualDataFrame(df2, sorted_)
30 |
31 | df2 = df.set_index(["ind", "c"])
32 | shuffled = dataframe_shuffle(df2, random_state=0)
33 | sorted_ = shuffled.sort_values("a")
34 | self.assertEqualDataFrame(df2, sorted_)
35 |
36 | def test_split_weights_errors(self):
37 | df = pandas.DataFrame(
38 | [
39 | dict(a=1, b="e", c=1),
40 | dict(a=2, b="f", c=1),
41 | dict(a=4, b="g", c=1),
42 | dict(a=8, b="h", c=1),
43 | dict(a=12, b="h", c=1),
44 | dict(a=16, b="i", c=1),
45 | ]
46 | )
47 |
48 | train, test = train_test_split_weights(df, train_size=0.5, weights="c")
49 | self.assertTrue(train is not None)
50 | self.assertTrue(test is not None)
51 | self.assertRaise(
52 | lambda: train_test_split_weights(df, test_size=0.5, weights=[0.5, 0.5]),
53 | ValueError,
54 | "Dimension",
55 | )
56 | self.assertRaise(
57 | lambda: train_test_split_weights(df, test_size=0), ValueError, "null"
58 | )
59 | self.assertRaise(
60 | lambda: train_test_split_weights(df, test_size=0, weights="c"),
61 | ValueError,
62 | "null",
63 | )
64 |
65 | def test_split_weights(self):
66 | df = pandas.DataFrame(
67 | [
68 | dict(a=1, b="e", c=1),
69 | dict(a=2, b="f", c=1),
70 | dict(a=4, b="g", c=1),
71 | dict(a=8, b="h", c=1),
72 | dict(a=12, b="h", c=1),
73 | dict(a=16, b="i", c=1),
74 | ]
75 | )
76 |
77 | train, test = train_test_split_weights(df, test_size=0.5)
78 | self.assertEqual(train.shape[1], test.shape[1])
79 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
80 |
81 | train, test = train_test_split_weights(df, test_size=0.5, weights="c")
82 | self.assertEqual(train.shape[1], test.shape[1])
83 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
84 |
85 | train, test = train_test_split_weights(df, test_size=0.5, weights=df["c"])
86 | self.assertEqual(train.shape[1], test.shape[1])
87 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
88 |
89 | df = pandas.DataFrame(
90 | [
91 | dict(a=1, b="e", c=1),
92 | dict(a=2, b="f", c=2),
93 | dict(a=4, b="g", c=3),
94 | dict(a=8, b="h", c=1),
95 | dict(a=12, b="h", c=2),
96 | dict(a=16, b="i", c=3),
97 | ]
98 | )
99 |
100 | train, test = train_test_split_weights(
101 | df, test_size=0.5, weights="c", fail_imbalanced=0.4
102 | )
103 | self.assertEqual(train.shape[1], test.shape[1])
104 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
105 | w1, w2 = train["c"].sum(), test["c"].sum()
106 | delta = abs(w1 - w2) / (w1 + w2)
107 | self.assertGreater(0.4, delta)
108 |
109 | def test_split_connex(self):
110 | df = pandas.DataFrame(
111 | [
112 | dict(user="UA", prod="PA", card="C1"),
113 | dict(user="UA", prod="PB", card="C1"),
114 | dict(user="UB", prod="PC", card="C2"),
115 | dict(user="UB", prod="PD", card="C2"),
116 | dict(user="UC", prod="PE", card="C3"),
117 | dict(user="UC", prod="PF", card="C4"),
118 | dict(user="UD", prod="PG", card="C5"),
119 | ]
120 | )
121 |
122 | train, test = train_test_connex_split( # pylint: disable=W0632
123 | df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
124 | )
125 |
126 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
127 | for col in ["user", "prod", "card"]:
128 | s1 = set(train[col])
129 | s2 = set(test[col])
130 | if s1 & s2:
131 | raise AssertionError(
132 | f"Non empty intersection {s1} & {s2}\n{train}\n{test}"
133 | )
134 |
135 | df["connex"] = "ole"
136 | train, test = train_test_connex_split( # pylint: disable=W0632
137 | df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
138 | )
139 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
140 |
141 | def test_split_connex2(self):
142 | df = pandas.DataFrame(
143 | [
144 | dict(user="UA", prod="PAA", card="C1"),
145 | dict(user="UA", prod="PB", card="C1"),
146 | dict(user="UB", prod="PC", card="C2"),
147 | dict(user="UB", prod="PD", card="C2"),
148 | dict(user="UC", prod="PAA", card="C3"),
149 | dict(user="UC", prod="PF", card="C4"),
150 | dict(user="UD", prod="PG", card="C5"),
151 | ]
152 | )
153 |
154 | train_test_connex_split(
155 | df,
156 | test_size=0.5,
157 | groups=["user", "prod", "card"],
158 | fail_imbalanced=0.5,
159 | return_cnx=True,
160 | )
161 | train, test, stats = train_test_connex_split(
162 | df,
163 | test_size=0.5,
164 | groups=["user", "prod", "card"],
165 | fail_imbalanced=0.5,
166 | return_cnx=True,
167 | random_state=0,
168 | )
169 |
170 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
171 | for col in ["user", "prod", "card"]:
172 | s1 = set(train[col])
173 | s2 = set(test[col])
174 | if s1 & s2:
175 | rows = []
176 | for k, v in sorted(stats[0].items()):
177 | rows.append(f"{k}={v}")
178 | raise AssertionError(
179 | "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030
180 | s1, s2, train, test, "\n".join(rows)
181 | )
182 | )
183 |
184 | def test_split_connex_missing(self):
185 | df = pandas.DataFrame(
186 | [
187 | dict(user="UA", prod="PAA", card="C1"),
188 | dict(user="UA", prod="PB", card="C1"),
189 | dict(user="UB", prod="PC", card="C2"),
190 | dict(user="UB", prod="PD", card="C2"),
191 | dict(user="UC", prod="PAA", card="C3"),
192 | dict(user="UC", card="C4"),
193 | dict(user="UD", prod="PG"),
194 | ]
195 | )
196 |
197 | train, test, stats = train_test_connex_split(
198 | df,
199 | test_size=0.5,
200 | groups=["user", "prod", "card"],
201 | fail_imbalanced=0.4,
202 | return_cnx=True,
203 | random_state=0,
204 | )
205 |
206 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
207 | for col in ["user", "prod", "card"]:
208 | s1 = set(train[col])
209 | s2 = set(test[col])
210 | if s1 & s2:
211 | rows = []
212 | for k, v in sorted(stats[0].items()):
213 | rows.append(f"{k}={v}")
214 | raise AssertionError(
215 | "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030
216 | s1, s2, train, test, "\n".join(rows)
217 | )
218 | )
219 |
220 |
221 | if __name__ == "__main__":
222 | unittest.main()
223 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split_big.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from collections import Counter
4 | import pandas
5 | from pandas_streaming.ext_test_case import ExtTestCase
6 | from pandas_streaming.df import train_test_connex_split
7 |
8 |
9 | class TestConnexSplitBig(ExtTestCase):
10 | def test_connex_big(self):
11 | data = os.path.join(os.path.dirname(__file__), "data")
12 | name = os.path.join(data, "buggy_hash.csv")
13 | df = pandas.read_csv(name, sep="\t", encoding="utf-8")
14 | train, test, stats = train_test_connex_split(
15 | df,
16 | groups=["cart_id", "mail", "product_id"],
17 | fail_imbalanced=0.9,
18 | return_cnx=True,
19 | )
20 | self.assertGreater(train.shape[0], 0)
21 | self.assertGreater(test.shape[0], 0)
22 | elements = stats[1]["connex"]
23 | counts = Counter(elements)
24 | nbc = len(counts)
25 | maxi = max(counts.values())
26 | self.assertEqual(nbc, 5376)
27 | self.assertEqual(maxi, 14181)
28 |
29 | def test_connex_big_approx(self):
30 | data = os.path.join(os.path.dirname(__file__), "data")
31 | name = os.path.join(data, "buggy_hash.csv")
32 | df = pandas.read_csv(name, sep="\t", encoding="utf-8")
33 | train, test, stats = train_test_connex_split(
34 | df,
35 | groups=["cart_id", "mail", "product_id"],
36 | stop_if_bigger=0.05,
37 | return_cnx=True,
38 | keep_balance=0.8,
39 | )
40 | self.assertGreater(train.shape[0], 0)
41 | self.assertGreater(test.shape[0], 0)
42 | elements = stats[1]["connex"]
43 | counts = Counter(elements)
44 | nbc = len(counts)
45 | maxi = max(counts.values())
46 | self.assertGreater(nbc, 5376)
47 | self.assertLesser(maxi, 14181)
48 |
49 | def test_connex_big_approx_must(self):
50 | data = os.path.join(os.path.dirname(__file__), "data")
51 | name = os.path.join(data, "buggy_hash.csv")
52 | df = pandas.read_csv(name, sep="\t", encoding="utf-8")
53 | train, test, stats = train_test_connex_split(
54 | df,
55 | groups=["cart_id", "mail", "product_id"],
56 | stop_if_bigger=0.05,
57 | return_cnx=True,
58 | keep_balance=0.8,
59 | must_groups=["product_id"],
60 | )
61 | self.assertGreater(train.shape[0], 0)
62 | self.assertGreater(test.shape[0], 0)
63 | elements = stats[1]["connex"]
64 | counts = Counter(elements)
65 | nbc = len(counts)
66 | maxi = max(counts.values())
67 | self.assertGreater(nbc, 5376)
68 | self.assertLesser(maxi, 14181)
69 | train_ids = set(train.product_id)
70 | test_ids = set(test.product_id)
71 | inter = train_ids & test_ids
72 | self.assertEqual(len(inter), 0)
73 |
74 |
75 | if __name__ == "__main__":
76 | unittest.main()
77 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split_cat.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from collections import Counter
3 | import pandas
4 | from pandas_streaming.ext_test_case import ExtTestCase
5 | from pandas_streaming.df import train_test_apart_stratify
6 |
7 |
8 | class TestConnexSplitCat(ExtTestCase):
9 | def test_cat_strat(self):
10 | df = pandas.DataFrame(
11 | [
12 | dict(a=1, b="e"),
13 | dict(a=2, b="e"),
14 | dict(a=4, b="f"),
15 | dict(a=8, b="f"),
16 | dict(a=32, b="f"),
17 | dict(a=16, b="f"),
18 | ]
19 | )
20 |
21 | train, test = train_test_apart_stratify(
22 | df, group="a", stratify="b", test_size=0.5
23 | )
24 | self.assertEqual(train.shape[1], test.shape[1])
25 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
26 | c1 = Counter(train["b"])
27 | c2 = Counter(train["b"])
28 | self.assertEqual(c1, c2)
29 |
30 | self.assertRaise(
31 | lambda: train_test_apart_stratify(
32 | df, group=None, stratify="b", test_size=0.5
33 | ),
34 | ValueError,
35 | )
36 | self.assertRaise(
37 | lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError
38 | )
39 |
40 | def test_cat_strat_sorted(self):
41 | df = pandas.DataFrame(
42 | [
43 | dict(a=1, b="e"),
44 | dict(a=2, b="e"),
45 | dict(a=4, b="f"),
46 | dict(a=8, b="f"),
47 | dict(a=32, b="f"),
48 | dict(a=16, b="f"),
49 | ]
50 | )
51 |
52 | train, test = train_test_apart_stratify(
53 | df, group="a", stratify="b", test_size=0.5, sorted_indices=True
54 | )
55 | self.assertEqual(train.shape[1], test.shape[1])
56 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
57 | c1 = Counter(train["b"])
58 | c2 = Counter(train["b"])
59 | self.assertEqual(c1, c2)
60 |
61 | self.assertRaise(
62 | lambda: train_test_apart_stratify(
63 | df, group=None, stratify="b", test_size=0.5, sorted_indices=True
64 | ),
65 | ValueError,
66 | )
67 | self.assertRaise(
68 | lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError
69 | )
70 |
71 | def test_cat_strat_multi(self):
72 | df = pandas.DataFrame(
73 | [
74 | dict(a=1, b="e"),
75 | dict(a=1, b="f"),
76 | dict(a=2, b="e"),
77 | dict(a=2, b="f"),
78 | ]
79 | )
80 |
81 | train, test = train_test_apart_stratify(
82 | df, group="a", stratify="b", test_size=0.5
83 | )
84 | self.assertEqual(train.shape[1], test.shape[1])
85 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
86 | c1 = Counter(train["b"])
87 | c2 = Counter(train["b"])
88 | self.assertEqual(c1, c2)
89 | self.assertEqual(len(set(train["a"])), 1)
90 | self.assertEqual(len(set(test["a"])), 1)
91 | self.assertTrue(set(train["a"]) != set(test["a"]))
92 |
93 | def test_cat_strat_multi_force(self):
94 | df = pandas.DataFrame(
95 | [
96 | dict(a=1, b="e"),
97 | dict(a=1, b="f"),
98 | dict(a=2, b="e"),
99 | dict(a=2, b="f"),
100 | ]
101 | )
102 |
103 | train, test = train_test_apart_stratify(
104 | df, group="a", stratify="b", test_size=0.1, force=True
105 | )
106 | self.assertEqual(train.shape[1], test.shape[1])
107 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
108 | c1 = Counter(train["b"])
109 | c2 = Counter(train["b"])
110 | self.assertEqual(c1, c2)
111 | self.assertEqual(len(set(train["a"])), 1)
112 | self.assertEqual(len(set(test["a"])), 1)
113 | self.assertTrue(set(train["a"]) != set(test["a"]))
114 |
115 |
116 | if __name__ == "__main__":
117 | unittest.main()
118 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_helpers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | import numpy
4 | import pandas
5 | from pandas_streaming.ext_test_case import ExtTestCase
6 | from pandas_streaming.df import dataframe_hash_columns
7 |
8 |
9 | class TestDataFrameHelpers(ExtTestCase):
10 | def test_hash_columns(self):
11 | df = pandas.DataFrame(
12 | [
13 | dict(a=1, b="e", c=5.6, ind="a1", ai=1),
14 | dict(b="f", c=5.7, ind="a2", ai=2),
15 | dict(a=4, b="g", ind="a3", ai=3),
16 | dict(a=8, b="h", c=5.9, ai=4),
17 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
18 | ]
19 | )
20 | df2 = dataframe_hash_columns(df)
21 | self.assertEqual(df2.shape, df.shape)
22 | for j in range(df.shape[1]):
23 | self.assertEqual(df.columns[j], df2.columns[j])
24 | self.assertEqual(df.dtypes[j], df2.dtypes[j])
25 | for i in range(df.shape[0]):
26 | v1 = df.iloc[i, j]
27 | v2 = df2.iloc[i, j]
28 | if isinstance(v1, float):
29 | if numpy.isnan(v1):
30 | self.assertTrue(numpy.isnan(v2))
31 | else:
32 | self.assertEqual(type(v1), type(v2))
33 | else:
34 | self.assertEqual(type(v1), type(v2))
35 |
36 | def test_hash_columns_bigger(self):
37 | data = os.path.join(os.path.dirname(__file__), "data")
38 | name = os.path.join(data, "buggy_hash.csv")
39 | df = pandas.read_csv(name, sep="\t", encoding="utf-8")
40 | df2 = dataframe_hash_columns(df)
41 | self.assertEqual(df.shape, df2.shape)
42 |
43 |
44 | if __name__ == "__main__":
45 | unittest.main()
46 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_helpers_simple.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas
3 | import numpy
4 | from pandas_streaming.ext_test_case import ExtTestCase
5 | from pandas_streaming.df import dataframe_unfold
6 | from pandas_streaming.df.dataframe_helpers import hash_int, hash_str, hash_float
7 |
8 |
9 | class TestDataFrameHelpersSimple(ExtTestCase):
10 | def test_unfold(self):
11 | df = pandas.DataFrame([dict(a=1, b="e,f"), dict(a=2, b="g"), dict(a=3)])
12 | df2 = dataframe_unfold(df, "b")
13 |
14 | exp = pandas.DataFrame(
15 | [
16 | dict(a=1, b="e,f", b_unfold="e"),
17 | dict(a=1, b="e,f", b_unfold="f"),
18 | dict(a=2, b="g", b_unfold="g"),
19 | dict(a=3),
20 | ]
21 | )
22 | self.assertEqualDataFrame(df2, exp)
23 |
24 | # fold
25 | folded = df2.groupby("a").apply(
26 | lambda row: (
27 | ",".join(row["b_unfold"].dropna())
28 | if len(row["b_unfold"].dropna()) > 0
29 | else numpy.nan
30 | )
31 | )
32 | bf = folded.reset_index(drop=False)
33 | bf.columns = ["a", "b"]
34 | self.assertEqualDataFrame(df, bf)
35 |
36 | def test_hash_except(self):
37 | self.assertRaise(lambda: hash_int(0.1, 3), ValueError, "numpy.nan expected")
38 | r = hash_int(numpy.nan, 3)
39 | self.assertTrue(numpy.isnan(r))
40 |
41 | self.assertRaise(lambda: hash_str(0.1, 3), ValueError, "numpy.nan expected")
42 | r = hash_str(numpy.nan, 3)
43 | self.assertTrue(numpy.isnan(r))
44 |
45 | self.assertRaise(lambda: hash_float("0.1", 3), TypeError, "isnan")
46 | r = hash_float(numpy.nan, 3)
47 | self.assertTrue(numpy.isnan(r))
48 | r = hash_str("3", 100)
49 | self.assertLess(len(r), 100)
50 |
51 |
52 | if __name__ == "__main__":
53 | unittest.main()
54 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_io.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import unittest
4 | import io
5 | import zipfile
6 | import numpy
7 | import pandas
8 | from pandas_streaming.ext_test_case import ExtTestCase
9 | from pandas_streaming.df import to_zip, read_zip
10 |
11 |
12 | class TestDataFrameIO(ExtTestCase):
13 | def test_zip_dataframe(self):
14 | df = pandas.DataFrame(
15 | [
16 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
17 | dict(b="f", c=5.7, ind="a2", ai=2),
18 | dict(a=4, b="g", ind="a3", ai=3),
19 | dict(a=8, b="h", c=5.9, ai=4),
20 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
21 | ]
22 | )
23 |
24 | with tempfile.TemporaryDirectory() as temp:
25 | name = os.path.join(temp, "df.zip")
26 | to_zip(df, name, encoding="utf-8", index=False)
27 | df2 = read_zip(name, encoding="utf-8")
28 | self.assertEqualDataFrame(df, df2)
29 |
30 | st = io.BytesIO()
31 | zp = zipfile.ZipFile(st, "w")
32 | to_zip(df, zp, encoding="utf-8", index=False)
33 | zp.close()
34 |
35 | st = io.BytesIO(st.getvalue())
36 | zp = zipfile.ZipFile(st, "r")
37 | df3 = read_zip(zp, encoding="utf-8")
38 | zp.close()
39 | self.assertEqualDataFrame(df, df3)
40 |
41 | def test_zip_numpy(self):
42 | df = numpy.zeros((3, 4))
43 | df[2, 3] = 1
44 |
45 | with tempfile.TemporaryDirectory() as temp:
46 | name = os.path.join(temp, "df.zip")
47 | to_zip(df, name, "arr.npy")
48 | df2 = read_zip(name, "arr.npy")
49 | self.assertEqualArray(df, df2)
50 |
51 | st = io.BytesIO()
52 | zp = zipfile.ZipFile(st, "w")
53 | to_zip(df, zp, "arr.npy")
54 | zp.close()
55 |
56 | st = io.BytesIO(st.getvalue())
57 | zp = zipfile.ZipFile(st, "r")
58 | df3 = read_zip(zp, "arr.npy")
59 | zp.close()
60 | self.assertEqualArray(df, df3)
61 |
62 |
63 | if __name__ == "__main__":
64 | unittest.main()
65 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_io_helpers.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from io import StringIO, BytesIO
3 | from json import loads
4 | import pandas
5 | from pandas_streaming.ext_test_case import ExtTestCase
6 | from pandas_streaming.df.dataframe_io_helpers import (
7 | enumerate_json_items,
8 | JsonPerRowsStream,
9 | JsonIterator2Stream,
10 | )
11 | from pandas_streaming.df import StreamingDataFrame
12 |
13 |
14 | class TestDataFrameIOHelpers(ExtTestCase):
15 | text_json = b"""
16 | [
17 | {
18 | "glossary": {
19 | "title": "example glossary",
20 | "GlossDiv": {
21 | "title": "S",
22 | "GlossList": [{
23 | "GlossEntry": {
24 | "ID": "SGML",
25 | "SortAs": "SGML",
26 | "GlossTerm": "Standard Generalized Markup Language",
27 | "Acronym": "SGML",
28 | "Abbrev": "ISO 8879:1986",
29 | "GlossDef": {
30 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
31 | "GlossSeeAlso": ["GML", "XML"]
32 | },
33 | "GlossSee": "markup"
34 | }
35 | }]
36 | }
37 | }
38 | },
39 | {
40 | "glossary": {
41 | "title": "example glossary",
42 | "GlossDiv": {
43 | "title": "X",
44 | "GlossList": {
45 | "GlossEntry": [{
46 | "ID": "SGML",
47 | "SortAs": "SGML",
48 | "GlossTerm": "Standard Generalized Markup Language",
49 | "Acronym": "SGML",
50 | "Abbrev": "ISO 8879:1986",
51 | "GlossDef": {
52 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
53 | "GlossSeeAlso": ["GML", "XML"]
54 | },
55 | "GlossSee": "markup"
56 | }]
57 | }
58 | }
59 | }
60 | }
61 | ]
62 | """
63 | text_json_exp = [
64 | {
65 | "glossary": {
66 | "title": "example glossary",
67 | "GlossDiv": {
68 | "title": "S",
69 | "GlossList": [
70 | {
71 | "GlossEntry": {
72 | "ID": "SGML",
73 | "SortAs": "SGML",
74 | "GlossTerm": "Standard Generalized Markup Language",
75 | "Acronym": "SGML",
76 | "Abbrev": "ISO 8879:1986",
77 | "GlossDef": {
78 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
79 | "GlossSeeAlso": ["GML", "XML"],
80 | },
81 | "GlossSee": "markup",
82 | }
83 | }
84 | ],
85 | },
86 | }
87 | },
88 | {
89 | "glossary": {
90 | "title": "example glossary",
91 | "GlossDiv": {
92 | "title": "X",
93 | "GlossList": {
94 | "GlossEntry": [
95 | {
96 | "ID": "SGML",
97 | "SortAs": "SGML",
98 | "GlossTerm": "Standard Generalized Markup Language",
99 | "Acronym": "SGML",
100 | "Abbrev": "ISO 8879:1986",
101 | "GlossDef": {
102 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
103 | "GlossSeeAlso": ["GML", "XML"],
104 | },
105 | "GlossSee": "markup",
106 | }
107 | ]
108 | },
109 | },
110 | }
111 | },
112 | ]
113 |
114 | def test_enumerate_json_items(self):
115 | items = list(enumerate_json_items(TestDataFrameIOHelpers.text_json))
116 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
117 | items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
118 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
119 | items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
120 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
121 |
122 | def test_read_json_raw(self):
123 | data = [
124 | {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
125 | {"name": {"given": "Mose", "family": "Regner"}},
126 | {"id": 2, "name": "FayeRaker"},
127 | ]
128 | exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"},
129 | {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null},
130 | {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null,
131 | "name.given":null,"name.last":null}]""".replace(
132 | " ", ""
133 | ).replace(
134 | "\n", ""
135 | )
136 | self.assertRaise(
137 | lambda: StreamingDataFrame.read_json(data), NotImplementedError
138 | )
139 | it = StreamingDataFrame.read_json(data, flatten=True)
140 | dfs = list(it)
141 | self.assertEqual(len(dfs), 1)
142 | js = dfs[0].to_json(orient="records")
143 | js_read = loads(js)
144 | js_exp = loads(exp)
145 | self.assertEqual(js_exp, js_read)
146 |
147 | def test_read_json_raw_head(self):
148 | data = [
149 | {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
150 | {"name": {"given": "Mose", "family": "Regner"}},
151 | {"id": 2, "name": "FayeRaker"},
152 | ]
153 | it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1)
154 | h1 = it.head()
155 | h2 = it.head()
156 | self.assertEqualDataFrame(h1, h2)
157 | self.assertGreater(h1.shape[0], 1)
158 | self.assertGreater(h2.shape[0], 1)
159 |
160 | def test_pandas_json_chunksize(self):
161 | jsonl = """{"a": 1, "b": 2}
162 | {"a": 3, "b": 4}"""
163 | df = pandas.read_json(jsonl, lines=True)
164 | idf = pandas.read_json(jsonl, lines=True, chunksize=2)
165 | ldf = list(idf)
166 | self.assertEqualDataFrame(df, ldf[0])
167 |
168 | def test_read_json_rows(self):
169 | data = """{"a": 1, "b": 2}
170 | {"a": 3, "b": 4}"""
171 | it = StreamingDataFrame.read_json(StringIO(data), lines=True)
172 | dfs = list(it)
173 | self.assertEqual(len(dfs), 1)
174 | js = dfs[0].to_json(orient="records")
175 | self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
176 |
177 | def test_read_json_rows2(self):
178 | data = b"""{"a": 1, "b": 2}
179 | {"a": 3, "b": 4}"""
180 | dfs = pandas.read_json(BytesIO(data), lines=True)
181 | self.assertEqual(dfs.shape, (2, 2))
182 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
183 | dfs = list(it)
184 | self.assertEqual(len(dfs), 1)
185 | js = dfs[0].to_json(orient="records")
186 | self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
187 |
188 | def test_read_json_rows2_head(self):
189 | data = b"""{"a": 1, "b": 2}
190 | {"a": 3, "b": 4}"""
191 | dfs = pandas.read_json(BytesIO(data), lines=True)
192 | self.assertEqual(dfs.shape, (2, 2))
193 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
194 | h1 = it.head()
195 | h2 = it.head()
196 | self.assertNotEmpty(h1)
197 | self.assertNotEmpty(h2)
198 | self.assertEqualDataFrame(h1, h2)
199 |
200 | def test_read_json_rows_file_head(self):
201 | data = self.abs_path_join(__file__, "data", "example2.json")
202 | dfs = pandas.read_json(data, orient="records")
203 | self.assertEqual(dfs.shape, (2, 2))
204 | it = StreamingDataFrame.read_json(data)
205 | h1 = it.head()
206 | h2 = it.head()
207 | self.assertNotEmpty(h1)
208 | self.assertNotEmpty(h2)
209 | self.assertEqualDataFrame(h1, h2)
210 |
211 | def test_read_json_rows_file_lines_head(self):
212 | data = self.abs_path_join(__file__, "data", "example.json")
213 | dfs = pandas.read_json(data, orient="records", lines=True)
214 | self.assertEqual(dfs.shape, (2, 2))
215 | it = StreamingDataFrame.read_json(data, lines="stream")
216 | h1 = it.head()
217 | h2 = it.head()
218 | self.assertNotEmpty(h1)
219 | self.assertNotEmpty(h2)
220 | self.assertEqualDataFrame(h1, h2)
221 |
222 | def test_read_json_ijson(self):
223 | it = StreamingDataFrame.read_json(BytesIO(TestDataFrameIOHelpers.text_json))
224 | dfs = list(it)
225 | self.assertEqual(len(dfs), 1)
226 | js = dfs[0].to_json(orient="records", lines=True)
227 | jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
228 | self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
229 |
230 | def test_read_json_stream(self):
231 | text = """{'a': 1}
232 | {'b': 1, 'a', 'r'}"""
233 | st = JsonPerRowsStream(StringIO(text))
234 | val = st.getvalue().replace(" ", "").replace("\n", "")
235 | exp = "[{'a':1},{'b':1,'a','r'}]"
236 | self.assertEqual(val, exp)
237 |
238 | st = JsonPerRowsStream(StringIO(text))
239 | t = st.read(0)
240 | t = st.read(1)
241 | c = ""
242 | while t:
243 | c += t
244 | t = st.read(1)
245 | val = c.replace(" ", "").replace("\n", "")
246 | self.assertEqual(val, exp)
247 |
248 | def test_enumerate_json_items_lines(self):
249 | data = b"""{"a": 1, "b": 2}
250 | {"a": 3, "b": 4}"""
251 | items = list(enumerate_json_items(data, lines=True))
252 | self.assertEqual(items, [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
253 |
254 | def test_read_json_file2(self):
255 | data = b"""{"a": {"c": 1}, "b": [2, 3]}
256 | {"a": {"a": 3}, "b": [4, 5, "r"]}"""
257 |
258 | obj1 = list(enumerate_json_items(BytesIO(data), flatten=False, lines=True))
259 | obj2 = list(enumerate_json_items(BytesIO(data), flatten=True, lines=True))
260 | self.assertNotEqual(obj1, obj2)
261 | self.assertEqual(
262 | obj2,
263 | [
264 | {"a_c": 1, "b_0": 2, "b_1": 3},
265 | {"a_a": 3, "b_0": 4, "b_1": 5, "b_2": "r"},
266 | ],
267 | )
268 |
269 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True)
270 | dfs = list(it)
271 | self.assertEqual(
272 | ["a_a", "a_c", "b_0", "b_1", "b_2"],
273 | list(sorted(dfs[0].columns)),
274 | )
275 | self.assertEqual(len(dfs), 1)
276 | js = dfs[0].to_json(orient="records", lines=True)
277 | jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
278 | exp = [
279 | {"a_a": None, "a_c": 1.0, "b_0": 2, "b_1": 3, "b_2": None},
280 | {"a_a": 3.0, "a_c": None, "b_0": 4, "b_1": 5, "b_2": "r"},
281 | ]
282 | self.assertEqual(exp, jsjson)
283 |
284 | def test_read_json_item(self):
285 | text = TestDataFrameIOHelpers.text_json
286 | st = JsonPerRowsStream(BytesIO(text))
287 | res = []
288 | while True:
289 | n = st.read()
290 | if not n:
291 | break
292 | res.append(n)
293 | self.assertGreater(len(res), 1)
294 |
295 | def test_bug_documentation(self):
296 | items = []
297 | for item in JsonIterator2Stream(
298 | lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json)
299 | ):
300 | items.append(item)
301 | self.assertEqual(len(items), 2)
302 |
303 | def test_read_json_classic(self):
304 | data = self.abs_path_join(__file__, "data", "classic.json")
305 | dfs = pandas.read_json(data, orient="records")
306 | dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
307 | self.assertEqual(dfs.shape[1], 9)
308 | self.assertGreater(dfs.shape[0], 2)
309 | it = StreamingDataFrame.read_json(data)
310 | it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
311 | h1 = it.to_df()
312 | h2 = it.to_df()
313 | self.assertNotEmpty(h1)
314 | self.assertNotEmpty(h2)
315 | self.assertEqualDataFrame(h1, h2)
316 | self.assertEqual(h1.shape[1], 9)
317 |
318 | def test_read_json_classic_file(self):
319 | data = self.abs_path_join(__file__, "data", "classic.json")
320 | dfs = pandas.read_json(data, orient="records")
321 | self.assertEqual(dfs.shape[1], 8)
322 | self.assertGreater(dfs.shape[0], 2)
323 | with open(data, "r", encoding="utf-8") as f:
324 | it = StreamingDataFrame.read_json(f, orient="records")
325 | h1 = it.to_df()
326 | h2 = it.to_df()
327 | self.assertNotEmpty(h1)
328 | self.assertNotEmpty(h2)
329 | self.assertEqualDataFrame(h1, h2)
330 | self.assertEqual(h1.shape[1], 8)
331 |
332 | def test_read_json_classic_file_formula(self):
333 | data = self.abs_path_join(__file__, "data", "classic.json")
334 | dfs = pandas.read_json(data, orient="records")
335 | dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
336 | self.assertEqual(dfs.shape[1], 9)
337 | self.assertGreater(dfs.shape[0], 2)
338 | with open(data, "r", encoding="utf-8") as f:
339 | it = StreamingDataFrame.read_json(f)
340 | it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
341 | h1 = it.to_df()
342 | h2 = it.to_df()
343 | self.assertNotEmpty(h1)
344 | self.assertNotEmpty(h2)
345 | self.assertEqualDataFrame(h1, h2)
346 | self.assertEqual(h1.shape[1], 9)
347 |
348 |
349 | if __name__ == "__main__":
350 | unittest.main()
351 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_sort.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import unittest
4 | import pandas
5 | from pandas_streaming.ext_test_case import ExtTestCase
6 | from pandas_streaming.df import StreamingDataFrame
7 |
8 |
9 | class TestDataFrameSort(ExtTestCase):
10 | def test_sort_values(self):
11 | with tempfile.TemporaryDirectory() as temp:
12 | name = os.path.join(temp, "_data_")
13 | df = pandas.DataFrame(
14 | [
15 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
16 | dict(a=5, b="f", c=5.7, ind="a2", ai=2),
17 | dict(a=4, b="g", ind="a3", ai=3),
18 | dict(a=8, b="h", c=5.9, ai=4),
19 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
20 | ]
21 | )
22 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
23 | sorted_df = df.sort_values(by="a")
24 | res = sdf.sort_values(by="a", temp_file=name)
25 | res_df = res.to_df()
26 | self.assertEqualDataFrame(sorted_df, res_df)
27 |
28 | def test_sort_values_twice(self):
29 | with tempfile.TemporaryDirectory() as temp:
30 | name = os.path.join(temp, "_data_")
31 | df = pandas.DataFrame(
32 | [
33 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
34 | dict(a=5, b="f", c=5.7, ind="a2", ai=2),
35 | dict(a=4, b="g", ind="a3", ai=3),
36 | dict(a=8, b="h", c=5.9, ai=4),
37 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
38 | ]
39 | )
40 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
41 | sorted_df = df.sort_values(by="a")
42 | res = sdf.sort_values(by="a", temp_file=name)
43 | res_df = res.to_df()
44 | self.assertEqualDataFrame(sorted_df, res_df)
45 | res_df = res.to_df()
46 | self.assertEqualDataFrame(sorted_df, res_df)
47 |
48 | def test_sort_values_reverse(self):
49 | with tempfile.TemporaryDirectory() as temp:
50 | name = os.path.join(temp, "_data_")
51 | df = pandas.DataFrame(
52 | [
53 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
54 | dict(a=5, b="f", c=5.7, ind="a2", ai=2),
55 | dict(a=4, b="g", ind="a3", ai=3),
56 | dict(a=8, b="h", c=5.9, ai=4),
57 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
58 | ]
59 | )
60 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
61 | sorted_df = df.sort_values(by="a", ascending=False)
62 | res = sdf.sort_values(by="a", temp_file=name, ascending=False)
63 | res_df = res.to_df()
64 | self.assertEqualDataFrame(sorted_df, res_df)
65 |
66 | def test_sort_values_nan_last(self):
67 | with tempfile.TemporaryDirectory() as temp:
68 | name = os.path.join(temp, "_data_")
69 | df = pandas.DataFrame(
70 | [
71 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
72 | dict(b="f", c=5.7, ind="a2", ai=2),
73 | dict(b="f", c=5.8, ind="a2", ai=2),
74 | dict(a=4, b="g", ind="a3", ai=3),
75 | dict(a=8, b="h", c=5.9, ai=4),
76 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
77 | ]
78 | )
79 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
80 | sorted_df = df.sort_values(by="a", na_position="last")
81 | res = sdf.sort_values(by="a", temp_file=name, na_position="last")
82 | res_df = res.to_df()
83 | self.assertEqualDataFrame(sorted_df, res_df)
84 |
85 | def test_sort_values_nan_first(self):
86 | with tempfile.TemporaryDirectory() as temp:
87 | name = os.path.join(temp, "_data_")
88 | df = pandas.DataFrame(
89 | [
90 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
91 | dict(b="f", c=5.7, ind="a2", ai=2),
92 | dict(b="f", c=5.8, ind="a2", ai=2),
93 | dict(a=4, b="g", ind="a3", ai=3),
94 | dict(a=8, b="h", c=5.9, ai=4),
95 | dict(a=16, b="i", c=6.2, ind="a5", ai=5),
96 | ]
97 | )
98 | sdf = StreamingDataFrame.read_df(df, chunksize=2)
99 | sorted_df = df.sort_values(by="a", na_position="first")
100 | res = sdf.sort_values(by="a", temp_file=name, na_position="first")
101 | res_df = res.to_df()
102 | self.assertEqualDataFrame(sorted_df, res_df)
103 |
104 |
105 | if __name__ == "__main__":
106 | unittest.main()
107 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_pandas_groupbynan.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas
3 | import numpy
4 | from scipy.sparse.linalg import lsqr as sparse_lsqr
5 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings
6 | from pandas_streaming.df import pandas_groupby_nan, numpy_types
7 |
8 |
9 | class TestPandasHelper(ExtTestCase):
10 | def test_pandas_groupbynan(self):
11 | self.assertTrue(sparse_lsqr is not None)
12 | types = [(int, -10), (float, -20.2), (str, "e"), (bytes, bytes("a", "ascii"))]
13 | skip = (numpy.bool_, numpy.complex64, numpy.complex128)
14 | types += [(_, _(5)) for _ in numpy_types() if _ not in skip]
15 |
16 | for ty in types:
17 | data = [
18 | {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
19 | {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
20 | {"this": "cst", "type": "row_for_nan"},
21 | ]
22 | df = pandas.DataFrame(data)
23 | gr = pandas_groupby_nan(df, "value")
24 | co = gr.sum()
25 | li = list(co["value"])
26 | try:
27 | self.assertIsInstance(li[-1], float)
28 | except AssertionError as e:
29 | raise AssertionError(f"Issue with {ty}") from e
30 | try:
31 | self.assertTrue(numpy.isnan(li[-1]))
32 | except AssertionError as e:
33 | raise AssertionError(
34 | "Issue with value {}\n--df--\n{}\n--gr--\n{}\n--co--\n{}".format(
35 | li, df, gr.count(), co
36 | )
37 | ) from e
38 |
39 | for ty in types:
40 | data = [
41 | {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
42 | {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
43 | {"this": "cst", "type": "row_for_nan"},
44 | ]
45 | df = pandas.DataFrame(data)
46 | try:
47 | gr = pandas_groupby_nan(df, ("value", "this"))
48 | t = True
49 | raise AssertionError("---")
50 | except (TypeError, KeyError):
51 | t = False
52 | if t:
53 | co = gr.sum()
54 | li = list(co["value"])
55 | self.assertIsInstance(li[-1], float)
56 | self.assertTrue(numpy.isnan(li[-1]))
57 | try:
58 | gr = pandas_groupby_nan(df, ["value", "this"])
59 | t = True
60 | except (TypeError, NotImplementedError):
61 | t = False
62 |
63 | if t:
64 | co = gr.sum()
65 | li = list(co["value"])
66 | self.assertEqual(len(li), 2)
67 |
68 | def test_pandas_groupbynan_tuple(self):
69 | data = [
70 | dict(a="a", b="b", c="c", n=1),
71 | dict(b="b", n=2),
72 | dict(a="a", n=3),
73 | dict(c="c", n=4),
74 | ]
75 | df = pandas.DataFrame(data)
76 | gr = df.groupby(["a", "b", "c"]).sum()
77 | self.assertEqual(gr.shape, (1, 1))
78 |
79 | for nanback in [True, False]:
80 | try:
81 | gr2_ = pandas_groupby_nan(
82 | df, ["a", "b", "c"], nanback=nanback, suffix="NAN"
83 | )
84 | except NotImplementedError:
85 | continue
86 | gr2 = gr2_.sum().sort_values("n")
87 | self.assertEqual(gr2.shape, (4, 4))
88 | d = gr2.to_dict("records")
89 | self.assertEqual(d[0]["a"], "a")
90 | self.assertEqual(d[0]["b"], "b")
91 | self.assertEqual(d[0]["c"], "c")
92 | self.assertEqual(d[0]["n"], 1)
93 | self.assertEqual(d[1]["a"], "NAN")
94 |
95 | def test_pandas_groupbynan_regular(self):
96 | df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)])
97 | gr = df.groupby(["a"], as_index=False).sum()
98 | gr2_ = pandas_groupby_nan(df, ["a"]).sum()
99 | self.assertEqualDataFrame(gr, gr2_)
100 |
101 | def test_pandas_groupbynan_regular_nanback(self):
102 | df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)])
103 | gr = df.groupby(["a", "cc"]).sum()
104 | self.assertEqual(len(gr), 1)
105 |
106 | def test_pandas_groupbynan_doc(self):
107 | data = [
108 | dict(a=2, ind="a", n=1),
109 | dict(a=2, ind="a"),
110 | dict(a=3, ind="b"),
111 | dict(a=30),
112 | ]
113 | df = pandas.DataFrame(data)
114 | gr2 = pandas_groupby_nan(df, ["ind"]).sum()
115 | ind = list(gr2["ind"])
116 | self.assertTrue(numpy.isnan(ind[-1]))
117 | val = list(gr2["a"])
118 | self.assertEqual(val[-1], 30)
119 |
120 | @ignore_warnings(UserWarning)
121 | def test_pandas_groupbynan_doc2(self):
122 | data = [
123 | dict(a=2, ind="a", n=1),
124 | dict(a=2, ind="a"),
125 | dict(a=3, ind="b"),
126 | dict(a=30),
127 | ]
128 | df = pandas.DataFrame(data)
129 | gr2 = pandas_groupby_nan(df, ["ind", "a"], nanback=False).sum()
130 | ind = list(gr2["ind"])
131 | self.assertEqual(ind[-1], "²nan")
132 |
133 | def test_pandas_groupbynan_doc3(self):
134 | data = [
135 | dict(a=2, ind="a", n=1),
136 | dict(a=2, ind="a"),
137 | dict(a=3, ind="b"),
138 | dict(a=30),
139 | ]
140 | df = pandas.DataFrame(data)
141 | gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum()
142 | ind = list(gr2["ind"])
143 | self.assertTrue(numpy.isnan(ind[-1]))
144 |
145 |
146 | if __name__ == "__main__":
147 | unittest.main()
148 |
--------------------------------------------------------------------------------
/_unittests/ut_df/test_streaming_dataframe.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import unittest
4 | from io import StringIO
5 | import pandas
6 | import numpy
7 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings
8 | from pandas_streaming.data import dummy_streaming_dataframe
9 | from pandas_streaming.df import StreamingDataFrame
10 | from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError
11 |
12 |
13 | class TestStreamingDataFrame(ExtTestCase):
14 | def test_shape(self):
15 | sdf = dummy_streaming_dataframe(100)
16 | dfs = list(sdf)
17 | self.assertEqual(len(dfs), 10)
18 | self.assertEqual(len(dfs), 10)
19 | shape = sdf.shape
20 | self.assertEqual(shape, (100, 2))
21 |
22 | def test_init(self):
23 | sdf = dummy_streaming_dataframe(100)
24 | df1 = sdf.to_df()
25 | sdf2 = StreamingDataFrame(sdf)
26 | df2 = sdf2.to_df()
27 | self.assertEqualDataFrame(df1, df2)
28 |
29 | def test_to_csv(self):
30 | sdf = dummy_streaming_dataframe(100)
31 | st = sdf.to_csv()
32 | self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
33 | st = sdf.to_csv()
34 | self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
35 |
36 | def test_iterrows(self):
37 | sdf = dummy_streaming_dataframe(100)
38 | rows = list(sdf.iterrows())
39 | self.assertEqual(sdf.shape[0], len(rows))
40 | rows = list(sdf.iterrows())
41 | self.assertEqual(sdf.shape[0], len(rows))
42 |
43 | def test_head(self):
44 | sdf = dummy_streaming_dataframe(100)
45 | st = sdf.head()
46 | self.assertEqual(st.shape, (5, 2))
47 | st = sdf.head(n=20)
48 | self.assertEqual(st.shape, (20, 2))
49 | st = sdf.head(n=20)
50 | self.assertEqual(st.shape, (20, 2))
51 |
52 | def test_tail(self):
53 | sdf = dummy_streaming_dataframe(100)
54 | st = sdf.tail()
55 | self.assertEqual(st.shape, (5, 2))
56 | st = sdf.tail(n=20)
57 | self.assertEqual(st.shape, (10, 2))
58 |
59 | def test_read_csv(self):
60 | with tempfile.TemporaryDirectory() as temp:
61 | df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"]))
62 | name = os.path.join(temp, "df.csv")
63 | name2 = os.path.join(temp, "df2.csv")
64 | name3 = os.path.join(temp, "df3.csv")
65 | df.to_csv(name, index=False)
66 | df.to_csv(name2, index=True)
67 | sdf = StreamingDataFrame.read_csv(name)
68 | text = sdf.to_csv(index=False)
69 | self.assertRaise(
70 | lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None),
71 | ValueError,
72 | )
73 | self.assertRaise(
74 | lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False),
75 | ValueError,
76 | )
77 | sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
78 | text2 = sdf2.to_csv(index=True)
79 | sdf2.to_csv(name3, index=True)
80 | with open(name, "r", encoding="utf-8") as f:
81 | exp = f.read()
82 | with open(name2, "r", encoding="utf-8") as f:
83 | exp2 = f.read()
84 | with open(name3, "r", encoding="utf-8") as f:
85 | text3 = f.read()
86 | self.assertEqual(text.replace("\r", ""), exp)
87 | sdf2 = StreamingDataFrame.read_df(df)
88 | self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
89 | self.assertEqual(text2.replace("\r", ""), exp2)
90 | self.assertEqual(
91 | text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "")
92 | )
93 |
94 | def test_where(self):
95 | sdf = dummy_streaming_dataframe(100)
96 | cols = sdf.columns
97 | self.assertEqual(list(cols), ["cint", "cstr"])
98 | dts = sdf.dtypes
99 | self.assertEqual(len(dts), 2)
100 | res = sdf.where(lambda row: row["cint"] == 1)
101 | st = res.to_csv()
102 | self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
103 | res = sdf.where(lambda row: row["cint"] == 1)
104 | st = res.to_csv()
105 | self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
106 |
107 | def test_dataframe(self):
108 | sdf = dummy_streaming_dataframe(100)
109 | df = sdf.to_dataframe()
110 | self.assertEqual(df.shape, (100, 2))
111 |
112 | def test_sample(self):
113 | sdf = dummy_streaming_dataframe(100)
114 | res = sdf.sample(frac=0.1)
115 | self.assertLesser(res.shape[0], 30)
116 | self.assertRaise(lambda: sdf.sample(n=5), ValueError)
117 | res = sdf.sample(frac=0.1)
118 | self.assertLesser(res.shape[0], 30)
119 | self.assertRaise(lambda: sdf.sample(n=5), ValueError)
120 |
121 | def test_sample_cache(self):
122 | sdf = dummy_streaming_dataframe(100)
123 | res = sdf.sample(frac=0.1, cache=True)
124 | df1 = res.to_df()
125 | df2 = res.to_df()
126 | self.assertEqualDataFrame(df1, df2)
127 | self.assertTrue(res.is_stable(n=df1.shape[0], do_check=True))
128 | self.assertTrue(res.is_stable(n=df1.shape[0], do_check=False))
129 | res = sdf.sample(frac=0.1, cache=False)
130 | self.assertFalse(res.is_stable(n=df1.shape[0], do_check=False))
131 |
132 | def test_sample_reservoir_cache(self):
133 | sdf = dummy_streaming_dataframe(100)
134 | res = sdf.sample(n=10, cache=True, reservoir=True)
135 | df1 = res.to_df()
136 | df2 = res.to_df()
137 | self.assertEqualDataFrame(df1, df2)
138 | self.assertEqual(df1.shape, (10, res.shape[1]))
139 | self.assertRaise(
140 | lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError
141 | )
142 | self.assertRaise(
143 | lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError
144 | )
145 |
146 | def test_apply(self):
147 | sdf = dummy_streaming_dataframe(100)
148 | self.assertNotEmpty(list(sdf))
149 | sdf = sdf.applymap(str)
150 | self.assertNotEmpty(list(sdf))
151 | sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1)
152 | self.assertNotEmpty(list(sdf))
153 | text = sdf.to_csv(header=False)
154 | self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text.replace("\r", ""))
155 |
156 | def test_train_test_split(self):
157 | sdf = dummy_streaming_dataframe(100)
158 | tr, te = sdf.train_test_split(index=False, streaming=False)
159 | self.assertRaise(
160 | lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError
161 | )
162 | self.assertRaise(
163 | lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError
164 | )
165 | StreamingDataFrame.read_str(tr.encode("utf-8"))
166 | trsdf = StreamingDataFrame.read_str(tr)
167 | tesdf = StreamingDataFrame.read_str(te)
168 | trdf = trsdf.to_dataframe()
169 | tedf = tesdf.to_dataframe()
170 | df_exp = sdf.to_dataframe()
171 | df_val = pandas.concat([trdf, tedf])
172 | self.assertEqual(df_exp.shape, df_val.shape)
173 | df_val = df_val.sort_values("cint").reset_index(drop=True)
174 | self.assertEqualDataFrame(df_val, df_exp)
175 |
176 | def test_train_test_split_streaming(self):
177 | sdf = dummy_streaming_dataframe(100, asfloat=True)
178 | trsdf, tesdf = sdf.train_test_split(
179 | streaming=True, unique_rows=True, partitions=[0.7, 0.3]
180 | )
181 | trdf = trsdf.to_dataframe()
182 | tedf = tesdf.to_dataframe()
183 | df_exp = sdf.to_dataframe()
184 | df_val = pandas.concat([trdf, tedf])
185 | self.assertEqual(df_exp.shape, df_val.shape)
186 | df_val = df_val.sort_values("cfloat").reset_index(drop=True)
187 | self.assertEqualDataFrame(df_val, df_exp)
188 | trdf2 = trsdf.to_dataframe()
189 | tedf2 = tesdf.to_dataframe()
190 | df_val = pandas.concat([trdf2, tedf2])
191 | self.assertEqual(df_exp.shape, df_val.shape)
192 | df_val = df_val.sort_values("cfloat").reset_index(drop=True)
193 | self.assertEqualDataFrame(df_val, df_exp)
194 | self.assertEqual(trdf.shape, trdf2.shape)
195 | self.assertEqual(tedf.shape, tedf2.shape)
196 | self.assertGreater(trdf.shape[0], tedf.shape[0])
197 | self.assertGreater(trdf2.shape[0], tedf2.shape[0])
198 |
199 | def test_train_test_split_streaming_tiny(self):
200 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
201 |
202 | sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df]))
203 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
204 | df1 = sdfte.head()
205 | df2 = sdfte.head()
206 | if df1 is not None or df2 is not None:
207 | self.assertEqualDataFrame(df1, df2)
208 | df1 = sdftr.head()
209 | df2 = sdftr.head()
210 | if df1 is not None or df2 is not None:
211 | self.assertEqualDataFrame(df1, df2)
212 | sdf = StreamingDataFrame.read_df(df)
213 | sdf2 = sdf.concat(sdf, axis=0)
214 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
215 | df1 = sdfte.head()
216 | df2 = sdfte.head()
217 | if df1 is not None or df2 is not None:
218 | self.assertEqualDataFrame(df1, df2)
219 | df1 = sdftr.head()
220 | df2 = sdftr.head()
221 | if df1 is not None or df2 is not None:
222 | self.assertEqualDataFrame(df1, df2)
223 |
224 | def test_train_test_split_streaming_strat(self):
225 | sdf = dummy_streaming_dataframe(
226 | 100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(100)]
227 | )
228 | trsdf, tesdf = sdf.train_test_split(
229 | streaming=True, unique_rows=True, stratify="tify"
230 | )
231 | trdf = trsdf.to_dataframe()
232 | tedf = tesdf.to_dataframe()
233 | df_exp = sdf.to_dataframe()
234 | df_val = pandas.concat([trdf, tedf])
235 | self.assertEqual(df_exp.shape, df_val.shape)
236 | df_val = df_val.sort_values("cfloat").reset_index(drop=True)
237 | self.assertEqualDataFrame(df_val, df_exp)
238 | trdf = trsdf.to_dataframe()
239 | tedf = tesdf.to_dataframe()
240 | df_val = pandas.concat([trdf, tedf])
241 | self.assertEqual(df_exp.shape, df_val.shape)
242 | df_val = df_val.sort_values("cfloat").reset_index(drop=True)
243 | self.assertEqualDataFrame(df_val, df_exp)
244 | trgr = trdf.groupby("tify").count()
245 | trgr["part"] = 0
246 | tegr = tedf.groupby("tify").count()
247 | tegr["part"] = 1
248 | gr = pandas.concat([trgr, tegr])
249 | self.assertGreater(gr["cfloat"].min(), 4)
250 |
251 | def test_train_test_split_file(self):
252 | with tempfile.TemporaryDirectory() as temp:
253 | names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")]
254 | sdf = dummy_streaming_dataframe(100)
255 | sdf.train_test_split(names, index=False, streaming=False)
256 | trsdf = StreamingDataFrame.read_csv(names[0])
257 | tesdf = StreamingDataFrame.read_csv(names[1])
258 | self.assertGreater(trsdf.shape[0], 20)
259 | self.assertGreater(tesdf.shape[0], 20)
260 | trdf = trsdf.to_dataframe()
261 | tedf = tesdf.to_dataframe()
262 | self.assertGreater(trdf.shape[0], 20)
263 | self.assertGreater(tedf.shape[0], 20)
264 | df_exp = sdf.to_dataframe()
265 | df_val = pandas.concat([trdf, tedf])
266 | self.assertEqual(df_exp.shape, df_val.shape)
267 | df_val = df_val.sort_values("cint").reset_index(drop=True)
268 | self.assertEqualDataFrame(df_val, df_exp)
269 |
270 | def test_train_test_split_file_pattern(self):
271 | with tempfile.TemporaryDirectory() as temp:
272 | sdf = dummy_streaming_dataframe(100)
273 | names = os.path.join(temp, "spl_{0}.txt")
274 | self.assertRaise(
275 | lambda: sdf.train_test_split(names, index=False, streaming=False),
276 | ValueError,
277 | )
278 | names = os.path.join(temp, "spl_{}.txt")
279 | tr, te = sdf.train_test_split(names, index=False, streaming=False)
280 | trsdf = StreamingDataFrame.read_csv(tr)
281 | tesdf = StreamingDataFrame.read_csv(te)
282 | trdf = trsdf.to_dataframe()
283 | tedf = tesdf.to_dataframe()
284 | df_exp = sdf.to_dataframe()
285 | df_val = pandas.concat([trdf, tedf])
286 | self.assertEqual(df_exp.shape, df_val.shape)
287 | df_val = df_val.sort_values("cint").reset_index(drop=True)
288 | self.assertEqualDataFrame(df_val, df_exp)
289 |
290 | def test_merge(self):
291 | def compares(a, b, how):
292 | m = a.merge(b, on="cint", indicator=True)
293 | dm = m.to_dataframe()
294 | da = a.to_dataframe()
295 | db = b.to_dataframe()
296 | exp = da.merge(db, on="cint", indicator=True)
297 | self.assertEqualDataFrame(
298 | dm.reset_index(drop=True), exp.reset_index(drop=True)
299 | )
300 |
301 | sdf20 = dummy_streaming_dataframe(20)
302 | sdf30 = dummy_streaming_dataframe(30)
303 | # itself
304 | hows = "inner left right outer".split()
305 | for how in hows:
306 | compares(sdf20, sdf20, how)
307 | compares(sdf20, sdf20, how)
308 | for how in hows:
309 | compares(sdf20, sdf30, how)
310 | compares(sdf20, sdf30, how)
311 | for how in hows:
312 | compares(sdf30, sdf20, how)
313 | compares(sdf30, sdf20, how)
314 | sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
315 |
316 | def test_concatv(self):
317 | sdf20 = dummy_streaming_dataframe(20)
318 | sdf30 = dummy_streaming_dataframe(30)
319 | df20 = sdf20.to_dataframe()
320 | df30 = sdf30.to_dataframe()
321 | df = pandas.concat([df20, df30], axis=0)
322 |
323 | m1 = sdf20.concat(sdf30, axis=0)
324 | self.assertEqualDataFrame(m1.to_dataframe(), df)
325 | m1 = sdf20.concat(df30, axis=0)
326 | self.assertEqualDataFrame(m1.to_dataframe(), df)
327 | m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) # noqa: C417
328 | self.assertEqualDataFrame(m1.to_dataframe(), df)
329 | m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) # noqa: C417
330 | self.assertEqualDataFrame(m1.to_dataframe(), df)
331 |
332 | df20["cint"] = df20["cint"].astype(float)
333 | self.assertRaise(
334 | lambda: sdf20.concat(df20).to_dataframe(),
335 | ValueError,
336 | "Frame others[0] do not have the same column types",
337 | )
338 | df30["g"] = 4
339 | self.assertRaise(
340 | lambda: sdf20.concat(df30).to_dataframe(),
341 | ValueError,
342 | "Frame others[0] do not have the same column names",
343 | )
344 |
345 | def test_concath(self):
346 | sdf20 = dummy_streaming_dataframe(20)
347 | sdf30 = dummy_streaming_dataframe(20)
348 | df20 = sdf20.to_dataframe()
349 | df30 = sdf30.to_dataframe()
350 | df = pandas.concat([df20, df30], axis=1)
351 |
352 | m1 = sdf20.concat(sdf30, axis=1)
353 | self.assertEqualDataFrame(m1.to_dataframe(), df)
354 | sdf22 = dummy_streaming_dataframe(22)
355 | sdf25 = dummy_streaming_dataframe(25)
356 | self.assertRaise(
357 | lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError
358 | )
359 |
360 | def test_groupby(self):
361 | df20 = dummy_streaming_dataframe(20).to_dataframe()
362 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
363 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
364 | gr = sdf20.groupby("key", lambda gr: gr.sum())
365 | gr2 = df20.groupby("key").sum()
366 | self.assertEqualDataFrame(gr, gr2)
367 | self.assertRaise(
368 | lambda: sdf20.groupby("key", in_memory=False), NotImplementedError
369 | )
370 |
371 | # Do not replace lambda c:sum(c) by sum or...
372 | # pandas.core.base.SpecificationError: Function names
373 | # must be unique, found multiple named sum
374 | gr2 = (
375 | df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c: sum(c)])
376 | )
377 | gr = sdf20.drop("cstr", axis=1).groupby(
378 | "key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)])
379 | )
380 | self.assertEqualDataFrame(gr, gr2)
381 |
382 | gr = sdf20.groupby("key", lambda gr: gr.count())
383 | gr2 = df20.groupby("key").count()
384 | self.assertEqualDataFrame(gr, gr2)
385 |
386 | df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7]))
387 | sdf = StreamingDataFrame.read_df(df)
388 | gr = sdf.groupby("A")
389 | gr2 = df.groupby("A").sum()
390 | self.assertEqualDataFrame(gr, gr2)
391 |
392 | def test_groupby_cum(self):
393 | df20 = dummy_streaming_dataframe(20).to_dataframe()
394 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
395 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
396 | sgr = sdf20.groupby_streaming(
397 | "key", lambda gr: gr.sum(), strategy="cum", as_index=False
398 | )
399 | gr2 = df20.groupby("key", as_index=False).sum()
400 | lastgr = None
401 | for gr in sgr:
402 | self.assertEqual(list(gr.columns), list(gr2.columns))
403 | lastgr = gr
404 | self.assertEqualDataFrame(lastgr, gr2)
405 |
406 | def test_groupby_streaming(self):
407 | df20 = dummy_streaming_dataframe(20).to_dataframe()
408 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
409 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
410 | sgr = sdf20.groupby_streaming(
411 | "key", lambda gr: gr.sum(), strategy="streaming", as_index=False
412 | )
413 | gr2 = df20.groupby("key", as_index=False).sum()
414 | grs = list(sgr)
415 | gr = pandas.concat(grs).groupby("key", as_index=False).sum()
416 | self.assertEqualDataFrame(gr, gr2)
417 |
418 | def test_groupby_cum_asindex(self):
419 | df20 = dummy_streaming_dataframe(20).to_dataframe()
420 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
421 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
422 | sgr = sdf20.groupby_streaming(
423 | "key", lambda gr: gr.sum(), strategy="cum", as_index=True
424 | )
425 | gr2 = df20.groupby("key", as_index=True).sum()
426 | lastgr = None
427 | for gr in sgr:
428 | self.assertEqual(list(gr.columns), list(gr2.columns))
429 | lastgr = gr
430 | self.assertEqualDataFrame(lastgr, gr2)
431 |
432 | def test_merge_2(self):
433 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
434 | df2 = pandas.concat([df, df])
435 | sdf = StreamingDataFrame.read_df(df)
436 | sdf2 = sdf.concat(sdf, axis=0)
437 | self.assertEqualDataFrame(df2, sdf2.to_dataframe())
438 | self.assertEqualDataFrame(df2, sdf2.to_dataframe())
439 | m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
440 | jm = df2.merge(m, left_on="Y", right_on="Y", how="outer")
441 | sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
442 | self.assertEqualDataFrame(
443 | jm.sort_values(["X", "Y"]).reset_index(drop=True),
444 | sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True),
445 | )
446 |
447 | @ignore_warnings(ResourceWarning)
448 | def test_schema_consistent(self):
449 | df = pandas.DataFrame(
450 | [
451 | dict(cf=0, cint=0, cstr="0"),
452 | dict(cf=1, cint=1, cstr="1"),
453 | dict(cf=2, cint="s2", cstr="2"),
454 | dict(cf=3, cint=3, cstr="3"),
455 | ]
456 | )
457 | with tempfile.TemporaryDirectory() as temp:
458 | name = os.path.join(temp, "df.csv")
459 | stio = StringIO()
460 | df.to_csv(stio, index=False)
461 | self.assertNotEmpty(stio.getvalue())
462 | df.to_csv(name, index=False)
463 | self.assertEqual(df.shape, (4, 3))
464 | sdf = StreamingDataFrame.read_csv(name, chunksize=2)
465 | self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError)
466 | sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False)
467 | pieces = list(sdf)
468 | self.assertEqual(len(pieces), 2)
469 |
470 | def test_getitem(self):
471 | sdf = dummy_streaming_dataframe(100)
472 | sdf2 = sdf[["cint"]]
473 | self.assertEqual(sdf2.shape, (100, 1))
474 | df1 = sdf.to_df()
475 | df2 = sdf2.to_df()
476 | self.assertEqualDataFrame(df1[["cint"]], df2)
477 | self.assertRaise(lambda: sdf[:, "cint"], NotImplementedError)
478 |
479 | @ignore_warnings(ResourceWarning)
480 | def test_read_csv_names(self):
481 | this = os.path.abspath(os.path.dirname(__file__))
482 | data = os.path.join(this, "data", "buggy_hash2.csv")
483 | df = pandas.read_csv(data, sep="\t", names=["A", "B", "C"], header=None)
484 | sdf = StreamingDataFrame.read_csv(
485 | data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None
486 | )
487 | head = sdf.head(n=1)
488 | self.assertEqualDataFrame(df.head(n=1), head)
489 |
490 | def test_add_column(self):
491 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
492 | sdf = StreamingDataFrame.read_df(df)
493 | sdf2 = sdf.add_column("d", lambda _row: 1)
494 | df2 = sdf2.to_dataframe()
495 | df["d"] = 1
496 | self.assertEqualDataFrame(df, df2)
497 |
498 | sdf3 = StreamingDataFrame.read_df(df)
499 | sdf4 = sdf3.add_column("dd", 2)
500 | df4 = sdf4.to_dataframe()
501 | df["dd"] = 2
502 | self.assertEqualDataFrame(df, df4)
503 |
504 | sdfA = StreamingDataFrame.read_df(df)
505 | sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10)
506 | dfB = sdfB.to_dataframe()
507 | df["dd12"] = 12
508 | self.assertEqualDataFrame(df, dfB)
509 |
510 | def test_fillna(self):
511 | df = pandas.DataFrame(data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
512 | sdf = StreamingDataFrame.read_df(df)
513 |
514 | df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
515 | na = sdf.fillna(value=dict(X=10.0, Y="NAN"))
516 | ndf = na.to_df()
517 | self.assertEqualDataFrame(ndf, df2)
518 |
519 | df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
520 | na = sdf.fillna(value=dict(X=10.0))
521 | ndf = na.to_df()
522 | self.assertEqualDataFrame(ndf, df3)
523 |
524 | def test_describe(self):
525 | x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5
526 | y = numpy.arange(100001).astype(numpy.int64)
527 | z = numpy.array([chr(65 + j % 45) for j in y])
528 | df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z))
529 | sdf = StreamingDataFrame.read_df(df)
530 |
531 | desc = sdf.describe()
532 | self.assertEqual(["X", "Y"], list(desc.columns))
533 | self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0])
534 | self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000])
535 | self.assertEqualArray(
536 | desc.loc["mean", :], numpy.array([0, 50000], dtype=numpy.float64), atol=1e-8
537 | )
538 | self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000]))
539 | self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000]))
540 | self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000]))
541 | self.assertEqualArray(
542 | desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), atol=1e-4
543 | )
544 |
545 | def test_set_item(self):
546 | df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
547 | self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
548 | sdf = StreamingDataFrame.read_df(df)
549 |
550 | def f():
551 | sdf[["a"]] = 10
552 |
553 | self.assertRaise(f, ValueError)
554 |
555 | def g():
556 | sdf["a"] = [10]
557 |
558 | self.assertRaise(g, NotImplementedError)
559 |
560 | sdf["aa"] = 10
561 | df = sdf.to_df()
562 | ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10]))
563 | self.assertEqualDataFrame(df, ddf)
564 | sdf["bb"] = sdf["b"] + 10
565 | df = sdf.to_df()
566 | ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
567 | self.assertEqualDataFrame(df, ddf)
568 |
569 | def test_set_item_function(self):
570 | df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
571 | self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
572 | sdf = StreamingDataFrame.read_df(df)
573 | sdf["bb"] = sdf["b"].apply(lambda x: x + 11)
574 | df = sdf.to_df()
575 | ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
576 | self.assertEqualDataFrame(df, ddf)
577 |
578 |
579 | if __name__ == "__main__":
580 | unittest.main(verbosity=2)
581 |
--------------------------------------------------------------------------------
/_unittests/ut_module/test_sklearn.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy
3 | import pandas
4 | from sklearn.linear_model import LogisticRegression
5 | from pandas_streaming.ext_test_case import ExtTestCase
6 |
7 |
8 | class TestScikitLearn(ExtTestCase):
9 | def test_logistic_regression_check(self):
10 | X = pandas.DataFrame(numpy.array([[0.1, 0.2], [-0.2, 0.3]]))
11 | Y = numpy.array([0, 1])
12 | clq = LogisticRegression(
13 | fit_intercept=False, solver="liblinear", random_state=42
14 | )
15 | clq.fit(X, Y)
16 | pred2 = clq.predict(X)
17 | self.assertEqualArray(numpy.array([0, 1]), pred2)
18 |
19 |
20 | if __name__ == "__main__":
21 | unittest.main()
22 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | image:
2 | - Visual Studio 2019
3 | environment:
4 | matrix:
5 | - PYTHON: "C:\\Python310-x64"
6 | PYTHON_VERSION: "3.10.x"
7 | PYTHON_ARCH: "64"
8 | init:
9 | - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
10 |
11 | install:
12 | - "%PYTHON%\\python -m pip install --upgrade pip"
13 | - "%PYTHON%\\Scripts\\pip install -r requirements-dev.txt"
14 | build: off
15 |
16 | before_test:
17 | - "%PYTHON%\\python -u setup.py build_ext --inplace"
18 |
19 | test_script:
20 | - "%PYTHON%\\python -u setup.py unittests"
21 |
22 | after_test:
23 | - "%PYTHON%\\python -u setup.py bdist_wheel"
24 |
25 | artifacts:
26 | - path: dist
27 | name: pandas_streaming
28 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | jobs:
2 | - job: 'TestLinuxWheelPip'
3 | pool:
4 | vmImage: 'ubuntu-latest'
5 | strategy:
6 | matrix:
7 | Python311-Linux:
8 | python.version: '3.11'
9 | maxParallel: 3
10 |
11 | steps:
12 | - task: UsePythonVersion@0
13 | inputs:
14 | versionSpec: '$(python.version)'
15 | architecture: 'x64'
16 | - script: sudo apt-get update
17 | displayName: 'AptGet Update'
18 | - script: sudo apt-get install -y graphviz
19 | displayName: 'Install Graphviz'
20 | - script: python -m pip install --upgrade pip setuptools wheel
21 | displayName: 'Install tools'
22 | - script: pip install -r requirements.txt
23 | displayName: 'Install Requirements'
24 | - script: pip install -r requirements-dev.txt
25 | displayName: 'Install Requirements dev'
26 | - script: |
27 | ruff check .
28 | displayName: 'Ruff'
29 | - script: |
30 | black --diff .
31 | displayName: 'Black'
32 | - script: |
33 | python -m pip wheel . --wheel-dir dist -v -v -v
34 | displayName: 'build wheel'
35 | - script: |
36 | python -m pip install . -v -v -v
37 | displayName: 'install wheel'
38 | - script: |
39 | python -m pytest
40 | displayName: 'Runs Unit Tests'
41 | - task: PublishPipelineArtifact@0
42 | inputs:
43 | artifactName: 'wheel-linux-wheel-$(python.version)'
44 | targetPath: 'dist'
45 |
46 | - job: 'TestLinuxNightly'
47 | pool:
48 | vmImage: 'ubuntu-latest'
49 | strategy:
50 | matrix:
51 | Python311-Linux:
52 | python.version: '3.11'
53 | maxParallel: 3
54 |
55 | steps:
56 | - task: UsePythonVersion@0
57 | inputs:
58 | versionSpec: '$(python.version)'
59 | architecture: 'x64'
60 | - script: sudo apt-get update
61 | displayName: 'AptGet Update'
62 | - script: sudo apt-get install -y pandoc
63 | displayName: 'Install Pandoc'
64 | - script: sudo apt-get install -y inkscape
65 | displayName: 'Install Inkscape'
66 | - script: sudo apt-get install -y graphviz
67 | displayName: 'Install Graphviz'
68 | - script: python -m pip install --upgrade pip setuptools wheel
69 | displayName: 'Install tools'
70 | - script: pip install -r requirements.txt
71 | displayName: 'Install Requirements'
72 | - script: pip install -r requirements-dev.txt
73 | displayName: 'Install Requirements dev'
74 | - script: pip uninstall -y scikit-learn
75 | displayName: 'Uninstall scikit-learn'
76 | - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
77 | displayName: 'Install scikit-learn nightly'
78 | - script: |
79 | ruff check .
80 | displayName: 'Ruff'
81 | - script: |
82 | black --diff .
83 | displayName: 'Black'
84 | - script: |
85 | python -m pytest
86 | displayName: 'Runs Unit Tests'
87 |
88 | - job: 'TestLinux'
89 | pool:
90 | vmImage: 'ubuntu-latest'
91 | strategy:
92 | matrix:
93 | Python311-Linux:
94 | python.version: '3.11'
95 | maxParallel: 3
96 |
97 | steps:
98 | - task: UsePythonVersion@0
99 | inputs:
100 | versionSpec: '$(python.version)'
101 | architecture: 'x64'
102 | - script: sudo apt-get update
103 | displayName: 'AptGet Update'
104 | - script: sudo apt-get install -y pandoc
105 | displayName: 'Install Pandoc'
106 | - script: sudo apt-get install -y inkscape
107 | displayName: 'Install Inkscape'
108 | - script: sudo apt-get install -y graphviz
109 | displayName: 'Install Graphviz'
110 | - script: python -m pip install --upgrade pip setuptools wheel
111 | displayName: 'Install tools'
112 | - script: pip install -r requirements.txt
113 | displayName: 'Install Requirements'
114 | - script: pip install -r requirements-dev.txt
115 | displayName: 'Install Requirements dev'
116 | - script: |
117 | ruff check .
118 | displayName: 'Ruff'
119 | - script: |
120 | black --diff .
121 | displayName: 'Black'
122 | - script: |
123 | python -m pytest --cov
124 | displayName: 'Runs Unit Tests'
125 | - script: |
126 | python -u setup.py bdist_wheel
127 | displayName: 'Build Package'
128 | #- script: |
129 | # python -m sphinx _doc dist/html
130 | # displayName: 'Builds Documentation'
131 | - task: PublishPipelineArtifact@0
132 | inputs:
133 | artifactName: 'wheel-linux-$(python.version)'
134 | targetPath: 'dist'
135 |
136 | - job: 'TestWindows'
137 | pool:
138 | vmImage: 'windows-latest'
139 | strategy:
140 | matrix:
141 | Python311-Windows:
142 | python.version: '3.11'
143 | maxParallel: 3
144 |
145 | steps:
146 | - task: UsePythonVersion@0
147 | inputs:
148 | versionSpec: '$(python.version)'
149 | architecture: 'x64'
150 | - script: python -m pip install --upgrade pip setuptools wheel
151 | displayName: 'Install tools'
152 | - script: pip install -r requirements.txt
153 | displayName: 'Install Requirements'
154 | - script: pip install -r requirements-dev.txt
155 | displayName: 'Install Requirements dev'
156 | - script: |
157 | python -m pytest
158 | displayName: 'Runs Unit Tests'
159 | - script: |
160 | python -u setup.py bdist_wheel
161 | displayName: 'Build Package'
162 | - task: PublishPipelineArtifact@0
163 | inputs:
164 | artifactName: 'wheel-windows-$(python.version)'
165 | targetPath: 'dist'
166 |
167 | - job: 'TestMac'
168 | pool:
169 | vmImage: 'macOS-latest'
170 | strategy:
171 | matrix:
172 | Python311-Mac:
173 | python.version: '3.11'
174 | maxParallel: 3
175 |
176 | steps:
177 | - task: UsePythonVersion@0
178 | inputs:
179 | versionSpec: '$(python.version)'
180 | architecture: 'x64'
181 | - script: gcc --version
182 | displayName: 'gcc version'
183 | #- script: brew upgrade
184 | # displayName: 'brew upgrade'
185 | #- script: brew update
186 | # displayName: 'brew update'
187 | - script: export
188 | displayName: 'export'
189 | - script: gcc --version
190 | displayName: 'gcc version'
191 | - script: python -m pip install --upgrade pip setuptools wheel
192 | displayName: 'Install tools'
193 | - script: pip install -r requirements.txt
194 | displayName: 'Install Requirements'
195 | - script: pip install -r requirements-dev.txt
196 | displayName: 'Install Requirements dev'
197 | - script: |
198 | python -m pytest
199 | displayName: 'Runs Unit Tests'
200 | - script: |
201 | python -u setup.py bdist_wheel
202 | displayName: 'Build Package'
203 | - task: PublishPipelineArtifact@0
204 | inputs:
205 | artifactName: 'wheel-mac-$(python.version)'
206 | targetPath: 'dist'
207 |
208 |
--------------------------------------------------------------------------------
/pandas_streaming/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.5.1"
2 | __author__ = "Xavier Dupré"
3 | __github__ = "https://github.com/sdpython/pandas_streaming"
4 | __url__ = "https://sdpython.github.io/doc/pandas-streaming/dev/"
5 | __license__ = "MIT License"
6 |
--------------------------------------------------------------------------------
/pandas_streaming/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dummy import dummy_streaming_dataframe
2 |
--------------------------------------------------------------------------------
/pandas_streaming/data/dummy.py:
--------------------------------------------------------------------------------
1 | from pandas import DataFrame
2 | from ..df import StreamingDataFrame
3 |
4 |
5 | def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols):
6 | """
7 | Returns a dummy streaming dataframe
8 | mostly for unit test purposes.
9 |
10 | :param n: number of rows
11 | :param chunksize: chunk size
12 | :param asfloat: use random float and not random int
13 | :param cols: additional columns
14 | :return: a @see cl StreamingDataFrame
15 | """
16 | if asfloat:
17 | df = DataFrame(
18 | dict(
19 | cfloat=[_ + 0.1 for _ in range(n)],
20 | cstr=[f"s{i}" for i in range(n)],
21 | )
22 | )
23 | else:
24 | df = DataFrame(dict(cint=list(range(n)), cstr=[f"s{i}" for i in range(n)]))
25 | for k, v in cols.items():
26 | df[k] = v
27 | return StreamingDataFrame.read_df(df, chunksize=chunksize)
28 |
--------------------------------------------------------------------------------
/pandas_streaming/df/__init__.py:
--------------------------------------------------------------------------------
1 | from .connex_split import (
2 | train_test_split_weights,
3 | train_test_connex_split,
4 | train_test_apart_stratify,
5 | )
6 | from .dataframe import StreamingDataFrame
7 | from .dataframe_helpers import (
8 | dataframe_hash_columns,
9 | dataframe_unfold,
10 | dataframe_shuffle,
11 | )
12 | from .dataframe_helpers import pandas_groupby_nan, numpy_types
13 | from .dataframe_io import to_zip, read_zip
14 |
--------------------------------------------------------------------------------
/pandas_streaming/df/connex_split.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from logging import getLogger
3 | from typing import Optional, Tuple
4 | import pandas
5 | import numpy
6 | from .dataframe_helpers import dataframe_shuffle
7 |
8 | logger = getLogger("pandas-streaming")
9 |
10 |
11 | class ImbalancedSplitException(Exception):
12 | """
13 | Raised when an imbalanced split is detected.
14 | """
15 |
16 |
17 | def train_test_split_weights(
18 | df,
19 | weights=None,
20 | test_size=0.25,
21 | train_size=None,
22 | shuffle=True,
23 | fail_imbalanced=0.05,
24 | random_state=None,
25 | ):
26 | """
27 | Splits a database in train/test given, every row
28 | can have a different weight.
29 |
30 | :param df: :class:`pandas.DataFrame` or see
31 | :class:`StreamingDataFrame `
32 | :param weights: None or weights or weights column name
33 | :param test_size: ratio for the test partition
34 | (if *train_size* is not specified)
35 | :param train_size: ratio for the train partition
36 | :param shuffle: shuffles before the split
37 | :param fail_imbalanced: raises an exception if relative weights
38 | difference is higher than this value
39 | :param random_state: seed for random generators
40 | :return: train and test :class:`pandas.DataFrame`
41 |
42 | If the dataframe is not shuffled first, the function
43 | will produce two datasets which are unlikely to be randomized
44 | as the function tries to keep equal weights among both paths
45 | without using randomness.
46 | """
47 | if hasattr(df, "iter_creation"):
48 | raise NotImplementedError( # pragma: no cover
49 | "Not implemented yet for StreamingDataFrame."
50 | )
51 | if isinstance(df, numpy.ndarray):
52 | raise NotImplementedError( # pragma: no cover
53 | "Not implemented on numpy arrays."
54 | )
55 | if shuffle:
56 | df = dataframe_shuffle(df, random_state=random_state)
57 | if weights is None:
58 | if test_size == 0 or train_size == 0:
59 | raise ValueError(
60 | f"test_size={test_size} or train_size={train_size} cannot be null (1)."
61 | )
62 | from sklearn.model_selection import train_test_split
63 |
64 | return train_test_split(
65 | df, test_size=test_size, train_size=train_size, random_state=random_state
66 | )
67 |
68 | if isinstance(weights, pandas.Series):
69 | weights = list(weights)
70 | elif isinstance(weights, str):
71 | weights = list(df[weights])
72 | if len(weights) != df.shape[0]:
73 | raise ValueError(
74 | "Dimension mismatch between weights and dataframe " # noqa: UP030
75 | "{0} != {1}".format(df.shape[0], len(weights))
76 | )
77 |
78 | p = (1 - test_size) if test_size else None
79 | if train_size is not None:
80 | p = train_size
81 | test_size = 1 - p
82 | if p is None or min(test_size, p) <= 0:
83 | raise ValueError(
84 | f"test_size={test_size} or train_size={train_size} cannot be null (2)."
85 | )
86 | ratio = test_size / p
87 |
88 | if random_state is None:
89 | randint = numpy.random.randint
90 | else:
91 | state = numpy.random.RandomState(random_state)
92 | randint = state.randint
93 |
94 | balance = 0
95 | train_ids = []
96 | test_ids = []
97 | test_weights = 0
98 | train_weights = 0
99 | for i in range(df.shape[0]):
100 | w = weights[i]
101 | if balance == 0:
102 | h = randint(0, 1)
103 | totest = h == 0
104 | else:
105 | totest = balance < 0
106 | if totest:
107 | test_ids.append(i)
108 | balance += w
109 | test_weights += w
110 | else:
111 | train_ids.append(i)
112 | balance -= w * ratio
113 | train_weights += w * ratio
114 |
115 | r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights))
116 | if r >= fail_imbalanced:
117 | raise ImbalancedSplitException( # pragma: no cover
118 | "Split is imbalanced: train_weights={0} test_weights={1} r={2}." # noqa: UP030
119 | "".format(train_weights, test_weights, r)
120 | )
121 |
122 | return df.iloc[train_ids, :], df.iloc[test_ids, :]
123 |
124 |
125 | def train_test_connex_split(
126 | df,
127 | groups,
128 | test_size=0.25,
129 | train_size=None,
130 | stratify=None,
131 | hash_size=9,
132 | unique_rows=False,
133 | shuffle=True,
134 | fail_imbalanced=0.05,
135 | keep_balance=None,
136 | stop_if_bigger=None,
137 | return_cnx=False,
138 | must_groups=None,
139 | random_state=None,
140 | verbose=0,
141 | ):
142 | """
143 | This split is for a specific case where data is linked
144 | in many ways. Let's assume we have three ids as we have
145 | for online sales: *(product id, user id, card id)*.
146 | As we may need to compute aggregated features,
147 | we need every id not to be present in both train and
148 | test set. The function computes the connected components
149 | and breaks each of them in two parts for train and test.
150 |
151 | :param df: :epkg:`pandas:DataFrame`
152 | :param groups: columns name for the ids
153 | :param test_size: ratio for the test partition
154 | (if *train_size* is not specified)
155 | :param train_size: ratio for the train partition
156 | :param stratify: column holding the stratification
157 | :param hash_size: size of the hash to cache information about partition
158 | :param unique_rows: ensures that rows are unique
159 | :param shuffle: shuffles before the split
160 | :param fail_imbalanced: raises an exception if relative weights difference
161 | is higher than this value
162 | :param stop_if_bigger: (float) stops a connected components from being
163 | bigger than this ratio of elements, this should not be used
164 | unless a big components emerges, the algorithm stops merging
165 | but does not guarantee it returns the best cut,
166 | the value should be close to 0
167 | :param keep_balance: (float), if not None, does not merge connected components
168 | if their relative sizes are too different,
169 | the value should be close to 1
170 | :param return_cnx: returns connected components as a third results
171 | :param must_groups: column name for ids which must not be shared by
172 | train/test partitions
173 | :param random_state: seed for random generator
174 | :param verbose: verbosity (uses logging)
175 | :return: Two see :class:`StreamingDataFrame
176 | `, one
177 | for train, one for test.
178 |
179 | The list of ids must hold in memory.
180 | There is no streaming implementation for the ids.
181 |
182 | .. exref::
183 | :title: Splits a dataframe, keep ids in separate partitions
184 | :tag: dataframe
185 |
186 | In some data science problems, rows are not independant
187 | and share common value, most of the time ids. In some
188 | specific case, multiple ids from different columns are
189 | connected and must appear in the same partition.
190 | Testing that each id column is evenly split and do not
191 | appear in both sets in not enough. Connected components
192 | are needed.
193 |
194 | .. runpython::
195 | :showcode:
196 |
197 | from pandas import DataFrame
198 | from pandas_streaming.df import train_test_connex_split
199 |
200 | df = DataFrame([dict(user="UA", prod="PAA", card="C1"),
201 | dict(user="UA", prod="PB", card="C1"),
202 | dict(user="UB", prod="PC", card="C2"),
203 | dict(user="UB", prod="PD", card="C2"),
204 | dict(user="UC", prod="PAA", card="C3"),
205 | dict(user="UC", prod="PF", card="C4"),
206 | dict(user="UD", prod="PG", card="C5"),
207 | ])
208 |
209 | train, test = train_test_connex_split(
210 | df, test_size=0.5, groups=['user', 'prod', 'card'],
211 | fail_imbalanced=0.6)
212 |
213 | print(train)
214 | print(test)
215 |
216 | If *return_cnx* is True, the third results contains:
217 |
218 | * connected components for each id
219 | * the dataframe with connected components as a new column
220 |
221 | .. runpython::
222 | :showcode:
223 |
224 | from pandas import DataFrame
225 | from pandas_streaming.df import train_test_connex_split
226 |
227 | df = DataFrame([dict(user="UA", prod="PAA", card="C1"),
228 | dict(user="UA", prod="PB", card="C1"),
229 | dict(user="UB", prod="PC", card="C2"),
230 | dict(user="UB", prod="PD", card="C2"),
231 | dict(user="UC", prod="PAA", card="C3"),
232 | dict(user="UC", prod="PF", card="C4"),
233 | dict(user="UD", prod="PG", card="C5"),
234 | ])
235 |
236 | train, test, cnx = train_test_connex_split(
237 | df, test_size=0.5, groups=['user', 'prod', 'card'],
238 | fail_imbalanced=0.6, return_cnx=True)
239 |
240 | print(cnx[0])
241 | print(cnx[1])
242 | """
243 | if stratify is not None:
244 | raise NotImplementedError( # pragma: no cover
245 | "Option stratify is not implemented."
246 | )
247 | if groups is None or len(groups) == 0:
248 | raise ValueError( # pragma: no cover
249 | "groups is empty. Use regular train_test_split."
250 | )
251 | if hasattr(df, "iter_creation"):
252 | raise NotImplementedError( # pragma: no cover
253 | "Not implemented yet for StreamingDataFrame."
254 | )
255 | if isinstance(df, numpy.ndarray):
256 | raise NotImplementedError( # pragma: no cover
257 | "Not implemented on numpy arrays."
258 | )
259 | if shuffle:
260 | df = dataframe_shuffle(df, random_state=random_state)
261 |
262 | dfids = df[groups].copy()
263 | if must_groups is not None:
264 | dfids_must = df[must_groups].copy()
265 |
266 | name = "connex"
267 | while name in dfids.columns:
268 | name += "_"
269 | one = "weight"
270 | while one in dfids.columns:
271 | one += "_"
272 |
273 | # Connected components.
274 | elements = list(range(dfids.shape[0]))
275 | counts_cnx = {i: {i} for i in elements}
276 | connex = {}
277 | avoids_merge = {}
278 |
279 | def do_connex_components(dfrows, local_groups, kb, sib):
280 | "run connected components algorithms"
281 | itern = 0
282 | modif = 1
283 |
284 | while modif > 0 and itern < len(elements):
285 | if df.shape[0] > 10000:
286 | logger.info(
287 | "[train_test_connex_split] iteration=%d-#nb connect=%d - "
288 | "modif=%s",
289 | itern,
290 | len(set(elements)),
291 | modif,
292 | )
293 |
294 | modif = 0
295 | itern += 1
296 | for i, row in enumerate(dfrows.itertuples(index=False, name=None)):
297 | vals = [
298 | val
299 | for val in zip(local_groups, row)
300 | if not isinstance(val[1], float) or not numpy.isnan(val[1])
301 | ]
302 |
303 | c = elements[i]
304 |
305 | for val in vals:
306 | if val not in connex:
307 | connex[val] = c
308 | modif += 1
309 |
310 | set_c = set(connex[val] for val in vals)
311 | set_c.add(c)
312 | new_c = min(set_c)
313 |
314 | add_pair_c = []
315 | for c in set_c:
316 | if c == new_c or (new_c, c) in avoids_merge:
317 | continue
318 | if kb is not None:
319 | maxi = min(len(counts_cnx[new_c]), len(counts_cnx[c]))
320 | if maxi > 5:
321 | diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi
322 | r = diff / float(maxi)
323 | if r > kb:
324 | if verbose: # pragma: no cover
325 | logger.info(
326 | "[train_test_connex_split] balance "
327 | "r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
328 | r,
329 | kb,
330 | new_c,
331 | len(counts_cnx[new_c]),
332 | c,
333 | len(counts_cnx[c]),
334 | )
335 |
336 | continue
337 |
338 | if sib is not None:
339 | r = (len(counts_cnx[new_c]) + len(counts_cnx[c])) / float(
340 | len(elements)
341 | )
342 | if r > sib:
343 | logger.info(
344 | "[train_test_connex_split] "
345 | "no merge r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
346 | r,
347 | sib,
348 | new_c,
349 | len(counts_cnx[new_c]),
350 | c,
351 | len(counts_cnx[c]),
352 | )
353 | avoids_merge[new_c, c] = i
354 | continue
355 |
356 | add_pair_c.append(c)
357 |
358 | if len(add_pair_c) > 0:
359 | for c in add_pair_c:
360 | modif += len(counts_cnx[c])
361 | for ii in counts_cnx[c]:
362 | elements[ii] = new_c
363 | counts_cnx[new_c] = counts_cnx[new_c].union(counts_cnx[c])
364 | counts_cnx[c] = set()
365 |
366 | keys = list(vals)
367 | for val in keys:
368 | if connex[val] == c:
369 | connex[val] = new_c
370 | modif += 1
371 |
372 | if must_groups:
373 | do_connex_components(dfids_must, must_groups, None, None)
374 | do_connex_components(dfids, groups, keep_balance, stop_if_bigger)
375 |
376 | # final
377 | dfids[name] = elements
378 | dfids[one] = 1
379 | grsum = dfids[[name, one]].groupby(name, as_index=False).sum()
380 | for g in groups:
381 | logger.info("[train_test_connex_split] #nb in '%d':", len(set(dfids[g])))
382 | logger.info(
383 | "[train_test_connex_split] #connex %d/%d", grsum.shape[0], dfids.shape[0]
384 | )
385 | if grsum.shape[0] <= 1:
386 | raise ValueError( # pragma: no cover
387 | "Every element is in the same connected components."
388 | )
389 |
390 | # Statistics: top connected components
391 | if verbose:
392 | # Global statistics
393 | counts = Counter(elements)
394 | cl = [(v, k) for k, v in counts.items()]
395 | cum = 0
396 | maxc = None
397 | logger.info(
398 | "[train_test_connex_split] number of connected components: %d",
399 | len(set(elements)),
400 | )
401 | for i, (v, k) in enumerate(sorted(cl, reverse=True)):
402 | if i == 0:
403 | maxc = k, v
404 | if i >= 10:
405 | break
406 | cum += v
407 | logger.info(
408 | "[train_test_connex_split] c=%s #elements=%s cumulated=%d/%d",
409 | k,
410 | v,
411 | cum,
412 | len(elements),
413 | )
414 |
415 | # Most important component
416 | logger.info(
417 | "[train_test_connex_split] first row of the biggest component %d", maxc
418 | )
419 | tdf = dfids[dfids[name] == maxc[0]]
420 | logger.info("[train_test_connex_split] % s", tdf.head(n=10))
421 |
422 | # Splits.
423 | train, test = train_test_split_weights(
424 | grsum,
425 | weights=one,
426 | test_size=test_size,
427 | train_size=train_size,
428 | shuffle=shuffle,
429 | fail_imbalanced=fail_imbalanced,
430 | random_state=random_state,
431 | )
432 | train.drop(one, inplace=True, axis=1)
433 | test.drop(one, inplace=True, axis=1)
434 |
435 | # We compute the final dataframe.
436 | def double_merge(d):
437 | "merge twice"
438 | merge1 = dfids.merge(d, left_on=name, right_on=name)
439 | merge2 = df.merge(merge1, left_on=groups, right_on=groups)
440 | return merge2
441 |
442 | train_f = double_merge(train)
443 | test_f = double_merge(test)
444 | if return_cnx:
445 | return train_f, test_f, (connex, dfids)
446 | else:
447 | return train_f, test_f
448 |
449 |
450 | def train_test_apart_stratify(
451 | df: pandas.DataFrame,
452 | group,
453 | test_size: Optional[float] = 0.25,
454 | train_size: Optional[float] = None,
455 | stratify: Optional[str] = None,
456 | force: bool = False,
457 | random_state: Optional[int] = None,
458 | sorted_indices: bool = False,
459 | ) -> Tuple["StreamingDataFrame", "StreamingDataFrame"]: # noqa: F821
460 | """
461 | This split is for a specific case where data is linked
462 | in one way. Let's assume we have two ids as we have
463 | for online sales: *(product id, category id)*.
464 | A product can have multiple categories. We need to have
465 | distinct products on train and test but common categories
466 | on both sides.
467 |
468 | :param df: :epkg:`pandas:DataFrame`
469 | :param group: columns name for the ids
470 | :param test_size: ratio for the test partition
471 | (if *train_size* is not specified)
472 | :param train_size: ratio for the train partition
473 | :param stratify: column holding the stratification
474 | :param force: if True, tries to get at least one example on the test side
475 | for each value of the column *stratify*
476 | :param random_state: seed for random generators
477 | :param sorted_indices: sort index first,
478 | see issue `41 `
479 | :return: Two see :class:`StreamingDataFrame
480 | `, one
481 | for train, one for test.
482 |
483 | The list of ids must hold in memory.
484 | There is no streaming implementation for the ids.
485 | This split was implemented for a case of a multi-label
486 | classification. A category (*stratify*) is not exclusive
487 | and an observation can be assigned to multiple
488 | categories. In that particular case, the method
489 | :func:`sklearn.model_selection.train_test_split`
490 | can not directly be used.
491 |
492 | .. runpython::
493 | :showcode:
494 |
495 | import pandas
496 | from pandas_streaming.df import train_test_apart_stratify
497 |
498 | df = pandas.DataFrame([dict(a=1, b="e"),
499 | dict(a=1, b="f"),
500 | dict(a=2, b="e"),
501 | dict(a=2, b="f")])
502 |
503 | train, test = train_test_apart_stratify(
504 | df, group="a", stratify="b", test_size=0.5)
505 | print(train)
506 | print('-----------')
507 | print(test)
508 |
509 | """
510 | if stratify is None:
511 | raise ValueError("stratify must be specified.") # pragma: no cover
512 | if group is None:
513 | raise ValueError("group must be specified.") # pragma: no cover
514 | if hasattr(df, "iter_creation"):
515 | raise NotImplementedError("Not implemented yet for StreamingDataFrame.")
516 | if isinstance(df, numpy.ndarray):
517 | raise NotImplementedError("Not implemented on numpy arrays.")
518 |
519 | p = (1 - test_size) if test_size else None
520 | if train_size is not None:
521 | p = train_size
522 | test_size = 1 - p
523 | if p is None or min(test_size, p) <= 0:
524 | raise ValueError( # pragma: no cover
525 | f"test_size={test_size} or train_size={train_size} cannot be null"
526 | )
527 |
528 | couples = df[[group, stratify]].itertuples(name=None, index=False)
529 | hist = Counter(df[stratify])
530 | sorted_hist = [(v, k) for k, v in hist.items()]
531 | sorted_hist.sort()
532 | ids = {c: set() for c in hist}
533 |
534 | for g, s in couples:
535 | ids[s].add(g)
536 |
537 | if random_state is None:
538 | permutation = numpy.random.permutation
539 | else:
540 | state = numpy.random.RandomState(random_state)
541 | permutation = state.permutation
542 |
543 | split = {}
544 | for _, k in sorted_hist:
545 | indices = sorted(ids[k]) if sorted_indices else ids[k]
546 | not_assigned, assigned = [], []
547 | for c in indices:
548 | if c in split:
549 | assigned.append(c)
550 | else:
551 | not_assigned.append(c)
552 | if len(not_assigned) == 0:
553 | continue
554 | nb_test = sum(split[c] for c in assigned)
555 | expected = min(len(ids[k]), int(test_size * len(ids[k]) + 0.5)) - nb_test
556 | if force and expected == 0 and nb_test == 0:
557 | nb_train = len(assigned) - nb_test
558 | if nb_train > 0 or len(not_assigned) > 1:
559 | expected = min(1, len(not_assigned))
560 | if expected > 0:
561 | permutation(not_assigned)
562 | for e in not_assigned[:expected]:
563 | split[e] = 1
564 | for e in not_assigned[expected:]:
565 | split[e] = 0
566 | else:
567 | for c in not_assigned:
568 | split[c] = 0
569 |
570 | train_set = set(k for k, v in split.items() if v == 0)
571 | test_set = set(k for k, v in split.items() if v == 1)
572 | train_df = df[df[group].isin(train_set)]
573 | test_df = df[df[group].isin(test_set)]
574 | return train_df, test_df
575 |
--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_helpers.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import struct
3 | import warnings
4 | import numpy
5 | from pandas import DataFrame, Index, Series
6 |
7 |
8 | def numpy_types():
9 | """
10 | Returns the list of :epkg:`numpy` available types.
11 |
12 | :return: list of types
13 | """
14 |
15 | return [
16 | numpy.bool_,
17 | numpy.int_,
18 | numpy.intc,
19 | numpy.intp,
20 | numpy.int8,
21 | numpy.int16,
22 | numpy.int32,
23 | numpy.int64,
24 | numpy.uint8,
25 | numpy.uint16,
26 | numpy.uint32,
27 | numpy.uint64,
28 | numpy.float16,
29 | numpy.float32,
30 | numpy.float64,
31 | numpy.complex64,
32 | numpy.complex128,
33 | ]
34 |
35 |
36 | def hash_str(c, hash_length):
37 | """
38 | Hashes a string.
39 |
40 | @param c value to hash
41 | @param hash_length hash_length
42 | @return string
43 | """
44 | if isinstance(c, float):
45 | if numpy.isnan(c):
46 | return c
47 | raise ValueError(f"numpy.nan expected, not {c}")
48 | m = hashlib.sha256()
49 | m.update(c.encode("utf-8"))
50 | r = m.hexdigest()
51 | if len(r) >= hash_length:
52 | return r[:hash_length]
53 | return r
54 |
55 |
56 | def hash_int(c, hash_length):
57 | """
58 | Hashes an integer into an integer.
59 |
60 | @param c value to hash
61 | @param hash_length hash_length
62 | @return int
63 | """
64 | if isinstance(c, float):
65 | if numpy.isnan(c):
66 | return c
67 | else:
68 | raise ValueError(f"numpy.nan expected, not {c}")
69 | else:
70 | b = struct.pack("i", c)
71 | m = hashlib.sha256()
72 | m.update(b)
73 | r = m.hexdigest()
74 | if len(r) >= hash_length:
75 | r = r[:hash_length]
76 | return int(r, 16) % (10**8)
77 |
78 |
79 | def hash_float(c, hash_length):
80 | """
81 | Hashes a float into a float.
82 |
83 | @param c value to hash
84 | @param hash_length hash_length
85 | @return int
86 | """
87 | if numpy.isnan(c):
88 | return c
89 | else:
90 | b = struct.pack("d", c)
91 | m = hashlib.sha256()
92 | m.update(b)
93 | r = m.hexdigest()
94 | if len(r) >= hash_length:
95 | r = r[:hash_length]
96 | i = int(r, 16) % (2**53)
97 | return float(i)
98 |
99 |
100 | def dataframe_hash_columns(df, cols=None, hash_length=10, inplace=False):
101 | """
102 | Hashes a set of columns in a dataframe.
103 | Keeps the same type. Skips missing values.
104 |
105 | @param df dataframe
106 | @param cols columns to hash or None for alls.
107 | @param hash_length for strings only, length of the hash
108 | @param inplace modifies inplace
109 | @return new dataframe
110 |
111 | This might be useful to anonimized data before
112 | making it public.
113 |
114 | .. exref::
115 | :title: Hashes a set of columns in a dataframe
116 | :tag: dataframe
117 |
118 | .. runpython::
119 | :showcode:
120 |
121 | import pandas
122 | from pandas_streaming.df import dataframe_hash_columns
123 | df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1", ai=1),
124 | dict(b="f", c=5.7, ind="a2", ai=2),
125 | dict(a=4, b="g", ind="a3", ai=3),
126 | dict(a=8, b="h", c=5.9, ai=4),
127 | dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
128 | print(df)
129 | print('--------------')
130 | df2 = dataframe_hash_columns(df)
131 | print(df2)
132 | """
133 | if cols is None:
134 | cols = list(df.columns)
135 |
136 | if not inplace:
137 | df = df.copy()
138 |
139 | def hash_intl(c):
140 | "hash int"
141 | return hash_int(c, hash_length)
142 |
143 | def hash_strl(c):
144 | "hash string"
145 | return hash_str(c, hash_length)
146 |
147 | def hash_floatl(c):
148 | "hash float"
149 | return hash_float(c, hash_length)
150 |
151 | coltype = dict(zip(df.columns, df.dtypes))
152 | for c in cols:
153 | t = coltype[c]
154 | if t == int: # noqa: E721
155 | df[c] = df[c].apply(hash_intl)
156 | elif t == numpy.int64:
157 | df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x)))
158 | elif t == float: # noqa: E721
159 | df[c] = df[c].apply(hash_floatl)
160 | elif t == object: # noqa: E721
161 | df[c] = df[c].apply(hash_strl)
162 | else:
163 | raise NotImplementedError( # pragma: no cover
164 | f"Conversion of type {t} in column '{c}' is not implemented"
165 | )
166 |
167 | return df
168 |
169 |
170 | def dataframe_unfold(df, col, new_col=None, sep=","):
171 | """
172 | One column may contain concatenated values.
173 | This function splits these values and multiplies the
174 | rows for each split value.
175 |
176 | @param df dataframe
177 | @param col column with the concatenated values (strings)
178 | @param new_col new column name, if None, use default value.
179 | @param sep separator
180 | @return a new dataframe
181 |
182 | .. exref::
183 | :title: Unfolds a column of a dataframe.
184 | :tag: dataframe
185 |
186 | .. runpython::
187 | :showcode:
188 |
189 | import pandas
190 | import numpy
191 | from pandas_streaming.df import dataframe_unfold
192 |
193 | df = pandas.DataFrame([dict(a=1, b="e,f"),
194 | dict(a=2, b="g"),
195 | dict(a=3)])
196 | print(df)
197 | df2 = dataframe_unfold(df, "b")
198 | print('----------')
199 | print(df2)
200 |
201 | # To fold:
202 | folded = df2.groupby('a').apply(
203 | lambda row: ','.join(row['b_unfold'].dropna())
204 | if len(row['b_unfold'].dropna()) > 0 else numpy.nan)
205 | print('----------')
206 | print(folded)
207 | """
208 | if new_col is None:
209 | col_name = col + "_unfold"
210 | else:
211 | col_name = new_col
212 | temp_col = "__index__"
213 | while temp_col in df.columns:
214 | temp_col += "_"
215 | rows = []
216 | for i, v in enumerate(df[col]):
217 | if isinstance(v, str):
218 | spl = v.split(sep)
219 | for vs in spl:
220 | rows.append({col: v, col_name: vs, temp_col: i})
221 | else:
222 | rows.append({col: v, col_name: v, temp_col: i})
223 | df = df.copy()
224 | df[temp_col] = list(range(df.shape[0]))
225 | dfj = DataFrame(rows)
226 | res = df.merge(dfj, on=[col, temp_col])
227 | return res.drop(temp_col, axis=1).copy()
228 |
229 |
230 | def dataframe_shuffle(df, random_state=None):
231 | """
232 | Shuffles a dataframe.
233 |
234 | :param df: :epkg:`pandas:DataFrame`
235 | :param random_state: seed
236 | :return: new :epkg:`pandas:DataFrame`
237 |
238 | .. exref::
239 | :title: Shuffles the rows of a dataframe
240 | :tag: dataframe
241 |
242 | .. runpython::
243 | :showcode:
244 |
245 | import pandas
246 | from pandas_streaming.df import dataframe_shuffle
247 |
248 | df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1"),
249 | dict(a=2, b="f", c=5.7, ind="a2"),
250 | dict(a=4, b="g", c=5.8, ind="a3"),
251 | dict(a=8, b="h", c=5.9, ind="a4"),
252 | dict(a=16, b="i", c=6.2, ind="a5")])
253 | print(df)
254 | print('----------')
255 |
256 | shuffled = dataframe_shuffle(df, random_state=0)
257 | print(shuffled)
258 | """
259 | if random_state is not None:
260 | state = numpy.random.RandomState(random_state)
261 | permutation = state.permutation
262 | else:
263 | permutation = numpy.random.permutation
264 | ori_cols = list(df.columns)
265 | scols = set(ori_cols)
266 |
267 | no_index = df.reset_index(drop=False)
268 | keep_cols = [_ for _ in no_index.columns if _ not in scols]
269 | index = no_index.index
270 | index = permutation(index)
271 | shuffled = no_index.iloc[index, :]
272 | res = shuffled.set_index(keep_cols)[ori_cols]
273 | res.index.names = df.index.names
274 | return res
275 |
276 |
277 | def pandas_fillna(df, by, hasna=None, suffix=None):
278 | """
279 | Replaces the :epkg:`nan` values for something not :epkg:`nan`.
280 | Mostly used by @see fn pandas_groupby_nan.
281 |
282 | :param df: dataframe
283 | :param by: list of columns for which we need to replace nan
284 | :param hasna: None or list of columns for which we need to replace NaN
285 | :param suffix: use a prefix for the NaN value
286 | :return: list of values chosen for each column, new dataframe (new copy)
287 | """
288 | suffix = suffix if suffix else "²nan"
289 | df = df.copy()
290 | rep = {}
291 | for c in by:
292 | if hasna is not None and c not in hasna:
293 | continue
294 | if df[c].dtype in (str, bytes, object):
295 | se = set(df[c].dropna())
296 | val = se.pop()
297 | if isinstance(val, str):
298 | cst = suffix
299 | val = ""
300 | elif isinstance(val, bytes):
301 | cst = b"_"
302 | else:
303 | raise TypeError( # pragma: no cover
304 | "Unable to determine a constant for type='{0}' dtype='{1}'".format( # noqa: UP030
305 | val, df[c].dtype
306 | )
307 | )
308 | val += cst
309 | while val in se:
310 | val += suffix
311 | df[c].fillna(val, inplace=True)
312 | rep[c] = val
313 | else:
314 | dr = df[c].dropna()
315 | mi = abs(dr.min())
316 | ma = abs(dr.max())
317 | val = ma + mi
318 | if val == ma and not isinstance(val, str):
319 | val += ma + 1.0
320 | if val <= ma:
321 | raise ValueError( # pragma: no cover
322 | "Unable to find a different value for column '{}' v='{}: "
323 | "min={} max={}".format(c, val, mi, ma)
324 | )
325 | df[c].fillna(val, inplace=True)
326 | rep[c] = val
327 | return rep, df
328 |
329 |
330 | def pandas_groupby_nan(
331 | df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs
332 | ):
333 | """
334 | Does a *groupby* including keeping missing values (:epkg:`nan`).
335 |
336 | :param df: dataframe
337 | :param by: column or list of columns
338 | :param axis: only 0 is allowed
339 | :param as_index: should be False
340 | :param suffix: None or a string
341 | :param nanback: put :epkg:`nan` back in the index,
342 | otherwise it leaves a replacement for :epkg:`nan`.
343 | (does not work when grouping by multiple columns)
344 | :param kwargs: other parameters sent to
345 | `groupby `_
347 | :return: groupby results
348 |
349 | See :epkg:`groupby and missing values`.
350 | If no :epkg:`nan` is detected, the function falls back in regular
351 | :epkg:`pandas:DataFrame:groupby` which has the following
352 | behavior.
353 |
354 | .. exref::
355 | :title: Group a dataframe by one column including nan values
356 | :tag: dataframe
357 |
358 | The regular :epkg:`pandas:dataframe:GroupBy` of a
359 | :epkg:`pandas:DataFrame` removes every :epkg:`nan`
360 | values from the index.
361 |
362 | .. runpython::
363 | :showcode:
364 |
365 | from pandas import DataFrame
366 |
367 | data = [dict(a=2, ind="a", n=1),
368 | dict(a=2, ind="a"),
369 | dict(a=3, ind="b"),
370 | dict(a=30)]
371 | df = DataFrame(data)
372 | print(df)
373 | gr = df.groupby(["ind"]).sum()
374 | print(gr)
375 |
376 | Function @see fn pandas_groupby_nan modifies keeps them.
377 |
378 | .. runpython::
379 | :showcode:
380 |
381 | from pandas import DataFrame
382 | from pandas_streaming.df import pandas_groupby_nan
383 |
384 | data = [dict(a=2, ind="a", n=1),
385 | dict(a=2, ind="a"),
386 | dict(a=3, ind="b"),
387 | dict(a=30)]
388 | df = DataFrame(data)
389 | gr2 = pandas_groupby_nan(df, ["ind"]).sum()
390 | print(gr2)
391 | """
392 | if nanback and suffix is None:
393 | try:
394 | res = df.groupby(by, axis=axis, as_index=as_index, dropna=False, **kwargs)
395 | except TypeError:
396 | # old version of pandas
397 | res = None
398 | if res is not None:
399 | if suffix is None:
400 | return res
401 | res.index = Series(res.index).replace(numpy.nan, suffix)
402 | return res
403 | if axis != 0:
404 | raise NotImplementedError("axis should be 0")
405 | if as_index:
406 | raise NotImplementedError("as_index must be False")
407 | if isinstance(by, tuple):
408 | raise TypeError("by should be of list not tuple")
409 | if not isinstance(by, list):
410 | by = [by]
411 | hasna = {}
412 | for b in by:
413 | h = df[b].isnull().values.any()
414 | if h:
415 | hasna[b] = True
416 | if len(hasna) > 0:
417 | rep, df_copy = pandas_fillna(df, by, hasna, suffix=suffix)
418 | res = df_copy.groupby(by, axis=axis, as_index=as_index, **kwargs)
419 | if len(by) == 1:
420 | if not nanback:
421 | dummy = DataFrame([{"a": "a"}])
422 | do = dummy.dtypes[0]
423 | typ = dict(zip(df.columns, df.dtypes))
424 | if typ[by[0]] != do:
425 | warnings.warn( # pragma: no cover
426 | f"[pandas_groupby_nan] NaN value: {rep}", stacklevel=0
427 | )
428 | return res
429 | for b in by:
430 | fnan = rep[b]
431 | if fnan in res.grouper.groups:
432 | res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
433 | del res.grouper.groups[fnan]
434 | new_val = [
435 | (numpy.nan if b == fnan else b) for b in res.grouper.result_index
436 | ]
437 | res.grouper.groupings[0]._group_index = Index(new_val)
438 | res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True)
439 | if hasattr(res.grouper, "grouping"):
440 | if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray):
441 | arr = numpy.array(new_val)
442 | res.grouper.groupings[0].grouper = arr
443 | if (
444 | hasattr(res.grouper.groupings[0], "_cache")
445 | and "result_index" in res.grouper.groupings[0]._cache
446 | ):
447 | del res.grouper.groupings[0]._cache["result_index"]
448 | else:
449 | raise NotImplementedError(
450 | "Not implemented for type: {0}".format( # noqa: UP030
451 | type(res.grouper.groupings[0].grouper)
452 | )
453 | )
454 | else:
455 | grouper = res.grouper._get_grouper()
456 | if isinstance(grouper, numpy.ndarray):
457 | arr = numpy.array(new_val)
458 | res.grouper.groupings[0].grouping_vector = arr
459 | if (
460 | hasattr(res.grouper.groupings[0], "_cache")
461 | and "result_index" in res.grouper.groupings[0]._cache
462 | ):
463 | index = res.grouper.groupings[0]._cache["result_index"]
464 | if len(rep) == 1:
465 | key = list(rep.values())[0] # noqa: RUF015
466 | new_index = numpy.array(index)
467 | for i in range(len(new_index)):
468 | if new_index[i] == key:
469 | new_index[i] = numpy.nan
470 | res.grouper.groupings[0]._cache["result_index"] = (
471 | index.__class__(new_index)
472 | )
473 | else:
474 | raise NotImplementedError( # pragma: no cover
475 | "NaN values not implemented for multiindex."
476 | )
477 | else:
478 | raise NotImplementedError( # pragma: no cover
479 | "Not implemented for type: {0}".format( # noqa: UP030
480 | type(res.grouper.groupings[0].grouper)
481 | )
482 | )
483 | res.grouper._cache["result_index"] = res.grouper.groupings[
484 | 0
485 | ]._group_index
486 | else:
487 | if not nanback:
488 | dummy = DataFrame([{"a": "a"}])
489 | do = dummy.dtypes[0]
490 | typ = dict(zip(df.columns, df.dtypes))
491 | for b in by:
492 | if typ[b] != do:
493 | warnings.warn( # pragma: no cover
494 | f"[pandas_groupby_nan] NaN values: {rep}", stacklevel=0
495 | )
496 | break
497 | return res
498 | raise NotImplementedError(
499 | "Not yet implemented. Replacing pseudo nan values by real nan "
500 | "values is not as easy as it looks. Use nanback=False"
501 | )
502 |
503 | # keys = list(res.grouper.groups.keys())
504 | # didit = False
505 | # mapping = {}
506 | # for key in keys:
507 | # new_key = list(key)
508 | # mod = False
509 | # for k, b in enumerate(by):
510 | # if b not in rep:
511 | # continue
512 | # fnan = rep[b]
513 | # if key[k] == fnan:
514 | # new_key[k] = numpy.nan
515 | # mod = True
516 | # didit = True
517 | # mapping[fnan] = numpy.nan
518 | # if mod:
519 | # new_key = tuple(new_key)
520 | # mapping[key] = new_key
521 | # res.grouper.groups[new_key] = res.grouper.groups[key]
522 | # del res.grouper.groups[key]
523 | # if didit:
524 | # # this code deos not work
525 | # vnan = numpy.nan
526 | # new_index = list(mapping.get(v, v)
527 | # for v in res.grouper.result_index)
528 | # names = res.grouper.result_index.names
529 | # # index = MultiIndex.from_tuples(tuples=new_index, names=names)
530 | # # res.grouper.result_index = index # does not work cannot set
531 | # # values for [result_index]
532 | # for k in range(len(res.grouper.groupings)):
533 | # grou = res.grouper.groupings[k]
534 | # new_val = list(mapping.get(v, v) for v in grou)
535 | # grou._group_index = Index(new_val)
536 | # b = names[k]
537 | # if b in rep:
538 | # vv = rep[b]
539 | # grou.obj[b].replace(vv, vnan, inplace=True)
540 | # if isinstance(grou.grouper, numpy.ndarray):
541 | # grou.grouper = numpy.array(new_val)
542 | # else:
543 | # raise NotImplementedError(
544 | # "Not implemented for type: {0}".format(
545 | # type(grou.grouper)))
546 | # del res.grouper._cache
547 | return res
548 | return df.groupby(by, axis=axis, **kwargs)
549 |
--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_io.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import zipfile
4 | import pandas
5 | import numpy
6 |
7 |
8 | def to_zip(df, zipfilename, zname="df.csv", **kwargs):
9 | """
10 | Saves a :epkg:`Dataframe` into a :epkg:`zip` file.
11 | It can be read by :meth:`read_zip`.
12 |
13 | :param df: :epkg:`dataframe` or :class:`numpy.ndarray`
14 | :param zipfilename: a :class:`zipfile.ZipFile` or a filename
15 | :param zname: a filename in the zipfile
16 | :param kwargs: parameters for :meth:`pandas.DataFrame.to_csv` or
17 | :func:`numpy.save`
18 | :return: zipfilename
19 |
20 | .. exref::
21 | :title: Saves and reads a dataframe in a zip file
22 | :tag: dataframe
23 |
24 | This shows an example on how to save and read a
25 | :class:`pandas.DataFrame` directly into a zip file.
26 |
27 | .. runpython::
28 | :showcode:
29 |
30 | import pandas
31 | from pandas_streaming.df import to_zip, read_zip
32 |
33 | df = pandas.DataFrame([dict(a=1, b="e"),
34 | dict(b="f", a=5.7)])
35 |
36 | name = "dfs.zip"
37 | to_zip(df, name, encoding="utf-8", index=False)
38 | df2 = read_zip(name, encoding="utf-8")
39 | print(df2)
40 |
41 | .. exref::
42 | :title: Saves and reads a numpy array in a zip file
43 | :tag: array
44 |
45 | This shows an example on how to save and read a
46 | :class:`numpy.ndarray` directly into a zip file.
47 |
48 | .. runpython::
49 | :showcode:
50 |
51 | import numpy
52 | from pandas_streaming.df import to_zip, read_zip
53 |
54 | arr = numpy.array([[0.5, 1.5], [0.4, 1.6]])
55 |
56 | name = "dfsa.zip"
57 | to_zip(arr, name, 'arr.npy')
58 | arr2 = read_zip(name, 'arr.npy')
59 | print(arr2)
60 | """
61 | if isinstance(df, pandas.DataFrame):
62 | stb = io.StringIO()
63 | ext = os.path.splitext(zname)[-1]
64 | if ext == ".npy":
65 | raise ValueError( # pragma: no cover
66 | "Extension '.npy' cannot be used to save a dataframe."
67 | )
68 | df.to_csv(stb, **kwargs)
69 | elif isinstance(df, numpy.ndarray):
70 | stb = io.BytesIO()
71 | ext = os.path.splitext(zname)[-1]
72 | if ext != ".npy":
73 | raise ValueError( # pragma: no cover
74 | "Extension '.npy' is required when saving a numpy array."
75 | )
76 | numpy.save(stb, df, **kwargs)
77 | else:
78 | raise TypeError(f"Type not handled {type(df)}") # pragma: no cover
79 | text = stb.getvalue()
80 |
81 | if isinstance(zipfilename, str):
82 | ext = os.path.splitext(zipfilename)[-1]
83 | if ext != ".zip":
84 | raise NotImplementedError( # pragma: no cover
85 | f"Only zip file are implemented not '{ext}'."
86 | )
87 | zf = zipfile.ZipFile(zipfilename, "w") # pylint: disable=R1732
88 | close = True
89 | elif isinstance(zipfilename, zipfile.ZipFile):
90 | zf = zipfilename
91 | close = False
92 | else:
93 | raise TypeError( # pragma: no cover
94 | f"No implementation for type '{type(zipfilename)}'"
95 | )
96 |
97 | zf.writestr(zname, text)
98 | if close:
99 | zf.close()
100 |
101 |
102 | def read_zip(zipfilename, zname=None, **kwargs):
103 | """
104 | Reads a :epkg:`dataframe` from a :epkg:`zip` file.
105 | It can be saved by :meth:`to_zip`.
106 |
107 | :param zipfilename: a :class:`zipfile.ZipFile` or a filename
108 | :param zname: a filename in zipfile, if None, takes the first one
109 | :param kwargs: parameters for :func:`pandas.read_csv`
110 | :return: :class:`pandas.DataFrame` or :class:`numpy.ndarray`
111 | """
112 | if isinstance(zipfilename, str):
113 | ext = os.path.splitext(zipfilename)[-1]
114 | if ext != ".zip":
115 | raise NotImplementedError( # pragma: no cover
116 | f"Only zip files are supported not '{ext}'."
117 | )
118 | zf = zipfile.ZipFile(zipfilename, "r") # pylint: disable=R1732
119 | close = True
120 | elif isinstance(zipfilename, zipfile.ZipFile):
121 | zf = zipfilename
122 | close = False
123 | else:
124 | raise TypeError( # pragma: no cover
125 | f"No implementation for type '{type(zipfilename)}'"
126 | )
127 |
128 | if zname is None:
129 | zname = zf.namelist()[0]
130 | content = zf.read(zname)
131 | stb = io.BytesIO(content)
132 | ext = os.path.splitext(zname)[-1]
133 | if ext == ".npy":
134 | df = numpy.load(stb, **kwargs)
135 | else:
136 | df = pandas.read_csv(stb, **kwargs)
137 |
138 | if close:
139 | zf.close()
140 |
141 | return df
142 |
--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_io_helpers.py:
--------------------------------------------------------------------------------
1 | import os
2 | from io import StringIO, BytesIO
3 |
4 | try:
5 | from ujson import dumps
6 | except ImportError: # pragma: no cover
7 | from json import dumps
8 |
9 |
10 | class JsonPerRowsStream:
11 | """
12 | Reads a :epkg:`json` streams and adds
13 | ``,``, ``[``, ``]`` to convert a stream containing
14 | one :epkg:`json` object per row into one single :epkg:`json` object.
15 | It only implements method *readline*.
16 |
17 | :param st: stream
18 | """
19 |
20 | def __init__(self, st):
21 | self.st = st
22 | self.begin = True
23 | self.newline = False
24 | self.end = True
25 |
26 | def seek(self, offset):
27 | """
28 | Change the stream position to the given byte offset.
29 |
30 | :param offset: offset, only 0 is implemented
31 | """
32 | self.st.seek(offset)
33 |
34 | def readline(self, size=-1):
35 | """
36 | Reads a line, adds ``,``, ``[``, ``]`` if needed.
37 | So the number of read characters is not recessarily
38 | the requested one but could be greater.
39 | """
40 | text = self.st.readline(size)
41 | if size == 0:
42 | return text
43 | if self.newline:
44 | text = "," + text
45 | self.newline = False
46 | elif self.begin:
47 | text = "[" + text
48 | self.begin = False
49 |
50 | if text.endswith("\n"):
51 | self.newline = True
52 | return text
53 | if len(text) == 0 or len(text) < size:
54 | if self.end:
55 | self.end = False
56 | return text + "]"
57 | return text
58 | return text
59 |
60 | def read(self, size=-1):
61 | """
62 | Reads characters, adds ``,``, ``[``, ``]`` if needed.
63 | So the number of read characters is not recessarily
64 | the requested one but could be greater.
65 | """
66 | text = self.st.read(size)
67 | if isinstance(text, bytes):
68 | cst = b"\n", b"\n,", b",", b"[", b"]"
69 | else:
70 | cst = "\n", "\n,", ",", "[", "]"
71 | if size == 0:
72 | return text
73 | if len(text) > 1:
74 | t1, t2 = text[: len(text) - 1], text[len(text) - 1 :]
75 | t1 = t1.replace(cst[0], cst[1])
76 | text = t1 + t2
77 |
78 | if self.newline:
79 | text = cst[2] + text
80 | self.newline = False
81 | elif self.begin:
82 | text = cst[3] + text
83 | self.begin = False
84 |
85 | if text.endswith(cst[0]):
86 | self.newline = True
87 | return text
88 | if len(text) == 0 or len(text) < size:
89 | if self.end:
90 | self.end = False
91 | return text + cst[4]
92 | return text
93 | return text
94 |
95 | def getvalue(self):
96 | """
97 | Returns the whole stream content.
98 | """
99 |
100 | def byline():
101 | line = self.readline()
102 | while line:
103 | yield line
104 | line = self.readline()
105 |
106 | return "".join(byline())
107 |
108 |
109 | def flatten_dictionary(dico, sep="_"):
110 | """
111 | Flattens a dictionary with nested structure to a dictionary with no
112 | hierarchy.
113 |
114 | :param dico: dictionary to flatten
115 | :param sep: string to separate dictionary keys by
116 | :return: flattened dictionary
117 |
118 | Inspired from `flatten_json
119 | `_.
120 | """
121 | flattened_dict = {}
122 |
123 | def _flatten(obj, key):
124 | if obj is None:
125 | flattened_dict[key] = obj
126 | elif isinstance(obj, dict):
127 | for k, v in obj.items():
128 | if not isinstance(k, str):
129 | raise TypeError("All keys must a string.") # pragma: no cover
130 | k2 = k if key is None else f"{key}{sep}{k}"
131 | _flatten(v, k2)
132 | elif isinstance(obj, (list, set)):
133 | for index, item in enumerate(obj):
134 | k2 = k if key is None else f"{key}{sep}{index}"
135 | _flatten(item, k2)
136 | else:
137 | flattened_dict[key] = obj
138 |
139 | _flatten(dico, None)
140 | return flattened_dict
141 |
142 |
143 | def enumerate_json_items(
144 | filename, encoding=None, lines=False, flatten=False, verbose=0
145 | ):
146 | """
147 | Enumerates items from a :epkg:`JSON` file or string.
148 |
149 | :param filename: filename or string or stream to parse
150 | :param encoding: encoding
151 | :param lines: one record per row
152 | :param flatten: call @see fn flatten_dictionary
153 | :param verbose: verbosity (based on :epkg:`tqdm`)
154 | :return: iterator on records at first level.
155 |
156 | It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.
157 | However, if option *lines* if true, the function considers that the
158 | stream or file does have one record per row as follows:
159 |
160 | {"id":1, ...}
161 | {"id": 2, ...}
162 |
163 | .. exref::
164 | :title: Processes a json file by streaming.
165 |
166 | The module :epkg:`ijson` can read a :epkg:`JSON` file by streaming.
167 | This module is needed because a record can be written on multiple lines.
168 | This function leverages it produces the following results.
169 |
170 | .. runpython::
171 | :showcode:
172 |
173 | from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items
174 |
175 | text_json = b'''
176 | [
177 | {
178 | "glossary": {
179 | "title": "example glossary",
180 | "GlossDiv": {
181 | "title": "S",
182 | "GlossList": [{
183 | "GlossEntry": {
184 | "ID": "SGML",
185 | "SortAs": "SGML",
186 | "GlossTerm": "Standard Generalized Markup Language",
187 | "Acronym": "SGML",
188 | "Abbrev": "ISO 8879:1986",
189 | "GlossDef": {
190 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
191 | "GlossSeeAlso": ["GML", "XML"]
192 | },
193 | "GlossSee": "markup"
194 | }
195 | }]
196 | }
197 | }
198 | },
199 | {
200 | "glossary": {
201 | "title": "example glossary",
202 | "GlossDiv": {
203 | "title": "S",
204 | "GlossList": {
205 | "GlossEntry": [{
206 | "ID": "SGML",
207 | "SortAs": "SGML",
208 | "GlossTerm": "Standard Generalized Markup Language",
209 | "Acronym": "SGML",
210 | "Abbrev": "ISO 8879:1986",
211 | "GlossDef": {
212 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
213 | "GlossSeeAlso": ["GML", "XML"]
214 | },
215 | "GlossSee": "markup"
216 | }]
217 | }
218 | }
219 | }
220 | }
221 | ]
222 | '''
223 |
224 | for item in enumerate_json_items(text_json):
225 | print(item)
226 |
227 | The parsed json must have an empty line at the end otherwise
228 | the following exception is raised:
229 | `ijson.common.IncompleteJSONError: `
230 | `parse error: unallowed token at this point in JSON text`.
231 | """
232 | if isinstance(filename, str):
233 | if "{" not in filename and os.path.exists(filename):
234 | with open(filename, "r", encoding=encoding) as f:
235 | for el in enumerate_json_items(
236 | f, encoding=encoding, lines=lines, flatten=flatten
237 | ):
238 | yield el
239 | else:
240 | st = StringIO(filename)
241 | for el in enumerate_json_items(
242 | st, encoding=encoding, lines=lines, flatten=flatten
243 | ):
244 | yield el
245 | elif isinstance(filename, bytes):
246 | st = BytesIO(filename)
247 | for el in enumerate_json_items(
248 | st, encoding=encoding, lines=lines, flatten=flatten
249 | ):
250 | yield el
251 | elif lines:
252 | for el in enumerate_json_items(
253 | JsonPerRowsStream(filename), encoding=encoding, lines=False, flatten=flatten
254 | ):
255 | yield el
256 | else:
257 | if hasattr(filename, "seek"):
258 | filename.seek(0)
259 | import ijson
260 |
261 | parser = ijson.parse(filename)
262 | current = None
263 | curkey = None
264 | stack = []
265 | nbyield = 0
266 | if verbose:
267 | from tqdm import tqdm
268 |
269 | loop = tqdm(enumerate(parser))
270 | else:
271 | loop = enumerate(parser)
272 | for i, (_, event, value) in loop:
273 | if verbose:
274 | loop.set_description(f"process row {i}-event={event!r}")
275 | if event == "start_array":
276 | if curkey is None:
277 | current = []
278 | else:
279 | if not isinstance(current, dict):
280 | raise RuntimeError( # pragma: no cover
281 | f"Type issue {type(current)}"
282 | )
283 | c = []
284 | current[curkey] = c # pylint: disable=E1137
285 | current = c
286 | curkey = None
287 | stack.append(current)
288 | elif event == "end_array":
289 | stack.pop()
290 | if len(stack) == 0:
291 | # We should be done.
292 | current = None
293 | else:
294 | current = stack[-1]
295 | elif event == "start_map":
296 | c = {}
297 | if curkey is None:
298 | if current is None:
299 | current = []
300 | current.append(c)
301 | else:
302 | current[curkey] = c # pylint: disable=E1137
303 | stack.append(c)
304 | current = c
305 | curkey = None
306 | elif event == "end_map":
307 | stack.pop()
308 | current = stack[-1]
309 | if len(stack) == 1:
310 | nbyield += 1
311 | if flatten:
312 | yield flatten_dictionary(current[-1])
313 | else:
314 | yield current[-1]
315 | # We clear the memory.
316 | current.clear()
317 | elif event == "map_key":
318 | curkey = value
319 | elif event in {"string", "number", "boolean"}:
320 | if curkey is None:
321 | current.append(value)
322 | else:
323 | current[curkey] = value # pylint: disable=E1137
324 | curkey = None
325 | elif event == "null":
326 | if curkey is None:
327 | current.append(None)
328 | else:
329 | current[curkey] = None # pylint: disable=E1137
330 | curkey = None
331 | else:
332 | raise ValueError(f"Unknown event '{event}'") # pragma: no cover
333 |
334 |
335 | class JsonIterator2Stream:
336 | """
337 | Transforms an iterator on :epkg:`JSON` items
338 | into a stream which returns an items as a string every time
339 | method *read* is called.
340 | The iterator could be one returned by @see fn enumerate_json_items.
341 |
342 | :param it: iterator
343 | :param kwargs: arguments to :class:`json.dumps`
344 |
345 | .. exref::
346 | :title: Reshape a json file
347 |
348 | The function @see fn enumerate_json_items reads any
349 | :epkg:`json` even if every record is split over
350 | multiple lines. Class @see cl JsonIterator2Stream
351 | mocks this iterator as a stream. Each row is a single item.
352 |
353 | .. runpython::
354 | :showcode:
355 |
356 | from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
357 |
358 | text_json = b'''
359 | [
360 | {
361 | "glossary": {
362 | "title": "example glossary",
363 | "GlossDiv": {
364 | "title": "S",
365 | "GlossList": [{
366 | "GlossEntry": {
367 | "ID": "SGML",
368 | "SortAs": "SGML",
369 | "GlossTerm": "Standard Generalized Markup Language",
370 | "Acronym": "SGML",
371 | "Abbrev": "ISO 8879:1986",
372 | "GlossDef": {
373 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
374 | "GlossSeeAlso": ["GML", "XML"]
375 | },
376 | "GlossSee": "markup"
377 | }
378 | }]
379 | }
380 | }
381 | },
382 | {
383 | "glossary": {
384 | "title": "example glossary",
385 | "GlossDiv": {
386 | "title": "S",
387 | "GlossList": {
388 | "GlossEntry": [{
389 | "ID": "SGML",
390 | "SortAs": "SGML",
391 | "GlossTerm": "Standard Generalized Markup Language",
392 | "Acronym": "SGML",
393 | "Abbrev": "ISO 8879:1986",
394 | "GlossDef": {
395 | "para": "A meta-markup language, used to create markup languages such as DocBook.",
396 | "GlossSeeAlso": ["GML", "XML"]
397 | },
398 | "GlossSee": "markup"
399 | }]
400 | }
401 | }
402 | }
403 | }
404 | ]
405 | '''
406 |
407 | for item in JsonIterator2Stream(lambda: enumerate_json_items(text_json)):
408 | print(item)
409 |
410 | .. versionchanged:: 0.3
411 | The class takes a function which outputs an iterator and not an iterator.
412 | `JsonIterator2Stream(enumerate_json_items(text_json))` needs to be rewritten
413 | into JsonIterator2Stream(lambda: enumerate_json_items(text_json)).
414 | """
415 |
416 | def __init__(self, it, **kwargs):
417 | self.it = it
418 | self.kwargs = kwargs
419 | self.it0 = it()
420 |
421 | def seek(self, offset):
422 | """
423 | Change the stream position to the given byte offset.
424 |
425 | :param offset: offset, only 0 is implemented
426 | """
427 | if offset != 0:
428 | raise NotImplementedError("The iterator can only return at the beginning.")
429 | self.it0 = self.it()
430 |
431 | def write(self):
432 | """
433 | The class does not write.
434 | """
435 | raise NotImplementedError()
436 |
437 | def read(self):
438 | """
439 | Reads the next item and returns it as a string.
440 | """
441 | try:
442 | value = next(self.it0)
443 | return dumps(value, **self.kwargs)
444 | except StopIteration:
445 | return None
446 |
447 | def __iter__(self):
448 | """
449 | Iterates on each row. The behaviour is a bit tricky.
450 | It is implemented to be swalled by :func:`pandas.read_json` which
451 | uses :func:`itertools.islice` to go through the items.
452 | It calls multiple times `__iter__` but does expect the
453 | iterator to continue from where it stopped last time.
454 | """
455 | for value in self.it0:
456 | yield dumps(value, **self.kwargs)
457 |
--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_split.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import pickle
3 | import random
4 | import warnings
5 | from io import StringIO
6 | import pandas
7 |
8 |
9 | def sklearn_train_test_split(
10 | self, path_or_buf=None, export_method="to_csv", names=None, **kwargs
11 | ):
12 | """
13 | Randomly splits a dataframe into smaller pieces.
14 | The function returns streams of file names.
15 | The function relies on :func:`sklearn.model_selection.train_test_split`.
16 | It does not handle stratified version of it.
17 |
18 | :param self: see :class:`StreamingDataFrame
19 | `
20 | :param path_or_buf: a string, a list of strings or buffers, if it is a
21 | string, it must contain ``{}`` like ``partition{}.txt``
22 | :param export_method: method used to store the partitions, by default
23 | :meth:`pandas.DataFrame.to_csv`
24 | :param names: partitions names, by default ``('train', 'test')``
25 | :param kwargs: parameters for the export function and
26 | :func:`sklearn.model_selection.train_test_split`.
27 | :return: outputs of the exports functions
28 |
29 | The function cannot return two iterators or two
30 | see :class:`StreamingDataFrame
31 | `
32 | because running through one
33 | means running through the other. We can assume both
34 | splits do not hold in memory and we cannot run through
35 | the same iterator again as random draws would be different.
36 | We need to store the results into files or buffers.
37 |
38 | .. warning::
39 | The method *export_method* must write the data in
40 | mode *append* and allows stream.
41 | """
42 | if kwargs.get("stratify") is not None:
43 | raise NotImplementedError( # pragma: no cover
44 | "No implementation yet for the stratified version."
45 | )
46 | with warnings.catch_warnings():
47 | warnings.filterwarnings("ignore", category=ImportWarning)
48 | from sklearn.model_selection import train_test_split
49 |
50 | opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]
51 | split_ops = {}
52 | for o in opts:
53 | if o in kwargs:
54 | split_ops[o] = kwargs[o]
55 | del kwargs[o]
56 |
57 | exportf_ = getattr(pandas.DataFrame, export_method)
58 | if export_method == "to_csv" and "mode" not in kwargs:
59 | exportf = lambda *a, **kw: exportf_(*a, mode="a", **kw) # noqa: E731
60 | else:
61 | exportf = exportf_
62 |
63 | if isinstance(path_or_buf, str):
64 | if "{}" not in path_or_buf:
65 | raise ValueError("path_or_buf must contain {} to insert the partition name")
66 | if names is None:
67 | names = ["train", "test"]
68 | elif len(names) != len(path_or_buf):
69 | raise ValueError( # pragma: no cover
70 | "names and path_or_buf must have the same length"
71 | )
72 | path_or_buf = [path_or_buf.format(n) for n in names]
73 | elif path_or_buf is None:
74 | path_or_buf = [None, None]
75 | else:
76 | if not isinstance(path_or_buf, list):
77 | raise TypeError( # pragma: no cover
78 | "path_or_buf must be a list or a string"
79 | )
80 |
81 | bufs = []
82 | close = []
83 | for p in path_or_buf:
84 | if p is None:
85 | st = StringIO()
86 | cl = False
87 | elif isinstance(p, str):
88 | st = open(p, "w", encoding=kwargs.get("encoding")) # noqa: SIM115
89 | cl = True
90 | else:
91 | st = p
92 | cl = False
93 | bufs.append(st)
94 | close.append(cl)
95 |
96 | for df in self:
97 | train, test = train_test_split(df, **split_ops)
98 | exportf(train, bufs[0], **kwargs)
99 | exportf(test, bufs[1], **kwargs)
100 | kwargs["header"] = False
101 |
102 | for b, c in zip(bufs, close):
103 | if c:
104 | b.close()
105 | return [
106 | st.getvalue() if isinstance(st, StringIO) else p
107 | for st, p in zip(bufs, path_or_buf)
108 | ]
109 |
110 |
111 | def sklearn_train_test_split_streaming(
112 | self, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False
113 | ):
114 | """
115 | Randomly splits a dataframe into smaller pieces.
116 | The function returns streams of file names.
117 | The function relies on :func:`sklearn.model_selection.train_test_split`.
118 | It handles the stratified version of it.
119 |
120 | :param self: see :class:`StreamingDataFrame
121 | `
122 | :param test_size: ratio for the test partition
123 | (if *train_size* is not specified)
124 | :param train_size: ratio for the train partition
125 | :param stratify: column holding the stratification
126 | :param hash_size: size of the hash to cache information about partition
127 | :param unique_rows: ensures that rows are unique
128 | :return: Two see :class:`StreamingDataFrame
129 | `,
130 | one for train, one for test.
131 |
132 | The function returns two iterators or two
133 | see :class:`StreamingDataFrame
134 | `. It
135 | tries to do everything without writing anything on disk
136 | but it requires to store the repartition somehow.
137 | This function hashes every row and maps the hash with a part
138 | (train or test). This cache must hold in memory otherwise the
139 | function fails. The two returned iterators must not be used
140 | for the first time in the same time. The first time is used to
141 | build the cache. The function changes the order of rows if
142 | the parameter *stratify* is not null. The cache has a side effect:
143 | every exact same row will be put in the same partition.
144 | If that is not what you want, you should add an index column
145 | or a random one.
146 | """
147 | p = (1 - test_size) if test_size else None
148 | if train_size is not None:
149 | p = train_size
150 | n = 2 * max(1 / p, 1 / (1 - p)) # changement
151 |
152 | static_schema = []
153 |
154 | def iterator_rows():
155 | "iterates on rows"
156 | counts = {}
157 | memory = {}
158 | pos_col = None
159 | for df in self:
160 | if pos_col is None:
161 | static_schema.append(list(df.columns))
162 | static_schema.append(list(df.dtypes))
163 | static_schema.append(df.shape[0])
164 | if stratify is not None:
165 | pos_col = list(df.columns).index(stratify)
166 | else:
167 | pos_col = -1
168 |
169 | for obs in df.itertuples(index=False, name=None):
170 | strat = 0 if stratify is None else obs[pos_col]
171 | if strat not in memory:
172 | memory[strat] = []
173 | memory[strat].append(obs)
174 |
175 | for k, v in memory.items():
176 | if len(v) >= n + random.randint(0, 10): # changement
177 | vr = list(range(len(v)))
178 | # on permute aléatoirement
179 | random.shuffle(vr)
180 | if (0, k) in counts:
181 | tt = counts[1, k] + counts[0, k]
182 | delta = -int(counts[0, k] - tt * p + 0.5)
183 | else:
184 | delta = 0
185 | i = int(len(v) * p + 0.5)
186 | i += delta
187 | i = max(0, min(len(v), i))
188 | one = set(vr[:i])
189 | for d, obs_ in enumerate(v):
190 | yield obs_, 0 if d in one else 1
191 | if (0, k) not in counts:
192 | counts[0, k] = i
193 | counts[1, k] = len(v) - i
194 | else:
195 | counts[0, k] += i
196 | counts[1, k] += len(v) - i
197 | # on efface de la mémoire les informations produites
198 | v.clear()
199 |
200 | # Lorsqu'on a fini, il faut tout de même répartir les
201 | # observations stockées.
202 | for k, v in memory.items():
203 | vr = list(range(len(v)))
204 | # on permute aléatoirement
205 | random.shuffle(vr)
206 | if (0, k) in counts:
207 | tt = counts[1, k] + counts[0, k]
208 | delta = -int(counts[0, k] - tt * p + 0.5)
209 | else:
210 | delta = 0
211 | i = int(len(v) * p + 0.5)
212 | i += delta
213 | i = max(0, min(len(v), i))
214 | one = set(vr[:i])
215 | for d, obs in enumerate(v):
216 | yield obs, 0 if d in one else 1
217 | if (0, k) not in counts:
218 | counts[0, k] = i
219 | counts[1, k] = len(v) - i
220 | else:
221 | counts[0, k] += i
222 | counts[1, k] += len(v) - i
223 |
224 | def h11(w):
225 | "pickle and hash"
226 | b = pickle.dumps(w)
227 | return hashlib.md5(b).hexdigest()[:hash_size]
228 |
229 | # We store the repartition in a cache.
230 | cache = {}
231 |
232 | def iterator_internal(part_requested):
233 | "internal iterator on dataframes"
234 | iy = 0
235 | accumul = []
236 | if len(cache) == 0:
237 | for obs, part in iterator_rows():
238 | h = h11(obs)
239 | if unique_rows and h in cache:
240 | raise ValueError(
241 | "A row or at least its hash is already cached. " # noqa: UP030
242 | "Increase hash_size or check for duplicates "
243 | "('{0}')\n{1}.".format(h, obs)
244 | )
245 | if h not in cache:
246 | cache[h] = part
247 | else:
248 | part = cache[h]
249 | if part == part_requested:
250 | accumul.append(obs)
251 | if len(accumul) >= static_schema[2]:
252 | dfo = pandas.DataFrame(accumul, columns=static_schema[0])
253 | self.ensure_dtype(dfo, static_schema[1])
254 | iy += dfo.shape[0]
255 | accumul.clear()
256 | yield dfo
257 | else:
258 | for df in self:
259 | for obs in df.itertuples(index=False, name=None):
260 | h = h11(obs)
261 | part = cache.get(h)
262 | if part is None:
263 | raise ValueError( # pragma: no cover
264 | f"Second iteration. A row was "
265 | f"never met in the first one\n{obs}"
266 | )
267 | if part == part_requested:
268 | accumul.append(obs)
269 | if len(accumul) >= static_schema[2]:
270 | dfo = pandas.DataFrame(accumul, columns=static_schema[0])
271 | self.ensure_dtype(dfo, static_schema[1])
272 | iy += dfo.shape[0]
273 | accumul.clear()
274 | yield dfo
275 | if len(accumul) > 0:
276 | dfo = pandas.DataFrame(accumul, columns=static_schema[0])
277 | self.ensure_dtype(dfo, static_schema[1])
278 | iy += dfo.shape[0]
279 | yield dfo
280 |
281 | return (
282 | self.__class__(lambda: iterator_internal(0)),
283 | self.__class__(lambda: iterator_internal(1)),
284 | )
285 |
--------------------------------------------------------------------------------
/pandas_streaming/exc/__init__.py:
--------------------------------------------------------------------------------
1 | from .exc_streaming import StreamingInefficientException # noqa: F401
2 |
--------------------------------------------------------------------------------
/pandas_streaming/exc/exc_streaming.py:
--------------------------------------------------------------------------------
1 | class StreamingInefficientException(Exception):
2 | """
3 | Kind of operations doable with a :epkg:`pandas:DataFrame`
4 | but which should not be done in streaming mode.
5 | """
6 |
7 | def __init__(self, meth):
8 | """
9 | This method is inefficient in streaming mode
10 | and not implemented.
11 |
12 | :param meth: inefficient method
13 | """
14 | Exception.__init__(self, f"{meth} should not be done in streaming mode.")
15 |
--------------------------------------------------------------------------------
/pandas_streaming/ext_test_case.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 | import warnings
5 | from contextlib import redirect_stderr, redirect_stdout
6 | from io import StringIO
7 | from typing import Any, Callable, List, Optional
8 |
9 | import numpy
10 | from numpy.testing import assert_allclose
11 |
12 |
13 | def unit_test_going():
14 | """
15 | Enables a flag telling the script is running while testing it.
16 | Avois unit tests to be very long.
17 | """
18 | going = int(os.environ.get("UNITTEST_GOING", 0))
19 | return going == 1
20 |
21 |
22 | def ignore_warnings(warns: List[Warning]) -> Callable:
23 | """
24 | Catches warnings.
25 |
26 | :param warns: warnings to ignore
27 | """
28 |
29 | def wrapper(fct):
30 | if warns is None:
31 | raise AssertionError(f"warns cannot be None for '{fct}'.")
32 |
33 | def call_f(self):
34 | with warnings.catch_warnings():
35 | warnings.simplefilter("ignore", warns)
36 | return fct(self)
37 |
38 | return call_f
39 |
40 | return wrapper
41 |
42 |
43 | class sys_path_append:
44 | """
45 | Stores the content of :epkg:`*py:sys:path` and
46 | restores it afterwards.
47 | """
48 |
49 | def __init__(self, paths, position=-1):
50 | """
51 | :param paths: paths to add
52 | :param position: where to add it
53 | """
54 | self.to_add = paths if isinstance(paths, list) else [paths]
55 | self.position = position
56 |
57 | def __enter__(self):
58 | """
59 | Modifies ``sys.path``.
60 | """
61 | self.store = sys.path.copy()
62 | if self.position == -1:
63 | sys.path.extend(self.to_add)
64 | else:
65 | for p in reversed(self.to_add):
66 | sys.path.insert(self.position, p)
67 |
68 | def __exit__(self, exc_type, exc_value, traceback):
69 | """
70 | Restores``sys.path``.
71 | """
72 | sys.path = self.store
73 |
74 |
75 | class ExtTestCase(unittest.TestCase):
76 | _warns = []
77 |
78 | def assertExists(self, name):
79 | if not os.path.exists(name):
80 | raise AssertionError(f"File or folder {name!r} does not exists.")
81 |
82 | def assertEqualArray(
83 | self,
84 | expected: numpy.ndarray,
85 | value: numpy.ndarray,
86 | atol: float = 0,
87 | rtol: float = 0,
88 | ):
89 | self.assertEqual(expected.dtype, value.dtype)
90 | self.assertEqual(expected.shape, value.shape)
91 | assert_allclose(expected, value, atol=atol, rtol=rtol)
92 |
93 | def assertEqualDataFrame(self, d1, d2, **kwargs):
94 | """
95 | Checks that two dataframes are equal.
96 | Calls :func:`pandas.testing.assert_frame_equal`.
97 | """
98 | from pandas.testing import assert_frame_equal
99 |
100 | assert_frame_equal(d1, d2, **kwargs)
101 |
102 | def assertAlmostEqual(
103 | self,
104 | expected: numpy.ndarray,
105 | value: numpy.ndarray,
106 | atol: float = 0,
107 | rtol: float = 0,
108 | ):
109 | if not isinstance(expected, numpy.ndarray):
110 | expected = numpy.array(expected)
111 | if not isinstance(value, numpy.ndarray):
112 | value = numpy.array(value).astype(expected.dtype)
113 | self.assertEqualArray(expected, value, atol=atol, rtol=rtol)
114 |
115 | def assertRaise(
116 | self, fct: Callable, exc_type: Exception, msg: Optional[str] = None
117 | ):
118 | try:
119 | fct()
120 | except exc_type as e:
121 | if not isinstance(e, exc_type):
122 | raise AssertionError(f"Unexpected exception {type(e)!r}.") from e
123 | if msg is None:
124 | return
125 | if msg not in str(e):
126 | raise AssertionError(f"Unexpected error message {e!r}.") from e
127 | return
128 | raise AssertionError("No exception was raised.")
129 |
130 | def assertEmpty(self, value: Any):
131 | if value is None:
132 | return
133 | if len(value) == 0:
134 | return
135 | raise AssertionError(f"value is not empty: {value!r}.")
136 |
137 | def assertNotEmpty(self, value: Any):
138 | if value is None:
139 | raise AssertionError(f"value is empty: {value!r}.")
140 | if isinstance(value, (list, dict, tuple, set)):
141 | if len(value) == 0:
142 | raise AssertionError(f"value is empty: {value!r}.")
143 |
144 | def assertStartsWith(self, prefix: str, full: str):
145 | if not full.startswith(prefix):
146 | raise AssertionError(f"prefix={prefix!r} does not start string {full!r}.")
147 |
148 | def assertLesser(self, x, y, strict=False):
149 | """
150 | Checks that ``x <= y``.
151 | """
152 | if x > y or (strict and x == y):
153 | raise AssertionError(
154 | "x >{2} y with x={0} and y={1}".format( # noqa: UP030
155 | ExtTestCase._format_str(x),
156 | ExtTestCase._format_str(y),
157 | "" if strict else "=",
158 | )
159 | )
160 |
161 | @staticmethod
162 | def abs_path_join(filename: str, *args: List[str]):
163 | """
164 | Returns an absolute and normalized path from this location.
165 |
166 | :param filename: filename, the folder which contains it
167 | is used as the base
168 | :param args: list of subpaths to the previous path
169 | :return: absolute and normalized path
170 | """
171 | dirname = os.path.join(os.path.dirname(filename), *args)
172 | return os.path.normpath(os.path.abspath(dirname))
173 |
174 | @classmethod
175 | def tearDownClass(cls):
176 | for name, line, w in cls._warns:
177 | warnings.warn(f"\n{name}:{line}: {type(w)}\n {str(w)}", stacklevel=0)
178 |
179 | def capture(self, fct: Callable):
180 | """
181 | Runs a function and capture standard output and error.
182 |
183 | :param fct: function to run
184 | :return: result of *fct*, output, error
185 | """
186 | sout = StringIO()
187 | serr = StringIO()
188 | with redirect_stdout(sout), redirect_stderr(serr):
189 | res = fct()
190 | return res, sout.getvalue(), serr.getvalue()
191 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 |
3 | # Exclude a variety of commonly ignored directories.
4 | exclude = [
5 | ".eggs",
6 | ".git",
7 | "build",
8 | "dist",
9 | ]
10 |
11 | line-length = 88
12 |
13 | [tool.ruff.lint]
14 | select = [
15 | "B", # flake8-bugbear
16 | "C4", # flake8-comprehensions
17 | #"D", # pydocstyle
18 | "E", # pycodestyle
19 | "F", # Pyflakes
20 | "G", # flake8-logging-format
21 | #"I", # isort
22 | "ISC", # flake8-implicit-str-concat
23 | "LOG", # flake8-logging
24 | #"N", # pep8-naming
25 | #"NPY", # modern numpy
26 | #"PERF", # Perflint
27 | "PIE", # flake8-pie
28 | "PYI", # flake8-pyi
29 | "RUF", # Ruff-specific rules
30 | "SIM", # flake8-simplify
31 | "SLOT", # flake8-slot
32 | "T10", # flake8-debugger
33 | #"TID", # Disallow relative imports
34 | #"TRY", # flake8-try-except-raise
35 | "UP", # pyupgrade
36 | "W", # pycodestyle
37 | "YTT", # flake8-2020
38 | ]
39 |
40 | [tool.ruff.lint.per-file-ignores]
41 | "**" = ["B905", "C401", "C408", "C413", "RUF012", "RUF100", "RUF010", "SIM108", "SIM910", "SIM110", "SIM102", "SIM114", "SIM103", "UP015", "UP027", "UP031", "UP034", "UP032", "UP006", "UP035", "UP007", "UP038"]
42 | "**/plot*.py" = ["B018"]
43 | "_doc/examples/**.py" = ["E402", "F811", "B018"]
44 | "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"]
45 | "pandas_streaming/data/__init__.py" = ["F401"]
46 | "pandas_streaming/df/__init__.py" = ["F401"]
47 | "pandas_streaming/df/dataframe_io_helpers.py" = ["E501"]
48 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8
2 | black
3 | coverage
4 | furo
5 | ijson
6 | jupyter_sphinx
7 | jyquickhelper
8 | matplotlib
9 | nbsphinx
10 | pandas>=1.1.0
11 | pandocfilters
12 | Pillow
13 | pycodestyle
14 | pylint>=2.14.0
15 | pytest
16 | pytest-cov
17 | ruff
18 | scikit-learn
19 | scipy
20 | sphinx
21 | sphinx-issues
22 | git+https://github.com/sdpython/sphinx-runpython.git
23 | sphinx_gallery
24 | ujson
25 | wheel
26 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | packages = find:
3 |
4 | [options.packages.find]
5 | include = pandas_streaming*
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from setuptools import setup
4 |
5 | ######################
6 | # beginning of setup
7 | ######################
8 |
9 |
10 | here = os.path.dirname(__file__)
11 | if here == "":
12 | here = "."
13 | package_data = {"pandas_streaming.validation": ["*.css", "*.js"]}
14 |
15 | try:
16 | with open(os.path.join(here, "requirements.txt"), "r") as f:
17 | requirements = f.read().strip(" \n\r\t").split("\n")
18 | except FileNotFoundError:
19 | requirements = []
20 | if len(requirements) == 0 or requirements == [""]:
21 | requirements = ["pandas"]
22 |
23 | try:
24 | with open(os.path.join(here, "README.rst"), "r", encoding="utf-8") as f:
25 | long_description = "pandas-streaming:" + f.read().split("pandas-streaming:")[1]
26 | except FileNotFoundError:
27 | long_description = ""
28 |
29 | version_str = "0.1.0"
30 | with open(os.path.join(here, "pandas_streaming/__init__.py"), "r") as f:
31 | line = [
32 | _
33 | for _ in [_.strip("\r\n ") for _ in f.readlines()]
34 | if _.startswith("__version__")
35 | ]
36 | if len(line) > 0:
37 | version_str = line[0].split("=")[1].strip('" ')
38 |
39 |
40 | setup(
41 | name="pandas-streaming",
42 | version=version_str,
43 | description="Array (and numpy) API for ONNX",
44 | long_description=long_description,
45 | author="Xavier Dupré",
46 | author_email="xavier.dupre@gmail.com",
47 | url="https://github.com/sdpython/pandas-streaming",
48 | package_data=package_data,
49 | setup_requires=["numpy", "scipy"],
50 | install_requires=requirements,
51 | classifiers=[
52 | "Intended Audience :: Science/Research",
53 | "Intended Audience :: Developers",
54 | "License :: OSI Approved :: MIT License",
55 | "Programming Language :: C",
56 | "Programming Language :: Python",
57 | "Topic :: Software Development",
58 | "Topic :: Scientific/Engineering",
59 | "Development Status :: 5 - Production/Stable",
60 | "Operating System :: Microsoft :: Windows",
61 | "Operating System :: POSIX",
62 | "Operating System :: Unix",
63 | "Operating System :: MacOS",
64 | "Programming Language :: Python :: 3",
65 | "Programming Language :: Python :: 3.8",
66 | "Programming Language :: Python :: 3.9",
67 | "Programming Language :: Python :: 3.10",
68 | "Programming Language :: Python :: 3.11",
69 | ],
70 | )
71 |
--------------------------------------------------------------------------------