├── .github
    └── workflows
    │   ├── black-ruff.yml
    │   ├── check-urls.yml
    │   ├── codeql.yml
    │   ├── documentation.yml
    │   └── wheels-any.yml
├── .gitignore
├── .local.jenkins.lin.yml
├── CHANGELOGS.rst
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── _doc
    ├── _static
    │   ├── git_logo.png
    │   ├── project_ico.ico
    │   └── project_ico.png
    ├── api
    │   ├── connex_split.rst
    │   ├── dataframe.rst
    │   ├── dataframe_io.rst
    │   ├── dataframe_split.rst
    │   ├── index.rst
    │   ├── rdata.rst
    │   ├── rdf.rst
    │   ├── rexc.rst
    │   └── rio.rst
    ├── conf.py
    ├── examples
    │   ├── README.txt
    │   └── first_step.py
    ├── i_ex.rst
    ├── index.rst
    ├── license.rst
    ├── sg_execution_times.rst
    └── tutorial
    │   └── index.rst
├── _unittests
    ├── ut_df
    │   ├── data
    │   │   ├── buggy_hash.csv
    │   │   ├── buggy_hash2.csv
    │   │   ├── classic.json
    │   │   ├── example.json
    │   │   └── example2.json
    │   ├── test_connex_split.py
    │   ├── test_connex_split_big.py
    │   ├── test_connex_split_cat.py
    │   ├── test_dataframe_helpers.py
    │   ├── test_dataframe_helpers_simple.py
    │   ├── test_dataframe_io.py
    │   ├── test_dataframe_io_helpers.py
    │   ├── test_dataframe_sort.py
    │   ├── test_pandas_groupbynan.py
    │   └── test_streaming_dataframe.py
    └── ut_module
    │   └── test_sklearn.py
├── appveyor.yml
├── azure-pipelines.yml
├── pandas_streaming
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   └── dummy.py
    ├── df
    │   ├── __init__.py
    │   ├── connex_split.py
    │   ├── dataframe.py
    │   ├── dataframe_helpers.py
    │   ├── dataframe_io.py
    │   ├── dataframe_io_helpers.py
    │   └── dataframe_split.py
    ├── exc
    │   ├── __init__.py
    │   └── exc_streaming.py
    └── ext_test_case.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── setup.py


/.github/workflows/black-ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Black + Ruff Format Checker
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   black-format-check:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - uses: psf/black@stable
 9 |         with:
10 |           options: "--diff --check"
11 |           src: "."
12 |   ruff-format-check:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: chartboost/ruff-action@v1
17 | 


--------------------------------------------------------------------------------
/.github/workflows/check-urls.yml:
--------------------------------------------------------------------------------
 1 | name: Check URLs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   schedule:
 7 |     #        ┌───────────── minute (0 - 59)
 8 |     #        │  ┌───────────── hour (0 - 23)
 9 |     #        │  │ ┌───────────── day of the month (1 - 31)
10 |     #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
11 |     #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
12 |     #        │  │ │ │ │
13 |     #        │  │ │ │ │
14 |     #        │  │ │ │ │
15 |     #        *  * * * *
16 |     - cron: '30 1 * * 0'
17 | 
18 | jobs:
19 |   build:
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 | 
25 |     - name: urls-checker-code
26 |       uses: urlstechie/urlchecker-action@master
27 |       with:
28 |         subfolder: pandas_streaming
29 |         file_types: .md,.py,.rst,.ipynb
30 |         print_all: false
31 |         timeout: 2
32 |         retry_count# : 2
33 |         # exclude_urls: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz,https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
34 |         # exclude_patterns: https://dumps.wikimedia.org/
35 |         # force_pass : true
36 | 
37 |     - name: urls-checker-docs
38 |       uses: urlstechie/urlchecker-action@master
39 |       with:
40 |         subfolder: _doc
41 |         file_types: .md,.py,.rst,.ipynb
42 |         print_all: false
43 |         timeout: 2
44 |         retry_count# : 2
45 |         # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document
46 |         exclude_patterns: https://circleci.com/gh/sdpython/pandas_streaming/
47 |         # force_pass : true
48 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "Code Scanning - Action"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 |   schedule:
 9 |     #        ┌───────────── minute (0 - 59)
10 |     #        │  ┌───────────── hour (0 - 23)
11 |     #        │  │ ┌───────────── day of the month (1 - 31)
12 |     #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
13 |     #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
14 |     #        │  │ │ │ │
15 |     #        │  │ │ │ │
16 |     #        │  │ │ │ │
17 |     #        *  * * * *
18 |     - cron: '30 1 * * 0'
19 | 
20 | jobs:
21 |   CodeQL-Build:
22 |     # CodeQL runs on ubuntu-latest, windows-latest, and macos-latest
23 |     runs-on: ubuntu-latest
24 | 
25 |     permissions:
26 |       # required for all workflows
27 |       security-events: write
28 | 
29 |       # only required for workflows in private repositories
30 |       actions: read
31 |       contents: read
32 | 
33 |     steps:
34 |       - name: Checkout repository
35 |         uses: actions/checkout@v3
36 | 
37 |       # Initializes the CodeQL tools for scanning.
38 |       - name: Initialize CodeQL
39 |         uses: github/codeql-action/init@v2
40 |         # Override language selection by uncommenting this and choosing your languages
41 |         # with:
42 |         #   languages: go, javascript, csharp, python, cpp, java, ruby
43 | 
44 |       # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
45 |       # If this step fails, then you should remove it and run the build manually (see below).
46 |       - name: Autobuild
47 |         uses: github/codeql-action/autobuild@v2
48 | 
49 |       # ℹ️ Command-line programs to run using the OS shell.
50 |       # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
51 | 
52 |       # ✏️ If the Autobuild fails above, remove it and uncomment the following
53 |       #    three lines and modify them (or add more) to build your code if your
54 |       #    project uses a compiled language
55 | 
56 |       #- run: |
57 |       #     make bootstrap
58 |       #     make release
59 | 
60 |       - name: Perform CodeQL Analysis
61 |         uses: github/codeql-action/analyze@v2
62 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation and Code Coverage
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types:
 7 |       - closed
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   run:
13 |     name: Build documentation on ${{ matrix.os }}
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         os: [ubuntu-latest]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v3
21 | 
22 |       - uses: actions/setup-python@v4
23 |         with:
24 |           python-version: '3.11'
25 | 
26 |       - uses: tlylt/install-graphviz@v1
27 | 
28 |       - name: Install pandoc
29 |         run: sudo apt-get install -y pandoc
30 | 
31 |       - name: Install requirements
32 |         run: python -m pip install -r requirements.txt
33 | 
34 |       - name: Install requirements dev
35 |         run: python -m pip install -r requirements-dev.txt
36 | 
37 |       - name: Cache pip
38 |         uses: actions/cache@v2
39 |         with:
40 |           path: ~/.cache/pip
41 |           key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
42 |           restore-keys: |
43 |             ${{ runner.os }}-pip-
44 |             ${{ runner.os }}-
45 | 
46 |       - name: Generate coverage report
47 |         run: |
48 |           pip install pytest
49 |           pip install pytest-cov
50 |           export PYTHONPATH=.
51 |           pytest --cov=./pandas_streaming/ --cov-report=xml --durations=10 --ignore-glob=**LONG*.py --ignore-glob=**notebook*.py
52 |           export PYTHONPATH=
53 | 
54 |       - name: Upload coverage reports to Codecov
55 |         uses: codecov/codecov-action@v3
56 |         env:
57 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
58 | 
59 |       - name: Install
60 |         run: python setup.py install
61 | 
62 |       - name: Copy license, changelogs
63 |         run: |
64 |           cp LICENSE* ./_doc
65 |           cp CHANGELOGS* ./_doc
66 | 
67 |       - name: Documentation
68 |         run: python -m sphinx ./_doc ./dist/html -n -w doc.txt
69 | 
70 |       - name: Summary
71 |         run: cat doc.txt
72 | 
73 |       - name: Check for errors and warnings
74 |         run: |
75 |           if [[ $(grep ERROR doc.txt) ]]; then
76 |             echo "Documentation produces errors."
77 |             grep ERROR doc.txt
78 |             exit 1
79 |           fi
80 |           if [[ $(grep WARNING doc.txt | grep -v 'std:term:y') ]]; then
81 |             echo "Documentation produces warnings."
82 |             grep WARNING doc.txt
83 |             exit 1
84 |           fi
85 | 
86 |       - uses: actions/upload-artifact@v3
87 |         with:
88 |           path: ./dist/html/**
89 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-any.yml:
--------------------------------------------------------------------------------
 1 | name: Build Any Wheel
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - 'releases/**'
 8 | 
 9 | jobs:
10 |   build_wheels:
11 |     name: Build wheels on ${{ matrix.os }}
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: actions/setup-python@v4
21 |         with:
22 |           python-version: '3.11'
23 | 
24 |       - name: build wheel
25 |         run: python -m pip wheel .
26 | 
27 |       - uses: actions/upload-artifact@v3
28 |         with:
29 |           path: ./pandas_streaming*.whl
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyd
 3 | *.dylib
 4 | *.so
 5 | *.whl
 6 | *.csv
 7 | *.zip
 8 | coverage.html/*
 9 | _cache/*
10 | .coverage
11 | dist/*
12 | build/*
13 | .eggs/*
14 | .hypothesis/*
15 | *egg-info/*
16 | prof
17 | _doc/CHANGELOGS.rst
18 | _doc/LICENSE.txt
19 | _doc/auto_examples/*
20 | _doc/examples/_cache/*
21 | _doc/examples/plot_*.png
22 | _doc/examples/plot_*.xlsx
23 | _doc/examples/*.html
24 | _doc/_static/require.js
25 | _doc/_static/viz.js
26 | _unittests/ut__main/*.png
27 | _unittests/ut__main/_cache/*
28 | _unittests/ut__main/*.html
29 | _unittests/.hypothesis/*
30 | 


--------------------------------------------------------------------------------
/.local.jenkins.lin.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | language: python
 3 | 
 4 | python:
 5 |   - { PATH: "{{Python39}}", VERSION: 3.9, DIST: std, PYINT: python3.9 }
 6 |   
 7 | virtualenv:
 8 |   - path: {{ospathjoin(root_path, pickname("$NAME_JENKINS", project_name + "_$VERSION_$DIST_$NAME"), "_venv")}}
 9 |   
10 | install:
11 |   - $PYINT -m pip install --upgrade pip
12 |   - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/
13 |   - $PYINT -m pip install -r requirements.txt
14 |   - $PYINT -m pip install -r requirements-dev.txt
15 |   - $PYINT --version
16 |   - $PYINT -m pip freeze
17 | 
18 | script:
19 |   - { CMD: "$PYINT -u setup.py unittests --covtoken=14c7930a-a5c0-405d-a22f-3f9c6feaf0bc", NAME: "UT" }
20 | 
21 | after_script:
22 |   - $PYINT -u setup.py bdist_wheel
23 |   - if [ ${NAME} == "UT" ] then cp dist/*.whl {{root_path}}/../local_pypi/local_pypi_server fi
24 | 
25 | documentation:
26 |   - if [ ${NAME} == "UT" ] then $PYINT -u setup.py build_sphinx --layout=html fi
27 |   - if [ ${NAME} == "UT" ] then cp -R -f _doc/sphinxdoc/build/html dist/html fi
28 | 


--------------------------------------------------------------------------------
/CHANGELOGS.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Change Logs
 3 | ===========
 4 | 
 5 | 0.5.1
 6 | +++++
 7 | 
 8 | * :pr:`43`: improves reproducibility of function train_test_apart_stratify
 9 | 
10 | 0.5.0
11 | +++++
12 | 
13 | * :pr:`33`: removes pyquickhelper dependency
14 | * :pr:`30`: fix compatiblity with pandas 2.0
15 | 
16 | 0.3.239
17 | +++++++
18 | 
19 | * :pr:`27`: Fixes json parser when input is a stream (2021-10-26)
20 | * :pr:`26`: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
21 | * :pr:`25`: Fixes documentation (2021-10-18)
22 | * :pr:`24`: Implements a first version of sort_values. (2021-10-18)
23 | * :pr:`23`: First version of operator __setitem__ (2021-10-16)
24 | * :pr:`22`: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
25 | * :pr:`21`: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
26 | * :pr:`17`: Implements method describe (2021-04-08)
27 | 
28 | 0.2.175
29 | +++++++
30 | 
31 | * :pr:`16`: Unit tests failing with pandas 1.1.0. (2020-08-06)
32 | * :pr:`15`: implements parameter lines, flatten for read_json (2018-11-21)
33 | * :pr:`14`: implements fillna (2018-10-29)
34 | * :pr:`13`: implement concat for axis=0,1 (2018-10-26)
35 | * :pr:`12`: add groupby_streaming (2018-10-26)
36 | * :pr:`11`: add method add_column (2018-10-26)
37 | * :pr:`10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
38 | * :pr:`9`: head is very slow (2018-10-26)
39 | * :pr:`8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
40 | * :pr:`7`: implement read_json (2018-05-17)
41 | * :pr:`6`: add pandas_groupby_nan from pyensae (2018-05-17)
42 | * :pr:`5`: add random_state parameter to splitting functions (2018-02-04)
43 | * :pr:`2`: add method sample, resevoir sampling (2017-11-05)
44 | * :pr:`3`: method train_test_split for out-of-memory datasets (2017-10-21)
45 | * :pr:`1`: Excited for your project (2017-10-10)
46 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | We are a community based on openness, as well as friendly and didactic discussions.
 4 | 
 5 | We aspire to treat everybody equally, and value their contributions.
 6 | 
 7 | Decisions are made based on technical merit and consensus.
 8 | 
 9 | Code is not the only way to help the project. Reviewing pull requests,
10 | answering questions to help others on mailing lists or issues, organizing and
11 | teaching tutorials, working on the website, improving the documentation, are
12 | all priceless contributions.
13 | 
14 | We abide by the principles of openness, respect, and consideration of others of
15 | the Python Software Foundation: https://www.python.org/psf/codeofconduct/
16 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Copyright (c) 2017-2024, Xavier Dupré
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune _doc
2 | prune _unittests
3 | exclude *.bat
4 | exclude *.yml
5 | exclude *.git*
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pandas-streaming: streaming API over pandas
 2 | ===========================================
 3 | 
 4 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true
 5 |     :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
 6 |     :alt: Build Status Windows
 7 | 
 8 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
 9 |     :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
10 | 
11 | .. image:: https://badge.fury.io/py/pandas_streaming.svg
12 |     :target: http://badge.fury.io/py/pandas_streaming
13 | 
14 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
15 |     :alt: MIT License
16 |     :target: https://opensource.org/license/MIT/
17 | 
18 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8 
19 |     :target: https://codecov.io/gh/sdpython/pandas-streaming
20 | 
21 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
22 |     :alt: GitHub Issues
23 |     :target: https://github.com/sdpython/pandas_streaming/issues
24 | 
25 | .. image:: https://pepy.tech/badge/pandas_streaming/month
26 |     :target: https://pepy.tech/project/pandas_streaming/month
27 |     :alt: Downloads
28 | 
29 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg
30 |     :target: https://github.com/sdpython/pandas_streaming/
31 |     :alt: Forks
32 | 
33 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg
34 |     :target: https://github.com/sdpython/pandas_streaming/
35 |     :alt: Stars
36 | 
37 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming
38 |     :target: https://github.com/sdpython/pandas_streaming/
39 |     :alt: size
40 | 
41 | `pandas-streaming <https://sdpython.github.io/doc/pandas-streaming/dev/>`_
42 | aims at processing big files with `pandas <https://pandas.pydata.org/>`_,
43 | too big to hold in memory, too small to be parallelized with a significant gain.
44 | The module replicates a subset of *pandas* API
45 | and implements other functionalities for machine learning.
46 | 
47 | .. code-block:: python
48 | 
49 |     from pandas_streaming.df import StreamingDataFrame
50 |     sdf = StreamingDataFrame.read_csv("filename", sep="\t", encoding="utf-8")
51 | 
52 |     for df in sdf:
53 |         # process this chunk of data
54 |         # df is a dataframe
55 |         print(df)
56 | 
57 | The module can also stream an existing dataframe.
58 | 
59 | .. code-block:: python
60 | 
61 |     import pandas
62 |     df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"),
63 |                            dict(cf=1, cint=1, cstr="1"),
64 |                            dict(cf=3, cint=3, cstr="3")])
65 | 
66 |     from pandas_streaming.df import StreamingDataFrame
67 |     sdf = StreamingDataFrame.read_df(df)
68 | 
69 |     for df in sdf:
70 |         # process this chunk of data
71 |         # df is a dataframe
72 |         print(df)
73 | 
74 | It contains other helpers to split datasets into
75 | train and test with some weird constraints.
76 | 


--------------------------------------------------------------------------------
/_doc/_static/git_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/git_logo.png


--------------------------------------------------------------------------------
/_doc/_static/project_ico.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.ico


--------------------------------------------------------------------------------
/_doc/_static/project_ico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.png


--------------------------------------------------------------------------------
/_doc/api/connex_split.rst:
--------------------------------------------------------------------------------
1 | 
2 | pandas_streaming.df.connex_split
3 | ================================
4 | 
5 | .. automodule:: pandas_streaming.df.connex_split
6 |     :members:
7 | 


--------------------------------------------------------------------------------
/_doc/api/dataframe.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | pandas_streaming.df.dataframe
 3 | =============================
 4 | 
 5 | StreamingDataFrameSchemaError
 6 | +++++++++++++++++++++++++++++
 7 | 
 8 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrameSchemaError
 9 |     :members:
10 | 
11 | StreamingDataFrame
12 | ++++++++++++++++++
13 | 
14 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame
15 |     :members:
16 |     :special-members:
17 | 
18 | StreamingSeries
19 | +++++++++++++++
20 | 
21 | .. autoclass:: pandas_streaming.df.dataframe.StreamingSeries
22 |     :members:
23 | 


--------------------------------------------------------------------------------
/_doc/api/dataframe_io.rst:
--------------------------------------------------------------------------------
1 | 
2 | pandas_streaming.df.dataframe_io
3 | ================================
4 | 
5 | .. automodule:: pandas_streaming.df.dataframe_io
6 |     :members:
7 | 


--------------------------------------------------------------------------------
/_doc/api/dataframe_split.rst:
--------------------------------------------------------------------------------
1 | 
2 | pandas_streaming.df.dataframe_split
3 | ===================================
4 | 
5 | .. automodule:: pandas_streaming.df.dataframe_split
6 |     :members:
7 | 


--------------------------------------------------------------------------------
/_doc/api/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | API
 3 | ===
 4 | 
 5 | .. toctree::
 6 | 
 7 |     rdata
 8 |     rdf
 9 |     rexc
10 |     rio
11 | 


--------------------------------------------------------------------------------
/_doc/api/rdata.rst:
--------------------------------------------------------------------------------
1 | 
2 | pandas_streaming.data
3 | =====================
4 | 
5 | Collection of functions which produces
6 | :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`.
7 | 
8 | .. autofunction:: pandas_streaming.data.dummy.dummy_streaming_dataframe
9 | 


--------------------------------------------------------------------------------
/_doc/api/rdf.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | pandas_streaming.df
 3 | ===================
 4 | 
 5 | Streaming
 6 | +++++++++
 7 | 
 8 | The main class is an interface which mimic
 9 | :class:`pandas.DataFrame` interface to offer
10 | a short list of methods which apply on an
11 | iterator of dataframes. This provides somehow
12 | a streaming version of it. As a result, the creation
13 | of an instance is fast as long as the data is not
14 | processed. Iterators can be chained as many map reduce
15 | framework does.
16 | 
17 | .. toctree::
18 |     :maxdepth: 2
19 | 
20 |     dataframe
21 | 
22 | The module implements additional and useful functions
23 | not necessarily for the streaming version of the dataframes.
24 | Many methods have been rewritten to support
25 | streaming. Among them, IO methods:
26 | :meth:`read_csv <pandas_streaming.df.dataframe.StreamingDataFrame.read_csv>`,
27 | :meth:`read_df <pandas_streaming.df.dataframe.StreamingDataFrame.read_df>`,
28 | :meth:`read_json <pandas_streaming.df.dataframe.StreamingDataFrame.read_json>`.
29 | 
30 | Data Manipulation
31 | +++++++++++++++++
32 | 
33 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns
34 | 
35 | .. autofunction:: pandas_streaming.df.connex_split.dataframe_shuffle
36 | 
37 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
38 | 
39 | .. autofunction:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan
40 | 
41 | Complex splits
42 | ++++++++++++++
43 | 
44 | Splitting a database into train and test is usually simple except
45 | if rows are not independant and share some ids. In that case,
46 | the following functions will try to build two partitions keeping
47 | ids separate or separate as much as possible:
48 | :func:`train_test_apart_stratify <pandas_streaming.df.connex_split.train_test_apart_stratify>`,
49 | :func:`train_test_connex_split <pandas_streaming.df.connex_split.train_test_connex_split>`,
50 | :func:`train_test_split_weights <pandas_streaming.df.connex_split.train_test_split_weights>`.
51 | 
52 | Extensions
53 | ++++++++++
54 | 
55 | .. toctree::
56 |     :maxdepth: 1
57 | 
58 |     connex_split
59 |     dataframe_io
60 |     dataframe_split
61 | 


--------------------------------------------------------------------------------
/_doc/api/rexc.rst:
--------------------------------------------------------------------------------
1 | 
2 | pandas_streaming.exc
3 | ====================
4 | 
5 | Exceptions.
6 | 
7 | .. autoclass:: pandas_streaming.exc.exc_streaming.StreamingInefficientException
8 | 


--------------------------------------------------------------------------------
/_doc/api/rio.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Inputs / Outputs
 3 | ================
 4 | 
 5 | Dataframes / Numpy arrays
 6 | +++++++++++++++++++++++++
 7 | 
 8 | `HDF5 <https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables>`_
 9 | is easy to manipulate in the :epkg:`Python` world but difficult
10 | to exchange with other people and other environments.
11 | The two following functions makes it easier to collapse many dataframes
12 | or numpy arrays into one single file. The data can be unzipped afterwards,
13 | see :func:`read_zip <pandas_streaming.df.dataframe_io.read_zip>`,
14 | :func:`to_zip <pandas_streaming.df.dataframe_io.to_zip>`.
15 | 


--------------------------------------------------------------------------------
/_doc/conf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from sphinx_runpython.github_link import make_linkcode_resolve
  4 | from sphinx_runpython.conf_helper import has_dvipng, has_dvisvgm
  5 | from pandas_streaming import __version__
  6 | 
  7 | 
  8 | extensions = [
  9 |     "nbsphinx",
 10 |     "sphinx.ext.autodoc",
 11 |     "sphinx.ext.coverage",
 12 |     "sphinx.ext.githubpages",
 13 |     "sphinx.ext.ifconfig",
 14 |     "sphinx.ext.intersphinx",
 15 |     "sphinx.ext.linkcode",
 16 |     "sphinx.ext.viewcode",
 17 |     "sphinx.ext.napoleon",
 18 |     "sphinx.ext.todo",
 19 |     "sphinx_gallery.gen_gallery",
 20 |     "sphinx_issues",
 21 |     "sphinx_runpython.blocdefs.sphinx_exref_extension",
 22 |     "sphinx_runpython.blocdefs.sphinx_mathdef_extension",
 23 |     "sphinx_runpython.epkg",
 24 |     "sphinx_runpython.gdot",
 25 |     "sphinx_runpython.runpython",
 26 |     "matplotlib.sphinxext.plot_directive",
 27 | ]
 28 | 
 29 | if has_dvisvgm():
 30 |     extensions.append("sphinx.ext.imgmath")
 31 |     imgmath_image_format = "svg"
 32 | elif has_dvipng():
 33 |     extensions.append("sphinx.ext.pngmath")
 34 |     imgmath_image_format = "png"
 35 | else:
 36 |     extensions.append("sphinx.ext.mathjax")
 37 | 
 38 | templates_path = ["_templates"]
 39 | html_logo = "_static/project_ico.png"
 40 | source_suffix = ".rst"
 41 | master_doc = "index"
 42 | project = "pandas-streaming"
 43 | copyright = "2017-2024, Xavier Dupré"
 44 | author = "Xavier Dupré"
 45 | version = __version__
 46 | release = __version__
 47 | language = "en"
 48 | exclude_patterns = ["auto_examples/*.ipynb"]
 49 | pygments_style = "sphinx"
 50 | todo_include_todos = True
 51 | nbsphinx_execute = "never"
 52 | 
 53 | html_theme = "furo"
 54 | html_theme_path = ["_static"]
 55 | html_theme_options = {}
 56 | html_sourcelink_suffix = ""
 57 | html_static_path = ["_static"]
 58 | 
 59 | issues_github_path = "sdpython/pandas-streaming"
 60 | 
 61 | # The following is used by sphinx.ext.linkcode to provide links to github
 62 | linkcode_resolve = make_linkcode_resolve(
 63 |     "pandas_streaming",
 64 |     (
 65 |         "https://github.com/sdpython/pandas-streaming/"
 66 |         "blob/{revision}/{package}/"
 67 |         "{path}#L{lineno}"
 68 |     ),
 69 | )
 70 | 
 71 | latex_elements = {
 72 |     "papersize": "a4",
 73 |     "pointsize": "10pt",
 74 |     "title": project,
 75 | }
 76 | 
 77 | mathjax3_config = {"chtml": {"displayAlign": "left"}}
 78 | 
 79 | intersphinx_mapping = {
 80 |     "onnx": ("https://onnx.ai/onnx/", None),
 81 |     "matplotlib": ("https://matplotlib.org/", None),
 82 |     "numpy": ("https://numpy.org/doc/stable", None),
 83 |     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
 84 |     "python": (f"https://docs.python.org/{sys.version_info.major}", None),
 85 |     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
 86 |     "sklearn": ("https://scikit-learn.org/stable/", None),
 87 |     "sklearn-onnx": ("https://onnx.ai/sklearn-onnx/", None),
 88 |     "torch": ("https://pytorch.org/docs/stable/", None),
 89 | }
 90 | 
 91 | # Check intersphinx reference targets exist
 92 | nitpicky = True
 93 | # See also scikit-learn/scikit-learn#26761
 94 | nitpick_ignore = [
 95 |     ("py:class", "False"),
 96 |     ("py:class", "True"),
 97 |     ("py:class", "pipeline.Pipeline"),
 98 |     ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"),
 99 | ]
100 | 
101 | sphinx_gallery_conf = {
102 |     # path to your examples scripts
103 |     "examples_dirs": os.path.join(os.path.dirname(__file__), "examples"),
104 |     # path where to save gallery generated examples
105 |     "gallery_dirs": "auto_examples",
106 | }
107 | 
108 | # next
109 | 
110 | preamble = """
111 | \\usepackage{etex}
112 | \\usepackage{fixltx2e} % LaTeX patches, \\textsubscript
113 | \\usepackage{cmap} % fix search and cut-and-paste in Acrobat
114 | \\usepackage[raccourcis]{fast-diagram}
115 | \\usepackage{titlesec}
116 | \\usepackage{amsmath}
117 | \\usepackage{amssymb}
118 | \\usepackage{amsfonts}
119 | \\usepackage{graphics}
120 | \\usepackage{epic}
121 | \\usepackage{eepic}
122 | %\\usepackage{pict2e}
123 | %%% Redefined titleformat
124 | \\setlength{\\parindent}{0cm}
125 | \\setlength{\\parskip}{1ex plus 0.5ex minus 0.2ex}
126 | \\newcommand{\\hsp}{\\hspace{20pt}}
127 | \\newcommand{\\acc}[1]{\\left\\{#1\\right\\}}
128 | \\newcommand{\\cro}[1]{\\left[#1\\right]}
129 | \\newcommand{\\pa}[1]{\\left(#1\\right)}
130 | \\newcommand{\\R}{\\mathbb{R}}
131 | \\newcommand{\\HRule}{\\rule{\\linewidth}{0.5mm}}
132 | %\\titleformat{\\chapter}[hang]{\\Huge\\bfseries\\sffamily}{\\thechapter\\hsp}{0pt}{\\Huge\\bfseries\\sffamily}
133 | 
134 | \\usepackage[all]{xy}
135 | \\newcommand{\\vecteur}[2]{\\pa{#1,\\dots,#2}}
136 | \\newcommand{\\N}[0]{\\mathbb{N}}
137 | \\newcommand{\\indicatrice}[1]{ {1\\!\\!1}_{\\acc{#1}} }
138 | \\newcommand{\\infegal}[0]{\\leqslant}
139 | \\newcommand{\\supegal}[0]{\\geqslant}
140 | \\newcommand{\\ensemble}[2]{\\acc{#1,\\dots,#2}}
141 | \\newcommand{\\fleche}[1]{\\overrightarrow{ #1 }}
142 | \\newcommand{\\intervalle}[2]{\\left\\{#1,\\cdots,#2\\right\\}}
143 | \\newcommand{\\independant}[0]{\\perp \\!\\!\\! \\perp}
144 | \\newcommand{\\esp}{\\mathbb{E}}
145 | \\newcommand{\\espf}[2]{\\mathbb{E}_{#1}\\pa{#2}}
146 | \\newcommand{\\var}{\\mathbb{V}}
147 | \\newcommand{\\pr}[1]{\\mathbb{P}\\pa{#1}}
148 | \\newcommand{\\loi}[0]{{\\cal L}}
149 | \\newcommand{\\vecteurno}[2]{#1,\\dots,#2}
150 | \\newcommand{\\norm}[1]{\\left\\Vert#1\\right\\Vert}
151 | \\newcommand{\\norme}[1]{\\left\\Vert#1\\right\\Vert}
152 | \\newcommand{\\scal}[2]{\\left<#1,#2\\right>}
153 | \\newcommand{\\dans}[0]{\\rightarrow}
154 | \\newcommand{\\partialfrac}[2]{\\frac{\\partial #1}{\\partial #2}}
155 | \\newcommand{\\partialdfrac}[2]{\\dfrac{\\partial #1}{\\partial #2}}
156 | \\newcommand{\\trace}[1]{tr\\pa{#1}}
157 | \\newcommand{\\sac}[0]{|}
158 | \\newcommand{\\abs}[1]{\\left|#1\\right|}
159 | \\newcommand{\\loinormale}[2]{{\\cal N} \\pa{#1,#2}}
160 | \\newcommand{\\loibinomialea}[1]{{\\cal B} \\pa{#1}}
161 | \\newcommand{\\loibinomiale}[2]{{\\cal B} \\pa{#1,#2}}
162 | \\newcommand{\\loimultinomiale}[1]{{\\cal M} \\pa{#1}}
163 | \\newcommand{\\variance}[1]{\\mathbb{V}\\pa{#1}}
164 | \\newcommand{\\intf}[1]{\\left\\lfloor #1 \\right\\rfloor}
165 | """
166 | 
167 | imgmath_latex_preamble = preamble
168 | latex_elements["preamble"] = imgmath_latex_preamble
169 | 
170 | 
171 | epkg_dictionary = {
172 |     "csv": "https://en.wikipedia.org/wiki/Comma-separated_values",
173 |     "dask": "https://dask.pydata.org/en/latest/",
174 |     "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
175 |     "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
176 |     "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
177 |     "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
178 |     "dill": "https://dill.readthedocs.io/en/latest/dill.html",
179 |     "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html",
180 |     "Jupyter": "https://jupyter.org/",
181 |     "Hadoop": "http://hadoop.apache.org/",
182 |     "ijson": "https://github.com/ICRAR/ijson",
183 |     "json": "https://docs.python.org/3/library/json.html",
184 |     "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
185 |     "numpy": "https://numpy.org/",
186 |     "pandas": (
187 |         "http://pandas.pydata.org/pandas-docs/stable/",
188 |         (
189 |             "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html",
190 |             1,
191 |         ),
192 |         (
193 |             "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html",
194 |             2,
195 |         ),
196 |     ),
197 |     "pyarrow": "https://arrow.apache.org/docs/python/",
198 |     "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
199 |     "Python": "https://www.python.org/",
200 |     "scikit-learn": "https://scikit-learn.org/stable/",
201 |     "scikit-multiflow": "https://scikit-multiflow.github.io/",
202 |     "sklearn": (
203 |         "https://scikit-learn.org/stable/",
204 |         ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1),
205 |         ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
206 |     ),
207 |     "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
208 |     "tornado": "https://www.tornadoweb.org/en/stable/",
209 |     "zip": "https://en.wikipedia.org/wiki/ZIP_(file_format)",
210 | }
211 | 


--------------------------------------------------------------------------------
/_doc/examples/README.txt:
--------------------------------------------------------------------------------
1 | Gallery of Examples
2 | ===================
3 | 
4 | 


--------------------------------------------------------------------------------
/_doc/examples/first_step.py:
--------------------------------------------------------------------------------
  1 | """
  2 | First steps with pandas_streaming
  3 | =================================
  4 | 
  5 | A few difference between :epkg:`pandas` and *pandas_streaming*.
  6 | 
  7 | pandas to pandas_streaming
  8 | ++++++++++++++++++++++++++
  9 | """
 10 | 
 11 | import glob
 12 | from pandas import DataFrame
 13 | from pandas_streaming.df import StreamingDataFrame
 14 | 
 15 | 
 16 | df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
 17 | df
 18 | 
 19 | 
 20 | #############################
 21 | # We create a streaming dataframe:
 22 | 
 23 | 
 24 | sdf = StreamingDataFrame.read_df(df)
 25 | sdf
 26 | 
 27 | 
 28 | ################################
 29 | #
 30 | 
 31 | sdf.to_dataframe()
 32 | 
 33 | 
 34 | ########################################
 35 | # Internally, StreamingDataFrame implements an iterator on
 36 | # dataframes and then tries to replicate the same interface as
 37 | # :class:`pandas.DataFrame` possibly wherever it is possible to
 38 | # manipulate data without loading everything into memory.
 39 | 
 40 | 
 41 | sdf2 = sdf.concat(sdf)
 42 | sdf2.to_dataframe()
 43 | 
 44 | 
 45 | ###############################
 46 | #
 47 | 
 48 | m = DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
 49 | m
 50 | 
 51 | 
 52 | ##########################################
 53 | #
 54 | 
 55 | sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
 56 | sdf3.to_dataframe()
 57 | 
 58 | 
 59 | ############################################
 60 | #
 61 | 
 62 | sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer")
 63 | 
 64 | 
 65 | ############################################
 66 | # The order might be different.
 67 | 
 68 | 
 69 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
 70 | sdfte.head()
 71 | 
 72 | 
 73 | ############################################
 74 | #
 75 | 
 76 | 
 77 | sdftr.head()
 78 | 
 79 | 
 80 | ############################################
 81 | # split a big file
 82 | # ++++++++++++++++
 83 | 
 84 | 
 85 | sdf2.to_csv("example.txt")
 86 | 
 87 | 
 88 | ############################################
 89 | #
 90 | 
 91 | 
 92 | new_sdf = StreamingDataFrame.read_csv("example.txt")
 93 | new_sdf.train_test_split("example.{}.txt", streaming=False)
 94 | 
 95 | 
 96 | ############################################
 97 | #
 98 | 
 99 | glob.glob("ex*.txt")
100 | 


--------------------------------------------------------------------------------
/_doc/i_ex.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Examples
 3 | ========
 4 | 
 5 | About array
 6 | +++++++++++
 7 | 
 8 | .. exreflist::
 9 |     :contents:
10 |     :tag: array
11 | 
12 | About DataFrame
13 | +++++++++++++++
14 | 
15 | .. exreflist::
16 |     :contents:
17 |     :tag: dataframe
18 | 
19 | About StreamingDataFrame
20 | ++++++++++++++++++++++++
21 | 
22 | .. exreflist::
23 |     :contents:
24 |     :tag: streaming
25 | 


--------------------------------------------------------------------------------
/_doc/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. |gitlogo| image:: _static/git_logo.png
 3 |              :height: 20
 4 | 
 5 | pandas-streaming: streaming API over pandas
 6 | ===========================================
 7 | 
 8 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true
 9 |     :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
10 |     :alt: Build Status Windows
11 | 
12 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
13 |     :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
14 | 
15 | .. image:: https://badge.fury.io/py/pandas_streaming.svg
16 |     :target: http://badge.fury.io/py/pandas-streaming
17 | 
18 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
19 |     :alt: MIT License
20 |     :target: https://opensource.org/license/MIT/
21 | 
22 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8 
23 |     :target: https://codecov.io/gh/sdpython/pandas-streaming
24 | 
25 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
26 |     :alt: GitHub Issues
27 |     :target: https://github.com/sdpython/pandas_streaming/issues
28 | 
29 | .. image:: https://pepy.tech/badge/pandas_streaming
30 |     :target: https://pypi.org/project/pandas_streaming/
31 |     :alt: Downloads
32 | 
33 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg
34 |     :target: https://github.com/sdpython/pandas_streaming/
35 |     :alt: Forks
36 | 
37 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg
38 |     :target: https://github.com/sdpython/pandas_streaming/
39 |     :alt: Stars
40 | 
41 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming
42 |     :target: https://github.com/sdpython/pandas_streaming/
43 |     :alt: size
44 | 
45 | *pandas_streaming* aims at processing big files with :epkg:`pandas`,
46 | too big to hold in memory, too small to be parallelized with a significant gain.
47 | The module replicates a subset of :epkg:`pandas` API
48 | and implements other functionalities for machine learning.
49 | 
50 | .. toctree::
51 |     :maxdepth: 1
52 |     :caption: Contents
53 | 
54 |     tutorial/index
55 |     auto_examples/index
56 |     api/index
57 |     i_ex
58 | 
59 | .. toctree::
60 |     :maxdepth: 1
61 |     :caption: More
62 | 
63 |     CHANGELOGS
64 |     license
65 | 
66 | Source are available at `sdpython/pandas_streaming <https://github.com/sdpython/pandas_streaming/>`_.
67 | 
68 | Older versions
69 | ++++++++++++++
70 | 
71 | * `0.5.1 <../v0.5.1/index.html>`_
72 | * `0.5.0 <../v0.5.0/index.html>`_
73 | 


--------------------------------------------------------------------------------
/_doc/license.rst:
--------------------------------------------------------------------------------
1 | .. _l-license:
2 | 
3 | License
4 | =======
5 | 
6 | .. include:: LICENSE.txt
7 |    :literal:
8 | 


--------------------------------------------------------------------------------
/_doc/sg_execution_times.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | :orphan:
 3 | 
 4 | .. _sphx_glr_sg_execution_times:
 5 | 
 6 | 
 7 | Computation times
 8 | =================
 9 | **00:00.000** total execution time for 1 file **from all galleries**:
10 | 
11 | .. container::
12 | 
13 |   .. raw:: html
14 | 
15 |     <style scoped>
16 |     <link href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css" rel="stylesheet" />
17 |     <link href="https://cdn.datatables.net/1.13.6/css/dataTables.bootstrap5.min.css" rel="stylesheet" />
18 |     </style>
19 |     <script src="https://code.jquery.com/jquery-3.7.0.js"></script>
20 |     <script src="https://cdn.datatables.net/1.13.6/js/jquery.dataTables.min.js"></script>
21 |     <script src="https://cdn.datatables.net/1.13.6/js/dataTables.bootstrap5.min.js"></script>
22 |     <script type="text/javascript" class="init">
23 |     $(document).ready( function () {
24 |         $('table.sg-datatable').DataTable({order: [[1, 'desc']]});
25 |     } );
26 |     </script>
27 | 
28 |   .. list-table::
29 |    :header-rows: 1
30 |    :class: table table-striped sg-datatable
31 | 
32 |    * - Example
33 |      - Time
34 |      - Mem (MB)
35 |    * - :ref:`sphx_glr_auto_examples_first_step.py` (``examples/first_step.py``)
36 |      - 00:00.000
37 |      - 0.0
38 | 


--------------------------------------------------------------------------------
/_doc/tutorial/index.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Tutorial
  3 | ========
  4 | 
  5 | The main class :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
  6 | is basically on iterator on dataframes. Altogether, it is a
  7 | single dataframe which does not have to fit in memory.
  8 | It implements a subset a functionalities :epkg:`pandas` provides
  9 | related to map reduce,
 10 | :meth:`concat <pandas_streaming.df.dataframe.StreamingDataFrame.concat>`,
 11 | :meth:`join <pandas_streaming.df.dataframe.StreamingDataFrame.concat>`.
 12 | Both return a :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
 13 | as opposed to :meth:`groupby <pandas_streaming.df.dataframe.StreamingDataFrame.concat>`
 14 | which does not.
 15 | 
 16 | The beginning is always the same, we create such object with one
 17 | method :meth:`read_csv <pandas_streaming.df.dataframe.StreamingDataFrame.read_csv>`,
 18 | :meth:`read_df <pandas_streaming.df.dataframe.StreamingDataFrame.read_df>`,
 19 | :meth:`read_str <pandas_streaming.df.dataframe.StreamingDataFrame.read_str>`.
 20 | The module was initially created to easily split a dataset into train/test
 21 | when it does not fit into memory.
 22 | 
 23 | ::
 24 | 
 25 |     from pandas_streaming.df import StreamingDataFrame
 26 |     sdf = StreamingDataFrame.read_csv("<filename>", sep="\t")
 27 |     sdf.train_test_split("dataset_split_{}.txt", sep="\t")
 28 | 
 29 |     >>> ['dataset_split_train.txt', 'dataset_split_test.txt']
 30 | 
 31 | Objectives and Competitors
 32 | ++++++++++++++++++++++++++
 33 | 
 34 | The first objective is speed.
 35 | :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
 36 | is useful when the user needs to process a large data set which does not
 37 | hold in memory (*out-of-memory dataset*) or when the user needs to fastly
 38 | check an algorithm on the beginning of a big dataset without paying the
 39 | cost of loading the data.
 40 | 
 41 | The second objective is simplicity. The proposed interface
 42 | tries to follow the same syntax as :epkg:`pandas`.
 43 | That is one of the direction followed by :epkg:`dask`.
 44 | 
 45 | :epkg:`dask` tries to address these two objectives
 46 | and also offers parallelization. Based on my experience,
 47 | :epkg:`dask` is efficient but tends to be slow for simple things
 48 | on medium datasets (a couple of gigabytes). The API is not exactly
 49 | the same either. The parser does not behave exactly the same.
 50 | :epkg:`pyspark` seems a bit of overhead, more difficult
 51 | to install and still slow if it is used locally.
 52 | :epkg:`pyarrow` is supposed to be the next :epkg:`pandas` but its
 53 | scope is larger (it handles streaming dataset from :epkg:`Hadoop`)
 54 | and does not work yet with :epkg:`scikit-learn`.
 55 | I expect this module to be live until
 56 | :epkg:`scikit-learn` updates its code to handle
 57 | a streaming container. This one will probably be
 58 | the winner.
 59 | :epkg:`streamz` follows a different direction.
 60 | It offers parallelisation, relies on :epkg:`tornado` but not
 61 | on :epkg:`pandas` meaning using it for machine learning
 62 | might hide some unexpected loopholes.
 63 | :epkg:`scikit-multiflow` does not only implement streaming
 64 | container but also streaming machine learning trainings.
 65 | 
 66 | One element of design to remember
 67 | +++++++++++++++++++++++++++++++++
 68 | 
 69 | The class :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
 70 | does not hold an iterator but a function which creates an iterator.
 71 | Every time the user writes the following loop, the function is called
 72 | to create an iterator then used to walk through the data.
 73 | 
 74 | .. runpython::
 75 |     :showcode:
 76 | 
 77 |     import pandas
 78 |     df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
 79 |                            dict(cf=3, cint=3, cstr="3")])
 80 | 
 81 |     from pandas_streaming.df import StreamingDataFrame
 82 |     sdf = StreamingDataFrame.read_df(df, chunksize=2)
 83 | 
 84 |     print("First time:")
 85 | 
 86 |     for df in sdf:
 87 |         # process this chunk of data
 88 |         print(df)
 89 | 
 90 |     print("\nSecond time:\n")
 91 | 
 92 |     for df in sdf:
 93 |         # process this chunk of data a second time
 94 |         print(df)
 95 | 
 96 | The reason why the class cannot directly use an iterator is because
 97 | it is not possible to pickle an iterator. An iterator is meant to
 98 | be used only once, a second loop would not be possible and would
 99 | be quite surprising to most of users.
100 | 
101 | A :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
102 | is also supposed to be *stable*: the two loops in the previous example
103 | should produce the exact same chunks. However, in some cases, the user can choose
104 | not to abide by this constraint. Drawing a sample is one of the reasons.
105 | A user can either choose to draw the same sample every time he is going
106 | through the data. He could also choose that a different sample should be
107 | drawn each time. The following method indicates which kinds of sample
108 | the :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
109 | is producing.
110 | 
111 | Check the schema consistency of a large file
112 | ++++++++++++++++++++++++++++++++++++++++++++
113 | 
114 | Large files usually comes from an export of a database and this
115 | for some reason, this export failed for a couple of lines.
116 | It can be character *end of line* not removed from a comment,
117 | a separator also present in the data. When that happens, :epkg:`pandas`
118 | takes the least strict type as the column type. Sometimes, we prefer to get a
119 | an idea of where we could find the error.
120 | 
121 | .. runpython::
122 |     :showcode:
123 | 
124 |     import pandas
125 |     df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
126 |                            dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")])
127 |     name = "temp_df.csv"
128 |     df.to_csv(name, index=False)
129 | 
130 |     from pandas_streaming.df import StreamingDataFrame
131 |     try:
132 |         sdf = StreamingDataFrame.read_csv(name, chunksize=2)
133 |         for df in sdf:
134 |             print(df.dtypes)
135 |     except Exception as e:
136 |         print("ERROR:", e)
137 | 
138 | The method :meth:`__iter__
139 | <pandas_streaming.df.dataframe.StreamingDataFrame.__iter__>`
140 | checks that the schema does not change between two iterations.
141 | It can be disabled by adding *check_schema=False* when
142 | the constructor is called.
143 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/data/buggy_hash2.csv:
--------------------------------------------------------------------------------
 1 | 1092397418290.0	a181248367	366498568522.0
 2 | 138742792720.0	516e2e745c	73810952621.0
 3 | 108082559849.0	1601fecc7f	79402822525.0
 4 | 251797282335.0	29d56f63ec	530980115159.0
 5 | 651822622544.0	67be9eb2e5	618639148003.0
 6 | 817909238810.0	3a24c42894	441595633456.0
 7 | 427513930052.0	42fbf1e0a9	759755785197.0
 8 | 409652918460.0	e0e09bcb7b	487633962255.0
 9 | 126536040328.0	a2c6f80ea6	325262414951.0
10 | 195809963606.0	7d67e33166	58693978128.0
11 | 426363751898.0	4f67c53e66	1037516316531.0
12 | 51702292002.0	37c64b233a	206747200377.0
13 | 945246123121.0	8739a9cebb	639796038157.0
14 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/data/classic.json:
--------------------------------------------------------------------------------
 1 | [{"name":"cudaGetDeviceCount",
 2 | "ph":"X",
 3 | "cat":"cuda",
 4 | "ts":1634290065724226794,
 5 | "dur":800,
 6 | "tid":"Thread 2080429824: Runtime API",
 7 | "pid":"[89792] Process",
 8 | "args":{}},
 9 | {"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
10 | "ph":"X",
11 | "cat":"cuda",
12 | "ts":1634290112071305413,
13 | "dur":1888,
14 | "tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
15 | "pid":"[0:1] Compute",
16 | "args":{"Grid size":"[ 1, 1, 1 ]",
17 | "Block size":"[ 256, 1, 1 ]"}},
18 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
19 | "ph":"X",
20 | "cat":"cuda",
21 | "ts":1634290112071308133,
22 | "dur":1440,
23 | "tid":"Compute",
24 | "pid":"[0:1] Overview",
25 | "args":{"Grid size":"[ 1, 1, 1 ]",
26 | "Block size":"[ 1, 256, 1 ]"}},
27 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
28 | "ph":"X",
29 | "cat":"cuda",
30 | "ts":1634290112071308133,
31 | "dur":1440,
32 | "tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
33 | "pid":"[0:1] Compute",
34 | "args":{"Grid size":"[ 1, 1, 1 ]",
35 | "Block size":"[ 1, 256, 1 ]"}}]
36 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/data/example.json:
--------------------------------------------------------------------------------
1 | {"a": 1, "b": 2}
2 | {"a": 3, "b": 4}


--------------------------------------------------------------------------------
/_unittests/ut_df/data/example2.json:
--------------------------------------------------------------------------------
1 | [{"a":1,"b":2},{"a":3,"b":4}]


--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas
  3 | from pandas_streaming.ext_test_case import ExtTestCase
  4 | from pandas_streaming.df import (
  5 |     dataframe_shuffle,
  6 |     train_test_split_weights,
  7 |     train_test_connex_split,
  8 | )
  9 | 
 10 | 
 11 | class TestConnexSplit(ExtTestCase):
 12 |     def test_shuffle(self):
 13 |         df = pandas.DataFrame(
 14 |             [
 15 |                 dict(a=1, b="e", c=5.6, ind="a1"),
 16 |                 dict(a=2, b="f", c=5.7, ind="a2"),
 17 |                 dict(a=4, b="g", c=5.8, ind="a3"),
 18 |                 dict(a=8, b="h", c=5.9, ind="a4"),
 19 |                 dict(a=16, b="i", c=6.2, ind="a5"),
 20 |             ]
 21 |         )
 22 |         shuffled = dataframe_shuffle(df, random_state=0)
 23 |         sorted_ = shuffled.sort_values("a")
 24 |         self.assertEqualDataFrame(df, sorted_)
 25 | 
 26 |         df2 = df.set_index("ind")
 27 |         shuffled = dataframe_shuffle(df2, random_state=0)
 28 |         sorted_ = shuffled.sort_values("a")
 29 |         self.assertEqualDataFrame(df2, sorted_)
 30 | 
 31 |         df2 = df.set_index(["ind", "c"])
 32 |         shuffled = dataframe_shuffle(df2, random_state=0)
 33 |         sorted_ = shuffled.sort_values("a")
 34 |         self.assertEqualDataFrame(df2, sorted_)
 35 | 
 36 |     def test_split_weights_errors(self):
 37 |         df = pandas.DataFrame(
 38 |             [
 39 |                 dict(a=1, b="e", c=1),
 40 |                 dict(a=2, b="f", c=1),
 41 |                 dict(a=4, b="g", c=1),
 42 |                 dict(a=8, b="h", c=1),
 43 |                 dict(a=12, b="h", c=1),
 44 |                 dict(a=16, b="i", c=1),
 45 |             ]
 46 |         )
 47 | 
 48 |         train, test = train_test_split_weights(df, train_size=0.5, weights="c")
 49 |         self.assertTrue(train is not None)
 50 |         self.assertTrue(test is not None)
 51 |         self.assertRaise(
 52 |             lambda: train_test_split_weights(df, test_size=0.5, weights=[0.5, 0.5]),
 53 |             ValueError,
 54 |             "Dimension",
 55 |         )
 56 |         self.assertRaise(
 57 |             lambda: train_test_split_weights(df, test_size=0), ValueError, "null"
 58 |         )
 59 |         self.assertRaise(
 60 |             lambda: train_test_split_weights(df, test_size=0, weights="c"),
 61 |             ValueError,
 62 |             "null",
 63 |         )
 64 | 
 65 |     def test_split_weights(self):
 66 |         df = pandas.DataFrame(
 67 |             [
 68 |                 dict(a=1, b="e", c=1),
 69 |                 dict(a=2, b="f", c=1),
 70 |                 dict(a=4, b="g", c=1),
 71 |                 dict(a=8, b="h", c=1),
 72 |                 dict(a=12, b="h", c=1),
 73 |                 dict(a=16, b="i", c=1),
 74 |             ]
 75 |         )
 76 | 
 77 |         train, test = train_test_split_weights(df, test_size=0.5)
 78 |         self.assertEqual(train.shape[1], test.shape[1])
 79 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 80 | 
 81 |         train, test = train_test_split_weights(df, test_size=0.5, weights="c")
 82 |         self.assertEqual(train.shape[1], test.shape[1])
 83 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 84 | 
 85 |         train, test = train_test_split_weights(df, test_size=0.5, weights=df["c"])
 86 |         self.assertEqual(train.shape[1], test.shape[1])
 87 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 88 | 
 89 |         df = pandas.DataFrame(
 90 |             [
 91 |                 dict(a=1, b="e", c=1),
 92 |                 dict(a=2, b="f", c=2),
 93 |                 dict(a=4, b="g", c=3),
 94 |                 dict(a=8, b="h", c=1),
 95 |                 dict(a=12, b="h", c=2),
 96 |                 dict(a=16, b="i", c=3),
 97 |             ]
 98 |         )
 99 | 
100 |         train, test = train_test_split_weights(
101 |             df, test_size=0.5, weights="c", fail_imbalanced=0.4
102 |         )
103 |         self.assertEqual(train.shape[1], test.shape[1])
104 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
105 |         w1, w2 = train["c"].sum(), test["c"].sum()
106 |         delta = abs(w1 - w2) / (w1 + w2)
107 |         self.assertGreater(0.4, delta)
108 | 
109 |     def test_split_connex(self):
110 |         df = pandas.DataFrame(
111 |             [
112 |                 dict(user="UA", prod="PA", card="C1"),
113 |                 dict(user="UA", prod="PB", card="C1"),
114 |                 dict(user="UB", prod="PC", card="C2"),
115 |                 dict(user="UB", prod="PD", card="C2"),
116 |                 dict(user="UC", prod="PE", card="C3"),
117 |                 dict(user="UC", prod="PF", card="C4"),
118 |                 dict(user="UD", prod="PG", card="C5"),
119 |             ]
120 |         )
121 | 
122 |         train, test = train_test_connex_split(  # pylint: disable=W0632
123 |             df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
124 |         )
125 | 
126 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
127 |         for col in ["user", "prod", "card"]:
128 |             s1 = set(train[col])
129 |             s2 = set(test[col])
130 |             if s1 & s2:
131 |                 raise AssertionError(
132 |                     f"Non empty intersection {s1} & {s2}\n{train}\n{test}"
133 |                 )
134 | 
135 |         df["connex"] = "ole"
136 |         train, test = train_test_connex_split(  # pylint: disable=W0632
137 |             df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
138 |         )
139 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
140 | 
141 |     def test_split_connex2(self):
142 |         df = pandas.DataFrame(
143 |             [
144 |                 dict(user="UA", prod="PAA", card="C1"),
145 |                 dict(user="UA", prod="PB", card="C1"),
146 |                 dict(user="UB", prod="PC", card="C2"),
147 |                 dict(user="UB", prod="PD", card="C2"),
148 |                 dict(user="UC", prod="PAA", card="C3"),
149 |                 dict(user="UC", prod="PF", card="C4"),
150 |                 dict(user="UD", prod="PG", card="C5"),
151 |             ]
152 |         )
153 | 
154 |         train_test_connex_split(
155 |             df,
156 |             test_size=0.5,
157 |             groups=["user", "prod", "card"],
158 |             fail_imbalanced=0.5,
159 |             return_cnx=True,
160 |         )
161 |         train, test, stats = train_test_connex_split(
162 |             df,
163 |             test_size=0.5,
164 |             groups=["user", "prod", "card"],
165 |             fail_imbalanced=0.5,
166 |             return_cnx=True,
167 |             random_state=0,
168 |         )
169 | 
170 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
171 |         for col in ["user", "prod", "card"]:
172 |             s1 = set(train[col])
173 |             s2 = set(test[col])
174 |             if s1 & s2:
175 |                 rows = []
176 |                 for k, v in sorted(stats[0].items()):
177 |                     rows.append(f"{k}={v}")
178 |                 raise AssertionError(
179 |                     "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(  # noqa: UP030
180 |                         s1, s2, train, test, "\n".join(rows)
181 |                     )
182 |                 )
183 | 
184 |     def test_split_connex_missing(self):
185 |         df = pandas.DataFrame(
186 |             [
187 |                 dict(user="UA", prod="PAA", card="C1"),
188 |                 dict(user="UA", prod="PB", card="C1"),
189 |                 dict(user="UB", prod="PC", card="C2"),
190 |                 dict(user="UB", prod="PD", card="C2"),
191 |                 dict(user="UC", prod="PAA", card="C3"),
192 |                 dict(user="UC", card="C4"),
193 |                 dict(user="UD", prod="PG"),
194 |             ]
195 |         )
196 | 
197 |         train, test, stats = train_test_connex_split(
198 |             df,
199 |             test_size=0.5,
200 |             groups=["user", "prod", "card"],
201 |             fail_imbalanced=0.4,
202 |             return_cnx=True,
203 |             random_state=0,
204 |         )
205 | 
206 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
207 |         for col in ["user", "prod", "card"]:
208 |             s1 = set(train[col])
209 |             s2 = set(test[col])
210 |             if s1 & s2:
211 |                 rows = []
212 |                 for k, v in sorted(stats[0].items()):
213 |                     rows.append(f"{k}={v}")
214 |                 raise AssertionError(
215 |                     "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(  # noqa: UP030
216 |                         s1, s2, train, test, "\n".join(rows)
217 |                     )
218 |                 )
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     unittest.main()
223 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split_big.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from collections import Counter
 4 | import pandas
 5 | from pandas_streaming.ext_test_case import ExtTestCase
 6 | from pandas_streaming.df import train_test_connex_split
 7 | 
 8 | 
 9 | class TestConnexSplitBig(ExtTestCase):
10 |     def test_connex_big(self):
11 |         data = os.path.join(os.path.dirname(__file__), "data")
12 |         name = os.path.join(data, "buggy_hash.csv")
13 |         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
14 |         train, test, stats = train_test_connex_split(
15 |             df,
16 |             groups=["cart_id", "mail", "product_id"],
17 |             fail_imbalanced=0.9,
18 |             return_cnx=True,
19 |         )
20 |         self.assertGreater(train.shape[0], 0)
21 |         self.assertGreater(test.shape[0], 0)
22 |         elements = stats[1]["connex"]
23 |         counts = Counter(elements)
24 |         nbc = len(counts)
25 |         maxi = max(counts.values())
26 |         self.assertEqual(nbc, 5376)
27 |         self.assertEqual(maxi, 14181)
28 | 
29 |     def test_connex_big_approx(self):
30 |         data = os.path.join(os.path.dirname(__file__), "data")
31 |         name = os.path.join(data, "buggy_hash.csv")
32 |         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
33 |         train, test, stats = train_test_connex_split(
34 |             df,
35 |             groups=["cart_id", "mail", "product_id"],
36 |             stop_if_bigger=0.05,
37 |             return_cnx=True,
38 |             keep_balance=0.8,
39 |         )
40 |         self.assertGreater(train.shape[0], 0)
41 |         self.assertGreater(test.shape[0], 0)
42 |         elements = stats[1]["connex"]
43 |         counts = Counter(elements)
44 |         nbc = len(counts)
45 |         maxi = max(counts.values())
46 |         self.assertGreater(nbc, 5376)
47 |         self.assertLesser(maxi, 14181)
48 | 
49 |     def test_connex_big_approx_must(self):
50 |         data = os.path.join(os.path.dirname(__file__), "data")
51 |         name = os.path.join(data, "buggy_hash.csv")
52 |         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
53 |         train, test, stats = train_test_connex_split(
54 |             df,
55 |             groups=["cart_id", "mail", "product_id"],
56 |             stop_if_bigger=0.05,
57 |             return_cnx=True,
58 |             keep_balance=0.8,
59 |             must_groups=["product_id"],
60 |         )
61 |         self.assertGreater(train.shape[0], 0)
62 |         self.assertGreater(test.shape[0], 0)
63 |         elements = stats[1]["connex"]
64 |         counts = Counter(elements)
65 |         nbc = len(counts)
66 |         maxi = max(counts.values())
67 |         self.assertGreater(nbc, 5376)
68 |         self.assertLesser(maxi, 14181)
69 |         train_ids = set(train.product_id)
70 |         test_ids = set(test.product_id)
71 |         inter = train_ids & test_ids
72 |         self.assertEqual(len(inter), 0)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_connex_split_cat.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from collections import Counter
  3 | import pandas
  4 | from pandas_streaming.ext_test_case import ExtTestCase
  5 | from pandas_streaming.df import train_test_apart_stratify
  6 | 
  7 | 
  8 | class TestConnexSplitCat(ExtTestCase):
  9 |     def test_cat_strat(self):
 10 |         df = pandas.DataFrame(
 11 |             [
 12 |                 dict(a=1, b="e"),
 13 |                 dict(a=2, b="e"),
 14 |                 dict(a=4, b="f"),
 15 |                 dict(a=8, b="f"),
 16 |                 dict(a=32, b="f"),
 17 |                 dict(a=16, b="f"),
 18 |             ]
 19 |         )
 20 | 
 21 |         train, test = train_test_apart_stratify(
 22 |             df, group="a", stratify="b", test_size=0.5
 23 |         )
 24 |         self.assertEqual(train.shape[1], test.shape[1])
 25 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 26 |         c1 = Counter(train["b"])
 27 |         c2 = Counter(train["b"])
 28 |         self.assertEqual(c1, c2)
 29 | 
 30 |         self.assertRaise(
 31 |             lambda: train_test_apart_stratify(
 32 |                 df, group=None, stratify="b", test_size=0.5
 33 |             ),
 34 |             ValueError,
 35 |         )
 36 |         self.assertRaise(
 37 |             lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError
 38 |         )
 39 | 
 40 |     def test_cat_strat_sorted(self):
 41 |         df = pandas.DataFrame(
 42 |             [
 43 |                 dict(a=1, b="e"),
 44 |                 dict(a=2, b="e"),
 45 |                 dict(a=4, b="f"),
 46 |                 dict(a=8, b="f"),
 47 |                 dict(a=32, b="f"),
 48 |                 dict(a=16, b="f"),
 49 |             ]
 50 |         )
 51 | 
 52 |         train, test = train_test_apart_stratify(
 53 |             df, group="a", stratify="b", test_size=0.5, sorted_indices=True
 54 |         )
 55 |         self.assertEqual(train.shape[1], test.shape[1])
 56 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 57 |         c1 = Counter(train["b"])
 58 |         c2 = Counter(train["b"])
 59 |         self.assertEqual(c1, c2)
 60 | 
 61 |         self.assertRaise(
 62 |             lambda: train_test_apart_stratify(
 63 |                 df, group=None, stratify="b", test_size=0.5, sorted_indices=True
 64 |             ),
 65 |             ValueError,
 66 |         )
 67 |         self.assertRaise(
 68 |             lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError
 69 |         )
 70 | 
 71 |     def test_cat_strat_multi(self):
 72 |         df = pandas.DataFrame(
 73 |             [
 74 |                 dict(a=1, b="e"),
 75 |                 dict(a=1, b="f"),
 76 |                 dict(a=2, b="e"),
 77 |                 dict(a=2, b="f"),
 78 |             ]
 79 |         )
 80 | 
 81 |         train, test = train_test_apart_stratify(
 82 |             df, group="a", stratify="b", test_size=0.5
 83 |         )
 84 |         self.assertEqual(train.shape[1], test.shape[1])
 85 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 86 |         c1 = Counter(train["b"])
 87 |         c2 = Counter(train["b"])
 88 |         self.assertEqual(c1, c2)
 89 |         self.assertEqual(len(set(train["a"])), 1)
 90 |         self.assertEqual(len(set(test["a"])), 1)
 91 |         self.assertTrue(set(train["a"]) != set(test["a"]))
 92 | 
 93 |     def test_cat_strat_multi_force(self):
 94 |         df = pandas.DataFrame(
 95 |             [
 96 |                 dict(a=1, b="e"),
 97 |                 dict(a=1, b="f"),
 98 |                 dict(a=2, b="e"),
 99 |                 dict(a=2, b="f"),
100 |             ]
101 |         )
102 | 
103 |         train, test = train_test_apart_stratify(
104 |             df, group="a", stratify="b", test_size=0.1, force=True
105 |         )
106 |         self.assertEqual(train.shape[1], test.shape[1])
107 |         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
108 |         c1 = Counter(train["b"])
109 |         c2 = Counter(train["b"])
110 |         self.assertEqual(c1, c2)
111 |         self.assertEqual(len(set(train["a"])), 1)
112 |         self.assertEqual(len(set(test["a"])), 1)
113 |         self.assertTrue(set(train["a"]) != set(test["a"]))
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     unittest.main()
118 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import numpy
 4 | import pandas
 5 | from pandas_streaming.ext_test_case import ExtTestCase
 6 | from pandas_streaming.df import dataframe_hash_columns
 7 | 
 8 | 
 9 | class TestDataFrameHelpers(ExtTestCase):
10 |     def test_hash_columns(self):
11 |         df = pandas.DataFrame(
12 |             [
13 |                 dict(a=1, b="e", c=5.6, ind="a1", ai=1),
14 |                 dict(b="f", c=5.7, ind="a2", ai=2),
15 |                 dict(a=4, b="g", ind="a3", ai=3),
16 |                 dict(a=8, b="h", c=5.9, ai=4),
17 |                 dict(a=16, b="i", c=6.2, ind="a5", ai=5),
18 |             ]
19 |         )
20 |         df2 = dataframe_hash_columns(df)
21 |         self.assertEqual(df2.shape, df.shape)
22 |         for j in range(df.shape[1]):
23 |             self.assertEqual(df.columns[j], df2.columns[j])
24 |             self.assertEqual(df.dtypes[j], df2.dtypes[j])
25 |             for i in range(df.shape[0]):
26 |                 v1 = df.iloc[i, j]
27 |                 v2 = df2.iloc[i, j]
28 |                 if isinstance(v1, float):
29 |                     if numpy.isnan(v1):
30 |                         self.assertTrue(numpy.isnan(v2))
31 |                     else:
32 |                         self.assertEqual(type(v1), type(v2))
33 |                 else:
34 |                     self.assertEqual(type(v1), type(v2))
35 | 
36 |     def test_hash_columns_bigger(self):
37 |         data = os.path.join(os.path.dirname(__file__), "data")
38 |         name = os.path.join(data, "buggy_hash.csv")
39 |         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
40 |         df2 = dataframe_hash_columns(df)
41 |         self.assertEqual(df.shape, df2.shape)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_helpers_simple.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas
 3 | import numpy
 4 | from pandas_streaming.ext_test_case import ExtTestCase
 5 | from pandas_streaming.df import dataframe_unfold
 6 | from pandas_streaming.df.dataframe_helpers import hash_int, hash_str, hash_float
 7 | 
 8 | 
 9 | class TestDataFrameHelpersSimple(ExtTestCase):
10 |     def test_unfold(self):
11 |         df = pandas.DataFrame([dict(a=1, b="e,f"), dict(a=2, b="g"), dict(a=3)])
12 |         df2 = dataframe_unfold(df, "b")
13 | 
14 |         exp = pandas.DataFrame(
15 |             [
16 |                 dict(a=1, b="e,f", b_unfold="e"),
17 |                 dict(a=1, b="e,f", b_unfold="f"),
18 |                 dict(a=2, b="g", b_unfold="g"),
19 |                 dict(a=3),
20 |             ]
21 |         )
22 |         self.assertEqualDataFrame(df2, exp)
23 | 
24 |         # fold
25 |         folded = df2.groupby("a").apply(
26 |             lambda row: (
27 |                 ",".join(row["b_unfold"].dropna())
28 |                 if len(row["b_unfold"].dropna()) > 0
29 |                 else numpy.nan
30 |             )
31 |         )
32 |         bf = folded.reset_index(drop=False)
33 |         bf.columns = ["a", "b"]
34 |         self.assertEqualDataFrame(df, bf)
35 | 
36 |     def test_hash_except(self):
37 |         self.assertRaise(lambda: hash_int(0.1, 3), ValueError, "numpy.nan expected")
38 |         r = hash_int(numpy.nan, 3)
39 |         self.assertTrue(numpy.isnan(r))
40 | 
41 |         self.assertRaise(lambda: hash_str(0.1, 3), ValueError, "numpy.nan expected")
42 |         r = hash_str(numpy.nan, 3)
43 |         self.assertTrue(numpy.isnan(r))
44 | 
45 |         self.assertRaise(lambda: hash_float("0.1", 3), TypeError, "isnan")
46 |         r = hash_float(numpy.nan, 3)
47 |         self.assertTrue(numpy.isnan(r))
48 |         r = hash_str("3", 100)
49 |         self.assertLess(len(r), 100)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_io.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | import io
 5 | import zipfile
 6 | import numpy
 7 | import pandas
 8 | from pandas_streaming.ext_test_case import ExtTestCase
 9 | from pandas_streaming.df import to_zip, read_zip
10 | 
11 | 
12 | class TestDataFrameIO(ExtTestCase):
13 |     def test_zip_dataframe(self):
14 |         df = pandas.DataFrame(
15 |             [
16 |                 dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
17 |                 dict(b="f", c=5.7, ind="a2", ai=2),
18 |                 dict(a=4, b="g", ind="a3", ai=3),
19 |                 dict(a=8, b="h", c=5.9, ai=4),
20 |                 dict(a=16, b="i", c=6.2, ind="a5", ai=5),
21 |             ]
22 |         )
23 | 
24 |         with tempfile.TemporaryDirectory() as temp:
25 |             name = os.path.join(temp, "df.zip")
26 |             to_zip(df, name, encoding="utf-8", index=False)
27 |             df2 = read_zip(name, encoding="utf-8")
28 |             self.assertEqualDataFrame(df, df2)
29 | 
30 |             st = io.BytesIO()
31 |             zp = zipfile.ZipFile(st, "w")
32 |             to_zip(df, zp, encoding="utf-8", index=False)
33 |             zp.close()
34 | 
35 |             st = io.BytesIO(st.getvalue())
36 |             zp = zipfile.ZipFile(st, "r")
37 |             df3 = read_zip(zp, encoding="utf-8")
38 |             zp.close()
39 |             self.assertEqualDataFrame(df, df3)
40 | 
41 |     def test_zip_numpy(self):
42 |         df = numpy.zeros((3, 4))
43 |         df[2, 3] = 1
44 | 
45 |         with tempfile.TemporaryDirectory() as temp:
46 |             name = os.path.join(temp, "df.zip")
47 |             to_zip(df, name, "arr.npy")
48 |             df2 = read_zip(name, "arr.npy")
49 |             self.assertEqualArray(df, df2)
50 | 
51 |             st = io.BytesIO()
52 |             zp = zipfile.ZipFile(st, "w")
53 |             to_zip(df, zp, "arr.npy")
54 |             zp.close()
55 | 
56 |             st = io.BytesIO(st.getvalue())
57 |             zp = zipfile.ZipFile(st, "r")
58 |             df3 = read_zip(zp, "arr.npy")
59 |             zp.close()
60 |             self.assertEqualArray(df, df3)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_io_helpers.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from io import StringIO, BytesIO
  3 | from json import loads
  4 | import pandas
  5 | from pandas_streaming.ext_test_case import ExtTestCase
  6 | from pandas_streaming.df.dataframe_io_helpers import (
  7 |     enumerate_json_items,
  8 |     JsonPerRowsStream,
  9 |     JsonIterator2Stream,
 10 | )
 11 | from pandas_streaming.df import StreamingDataFrame
 12 | 
 13 | 
 14 | class TestDataFrameIOHelpers(ExtTestCase):
 15 |     text_json = b"""
 16 |         [
 17 |         {
 18 |             "glossary": {
 19 |                 "title": "example glossary",
 20 |                 "GlossDiv": {
 21 |                     "title": "S",
 22 |                     "GlossList": [{
 23 |                         "GlossEntry": {
 24 |                             "ID": "SGML",
 25 |                             "SortAs": "SGML",
 26 |                             "GlossTerm": "Standard Generalized Markup Language",
 27 |                             "Acronym": "SGML",
 28 |                             "Abbrev": "ISO 8879:1986",
 29 |                             "GlossDef": {
 30 |                                 "para": "A meta-markup language, used to create markup languages such as DocBook.",
 31 |                                 "GlossSeeAlso": ["GML", "XML"]
 32 |                             },
 33 |                             "GlossSee": "markup"
 34 |                         }
 35 |                     }]
 36 |                 }
 37 |             }
 38 |         },
 39 |         {
 40 |             "glossary": {
 41 |                 "title": "example glossary",
 42 |                 "GlossDiv": {
 43 |                     "title": "X",
 44 |                     "GlossList": {
 45 |                         "GlossEntry": [{
 46 |                             "ID": "SGML",
 47 |                             "SortAs": "SGML",
 48 |                             "GlossTerm": "Standard Generalized Markup Language",
 49 |                             "Acronym": "SGML",
 50 |                             "Abbrev": "ISO 8879:1986",
 51 |                             "GlossDef": {
 52 |                                 "para": "A meta-markup language, used to create markup languages such as DocBook.",
 53 |                                 "GlossSeeAlso": ["GML", "XML"]
 54 |                             },
 55 |                             "GlossSee": "markup"
 56 |                         }]
 57 |                     }
 58 |                 }
 59 |             }
 60 |         }
 61 |         ]
 62 |     """
 63 |     text_json_exp = [
 64 |         {
 65 |             "glossary": {
 66 |                 "title": "example glossary",
 67 |                 "GlossDiv": {
 68 |                     "title": "S",
 69 |                     "GlossList": [
 70 |                         {
 71 |                             "GlossEntry": {
 72 |                                 "ID": "SGML",
 73 |                                 "SortAs": "SGML",
 74 |                                 "GlossTerm": "Standard Generalized Markup Language",
 75 |                                 "Acronym": "SGML",
 76 |                                 "Abbrev": "ISO 8879:1986",
 77 |                                 "GlossDef": {
 78 |                                     "para": "A meta-markup language, used to create markup languages such as DocBook.",
 79 |                                     "GlossSeeAlso": ["GML", "XML"],
 80 |                                 },
 81 |                                 "GlossSee": "markup",
 82 |                             }
 83 |                         }
 84 |                     ],
 85 |                 },
 86 |             }
 87 |         },
 88 |         {
 89 |             "glossary": {
 90 |                 "title": "example glossary",
 91 |                 "GlossDiv": {
 92 |                     "title": "X",
 93 |                     "GlossList": {
 94 |                         "GlossEntry": [
 95 |                             {
 96 |                                 "ID": "SGML",
 97 |                                 "SortAs": "SGML",
 98 |                                 "GlossTerm": "Standard Generalized Markup Language",
 99 |                                 "Acronym": "SGML",
100 |                                 "Abbrev": "ISO 8879:1986",
101 |                                 "GlossDef": {
102 |                                     "para": "A meta-markup language, used to create markup languages such as DocBook.",
103 |                                     "GlossSeeAlso": ["GML", "XML"],
104 |                                 },
105 |                                 "GlossSee": "markup",
106 |                             }
107 |                         ]
108 |                     },
109 |                 },
110 |             }
111 |         },
112 |     ]
113 | 
114 |     def test_enumerate_json_items(self):
115 |         items = list(enumerate_json_items(TestDataFrameIOHelpers.text_json))
116 |         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
117 |         items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
118 |         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
119 |         items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
120 |         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
121 | 
122 |     def test_read_json_raw(self):
123 |         data = [
124 |             {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
125 |             {"name": {"given": "Mose", "family": "Regner"}},
126 |             {"id": 2, "name": "FayeRaker"},
127 |         ]
128 |         exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"},
129 |                 {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null},
130 |                 {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null,
131 |                 "name.given":null,"name.last":null}]""".replace(
132 |             " ", ""
133 |         ).replace(
134 |             "\n", ""
135 |         )
136 |         self.assertRaise(
137 |             lambda: StreamingDataFrame.read_json(data), NotImplementedError
138 |         )
139 |         it = StreamingDataFrame.read_json(data, flatten=True)
140 |         dfs = list(it)
141 |         self.assertEqual(len(dfs), 1)
142 |         js = dfs[0].to_json(orient="records")
143 |         js_read = loads(js)
144 |         js_exp = loads(exp)
145 |         self.assertEqual(js_exp, js_read)
146 | 
147 |     def test_read_json_raw_head(self):
148 |         data = [
149 |             {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
150 |             {"name": {"given": "Mose", "family": "Regner"}},
151 |             {"id": 2, "name": "FayeRaker"},
152 |         ]
153 |         it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1)
154 |         h1 = it.head()
155 |         h2 = it.head()
156 |         self.assertEqualDataFrame(h1, h2)
157 |         self.assertGreater(h1.shape[0], 1)
158 |         self.assertGreater(h2.shape[0], 1)
159 | 
160 |     def test_pandas_json_chunksize(self):
161 |         jsonl = """{"a": 1, "b": 2}
162 |                    {"a": 3, "b": 4}"""
163 |         df = pandas.read_json(jsonl, lines=True)
164 |         idf = pandas.read_json(jsonl, lines=True, chunksize=2)
165 |         ldf = list(idf)
166 |         self.assertEqualDataFrame(df, ldf[0])
167 | 
168 |     def test_read_json_rows(self):
169 |         data = """{"a": 1, "b": 2}
170 |                   {"a": 3, "b": 4}"""
171 |         it = StreamingDataFrame.read_json(StringIO(data), lines=True)
172 |         dfs = list(it)
173 |         self.assertEqual(len(dfs), 1)
174 |         js = dfs[0].to_json(orient="records")
175 |         self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
176 | 
177 |     def test_read_json_rows2(self):
178 |         data = b"""{"a": 1, "b": 2}
179 |                   {"a": 3, "b": 4}"""
180 |         dfs = pandas.read_json(BytesIO(data), lines=True)
181 |         self.assertEqual(dfs.shape, (2, 2))
182 |         it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
183 |         dfs = list(it)
184 |         self.assertEqual(len(dfs), 1)
185 |         js = dfs[0].to_json(orient="records")
186 |         self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
187 | 
188 |     def test_read_json_rows2_head(self):
189 |         data = b"""{"a": 1, "b": 2}
190 |                   {"a": 3, "b": 4}"""
191 |         dfs = pandas.read_json(BytesIO(data), lines=True)
192 |         self.assertEqual(dfs.shape, (2, 2))
193 |         it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
194 |         h1 = it.head()
195 |         h2 = it.head()
196 |         self.assertNotEmpty(h1)
197 |         self.assertNotEmpty(h2)
198 |         self.assertEqualDataFrame(h1, h2)
199 | 
200 |     def test_read_json_rows_file_head(self):
201 |         data = self.abs_path_join(__file__, "data", "example2.json")
202 |         dfs = pandas.read_json(data, orient="records")
203 |         self.assertEqual(dfs.shape, (2, 2))
204 |         it = StreamingDataFrame.read_json(data)
205 |         h1 = it.head()
206 |         h2 = it.head()
207 |         self.assertNotEmpty(h1)
208 |         self.assertNotEmpty(h2)
209 |         self.assertEqualDataFrame(h1, h2)
210 | 
211 |     def test_read_json_rows_file_lines_head(self):
212 |         data = self.abs_path_join(__file__, "data", "example.json")
213 |         dfs = pandas.read_json(data, orient="records", lines=True)
214 |         self.assertEqual(dfs.shape, (2, 2))
215 |         it = StreamingDataFrame.read_json(data, lines="stream")
216 |         h1 = it.head()
217 |         h2 = it.head()
218 |         self.assertNotEmpty(h1)
219 |         self.assertNotEmpty(h2)
220 |         self.assertEqualDataFrame(h1, h2)
221 | 
222 |     def test_read_json_ijson(self):
223 |         it = StreamingDataFrame.read_json(BytesIO(TestDataFrameIOHelpers.text_json))
224 |         dfs = list(it)
225 |         self.assertEqual(len(dfs), 1)
226 |         js = dfs[0].to_json(orient="records", lines=True)
227 |         jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
228 |         self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
229 | 
230 |     def test_read_json_stream(self):
231 |         text = """{'a': 1}
232 |         {'b': 1, 'a', 'r'}"""
233 |         st = JsonPerRowsStream(StringIO(text))
234 |         val = st.getvalue().replace(" ", "").replace("\n", "")
235 |         exp = "[{'a':1},{'b':1,'a','r'}]"
236 |         self.assertEqual(val, exp)
237 | 
238 |         st = JsonPerRowsStream(StringIO(text))
239 |         t = st.read(0)
240 |         t = st.read(1)
241 |         c = ""
242 |         while t:
243 |             c += t
244 |             t = st.read(1)
245 |         val = c.replace(" ", "").replace("\n", "")
246 |         self.assertEqual(val, exp)
247 | 
248 |     def test_enumerate_json_items_lines(self):
249 |         data = b"""{"a": 1, "b": 2}
250 |                    {"a": 3, "b": 4}"""
251 |         items = list(enumerate_json_items(data, lines=True))
252 |         self.assertEqual(items, [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
253 | 
254 |     def test_read_json_file2(self):
255 |         data = b"""{"a": {"c": 1}, "b": [2, 3]}
256 |                    {"a": {"a": 3}, "b": [4, 5, "r"]}"""
257 | 
258 |         obj1 = list(enumerate_json_items(BytesIO(data), flatten=False, lines=True))
259 |         obj2 = list(enumerate_json_items(BytesIO(data), flatten=True, lines=True))
260 |         self.assertNotEqual(obj1, obj2)
261 |         self.assertEqual(
262 |             obj2,
263 |             [
264 |                 {"a_c": 1, "b_0": 2, "b_1": 3},
265 |                 {"a_a": 3, "b_0": 4, "b_1": 5, "b_2": "r"},
266 |             ],
267 |         )
268 | 
269 |         it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True)
270 |         dfs = list(it)
271 |         self.assertEqual(
272 |             ["a_a", "a_c", "b_0", "b_1", "b_2"],
273 |             list(sorted(dfs[0].columns)),
274 |         )
275 |         self.assertEqual(len(dfs), 1)
276 |         js = dfs[0].to_json(orient="records", lines=True)
277 |         jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
278 |         exp = [
279 |             {"a_a": None, "a_c": 1.0, "b_0": 2, "b_1": 3, "b_2": None},
280 |             {"a_a": 3.0, "a_c": None, "b_0": 4, "b_1": 5, "b_2": "r"},
281 |         ]
282 |         self.assertEqual(exp, jsjson)
283 | 
284 |     def test_read_json_item(self):
285 |         text = TestDataFrameIOHelpers.text_json
286 |         st = JsonPerRowsStream(BytesIO(text))
287 |         res = []
288 |         while True:
289 |             n = st.read()
290 |             if not n:
291 |                 break
292 |             res.append(n)
293 |         self.assertGreater(len(res), 1)
294 | 
295 |     def test_bug_documentation(self):
296 |         items = []
297 |         for item in JsonIterator2Stream(
298 |             lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json)
299 |         ):
300 |             items.append(item)
301 |         self.assertEqual(len(items), 2)
302 | 
303 |     def test_read_json_classic(self):
304 |         data = self.abs_path_join(__file__, "data", "classic.json")
305 |         dfs = pandas.read_json(data, orient="records")
306 |         dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
307 |         self.assertEqual(dfs.shape[1], 9)
308 |         self.assertGreater(dfs.shape[0], 2)
309 |         it = StreamingDataFrame.read_json(data)
310 |         it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
311 |         h1 = it.to_df()
312 |         h2 = it.to_df()
313 |         self.assertNotEmpty(h1)
314 |         self.assertNotEmpty(h2)
315 |         self.assertEqualDataFrame(h1, h2)
316 |         self.assertEqual(h1.shape[1], 9)
317 | 
318 |     def test_read_json_classic_file(self):
319 |         data = self.abs_path_join(__file__, "data", "classic.json")
320 |         dfs = pandas.read_json(data, orient="records")
321 |         self.assertEqual(dfs.shape[1], 8)
322 |         self.assertGreater(dfs.shape[0], 2)
323 |         with open(data, "r", encoding="utf-8") as f:
324 |             it = StreamingDataFrame.read_json(f, orient="records")
325 |             h1 = it.to_df()
326 |             h2 = it.to_df()
327 |         self.assertNotEmpty(h1)
328 |         self.assertNotEmpty(h2)
329 |         self.assertEqualDataFrame(h1, h2)
330 |         self.assertEqual(h1.shape[1], 8)
331 | 
332 |     def test_read_json_classic_file_formula(self):
333 |         data = self.abs_path_join(__file__, "data", "classic.json")
334 |         dfs = pandas.read_json(data, orient="records")
335 |         dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
336 |         self.assertEqual(dfs.shape[1], 9)
337 |         self.assertGreater(dfs.shape[0], 2)
338 |         with open(data, "r", encoding="utf-8") as f:
339 |             it = StreamingDataFrame.read_json(f)
340 |             it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
341 |             h1 = it.to_df()
342 |             h2 = it.to_df()
343 |         self.assertNotEmpty(h1)
344 |         self.assertNotEmpty(h2)
345 |         self.assertEqualDataFrame(h1, h2)
346 |         self.assertEqual(h1.shape[1], 9)
347 | 
348 | 
349 | if __name__ == "__main__":
350 |     unittest.main()
351 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_dataframe_sort.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | import pandas
  5 | from pandas_streaming.ext_test_case import ExtTestCase
  6 | from pandas_streaming.df import StreamingDataFrame
  7 | 
  8 | 
  9 | class TestDataFrameSort(ExtTestCase):
 10 |     def test_sort_values(self):
 11 |         with tempfile.TemporaryDirectory() as temp:
 12 |             name = os.path.join(temp, "_data_")
 13 |             df = pandas.DataFrame(
 14 |                 [
 15 |                     dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
 16 |                     dict(a=5, b="f", c=5.7, ind="a2", ai=2),
 17 |                     dict(a=4, b="g", ind="a3", ai=3),
 18 |                     dict(a=8, b="h", c=5.9, ai=4),
 19 |                     dict(a=16, b="i", c=6.2, ind="a5", ai=5),
 20 |                 ]
 21 |             )
 22 |             sdf = StreamingDataFrame.read_df(df, chunksize=2)
 23 |             sorted_df = df.sort_values(by="a")
 24 |             res = sdf.sort_values(by="a", temp_file=name)
 25 |             res_df = res.to_df()
 26 |             self.assertEqualDataFrame(sorted_df, res_df)
 27 | 
 28 |     def test_sort_values_twice(self):
 29 |         with tempfile.TemporaryDirectory() as temp:
 30 |             name = os.path.join(temp, "_data_")
 31 |             df = pandas.DataFrame(
 32 |                 [
 33 |                     dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
 34 |                     dict(a=5, b="f", c=5.7, ind="a2", ai=2),
 35 |                     dict(a=4, b="g", ind="a3", ai=3),
 36 |                     dict(a=8, b="h", c=5.9, ai=4),
 37 |                     dict(a=16, b="i", c=6.2, ind="a5", ai=5),
 38 |                 ]
 39 |             )
 40 |             sdf = StreamingDataFrame.read_df(df, chunksize=2)
 41 |             sorted_df = df.sort_values(by="a")
 42 |             res = sdf.sort_values(by="a", temp_file=name)
 43 |             res_df = res.to_df()
 44 |             self.assertEqualDataFrame(sorted_df, res_df)
 45 |             res_df = res.to_df()
 46 |             self.assertEqualDataFrame(sorted_df, res_df)
 47 | 
 48 |     def test_sort_values_reverse(self):
 49 |         with tempfile.TemporaryDirectory() as temp:
 50 |             name = os.path.join(temp, "_data_")
 51 |             df = pandas.DataFrame(
 52 |                 [
 53 |                     dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
 54 |                     dict(a=5, b="f", c=5.7, ind="a2", ai=2),
 55 |                     dict(a=4, b="g", ind="a3", ai=3),
 56 |                     dict(a=8, b="h", c=5.9, ai=4),
 57 |                     dict(a=16, b="i", c=6.2, ind="a5", ai=5),
 58 |                 ]
 59 |             )
 60 |             sdf = StreamingDataFrame.read_df(df, chunksize=2)
 61 |             sorted_df = df.sort_values(by="a", ascending=False)
 62 |             res = sdf.sort_values(by="a", temp_file=name, ascending=False)
 63 |             res_df = res.to_df()
 64 |             self.assertEqualDataFrame(sorted_df, res_df)
 65 | 
 66 |     def test_sort_values_nan_last(self):
 67 |         with tempfile.TemporaryDirectory() as temp:
 68 |             name = os.path.join(temp, "_data_")
 69 |             df = pandas.DataFrame(
 70 |                 [
 71 |                     dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
 72 |                     dict(b="f", c=5.7, ind="a2", ai=2),
 73 |                     dict(b="f", c=5.8, ind="a2", ai=2),
 74 |                     dict(a=4, b="g", ind="a3", ai=3),
 75 |                     dict(a=8, b="h", c=5.9, ai=4),
 76 |                     dict(a=16, b="i", c=6.2, ind="a5", ai=5),
 77 |                 ]
 78 |             )
 79 |             sdf = StreamingDataFrame.read_df(df, chunksize=2)
 80 |             sorted_df = df.sort_values(by="a", na_position="last")
 81 |             res = sdf.sort_values(by="a", temp_file=name, na_position="last")
 82 |             res_df = res.to_df()
 83 |             self.assertEqualDataFrame(sorted_df, res_df)
 84 | 
 85 |     def test_sort_values_nan_first(self):
 86 |         with tempfile.TemporaryDirectory() as temp:
 87 |             name = os.path.join(temp, "_data_")
 88 |             df = pandas.DataFrame(
 89 |                 [
 90 |                     dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
 91 |                     dict(b="f", c=5.7, ind="a2", ai=2),
 92 |                     dict(b="f", c=5.8, ind="a2", ai=2),
 93 |                     dict(a=4, b="g", ind="a3", ai=3),
 94 |                     dict(a=8, b="h", c=5.9, ai=4),
 95 |                     dict(a=16, b="i", c=6.2, ind="a5", ai=5),
 96 |                 ]
 97 |             )
 98 |             sdf = StreamingDataFrame.read_df(df, chunksize=2)
 99 |             sorted_df = df.sort_values(by="a", na_position="first")
100 |             res = sdf.sort_values(by="a", temp_file=name, na_position="first")
101 |             res_df = res.to_df()
102 |             self.assertEqualDataFrame(sorted_df, res_df)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_pandas_groupbynan.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas
  3 | import numpy
  4 | from scipy.sparse.linalg import lsqr as sparse_lsqr
  5 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings
  6 | from pandas_streaming.df import pandas_groupby_nan, numpy_types
  7 | 
  8 | 
  9 | class TestPandasHelper(ExtTestCase):
 10 |     def test_pandas_groupbynan(self):
 11 |         self.assertTrue(sparse_lsqr is not None)
 12 |         types = [(int, -10), (float, -20.2), (str, "e"), (bytes, bytes("a", "ascii"))]
 13 |         skip = (numpy.bool_, numpy.complex64, numpy.complex128)
 14 |         types += [(_, _(5)) for _ in numpy_types() if _ not in skip]
 15 | 
 16 |         for ty in types:
 17 |             data = [
 18 |                 {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
 19 |                 {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
 20 |                 {"this": "cst", "type": "row_for_nan"},
 21 |             ]
 22 |             df = pandas.DataFrame(data)
 23 |             gr = pandas_groupby_nan(df, "value")
 24 |             co = gr.sum()
 25 |             li = list(co["value"])
 26 |             try:
 27 |                 self.assertIsInstance(li[-1], float)
 28 |             except AssertionError as e:
 29 |                 raise AssertionError(f"Issue with {ty}") from e
 30 |             try:
 31 |                 self.assertTrue(numpy.isnan(li[-1]))
 32 |             except AssertionError as e:
 33 |                 raise AssertionError(
 34 |                     "Issue with value {}\n--df--\n{}\n--gr--\n{}\n--co--\n{}".format(
 35 |                         li, df, gr.count(), co
 36 |                     )
 37 |                 ) from e
 38 | 
 39 |         for ty in types:
 40 |             data = [
 41 |                 {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
 42 |                 {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
 43 |                 {"this": "cst", "type": "row_for_nan"},
 44 |             ]
 45 |             df = pandas.DataFrame(data)
 46 |             try:
 47 |                 gr = pandas_groupby_nan(df, ("value", "this"))
 48 |                 t = True
 49 |                 raise AssertionError("---")
 50 |             except (TypeError, KeyError):
 51 |                 t = False
 52 |             if t:
 53 |                 co = gr.sum()
 54 |                 li = list(co["value"])
 55 |                 self.assertIsInstance(li[-1], float)
 56 |                 self.assertTrue(numpy.isnan(li[-1]))
 57 |             try:
 58 |                 gr = pandas_groupby_nan(df, ["value", "this"])
 59 |                 t = True
 60 |             except (TypeError, NotImplementedError):
 61 |                 t = False
 62 | 
 63 |             if t:
 64 |                 co = gr.sum()
 65 |                 li = list(co["value"])
 66 |                 self.assertEqual(len(li), 2)
 67 | 
 68 |     def test_pandas_groupbynan_tuple(self):
 69 |         data = [
 70 |             dict(a="a", b="b", c="c", n=1),
 71 |             dict(b="b", n=2),
 72 |             dict(a="a", n=3),
 73 |             dict(c="c", n=4),
 74 |         ]
 75 |         df = pandas.DataFrame(data)
 76 |         gr = df.groupby(["a", "b", "c"]).sum()
 77 |         self.assertEqual(gr.shape, (1, 1))
 78 | 
 79 |         for nanback in [True, False]:
 80 |             try:
 81 |                 gr2_ = pandas_groupby_nan(
 82 |                     df, ["a", "b", "c"], nanback=nanback, suffix="NAN"
 83 |                 )
 84 |             except NotImplementedError:
 85 |                 continue
 86 |             gr2 = gr2_.sum().sort_values("n")
 87 |             self.assertEqual(gr2.shape, (4, 4))
 88 |             d = gr2.to_dict("records")
 89 |             self.assertEqual(d[0]["a"], "a")
 90 |             self.assertEqual(d[0]["b"], "b")
 91 |             self.assertEqual(d[0]["c"], "c")
 92 |             self.assertEqual(d[0]["n"], 1)
 93 |             self.assertEqual(d[1]["a"], "NAN")
 94 | 
 95 |     def test_pandas_groupbynan_regular(self):
 96 |         df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)])
 97 |         gr = df.groupby(["a"], as_index=False).sum()
 98 |         gr2_ = pandas_groupby_nan(df, ["a"]).sum()
 99 |         self.assertEqualDataFrame(gr, gr2_)
100 | 
101 |     def test_pandas_groupbynan_regular_nanback(self):
102 |         df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)])
103 |         gr = df.groupby(["a", "cc"]).sum()
104 |         self.assertEqual(len(gr), 1)
105 | 
106 |     def test_pandas_groupbynan_doc(self):
107 |         data = [
108 |             dict(a=2, ind="a", n=1),
109 |             dict(a=2, ind="a"),
110 |             dict(a=3, ind="b"),
111 |             dict(a=30),
112 |         ]
113 |         df = pandas.DataFrame(data)
114 |         gr2 = pandas_groupby_nan(df, ["ind"]).sum()
115 |         ind = list(gr2["ind"])
116 |         self.assertTrue(numpy.isnan(ind[-1]))
117 |         val = list(gr2["a"])
118 |         self.assertEqual(val[-1], 30)
119 | 
120 |     @ignore_warnings(UserWarning)
121 |     def test_pandas_groupbynan_doc2(self):
122 |         data = [
123 |             dict(a=2, ind="a", n=1),
124 |             dict(a=2, ind="a"),
125 |             dict(a=3, ind="b"),
126 |             dict(a=30),
127 |         ]
128 |         df = pandas.DataFrame(data)
129 |         gr2 = pandas_groupby_nan(df, ["ind", "a"], nanback=False).sum()
130 |         ind = list(gr2["ind"])
131 |         self.assertEqual(ind[-1], "²nan")
132 | 
133 |     def test_pandas_groupbynan_doc3(self):
134 |         data = [
135 |             dict(a=2, ind="a", n=1),
136 |             dict(a=2, ind="a"),
137 |             dict(a=3, ind="b"),
138 |             dict(a=30),
139 |         ]
140 |         df = pandas.DataFrame(data)
141 |         gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum()
142 |         ind = list(gr2["ind"])
143 |         self.assertTrue(numpy.isnan(ind[-1]))
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/_unittests/ut_df/test_streaming_dataframe.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | from io import StringIO
  5 | import pandas
  6 | import numpy
  7 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings
  8 | from pandas_streaming.data import dummy_streaming_dataframe
  9 | from pandas_streaming.df import StreamingDataFrame
 10 | from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError
 11 | 
 12 | 
 13 | class TestStreamingDataFrame(ExtTestCase):
 14 |     def test_shape(self):
 15 |         sdf = dummy_streaming_dataframe(100)
 16 |         dfs = list(sdf)
 17 |         self.assertEqual(len(dfs), 10)
 18 |         self.assertEqual(len(dfs), 10)
 19 |         shape = sdf.shape
 20 |         self.assertEqual(shape, (100, 2))
 21 | 
 22 |     def test_init(self):
 23 |         sdf = dummy_streaming_dataframe(100)
 24 |         df1 = sdf.to_df()
 25 |         sdf2 = StreamingDataFrame(sdf)
 26 |         df2 = sdf2.to_df()
 27 |         self.assertEqualDataFrame(df1, df2)
 28 | 
 29 |     def test_to_csv(self):
 30 |         sdf = dummy_streaming_dataframe(100)
 31 |         st = sdf.to_csv()
 32 |         self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
 33 |         st = sdf.to_csv()
 34 |         self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
 35 | 
 36 |     def test_iterrows(self):
 37 |         sdf = dummy_streaming_dataframe(100)
 38 |         rows = list(sdf.iterrows())
 39 |         self.assertEqual(sdf.shape[0], len(rows))
 40 |         rows = list(sdf.iterrows())
 41 |         self.assertEqual(sdf.shape[0], len(rows))
 42 | 
 43 |     def test_head(self):
 44 |         sdf = dummy_streaming_dataframe(100)
 45 |         st = sdf.head()
 46 |         self.assertEqual(st.shape, (5, 2))
 47 |         st = sdf.head(n=20)
 48 |         self.assertEqual(st.shape, (20, 2))
 49 |         st = sdf.head(n=20)
 50 |         self.assertEqual(st.shape, (20, 2))
 51 | 
 52 |     def test_tail(self):
 53 |         sdf = dummy_streaming_dataframe(100)
 54 |         st = sdf.tail()
 55 |         self.assertEqual(st.shape, (5, 2))
 56 |         st = sdf.tail(n=20)
 57 |         self.assertEqual(st.shape, (10, 2))
 58 | 
 59 |     def test_read_csv(self):
 60 |         with tempfile.TemporaryDirectory() as temp:
 61 |             df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"]))
 62 |             name = os.path.join(temp, "df.csv")
 63 |             name2 = os.path.join(temp, "df2.csv")
 64 |             name3 = os.path.join(temp, "df3.csv")
 65 |             df.to_csv(name, index=False)
 66 |             df.to_csv(name2, index=True)
 67 |             sdf = StreamingDataFrame.read_csv(name)
 68 |             text = sdf.to_csv(index=False)
 69 |             self.assertRaise(
 70 |                 lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None),
 71 |                 ValueError,
 72 |             )
 73 |             self.assertRaise(
 74 |                 lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False),
 75 |                 ValueError,
 76 |             )
 77 |             sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
 78 |             text2 = sdf2.to_csv(index=True)
 79 |             sdf2.to_csv(name3, index=True)
 80 |             with open(name, "r", encoding="utf-8") as f:
 81 |                 exp = f.read()
 82 |             with open(name2, "r", encoding="utf-8") as f:
 83 |                 exp2 = f.read()
 84 |             with open(name3, "r", encoding="utf-8") as f:
 85 |                 text3 = f.read()
 86 |             self.assertEqual(text.replace("\r", ""), exp)
 87 |             sdf2 = StreamingDataFrame.read_df(df)
 88 |             self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
 89 |             self.assertEqual(text2.replace("\r", ""), exp2)
 90 |             self.assertEqual(
 91 |                 text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "")
 92 |             )
 93 | 
 94 |     def test_where(self):
 95 |         sdf = dummy_streaming_dataframe(100)
 96 |         cols = sdf.columns
 97 |         self.assertEqual(list(cols), ["cint", "cstr"])
 98 |         dts = sdf.dtypes
 99 |         self.assertEqual(len(dts), 2)
100 |         res = sdf.where(lambda row: row["cint"] == 1)
101 |         st = res.to_csv()
102 |         self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
103 |         res = sdf.where(lambda row: row["cint"] == 1)
104 |         st = res.to_csv()
105 |         self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
106 | 
107 |     def test_dataframe(self):
108 |         sdf = dummy_streaming_dataframe(100)
109 |         df = sdf.to_dataframe()
110 |         self.assertEqual(df.shape, (100, 2))
111 | 
112 |     def test_sample(self):
113 |         sdf = dummy_streaming_dataframe(100)
114 |         res = sdf.sample(frac=0.1)
115 |         self.assertLesser(res.shape[0], 30)
116 |         self.assertRaise(lambda: sdf.sample(n=5), ValueError)
117 |         res = sdf.sample(frac=0.1)
118 |         self.assertLesser(res.shape[0], 30)
119 |         self.assertRaise(lambda: sdf.sample(n=5), ValueError)
120 | 
121 |     def test_sample_cache(self):
122 |         sdf = dummy_streaming_dataframe(100)
123 |         res = sdf.sample(frac=0.1, cache=True)
124 |         df1 = res.to_df()
125 |         df2 = res.to_df()
126 |         self.assertEqualDataFrame(df1, df2)
127 |         self.assertTrue(res.is_stable(n=df1.shape[0], do_check=True))
128 |         self.assertTrue(res.is_stable(n=df1.shape[0], do_check=False))
129 |         res = sdf.sample(frac=0.1, cache=False)
130 |         self.assertFalse(res.is_stable(n=df1.shape[0], do_check=False))
131 | 
132 |     def test_sample_reservoir_cache(self):
133 |         sdf = dummy_streaming_dataframe(100)
134 |         res = sdf.sample(n=10, cache=True, reservoir=True)
135 |         df1 = res.to_df()
136 |         df2 = res.to_df()
137 |         self.assertEqualDataFrame(df1, df2)
138 |         self.assertEqual(df1.shape, (10, res.shape[1]))
139 |         self.assertRaise(
140 |             lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError
141 |         )
142 |         self.assertRaise(
143 |             lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError
144 |         )
145 | 
146 |     def test_apply(self):
147 |         sdf = dummy_streaming_dataframe(100)
148 |         self.assertNotEmpty(list(sdf))
149 |         sdf = sdf.applymap(str)
150 |         self.assertNotEmpty(list(sdf))
151 |         sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1)
152 |         self.assertNotEmpty(list(sdf))
153 |         text = sdf.to_csv(header=False)
154 |         self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text.replace("\r", ""))
155 | 
156 |     def test_train_test_split(self):
157 |         sdf = dummy_streaming_dataframe(100)
158 |         tr, te = sdf.train_test_split(index=False, streaming=False)
159 |         self.assertRaise(
160 |             lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError
161 |         )
162 |         self.assertRaise(
163 |             lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError
164 |         )
165 |         StreamingDataFrame.read_str(tr.encode("utf-8"))
166 |         trsdf = StreamingDataFrame.read_str(tr)
167 |         tesdf = StreamingDataFrame.read_str(te)
168 |         trdf = trsdf.to_dataframe()
169 |         tedf = tesdf.to_dataframe()
170 |         df_exp = sdf.to_dataframe()
171 |         df_val = pandas.concat([trdf, tedf])
172 |         self.assertEqual(df_exp.shape, df_val.shape)
173 |         df_val = df_val.sort_values("cint").reset_index(drop=True)
174 |         self.assertEqualDataFrame(df_val, df_exp)
175 | 
176 |     def test_train_test_split_streaming(self):
177 |         sdf = dummy_streaming_dataframe(100, asfloat=True)
178 |         trsdf, tesdf = sdf.train_test_split(
179 |             streaming=True, unique_rows=True, partitions=[0.7, 0.3]
180 |         )
181 |         trdf = trsdf.to_dataframe()
182 |         tedf = tesdf.to_dataframe()
183 |         df_exp = sdf.to_dataframe()
184 |         df_val = pandas.concat([trdf, tedf])
185 |         self.assertEqual(df_exp.shape, df_val.shape)
186 |         df_val = df_val.sort_values("cfloat").reset_index(drop=True)
187 |         self.assertEqualDataFrame(df_val, df_exp)
188 |         trdf2 = trsdf.to_dataframe()
189 |         tedf2 = tesdf.to_dataframe()
190 |         df_val = pandas.concat([trdf2, tedf2])
191 |         self.assertEqual(df_exp.shape, df_val.shape)
192 |         df_val = df_val.sort_values("cfloat").reset_index(drop=True)
193 |         self.assertEqualDataFrame(df_val, df_exp)
194 |         self.assertEqual(trdf.shape, trdf2.shape)
195 |         self.assertEqual(tedf.shape, tedf2.shape)
196 |         self.assertGreater(trdf.shape[0], tedf.shape[0])
197 |         self.assertGreater(trdf2.shape[0], tedf2.shape[0])
198 | 
199 |     def test_train_test_split_streaming_tiny(self):
200 |         df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
201 | 
202 |         sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df]))
203 |         sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
204 |         df1 = sdfte.head()
205 |         df2 = sdfte.head()
206 |         if df1 is not None or df2 is not None:
207 |             self.assertEqualDataFrame(df1, df2)
208 |         df1 = sdftr.head()
209 |         df2 = sdftr.head()
210 |         if df1 is not None or df2 is not None:
211 |             self.assertEqualDataFrame(df1, df2)
212 |         sdf = StreamingDataFrame.read_df(df)
213 |         sdf2 = sdf.concat(sdf, axis=0)
214 |         sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
215 |         df1 = sdfte.head()
216 |         df2 = sdfte.head()
217 |         if df1 is not None or df2 is not None:
218 |             self.assertEqualDataFrame(df1, df2)
219 |         df1 = sdftr.head()
220 |         df2 = sdftr.head()
221 |         if df1 is not None or df2 is not None:
222 |             self.assertEqualDataFrame(df1, df2)
223 | 
224 |     def test_train_test_split_streaming_strat(self):
225 |         sdf = dummy_streaming_dataframe(
226 |             100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(100)]
227 |         )
228 |         trsdf, tesdf = sdf.train_test_split(
229 |             streaming=True, unique_rows=True, stratify="tify"
230 |         )
231 |         trdf = trsdf.to_dataframe()
232 |         tedf = tesdf.to_dataframe()
233 |         df_exp = sdf.to_dataframe()
234 |         df_val = pandas.concat([trdf, tedf])
235 |         self.assertEqual(df_exp.shape, df_val.shape)
236 |         df_val = df_val.sort_values("cfloat").reset_index(drop=True)
237 |         self.assertEqualDataFrame(df_val, df_exp)
238 |         trdf = trsdf.to_dataframe()
239 |         tedf = tesdf.to_dataframe()
240 |         df_val = pandas.concat([trdf, tedf])
241 |         self.assertEqual(df_exp.shape, df_val.shape)
242 |         df_val = df_val.sort_values("cfloat").reset_index(drop=True)
243 |         self.assertEqualDataFrame(df_val, df_exp)
244 |         trgr = trdf.groupby("tify").count()
245 |         trgr["part"] = 0
246 |         tegr = tedf.groupby("tify").count()
247 |         tegr["part"] = 1
248 |         gr = pandas.concat([trgr, tegr])
249 |         self.assertGreater(gr["cfloat"].min(), 4)
250 | 
251 |     def test_train_test_split_file(self):
252 |         with tempfile.TemporaryDirectory() as temp:
253 |             names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")]
254 |             sdf = dummy_streaming_dataframe(100)
255 |             sdf.train_test_split(names, index=False, streaming=False)
256 |             trsdf = StreamingDataFrame.read_csv(names[0])
257 |             tesdf = StreamingDataFrame.read_csv(names[1])
258 |             self.assertGreater(trsdf.shape[0], 20)
259 |             self.assertGreater(tesdf.shape[0], 20)
260 |             trdf = trsdf.to_dataframe()
261 |             tedf = tesdf.to_dataframe()
262 |             self.assertGreater(trdf.shape[0], 20)
263 |             self.assertGreater(tedf.shape[0], 20)
264 |             df_exp = sdf.to_dataframe()
265 |             df_val = pandas.concat([trdf, tedf])
266 |             self.assertEqual(df_exp.shape, df_val.shape)
267 |             df_val = df_val.sort_values("cint").reset_index(drop=True)
268 |             self.assertEqualDataFrame(df_val, df_exp)
269 | 
270 |     def test_train_test_split_file_pattern(self):
271 |         with tempfile.TemporaryDirectory() as temp:
272 |             sdf = dummy_streaming_dataframe(100)
273 |             names = os.path.join(temp, "spl_{0}.txt")
274 |             self.assertRaise(
275 |                 lambda: sdf.train_test_split(names, index=False, streaming=False),
276 |                 ValueError,
277 |             )
278 |             names = os.path.join(temp, "spl_{}.txt")
279 |             tr, te = sdf.train_test_split(names, index=False, streaming=False)
280 |             trsdf = StreamingDataFrame.read_csv(tr)
281 |             tesdf = StreamingDataFrame.read_csv(te)
282 |             trdf = trsdf.to_dataframe()
283 |             tedf = tesdf.to_dataframe()
284 |             df_exp = sdf.to_dataframe()
285 |             df_val = pandas.concat([trdf, tedf])
286 |             self.assertEqual(df_exp.shape, df_val.shape)
287 |             df_val = df_val.sort_values("cint").reset_index(drop=True)
288 |             self.assertEqualDataFrame(df_val, df_exp)
289 | 
290 |     def test_merge(self):
291 |         def compares(a, b, how):
292 |             m = a.merge(b, on="cint", indicator=True)
293 |             dm = m.to_dataframe()
294 |             da = a.to_dataframe()
295 |             db = b.to_dataframe()
296 |             exp = da.merge(db, on="cint", indicator=True)
297 |             self.assertEqualDataFrame(
298 |                 dm.reset_index(drop=True), exp.reset_index(drop=True)
299 |             )
300 | 
301 |         sdf20 = dummy_streaming_dataframe(20)
302 |         sdf30 = dummy_streaming_dataframe(30)
303 |         # itself
304 |         hows = "inner left right outer".split()
305 |         for how in hows:
306 |             compares(sdf20, sdf20, how)
307 |             compares(sdf20, sdf20, how)
308 |         for how in hows:
309 |             compares(sdf20, sdf30, how)
310 |             compares(sdf20, sdf30, how)
311 |         for how in hows:
312 |             compares(sdf30, sdf20, how)
313 |             compares(sdf30, sdf20, how)
314 |         sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
315 | 
316 |     def test_concatv(self):
317 |         sdf20 = dummy_streaming_dataframe(20)
318 |         sdf30 = dummy_streaming_dataframe(30)
319 |         df20 = sdf20.to_dataframe()
320 |         df30 = sdf30.to_dataframe()
321 |         df = pandas.concat([df20, df30], axis=0)
322 | 
323 |         m1 = sdf20.concat(sdf30, axis=0)
324 |         self.assertEqualDataFrame(m1.to_dataframe(), df)
325 |         m1 = sdf20.concat(df30, axis=0)
326 |         self.assertEqualDataFrame(m1.to_dataframe(), df)
327 |         m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)  # noqa: C417
328 |         self.assertEqualDataFrame(m1.to_dataframe(), df)
329 |         m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)  # noqa: C417
330 |         self.assertEqualDataFrame(m1.to_dataframe(), df)
331 | 
332 |         df20["cint"] = df20["cint"].astype(float)
333 |         self.assertRaise(
334 |             lambda: sdf20.concat(df20).to_dataframe(),
335 |             ValueError,
336 |             "Frame others[0] do not have the same column types",
337 |         )
338 |         df30["g"] = 4
339 |         self.assertRaise(
340 |             lambda: sdf20.concat(df30).to_dataframe(),
341 |             ValueError,
342 |             "Frame others[0] do not have the same column names",
343 |         )
344 | 
345 |     def test_concath(self):
346 |         sdf20 = dummy_streaming_dataframe(20)
347 |         sdf30 = dummy_streaming_dataframe(20)
348 |         df20 = sdf20.to_dataframe()
349 |         df30 = sdf30.to_dataframe()
350 |         df = pandas.concat([df20, df30], axis=1)
351 | 
352 |         m1 = sdf20.concat(sdf30, axis=1)
353 |         self.assertEqualDataFrame(m1.to_dataframe(), df)
354 |         sdf22 = dummy_streaming_dataframe(22)
355 |         sdf25 = dummy_streaming_dataframe(25)
356 |         self.assertRaise(
357 |             lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError
358 |         )
359 | 
360 |     def test_groupby(self):
361 |         df20 = dummy_streaming_dataframe(20).to_dataframe()
362 |         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
363 |         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
364 |         gr = sdf20.groupby("key", lambda gr: gr.sum())
365 |         gr2 = df20.groupby("key").sum()
366 |         self.assertEqualDataFrame(gr, gr2)
367 |         self.assertRaise(
368 |             lambda: sdf20.groupby("key", in_memory=False), NotImplementedError
369 |         )
370 | 
371 |         # Do not replace lambda c:sum(c) by sum or...
372 |         # pandas.core.base.SpecificationError: Function names
373 |         # must be unique, found multiple named sum
374 |         gr2 = (
375 |             df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c: sum(c)])
376 |         )
377 |         gr = sdf20.drop("cstr", axis=1).groupby(
378 |             "key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)])
379 |         )
380 |         self.assertEqualDataFrame(gr, gr2)
381 | 
382 |         gr = sdf20.groupby("key", lambda gr: gr.count())
383 |         gr2 = df20.groupby("key").count()
384 |         self.assertEqualDataFrame(gr, gr2)
385 | 
386 |         df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7]))
387 |         sdf = StreamingDataFrame.read_df(df)
388 |         gr = sdf.groupby("A")
389 |         gr2 = df.groupby("A").sum()
390 |         self.assertEqualDataFrame(gr, gr2)
391 | 
392 |     def test_groupby_cum(self):
393 |         df20 = dummy_streaming_dataframe(20).to_dataframe()
394 |         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
395 |         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
396 |         sgr = sdf20.groupby_streaming(
397 |             "key", lambda gr: gr.sum(), strategy="cum", as_index=False
398 |         )
399 |         gr2 = df20.groupby("key", as_index=False).sum()
400 |         lastgr = None
401 |         for gr in sgr:
402 |             self.assertEqual(list(gr.columns), list(gr2.columns))
403 |             lastgr = gr
404 |         self.assertEqualDataFrame(lastgr, gr2)
405 | 
406 |     def test_groupby_streaming(self):
407 |         df20 = dummy_streaming_dataframe(20).to_dataframe()
408 |         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
409 |         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
410 |         sgr = sdf20.groupby_streaming(
411 |             "key", lambda gr: gr.sum(), strategy="streaming", as_index=False
412 |         )
413 |         gr2 = df20.groupby("key", as_index=False).sum()
414 |         grs = list(sgr)
415 |         gr = pandas.concat(grs).groupby("key", as_index=False).sum()
416 |         self.assertEqualDataFrame(gr, gr2)
417 | 
418 |     def test_groupby_cum_asindex(self):
419 |         df20 = dummy_streaming_dataframe(20).to_dataframe()
420 |         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
421 |         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
422 |         sgr = sdf20.groupby_streaming(
423 |             "key", lambda gr: gr.sum(), strategy="cum", as_index=True
424 |         )
425 |         gr2 = df20.groupby("key", as_index=True).sum()
426 |         lastgr = None
427 |         for gr in sgr:
428 |             self.assertEqual(list(gr.columns), list(gr2.columns))
429 |             lastgr = gr
430 |         self.assertEqualDataFrame(lastgr, gr2)
431 | 
432 |     def test_merge_2(self):
433 |         df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
434 |         df2 = pandas.concat([df, df])
435 |         sdf = StreamingDataFrame.read_df(df)
436 |         sdf2 = sdf.concat(sdf, axis=0)
437 |         self.assertEqualDataFrame(df2, sdf2.to_dataframe())
438 |         self.assertEqualDataFrame(df2, sdf2.to_dataframe())
439 |         m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
440 |         jm = df2.merge(m, left_on="Y", right_on="Y", how="outer")
441 |         sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
442 |         self.assertEqualDataFrame(
443 |             jm.sort_values(["X", "Y"]).reset_index(drop=True),
444 |             sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True),
445 |         )
446 | 
447 |     @ignore_warnings(ResourceWarning)
448 |     def test_schema_consistent(self):
449 |         df = pandas.DataFrame(
450 |             [
451 |                 dict(cf=0, cint=0, cstr="0"),
452 |                 dict(cf=1, cint=1, cstr="1"),
453 |                 dict(cf=2, cint="s2", cstr="2"),
454 |                 dict(cf=3, cint=3, cstr="3"),
455 |             ]
456 |         )
457 |         with tempfile.TemporaryDirectory() as temp:
458 |             name = os.path.join(temp, "df.csv")
459 |             stio = StringIO()
460 |             df.to_csv(stio, index=False)
461 |             self.assertNotEmpty(stio.getvalue())
462 |             df.to_csv(name, index=False)
463 |             self.assertEqual(df.shape, (4, 3))
464 |             sdf = StreamingDataFrame.read_csv(name, chunksize=2)
465 |             self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError)
466 |             sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False)
467 |             pieces = list(sdf)
468 |             self.assertEqual(len(pieces), 2)
469 | 
470 |     def test_getitem(self):
471 |         sdf = dummy_streaming_dataframe(100)
472 |         sdf2 = sdf[["cint"]]
473 |         self.assertEqual(sdf2.shape, (100, 1))
474 |         df1 = sdf.to_df()
475 |         df2 = sdf2.to_df()
476 |         self.assertEqualDataFrame(df1[["cint"]], df2)
477 |         self.assertRaise(lambda: sdf[:, "cint"], NotImplementedError)
478 | 
479 |     @ignore_warnings(ResourceWarning)
480 |     def test_read_csv_names(self):
481 |         this = os.path.abspath(os.path.dirname(__file__))
482 |         data = os.path.join(this, "data", "buggy_hash2.csv")
483 |         df = pandas.read_csv(data, sep="\t", names=["A", "B", "C"], header=None)
484 |         sdf = StreamingDataFrame.read_csv(
485 |             data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None
486 |         )
487 |         head = sdf.head(n=1)
488 |         self.assertEqualDataFrame(df.head(n=1), head)
489 | 
490 |     def test_add_column(self):
491 |         df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
492 |         sdf = StreamingDataFrame.read_df(df)
493 |         sdf2 = sdf.add_column("d", lambda _row: 1)
494 |         df2 = sdf2.to_dataframe()
495 |         df["d"] = 1
496 |         self.assertEqualDataFrame(df, df2)
497 | 
498 |         sdf3 = StreamingDataFrame.read_df(df)
499 |         sdf4 = sdf3.add_column("dd", 2)
500 |         df4 = sdf4.to_dataframe()
501 |         df["dd"] = 2
502 |         self.assertEqualDataFrame(df, df4)
503 | 
504 |         sdfA = StreamingDataFrame.read_df(df)
505 |         sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10)
506 |         dfB = sdfB.to_dataframe()
507 |         df["dd12"] = 12
508 |         self.assertEqualDataFrame(df, dfB)
509 | 
510 |     def test_fillna(self):
511 |         df = pandas.DataFrame(data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
512 |         sdf = StreamingDataFrame.read_df(df)
513 | 
514 |         df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
515 |         na = sdf.fillna(value=dict(X=10.0, Y="NAN"))
516 |         ndf = na.to_df()
517 |         self.assertEqualDataFrame(ndf, df2)
518 | 
519 |         df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
520 |         na = sdf.fillna(value=dict(X=10.0))
521 |         ndf = na.to_df()
522 |         self.assertEqualDataFrame(ndf, df3)
523 | 
524 |     def test_describe(self):
525 |         x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5
526 |         y = numpy.arange(100001).astype(numpy.int64)
527 |         z = numpy.array([chr(65 + j % 45) for j in y])
528 |         df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z))
529 |         sdf = StreamingDataFrame.read_df(df)
530 | 
531 |         desc = sdf.describe()
532 |         self.assertEqual(["X", "Y"], list(desc.columns))
533 |         self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0])
534 |         self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000])
535 |         self.assertEqualArray(
536 |             desc.loc["mean", :], numpy.array([0, 50000], dtype=numpy.float64), atol=1e-8
537 |         )
538 |         self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000]))
539 |         self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000]))
540 |         self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000]))
541 |         self.assertEqualArray(
542 |             desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), atol=1e-4
543 |         )
544 | 
545 |     def test_set_item(self):
546 |         df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
547 |         self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
548 |         sdf = StreamingDataFrame.read_df(df)
549 | 
550 |         def f():
551 |             sdf[["a"]] = 10
552 | 
553 |         self.assertRaise(f, ValueError)
554 | 
555 |         def g():
556 |             sdf["a"] = [10]
557 | 
558 |         self.assertRaise(g, NotImplementedError)
559 | 
560 |         sdf["aa"] = 10
561 |         df = sdf.to_df()
562 |         ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10]))
563 |         self.assertEqualDataFrame(df, ddf)
564 |         sdf["bb"] = sdf["b"] + 10
565 |         df = sdf.to_df()
566 |         ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
567 |         self.assertEqualDataFrame(df, ddf)
568 | 
569 |     def test_set_item_function(self):
570 |         df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
571 |         self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
572 |         sdf = StreamingDataFrame.read_df(df)
573 |         sdf["bb"] = sdf["b"].apply(lambda x: x + 11)
574 |         df = sdf.to_df()
575 |         ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
576 |         self.assertEqualDataFrame(df, ddf)
577 | 
578 | 
579 | if __name__ == "__main__":
580 |     unittest.main(verbosity=2)
581 | 


--------------------------------------------------------------------------------
/_unittests/ut_module/test_sklearn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy
 3 | import pandas
 4 | from sklearn.linear_model import LogisticRegression
 5 | from pandas_streaming.ext_test_case import ExtTestCase
 6 | 
 7 | 
 8 | class TestScikitLearn(ExtTestCase):
 9 |     def test_logistic_regression_check(self):
10 |         X = pandas.DataFrame(numpy.array([[0.1, 0.2], [-0.2, 0.3]]))
11 |         Y = numpy.array([0, 1])
12 |         clq = LogisticRegression(
13 |             fit_intercept=False, solver="liblinear", random_state=42
14 |         )
15 |         clq.fit(X, Y)
16 |         pred2 = clq.predict(X)
17 |         self.assertEqualArray(numpy.array([0, 1]), pred2)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   - Visual Studio 2019
 3 | environment:
 4 |   matrix:
 5 |     - PYTHON: "C:\\Python310-x64"
 6 |       PYTHON_VERSION: "3.10.x"
 7 |       PYTHON_ARCH: "64"
 8 | init:
 9 |   - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
10 | 
11 | install:
12 |   - "%PYTHON%\\python -m pip install --upgrade pip"
13 |   - "%PYTHON%\\Scripts\\pip install  -r requirements-dev.txt"
14 | build: off
15 | 
16 | before_test:
17 |   - "%PYTHON%\\python -u setup.py build_ext --inplace"
18 | 
19 | test_script:
20 |   - "%PYTHON%\\python -u setup.py unittests"
21 | 
22 | after_test:
23 |   - "%PYTHON%\\python -u setup.py bdist_wheel"
24 | 
25 | artifacts:
26 |   - path: dist
27 |     name: pandas_streaming
28 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | jobs:
  2 | - job: 'TestLinuxWheelPip'
  3 |   pool:
  4 |     vmImage: 'ubuntu-latest'
  5 |   strategy:
  6 |     matrix:
  7 |       Python311-Linux:
  8 |         python.version: '3.11'
  9 |     maxParallel: 3
 10 | 
 11 |   steps:
 12 |   - task: UsePythonVersion@0
 13 |     inputs:
 14 |       versionSpec: '$(python.version)'
 15 |       architecture: 'x64'
 16 |   - script: sudo apt-get update
 17 |     displayName: 'AptGet Update'
 18 |   - script: sudo apt-get install -y graphviz
 19 |     displayName: 'Install Graphviz'
 20 |   - script: python -m pip install --upgrade pip setuptools wheel
 21 |     displayName: 'Install tools'
 22 |   - script: pip install -r requirements.txt
 23 |     displayName: 'Install Requirements'
 24 |   - script: pip install -r requirements-dev.txt
 25 |     displayName: 'Install Requirements dev'
 26 |   - script: |
 27 |       ruff check .
 28 |     displayName: 'Ruff'
 29 |   - script: |
 30 |       black --diff .
 31 |     displayName: 'Black'
 32 |   - script: |
 33 |       python -m pip wheel . --wheel-dir dist -v -v -v
 34 |     displayName: 'build wheel'
 35 |   - script: |
 36 |       python -m pip install . -v -v -v
 37 |     displayName: 'install wheel'
 38 |   - script: |
 39 |       python -m pytest
 40 |     displayName: 'Runs Unit Tests'
 41 |   - task: PublishPipelineArtifact@0
 42 |     inputs:
 43 |       artifactName: 'wheel-linux-wheel-$(python.version)'
 44 |       targetPath: 'dist'
 45 | 
 46 | - job: 'TestLinuxNightly'
 47 |   pool:
 48 |     vmImage: 'ubuntu-latest'
 49 |   strategy:
 50 |     matrix:
 51 |       Python311-Linux:
 52 |         python.version: '3.11'
 53 |     maxParallel: 3
 54 | 
 55 |   steps:
 56 |   - task: UsePythonVersion@0
 57 |     inputs:
 58 |       versionSpec: '$(python.version)'
 59 |       architecture: 'x64'
 60 |   - script: sudo apt-get update
 61 |     displayName: 'AptGet Update'
 62 |   - script: sudo apt-get install -y pandoc
 63 |     displayName: 'Install Pandoc'
 64 |   - script: sudo apt-get install -y inkscape
 65 |     displayName: 'Install Inkscape'
 66 |   - script: sudo apt-get install -y graphviz
 67 |     displayName: 'Install Graphviz'
 68 |   - script: python -m pip install --upgrade pip setuptools wheel
 69 |     displayName: 'Install tools'
 70 |   - script: pip install -r requirements.txt
 71 |     displayName: 'Install Requirements'
 72 |   - script: pip install -r requirements-dev.txt
 73 |     displayName: 'Install Requirements dev'
 74 |   - script: pip uninstall -y scikit-learn
 75 |     displayName: 'Uninstall scikit-learn'
 76 |   - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
 77 |     displayName: 'Install scikit-learn nightly'
 78 |   - script: |
 79 |       ruff check .
 80 |     displayName: 'Ruff'
 81 |   - script: |
 82 |       black --diff .
 83 |     displayName: 'Black'
 84 |   - script: |
 85 |       python -m pytest
 86 |     displayName: 'Runs Unit Tests'
 87 | 
 88 | - job: 'TestLinux'
 89 |   pool:
 90 |     vmImage: 'ubuntu-latest'
 91 |   strategy:
 92 |     matrix:
 93 |       Python311-Linux:
 94 |         python.version: '3.11'
 95 |     maxParallel: 3
 96 | 
 97 |   steps:
 98 |   - task: UsePythonVersion@0
 99 |     inputs:
100 |       versionSpec: '$(python.version)'
101 |       architecture: 'x64'
102 |   - script: sudo apt-get update
103 |     displayName: 'AptGet Update'
104 |   - script: sudo apt-get install -y pandoc
105 |     displayName: 'Install Pandoc'
106 |   - script: sudo apt-get install -y inkscape
107 |     displayName: 'Install Inkscape'
108 |   - script: sudo apt-get install -y graphviz
109 |     displayName: 'Install Graphviz'
110 |   - script: python -m pip install --upgrade pip setuptools wheel
111 |     displayName: 'Install tools'
112 |   - script: pip install -r requirements.txt
113 |     displayName: 'Install Requirements'
114 |   - script: pip install -r requirements-dev.txt
115 |     displayName: 'Install Requirements dev'
116 |   - script: |
117 |       ruff check .
118 |     displayName: 'Ruff'
119 |   - script: |
120 |       black --diff .
121 |     displayName: 'Black'
122 |   - script: |
123 |       python -m pytest --cov
124 |     displayName: 'Runs Unit Tests'
125 |   - script: |
126 |       python -u setup.py bdist_wheel
127 |     displayName: 'Build Package'
128 |   #- script: |
129 |   #    python -m sphinx _doc dist/html
130 |   #  displayName: 'Builds Documentation'
131 |   - task: PublishPipelineArtifact@0
132 |     inputs:
133 |       artifactName: 'wheel-linux-$(python.version)'
134 |       targetPath: 'dist'
135 | 
136 | - job: 'TestWindows'
137 |   pool:
138 |     vmImage: 'windows-latest'
139 |   strategy:
140 |     matrix:
141 |       Python311-Windows:
142 |         python.version: '3.11'
143 |     maxParallel: 3
144 | 
145 |   steps:
146 |   - task: UsePythonVersion@0
147 |     inputs:
148 |       versionSpec: '$(python.version)'
149 |       architecture: 'x64'
150 |   - script: python -m pip install --upgrade pip setuptools wheel
151 |     displayName: 'Install tools'
152 |   - script: pip install -r requirements.txt
153 |     displayName: 'Install Requirements'
154 |   - script: pip install -r requirements-dev.txt
155 |     displayName: 'Install Requirements dev'
156 |   - script: |
157 |       python -m pytest
158 |     displayName: 'Runs Unit Tests'
159 |   - script: |
160 |       python -u setup.py bdist_wheel
161 |     displayName: 'Build Package'
162 |   - task: PublishPipelineArtifact@0
163 |     inputs:
164 |       artifactName: 'wheel-windows-$(python.version)'
165 |       targetPath: 'dist'
166 | 
167 | - job: 'TestMac'
168 |   pool:
169 |     vmImage: 'macOS-latest'
170 |   strategy:
171 |     matrix:
172 |       Python311-Mac:
173 |         python.version: '3.11'
174 |     maxParallel: 3
175 | 
176 |   steps:
177 |   - task: UsePythonVersion@0
178 |     inputs:
179 |       versionSpec: '$(python.version)'
180 |       architecture: 'x64'
181 |   - script: gcc --version
182 |     displayName: 'gcc version'
183 |   #- script: brew upgrade
184 |   #  displayName: 'brew upgrade'
185 |   #- script: brew update
186 |   #  displayName: 'brew update'
187 |   - script: export
188 |     displayName: 'export'
189 |   - script: gcc --version
190 |     displayName: 'gcc version'
191 |   - script: python -m pip install --upgrade pip setuptools wheel
192 |     displayName: 'Install tools'
193 |   - script: pip install -r requirements.txt
194 |     displayName: 'Install Requirements'
195 |   - script: pip install -r requirements-dev.txt
196 |     displayName: 'Install Requirements dev'
197 |   - script: |
198 |       python -m pytest
199 |     displayName: 'Runs Unit Tests'
200 |   - script: |
201 |       python -u setup.py bdist_wheel
202 |     displayName: 'Build Package'
203 |   - task: PublishPipelineArtifact@0
204 |     inputs:
205 |       artifactName: 'wheel-mac-$(python.version)'
206 |       targetPath: 'dist'
207 | 
208 | 


--------------------------------------------------------------------------------
/pandas_streaming/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.5.1"
2 | __author__ = "Xavier Dupré"
3 | __github__ = "https://github.com/sdpython/pandas_streaming"
4 | __url__ = "https://sdpython.github.io/doc/pandas-streaming/dev/"
5 | __license__ = "MIT License"
6 | 


--------------------------------------------------------------------------------
/pandas_streaming/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dummy import dummy_streaming_dataframe
2 | 


--------------------------------------------------------------------------------
/pandas_streaming/data/dummy.py:
--------------------------------------------------------------------------------
 1 | from pandas import DataFrame
 2 | from ..df import StreamingDataFrame
 3 | 
 4 | 
 5 | def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols):
 6 |     """
 7 |     Returns a dummy streaming dataframe
 8 |     mostly for unit test purposes.
 9 | 
10 |     :param n: number of rows
11 |     :param chunksize: chunk size
12 |     :param asfloat: use random float and not random int
13 |     :param cols: additional columns
14 |     :return: a @see cl StreamingDataFrame
15 |     """
16 |     if asfloat:
17 |         df = DataFrame(
18 |             dict(
19 |                 cfloat=[_ + 0.1 for _ in range(n)],
20 |                 cstr=[f"s{i}" for i in range(n)],
21 |             )
22 |         )
23 |     else:
24 |         df = DataFrame(dict(cint=list(range(n)), cstr=[f"s{i}" for i in range(n)]))
25 |     for k, v in cols.items():
26 |         df[k] = v
27 |     return StreamingDataFrame.read_df(df, chunksize=chunksize)
28 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/__init__.py:
--------------------------------------------------------------------------------
 1 | from .connex_split import (
 2 |     train_test_split_weights,
 3 |     train_test_connex_split,
 4 |     train_test_apart_stratify,
 5 | )
 6 | from .dataframe import StreamingDataFrame
 7 | from .dataframe_helpers import (
 8 |     dataframe_hash_columns,
 9 |     dataframe_unfold,
10 |     dataframe_shuffle,
11 | )
12 | from .dataframe_helpers import pandas_groupby_nan, numpy_types
13 | from .dataframe_io import to_zip, read_zip
14 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/connex_split.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from logging import getLogger
  3 | from typing import Optional, Tuple
  4 | import pandas
  5 | import numpy
  6 | from .dataframe_helpers import dataframe_shuffle
  7 | 
  8 | logger = getLogger("pandas-streaming")
  9 | 
 10 | 
 11 | class ImbalancedSplitException(Exception):
 12 |     """
 13 |     Raised when an imbalanced split is detected.
 14 |     """
 15 | 
 16 | 
 17 | def train_test_split_weights(
 18 |     df,
 19 |     weights=None,
 20 |     test_size=0.25,
 21 |     train_size=None,
 22 |     shuffle=True,
 23 |     fail_imbalanced=0.05,
 24 |     random_state=None,
 25 | ):
 26 |     """
 27 |     Splits a database in train/test given, every row
 28 |     can have a different weight.
 29 | 
 30 |     :param df: :class:`pandas.DataFrame` or see
 31 |         :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
 32 |     :param weights: None or weights or weights column name
 33 |     :param test_size: ratio for the test partition
 34 |         (if *train_size* is not specified)
 35 |     :param train_size: ratio for the train partition
 36 |     :param shuffle: shuffles before the split
 37 |     :param fail_imbalanced: raises an exception if relative weights
 38 |         difference is higher than this value
 39 |     :param random_state: seed for random generators
 40 |     :return: train and test :class:`pandas.DataFrame`
 41 | 
 42 |     If the dataframe is not shuffled first, the function
 43 |     will produce two datasets which are unlikely to be randomized
 44 |     as the function tries to keep equal weights among both paths
 45 |     without using randomness.
 46 |     """
 47 |     if hasattr(df, "iter_creation"):
 48 |         raise NotImplementedError(  # pragma: no cover
 49 |             "Not implemented yet for StreamingDataFrame."
 50 |         )
 51 |     if isinstance(df, numpy.ndarray):
 52 |         raise NotImplementedError(  # pragma: no cover
 53 |             "Not implemented on numpy arrays."
 54 |         )
 55 |     if shuffle:
 56 |         df = dataframe_shuffle(df, random_state=random_state)
 57 |     if weights is None:
 58 |         if test_size == 0 or train_size == 0:
 59 |             raise ValueError(
 60 |                 f"test_size={test_size} or train_size={train_size} cannot be null (1)."
 61 |             )
 62 |         from sklearn.model_selection import train_test_split
 63 | 
 64 |         return train_test_split(
 65 |             df, test_size=test_size, train_size=train_size, random_state=random_state
 66 |         )
 67 | 
 68 |     if isinstance(weights, pandas.Series):
 69 |         weights = list(weights)
 70 |     elif isinstance(weights, str):
 71 |         weights = list(df[weights])
 72 |     if len(weights) != df.shape[0]:
 73 |         raise ValueError(
 74 |             "Dimension mismatch between weights and dataframe "  # noqa: UP030
 75 |             "{0} != {1}".format(df.shape[0], len(weights))
 76 |         )
 77 | 
 78 |     p = (1 - test_size) if test_size else None
 79 |     if train_size is not None:
 80 |         p = train_size
 81 |         test_size = 1 - p
 82 |     if p is None or min(test_size, p) <= 0:
 83 |         raise ValueError(
 84 |             f"test_size={test_size} or train_size={train_size} cannot be null (2)."
 85 |         )
 86 |     ratio = test_size / p
 87 | 
 88 |     if random_state is None:
 89 |         randint = numpy.random.randint
 90 |     else:
 91 |         state = numpy.random.RandomState(random_state)
 92 |         randint = state.randint
 93 | 
 94 |     balance = 0
 95 |     train_ids = []
 96 |     test_ids = []
 97 |     test_weights = 0
 98 |     train_weights = 0
 99 |     for i in range(df.shape[0]):
100 |         w = weights[i]
101 |         if balance == 0:
102 |             h = randint(0, 1)
103 |             totest = h == 0
104 |         else:
105 |             totest = balance < 0
106 |         if totest:
107 |             test_ids.append(i)
108 |             balance += w
109 |             test_weights += w
110 |         else:
111 |             train_ids.append(i)
112 |             balance -= w * ratio
113 |             train_weights += w * ratio
114 | 
115 |     r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights))
116 |     if r >= fail_imbalanced:
117 |         raise ImbalancedSplitException(  # pragma: no cover
118 |             "Split is imbalanced: train_weights={0} test_weights={1} r={2}."  # noqa: UP030
119 |             "".format(train_weights, test_weights, r)
120 |         )
121 | 
122 |     return df.iloc[train_ids, :], df.iloc[test_ids, :]
123 | 
124 | 
125 | def train_test_connex_split(
126 |     df,
127 |     groups,
128 |     test_size=0.25,
129 |     train_size=None,
130 |     stratify=None,
131 |     hash_size=9,
132 |     unique_rows=False,
133 |     shuffle=True,
134 |     fail_imbalanced=0.05,
135 |     keep_balance=None,
136 |     stop_if_bigger=None,
137 |     return_cnx=False,
138 |     must_groups=None,
139 |     random_state=None,
140 |     verbose=0,
141 | ):
142 |     """
143 |     This split is for a specific case where data is linked
144 |     in many ways. Let's assume we have three ids as we have
145 |     for online sales: *(product id, user id, card id)*.
146 |     As we may need to compute aggregated features,
147 |     we need every id not to be present in both train and
148 |     test set. The function computes the connected components
149 |     and breaks each of them in two parts for train and test.
150 | 
151 |     :param df: :epkg:`pandas:DataFrame`
152 |     :param groups: columns name for the ids
153 |     :param test_size: ratio for the test partition
154 |         (if *train_size* is not specified)
155 |     :param train_size: ratio for the train partition
156 |     :param stratify: column holding the stratification
157 |     :param hash_size: size of the hash to cache information about partition
158 |     :param unique_rows: ensures that rows are unique
159 |     :param shuffle: shuffles before the split
160 |     :param fail_imbalanced: raises an exception if relative weights difference
161 |         is higher than this value
162 |     :param stop_if_bigger: (float) stops a connected components from being
163 |         bigger than this ratio of elements, this should not be used
164 |         unless a big components emerges, the algorithm stops merging
165 |         but does not guarantee it returns the best cut,
166 |         the value should be close to 0
167 |     :param keep_balance: (float), if not None, does not merge connected components
168 |         if their relative sizes are too different,
169 |         the value should be close to 1
170 |     :param return_cnx: returns connected components as a third results
171 |     :param must_groups: column name for ids which must not be shared by
172 |         train/test partitions
173 |     :param random_state: seed for random generator
174 |     :param verbose: verbosity (uses logging)
175 |     :return: Two see :class:`StreamingDataFrame
176 |         <pandas_streaming.df.dataframe.StreamingDataFrame>`, one
177 |         for train, one for test.
178 | 
179 |     The list of ids must hold in memory.
180 |     There is no streaming implementation for the ids.
181 | 
182 |     .. exref::
183 |         :title: Splits a dataframe, keep ids in separate partitions
184 |         :tag: dataframe
185 | 
186 |         In some data science problems, rows are not independant
187 |         and share common value, most of the time ids. In some
188 |         specific case, multiple ids from different columns are
189 |         connected and must appear in the same partition.
190 |         Testing that each id column is evenly split and do not
191 |         appear in both sets in not enough. Connected components
192 |         are needed.
193 | 
194 |         .. runpython::
195 |             :showcode:
196 | 
197 |             from pandas import DataFrame
198 |             from pandas_streaming.df import train_test_connex_split
199 | 
200 |             df = DataFrame([dict(user="UA", prod="PAA", card="C1"),
201 |                             dict(user="UA", prod="PB", card="C1"),
202 |                             dict(user="UB", prod="PC", card="C2"),
203 |                             dict(user="UB", prod="PD", card="C2"),
204 |                             dict(user="UC", prod="PAA", card="C3"),
205 |                             dict(user="UC", prod="PF", card="C4"),
206 |                             dict(user="UD", prod="PG", card="C5"),
207 |                             ])
208 | 
209 |             train, test = train_test_connex_split(
210 |                 df, test_size=0.5, groups=['user', 'prod', 'card'],
211 |                 fail_imbalanced=0.6)
212 | 
213 |             print(train)
214 |             print(test)
215 | 
216 |     If *return_cnx* is True, the third results contains:
217 | 
218 |     * connected components for each id
219 |     * the dataframe with connected components as a new column
220 | 
221 |     .. runpython::
222 |         :showcode:
223 | 
224 |         from pandas import DataFrame
225 |         from pandas_streaming.df import train_test_connex_split
226 | 
227 |         df = DataFrame([dict(user="UA", prod="PAA", card="C1"),
228 |                         dict(user="UA", prod="PB", card="C1"),
229 |                         dict(user="UB", prod="PC", card="C2"),
230 |                         dict(user="UB", prod="PD", card="C2"),
231 |                         dict(user="UC", prod="PAA", card="C3"),
232 |                         dict(user="UC", prod="PF", card="C4"),
233 |                         dict(user="UD", prod="PG", card="C5"),
234 |                         ])
235 | 
236 |         train, test, cnx = train_test_connex_split(
237 |             df, test_size=0.5, groups=['user', 'prod', 'card'],
238 |             fail_imbalanced=0.6, return_cnx=True)
239 | 
240 |         print(cnx[0])
241 |         print(cnx[1])
242 |     """
243 |     if stratify is not None:
244 |         raise NotImplementedError(  # pragma: no cover
245 |             "Option stratify is not implemented."
246 |         )
247 |     if groups is None or len(groups) == 0:
248 |         raise ValueError(  # pragma: no cover
249 |             "groups is empty. Use regular train_test_split."
250 |         )
251 |     if hasattr(df, "iter_creation"):
252 |         raise NotImplementedError(  # pragma: no cover
253 |             "Not implemented yet for StreamingDataFrame."
254 |         )
255 |     if isinstance(df, numpy.ndarray):
256 |         raise NotImplementedError(  # pragma: no cover
257 |             "Not implemented on numpy arrays."
258 |         )
259 |     if shuffle:
260 |         df = dataframe_shuffle(df, random_state=random_state)
261 | 
262 |     dfids = df[groups].copy()
263 |     if must_groups is not None:
264 |         dfids_must = df[must_groups].copy()
265 | 
266 |     name = "connex"
267 |     while name in dfids.columns:
268 |         name += "_"
269 |     one = "weight"
270 |     while one in dfids.columns:
271 |         one += "_"
272 | 
273 |     # Connected components.
274 |     elements = list(range(dfids.shape[0]))
275 |     counts_cnx = {i: {i} for i in elements}
276 |     connex = {}
277 |     avoids_merge = {}
278 | 
279 |     def do_connex_components(dfrows, local_groups, kb, sib):
280 |         "run connected components algorithms"
281 |         itern = 0
282 |         modif = 1
283 | 
284 |         while modif > 0 and itern < len(elements):
285 |             if df.shape[0] > 10000:
286 |                 logger.info(
287 |                     "[train_test_connex_split] iteration=%d-#nb connect=%d - "
288 |                     "modif=%s",
289 |                     itern,
290 |                     len(set(elements)),
291 |                     modif,
292 |                 )
293 | 
294 |             modif = 0
295 |             itern += 1
296 |             for i, row in enumerate(dfrows.itertuples(index=False, name=None)):
297 |                 vals = [
298 |                     val
299 |                     for val in zip(local_groups, row)
300 |                     if not isinstance(val[1], float) or not numpy.isnan(val[1])
301 |                 ]
302 | 
303 |                 c = elements[i]
304 | 
305 |                 for val in vals:
306 |                     if val not in connex:
307 |                         connex[val] = c
308 |                         modif += 1
309 | 
310 |                 set_c = set(connex[val] for val in vals)
311 |                 set_c.add(c)
312 |                 new_c = min(set_c)
313 | 
314 |                 add_pair_c = []
315 |                 for c in set_c:
316 |                     if c == new_c or (new_c, c) in avoids_merge:
317 |                         continue
318 |                     if kb is not None:
319 |                         maxi = min(len(counts_cnx[new_c]), len(counts_cnx[c]))
320 |                         if maxi > 5:
321 |                             diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi
322 |                             r = diff / float(maxi)
323 |                             if r > kb:
324 |                                 if verbose:  # pragma: no cover
325 |                                     logger.info(
326 |                                         "[train_test_connex_split]    balance "
327 |                                         "r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
328 |                                         r,
329 |                                         kb,
330 |                                         new_c,
331 |                                         len(counts_cnx[new_c]),
332 |                                         c,
333 |                                         len(counts_cnx[c]),
334 |                                     )
335 | 
336 |                                 continue
337 | 
338 |                     if sib is not None:
339 |                         r = (len(counts_cnx[new_c]) + len(counts_cnx[c])) / float(
340 |                             len(elements)
341 |                         )
342 |                         if r > sib:
343 |                             logger.info(
344 |                                 "[train_test_connex_split]    "
345 |                                 "no merge r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
346 |                                 r,
347 |                                 sib,
348 |                                 new_c,
349 |                                 len(counts_cnx[new_c]),
350 |                                 c,
351 |                                 len(counts_cnx[c]),
352 |                             )
353 |                             avoids_merge[new_c, c] = i
354 |                             continue
355 | 
356 |                     add_pair_c.append(c)
357 | 
358 |                 if len(add_pair_c) > 0:
359 |                     for c in add_pair_c:
360 |                         modif += len(counts_cnx[c])
361 |                         for ii in counts_cnx[c]:
362 |                             elements[ii] = new_c
363 |                         counts_cnx[new_c] = counts_cnx[new_c].union(counts_cnx[c])
364 |                         counts_cnx[c] = set()
365 | 
366 |                         keys = list(vals)
367 |                         for val in keys:
368 |                             if connex[val] == c:
369 |                                 connex[val] = new_c
370 |                                 modif += 1
371 | 
372 |     if must_groups:
373 |         do_connex_components(dfids_must, must_groups, None, None)
374 |     do_connex_components(dfids, groups, keep_balance, stop_if_bigger)
375 | 
376 |     # final
377 |     dfids[name] = elements
378 |     dfids[one] = 1
379 |     grsum = dfids[[name, one]].groupby(name, as_index=False).sum()
380 |     for g in groups:
381 |         logger.info("[train_test_connex_split]     #nb in '%d':", len(set(dfids[g])))
382 |     logger.info(
383 |         "[train_test_connex_split] #connex %d/%d", grsum.shape[0], dfids.shape[0]
384 |     )
385 |     if grsum.shape[0] <= 1:
386 |         raise ValueError(  # pragma: no cover
387 |             "Every element is in the same connected components."
388 |         )
389 | 
390 |     # Statistics: top connected components
391 |     if verbose:
392 |         # Global statistics
393 |         counts = Counter(elements)
394 |         cl = [(v, k) for k, v in counts.items()]
395 |         cum = 0
396 |         maxc = None
397 |         logger.info(
398 |             "[train_test_connex_split] number of connected components: %d",
399 |             len(set(elements)),
400 |         )
401 |         for i, (v, k) in enumerate(sorted(cl, reverse=True)):
402 |             if i == 0:
403 |                 maxc = k, v
404 |             if i >= 10:
405 |                 break
406 |             cum += v
407 |             logger.info(
408 |                 "[train_test_connex_split]     c=%s #elements=%s cumulated=%d/%d",
409 |                 k,
410 |                 v,
411 |                 cum,
412 |                 len(elements),
413 |             )
414 | 
415 |         # Most important component
416 |         logger.info(
417 |             "[train_test_connex_split] first row of the biggest component %d", maxc
418 |         )
419 |         tdf = dfids[dfids[name] == maxc[0]]
420 |         logger.info("[train_test_connex_split] % s", tdf.head(n=10))
421 | 
422 |     # Splits.
423 |     train, test = train_test_split_weights(
424 |         grsum,
425 |         weights=one,
426 |         test_size=test_size,
427 |         train_size=train_size,
428 |         shuffle=shuffle,
429 |         fail_imbalanced=fail_imbalanced,
430 |         random_state=random_state,
431 |     )
432 |     train.drop(one, inplace=True, axis=1)
433 |     test.drop(one, inplace=True, axis=1)
434 | 
435 |     # We compute the final dataframe.
436 |     def double_merge(d):
437 |         "merge twice"
438 |         merge1 = dfids.merge(d, left_on=name, right_on=name)
439 |         merge2 = df.merge(merge1, left_on=groups, right_on=groups)
440 |         return merge2
441 | 
442 |     train_f = double_merge(train)
443 |     test_f = double_merge(test)
444 |     if return_cnx:
445 |         return train_f, test_f, (connex, dfids)
446 |     else:
447 |         return train_f, test_f
448 | 
449 | 
450 | def train_test_apart_stratify(
451 |     df: pandas.DataFrame,
452 |     group,
453 |     test_size: Optional[float] = 0.25,
454 |     train_size: Optional[float] = None,
455 |     stratify: Optional[str] = None,
456 |     force: bool = False,
457 |     random_state: Optional[int] = None,
458 |     sorted_indices: bool = False,
459 | ) -> Tuple["StreamingDataFrame", "StreamingDataFrame"]:  # noqa: F821
460 |     """
461 |     This split is for a specific case where data is linked
462 |     in one way. Let's assume we have two ids as we have
463 |     for online sales: *(product id, category id)*.
464 |     A product can have multiple categories. We need to have
465 |     distinct products on train and test but common categories
466 |     on both sides.
467 | 
468 |     :param df: :epkg:`pandas:DataFrame`
469 |     :param group: columns name for the ids
470 |     :param test_size: ratio for the test partition
471 |         (if *train_size* is not specified)
472 |     :param train_size: ratio for the train partition
473 |     :param stratify: column holding the stratification
474 |     :param force: if True, tries to get at least one example on the test side
475 |         for each value of the column *stratify*
476 |     :param random_state: seed for random generators
477 |     :param sorted_indices: sort index first,
478 |         see issue `41 <https://github.com/sdpython/pandas-streaming/issues/41>`
479 |     :return: Two see :class:`StreamingDataFrame
480 |         <pandas_streaming.df.dataframe.StreamingDataFrame>`, one
481 |         for train, one for test.
482 | 
483 |     The list of ids must hold in memory.
484 |     There is no streaming implementation for the ids.
485 |     This split was implemented for a case of a multi-label
486 |     classification. A category (*stratify*) is not exclusive
487 |     and an observation can be assigned to multiple
488 |     categories. In that particular case, the method
489 |     :func:`sklearn.model_selection.train_test_split`
490 |     can not directly be used.
491 | 
492 |     .. runpython::
493 |         :showcode:
494 | 
495 |         import pandas
496 |         from pandas_streaming.df import train_test_apart_stratify
497 | 
498 |         df = pandas.DataFrame([dict(a=1, b="e"),
499 |                                dict(a=1, b="f"),
500 |                                dict(a=2, b="e"),
501 |                                dict(a=2, b="f")])
502 | 
503 |         train, test = train_test_apart_stratify(
504 |             df, group="a", stratify="b", test_size=0.5)
505 |         print(train)
506 |         print('-----------')
507 |         print(test)
508 | 
509 |     """
510 |     if stratify is None:
511 |         raise ValueError("stratify must be specified.")  # pragma: no cover
512 |     if group is None:
513 |         raise ValueError("group must be specified.")  # pragma: no cover
514 |     if hasattr(df, "iter_creation"):
515 |         raise NotImplementedError("Not implemented yet for StreamingDataFrame.")
516 |     if isinstance(df, numpy.ndarray):
517 |         raise NotImplementedError("Not implemented on numpy arrays.")
518 | 
519 |     p = (1 - test_size) if test_size else None
520 |     if train_size is not None:
521 |         p = train_size
522 |     test_size = 1 - p
523 |     if p is None or min(test_size, p) <= 0:
524 |         raise ValueError(  # pragma: no cover
525 |             f"test_size={test_size} or train_size={train_size} cannot be null"
526 |         )
527 | 
528 |     couples = df[[group, stratify]].itertuples(name=None, index=False)
529 |     hist = Counter(df[stratify])
530 |     sorted_hist = [(v, k) for k, v in hist.items()]
531 |     sorted_hist.sort()
532 |     ids = {c: set() for c in hist}
533 | 
534 |     for g, s in couples:
535 |         ids[s].add(g)
536 | 
537 |     if random_state is None:
538 |         permutation = numpy.random.permutation
539 |     else:
540 |         state = numpy.random.RandomState(random_state)
541 |         permutation = state.permutation
542 | 
543 |     split = {}
544 |     for _, k in sorted_hist:
545 |         indices = sorted(ids[k]) if sorted_indices else ids[k]
546 |         not_assigned, assigned = [], []
547 |         for c in indices:
548 |             if c in split:
549 |                 assigned.append(c)
550 |             else:
551 |                 not_assigned.append(c)
552 |         if len(not_assigned) == 0:
553 |             continue
554 |         nb_test = sum(split[c] for c in assigned)
555 |         expected = min(len(ids[k]), int(test_size * len(ids[k]) + 0.5)) - nb_test
556 |         if force and expected == 0 and nb_test == 0:
557 |             nb_train = len(assigned) - nb_test
558 |             if nb_train > 0 or len(not_assigned) > 1:
559 |                 expected = min(1, len(not_assigned))
560 |         if expected > 0:
561 |             permutation(not_assigned)
562 |             for e in not_assigned[:expected]:
563 |                 split[e] = 1
564 |             for e in not_assigned[expected:]:
565 |                 split[e] = 0
566 |         else:
567 |             for c in not_assigned:
568 |                 split[c] = 0
569 | 
570 |     train_set = set(k for k, v in split.items() if v == 0)
571 |     test_set = set(k for k, v in split.items() if v == 1)
572 |     train_df = df[df[group].isin(train_set)]
573 |     test_df = df[df[group].isin(test_set)]
574 |     return train_df, test_df
575 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_helpers.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import struct
  3 | import warnings
  4 | import numpy
  5 | from pandas import DataFrame, Index, Series
  6 | 
  7 | 
  8 | def numpy_types():
  9 |     """
 10 |     Returns the list of :epkg:`numpy` available types.
 11 | 
 12 |     :return: list of types
 13 |     """
 14 | 
 15 |     return [
 16 |         numpy.bool_,
 17 |         numpy.int_,
 18 |         numpy.intc,
 19 |         numpy.intp,
 20 |         numpy.int8,
 21 |         numpy.int16,
 22 |         numpy.int32,
 23 |         numpy.int64,
 24 |         numpy.uint8,
 25 |         numpy.uint16,
 26 |         numpy.uint32,
 27 |         numpy.uint64,
 28 |         numpy.float16,
 29 |         numpy.float32,
 30 |         numpy.float64,
 31 |         numpy.complex64,
 32 |         numpy.complex128,
 33 |     ]
 34 | 
 35 | 
 36 | def hash_str(c, hash_length):
 37 |     """
 38 |     Hashes a string.
 39 | 
 40 |     @param      c               value to hash
 41 |     @param      hash_length     hash_length
 42 |     @return                     string
 43 |     """
 44 |     if isinstance(c, float):
 45 |         if numpy.isnan(c):
 46 |             return c
 47 |         raise ValueError(f"numpy.nan expected, not {c}")
 48 |     m = hashlib.sha256()
 49 |     m.update(c.encode("utf-8"))
 50 |     r = m.hexdigest()
 51 |     if len(r) >= hash_length:
 52 |         return r[:hash_length]
 53 |     return r
 54 | 
 55 | 
 56 | def hash_int(c, hash_length):
 57 |     """
 58 |     Hashes an integer into an integer.
 59 | 
 60 |     @param      c               value to hash
 61 |     @param      hash_length     hash_length
 62 |     @return                     int
 63 |     """
 64 |     if isinstance(c, float):
 65 |         if numpy.isnan(c):
 66 |             return c
 67 |         else:
 68 |             raise ValueError(f"numpy.nan expected, not {c}")
 69 |     else:
 70 |         b = struct.pack("i", c)
 71 |         m = hashlib.sha256()
 72 |         m.update(b)
 73 |         r = m.hexdigest()
 74 |         if len(r) >= hash_length:
 75 |             r = r[:hash_length]
 76 |         return int(r, 16) % (10**8)
 77 | 
 78 | 
 79 | def hash_float(c, hash_length):
 80 |     """
 81 |     Hashes a float into a float.
 82 | 
 83 |     @param      c               value to hash
 84 |     @param      hash_length     hash_length
 85 |     @return                     int
 86 |     """
 87 |     if numpy.isnan(c):
 88 |         return c
 89 |     else:
 90 |         b = struct.pack("d", c)
 91 |         m = hashlib.sha256()
 92 |         m.update(b)
 93 |         r = m.hexdigest()
 94 |         if len(r) >= hash_length:
 95 |             r = r[:hash_length]
 96 |         i = int(r, 16) % (2**53)
 97 |         return float(i)
 98 | 
 99 | 
100 | def dataframe_hash_columns(df, cols=None, hash_length=10, inplace=False):
101 |     """
102 |     Hashes a set of columns in a dataframe.
103 |     Keeps the same type. Skips missing values.
104 | 
105 |     @param      df          dataframe
106 |     @param      cols        columns to hash or None for alls.
107 |     @param      hash_length for strings only, length of the hash
108 |     @param      inplace     modifies inplace
109 |     @return                 new dataframe
110 | 
111 |     This might be useful to anonimized data before
112 |     making it public.
113 | 
114 |     .. exref::
115 |         :title: Hashes a set of columns in a dataframe
116 |         :tag: dataframe
117 | 
118 |         .. runpython::
119 |             :showcode:
120 | 
121 |             import pandas
122 |             from pandas_streaming.df import dataframe_hash_columns
123 |             df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1", ai=1),
124 |                                    dict(b="f", c=5.7, ind="a2", ai=2),
125 |                                    dict(a=4, b="g", ind="a3", ai=3),
126 |                                    dict(a=8, b="h", c=5.9, ai=4),
127 |                                    dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
128 |             print(df)
129 |             print('--------------')
130 |             df2 = dataframe_hash_columns(df)
131 |             print(df2)
132 |     """
133 |     if cols is None:
134 |         cols = list(df.columns)
135 | 
136 |     if not inplace:
137 |         df = df.copy()
138 | 
139 |     def hash_intl(c):
140 |         "hash int"
141 |         return hash_int(c, hash_length)
142 | 
143 |     def hash_strl(c):
144 |         "hash string"
145 |         return hash_str(c, hash_length)
146 | 
147 |     def hash_floatl(c):
148 |         "hash float"
149 |         return hash_float(c, hash_length)
150 | 
151 |     coltype = dict(zip(df.columns, df.dtypes))
152 |     for c in cols:
153 |         t = coltype[c]
154 |         if t == int:  # noqa: E721
155 |             df[c] = df[c].apply(hash_intl)
156 |         elif t == numpy.int64:
157 |             df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x)))
158 |         elif t == float:  # noqa: E721
159 |             df[c] = df[c].apply(hash_floatl)
160 |         elif t == object:  # noqa: E721
161 |             df[c] = df[c].apply(hash_strl)
162 |         else:
163 |             raise NotImplementedError(  # pragma: no cover
164 |                 f"Conversion of type {t} in column '{c}' is not implemented"
165 |             )
166 | 
167 |     return df
168 | 
169 | 
170 | def dataframe_unfold(df, col, new_col=None, sep=","):
171 |     """
172 |     One column may contain concatenated values.
173 |     This function splits these values and multiplies the
174 |     rows for each split value.
175 | 
176 |     @param      df      dataframe
177 |     @param      col     column with the concatenated values (strings)
178 |     @param      new_col new column name, if None, use default value.
179 |     @param      sep     separator
180 |     @return             a new dataframe
181 | 
182 |     .. exref::
183 |         :title: Unfolds a column of a dataframe.
184 |         :tag: dataframe
185 | 
186 |         .. runpython::
187 |             :showcode:
188 | 
189 |             import pandas
190 |             import numpy
191 |             from pandas_streaming.df import dataframe_unfold
192 | 
193 |             df = pandas.DataFrame([dict(a=1, b="e,f"),
194 |                                    dict(a=2, b="g"),
195 |                                    dict(a=3)])
196 |             print(df)
197 |             df2 = dataframe_unfold(df, "b")
198 |             print('----------')
199 |             print(df2)
200 | 
201 |             # To fold:
202 |             folded = df2.groupby('a').apply(
203 |                 lambda row: ','.join(row['b_unfold'].dropna())
204 |                         if len(row['b_unfold'].dropna()) > 0 else numpy.nan)
205 |             print('----------')
206 |             print(folded)
207 |     """
208 |     if new_col is None:
209 |         col_name = col + "_unfold"
210 |     else:
211 |         col_name = new_col
212 |     temp_col = "__index__"
213 |     while temp_col in df.columns:
214 |         temp_col += "_"
215 |     rows = []
216 |     for i, v in enumerate(df[col]):
217 |         if isinstance(v, str):
218 |             spl = v.split(sep)
219 |             for vs in spl:
220 |                 rows.append({col: v, col_name: vs, temp_col: i})
221 |         else:
222 |             rows.append({col: v, col_name: v, temp_col: i})
223 |     df = df.copy()
224 |     df[temp_col] = list(range(df.shape[0]))
225 |     dfj = DataFrame(rows)
226 |     res = df.merge(dfj, on=[col, temp_col])
227 |     return res.drop(temp_col, axis=1).copy()
228 | 
229 | 
230 | def dataframe_shuffle(df, random_state=None):
231 |     """
232 |     Shuffles a dataframe.
233 | 
234 |     :param df: :epkg:`pandas:DataFrame`
235 |     :param random_state: seed
236 |     :return: new :epkg:`pandas:DataFrame`
237 | 
238 |     .. exref::
239 |         :title: Shuffles the rows of a dataframe
240 |         :tag: dataframe
241 | 
242 |         .. runpython::
243 |             :showcode:
244 | 
245 |             import pandas
246 |             from pandas_streaming.df import dataframe_shuffle
247 | 
248 |             df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1"),
249 |                                    dict(a=2, b="f", c=5.7, ind="a2"),
250 |                                    dict(a=4, b="g", c=5.8, ind="a3"),
251 |                                    dict(a=8, b="h", c=5.9, ind="a4"),
252 |                                    dict(a=16, b="i", c=6.2, ind="a5")])
253 |             print(df)
254 |             print('----------')
255 | 
256 |             shuffled = dataframe_shuffle(df, random_state=0)
257 |             print(shuffled)
258 |     """
259 |     if random_state is not None:
260 |         state = numpy.random.RandomState(random_state)
261 |         permutation = state.permutation
262 |     else:
263 |         permutation = numpy.random.permutation
264 |     ori_cols = list(df.columns)
265 |     scols = set(ori_cols)
266 | 
267 |     no_index = df.reset_index(drop=False)
268 |     keep_cols = [_ for _ in no_index.columns if _ not in scols]
269 |     index = no_index.index
270 |     index = permutation(index)
271 |     shuffled = no_index.iloc[index, :]
272 |     res = shuffled.set_index(keep_cols)[ori_cols]
273 |     res.index.names = df.index.names
274 |     return res
275 | 
276 | 
277 | def pandas_fillna(df, by, hasna=None, suffix=None):
278 |     """
279 |     Replaces the :epkg:`nan` values for something not :epkg:`nan`.
280 |     Mostly used by @see fn pandas_groupby_nan.
281 | 
282 |     :param df: dataframe
283 |     :param by: list of columns for which we need to replace nan
284 |     :param hasna: None or list of columns for which we need to replace NaN
285 |     :param suffix: use a prefix for the NaN value
286 |     :return: list of values chosen for each column, new dataframe (new copy)
287 |     """
288 |     suffix = suffix if suffix else "²nan"
289 |     df = df.copy()
290 |     rep = {}
291 |     for c in by:
292 |         if hasna is not None and c not in hasna:
293 |             continue
294 |         if df[c].dtype in (str, bytes, object):
295 |             se = set(df[c].dropna())
296 |             val = se.pop()
297 |             if isinstance(val, str):
298 |                 cst = suffix
299 |                 val = ""
300 |             elif isinstance(val, bytes):
301 |                 cst = b"_"
302 |             else:
303 |                 raise TypeError(  # pragma: no cover
304 |                     "Unable to determine a constant for type='{0}' dtype='{1}'".format(  # noqa: UP030
305 |                         val, df[c].dtype
306 |                     )
307 |                 )
308 |             val += cst
309 |             while val in se:
310 |                 val += suffix
311 |             df[c].fillna(val, inplace=True)
312 |             rep[c] = val
313 |         else:
314 |             dr = df[c].dropna()
315 |             mi = abs(dr.min())
316 |             ma = abs(dr.max())
317 |             val = ma + mi
318 |             if val == ma and not isinstance(val, str):
319 |                 val += ma + 1.0
320 |             if val <= ma:
321 |                 raise ValueError(  # pragma: no cover
322 |                     "Unable to find a different value for column '{}' v='{}: "
323 |                     "min={} max={}".format(c, val, mi, ma)
324 |                 )
325 |             df[c].fillna(val, inplace=True)
326 |             rep[c] = val
327 |     return rep, df
328 | 
329 | 
330 | def pandas_groupby_nan(
331 |     df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs
332 | ):
333 |     """
334 |     Does a *groupby* including keeping missing values (:epkg:`nan`).
335 | 
336 |     :param df: dataframe
337 |     :param by: column or list of columns
338 |     :param axis: only 0 is allowed
339 |     :param as_index: should be False
340 |     :param suffix: None or a string
341 |     :param nanback: put :epkg:`nan` back in the index,
342 |         otherwise it leaves a replacement for :epkg:`nan`.
343 |         (does not work when grouping by multiple columns)
344 |     :param kwargs: other parameters sent to
345 |         `groupby <http://pandas.pydata.org/pandas-docs/stable/
346 |         generated/pandas.DataFrame.groupby.html>`_
347 |     :return: groupby results
348 | 
349 |     See :epkg:`groupby and missing values`.
350 |     If no :epkg:`nan` is detected, the function falls back in regular
351 |     :epkg:`pandas:DataFrame:groupby` which has the following
352 |     behavior.
353 | 
354 |     .. exref::
355 |         :title: Group a dataframe by one column including nan values
356 |         :tag: dataframe
357 | 
358 |         The regular :epkg:`pandas:dataframe:GroupBy` of a
359 |         :epkg:`pandas:DataFrame` removes every :epkg:`nan`
360 |         values from the index.
361 | 
362 |         .. runpython::
363 |             :showcode:
364 | 
365 |             from pandas import DataFrame
366 | 
367 |             data = [dict(a=2, ind="a", n=1),
368 |                     dict(a=2, ind="a"),
369 |                     dict(a=3, ind="b"),
370 |                     dict(a=30)]
371 |             df = DataFrame(data)
372 |             print(df)
373 |             gr = df.groupby(["ind"]).sum()
374 |             print(gr)
375 | 
376 |         Function @see fn pandas_groupby_nan modifies keeps them.
377 | 
378 |         .. runpython::
379 |             :showcode:
380 | 
381 |             from pandas import DataFrame
382 |             from pandas_streaming.df import pandas_groupby_nan
383 | 
384 |             data = [dict(a=2, ind="a", n=1),
385 |                     dict(a=2, ind="a"),
386 |                     dict(a=3, ind="b"),
387 |                     dict(a=30)]
388 |             df = DataFrame(data)
389 |             gr2 = pandas_groupby_nan(df, ["ind"]).sum()
390 |             print(gr2)
391 |     """
392 |     if nanback and suffix is None:
393 |         try:
394 |             res = df.groupby(by, axis=axis, as_index=as_index, dropna=False, **kwargs)
395 |         except TypeError:
396 |             # old version of pandas
397 |             res = None
398 |         if res is not None:
399 |             if suffix is None:
400 |                 return res
401 |             res.index = Series(res.index).replace(numpy.nan, suffix)
402 |             return res
403 |     if axis != 0:
404 |         raise NotImplementedError("axis should be 0")
405 |     if as_index:
406 |         raise NotImplementedError("as_index must be False")
407 |     if isinstance(by, tuple):
408 |         raise TypeError("by should be of list not tuple")
409 |     if not isinstance(by, list):
410 |         by = [by]
411 |     hasna = {}
412 |     for b in by:
413 |         h = df[b].isnull().values.any()
414 |         if h:
415 |             hasna[b] = True
416 |     if len(hasna) > 0:
417 |         rep, df_copy = pandas_fillna(df, by, hasna, suffix=suffix)
418 |         res = df_copy.groupby(by, axis=axis, as_index=as_index, **kwargs)
419 |         if len(by) == 1:
420 |             if not nanback:
421 |                 dummy = DataFrame([{"a": "a"}])
422 |                 do = dummy.dtypes[0]
423 |                 typ = dict(zip(df.columns, df.dtypes))
424 |                 if typ[by[0]] != do:
425 |                     warnings.warn(  # pragma: no cover
426 |                         f"[pandas_groupby_nan] NaN value: {rep}", stacklevel=0
427 |                     )
428 |                 return res
429 |             for b in by:
430 |                 fnan = rep[b]
431 |                 if fnan in res.grouper.groups:
432 |                     res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
433 |                     del res.grouper.groups[fnan]
434 |                 new_val = [
435 |                     (numpy.nan if b == fnan else b) for b in res.grouper.result_index
436 |                 ]
437 |                 res.grouper.groupings[0]._group_index = Index(new_val)
438 |                 res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True)
439 |                 if hasattr(res.grouper, "grouping"):
440 |                     if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray):
441 |                         arr = numpy.array(new_val)
442 |                         res.grouper.groupings[0].grouper = arr
443 |                         if (
444 |                             hasattr(res.grouper.groupings[0], "_cache")
445 |                             and "result_index" in res.grouper.groupings[0]._cache
446 |                         ):
447 |                             del res.grouper.groupings[0]._cache["result_index"]
448 |                     else:
449 |                         raise NotImplementedError(
450 |                             "Not implemented for type: {0}".format(  # noqa: UP030
451 |                                 type(res.grouper.groupings[0].grouper)
452 |                             )
453 |                         )
454 |                 else:
455 |                     grouper = res.grouper._get_grouper()
456 |                     if isinstance(grouper, numpy.ndarray):
457 |                         arr = numpy.array(new_val)
458 |                         res.grouper.groupings[0].grouping_vector = arr
459 |                         if (
460 |                             hasattr(res.grouper.groupings[0], "_cache")
461 |                             and "result_index" in res.grouper.groupings[0]._cache
462 |                         ):
463 |                             index = res.grouper.groupings[0]._cache["result_index"]
464 |                             if len(rep) == 1:
465 |                                 key = list(rep.values())[0]  # noqa: RUF015
466 |                                 new_index = numpy.array(index)
467 |                                 for i in range(len(new_index)):
468 |                                     if new_index[i] == key:
469 |                                         new_index[i] = numpy.nan
470 |                                 res.grouper.groupings[0]._cache["result_index"] = (
471 |                                     index.__class__(new_index)
472 |                                 )
473 |                             else:
474 |                                 raise NotImplementedError(  # pragma: no cover
475 |                                     "NaN values not implemented for multiindex."
476 |                                 )
477 |                     else:
478 |                         raise NotImplementedError(  # pragma: no cover
479 |                             "Not implemented for type: {0}".format(  # noqa: UP030
480 |                                 type(res.grouper.groupings[0].grouper)
481 |                             )
482 |                         )
483 |                 res.grouper._cache["result_index"] = res.grouper.groupings[
484 |                     0
485 |                 ]._group_index
486 |         else:
487 |             if not nanback:
488 |                 dummy = DataFrame([{"a": "a"}])
489 |                 do = dummy.dtypes[0]
490 |                 typ = dict(zip(df.columns, df.dtypes))
491 |                 for b in by:
492 |                     if typ[b] != do:
493 |                         warnings.warn(  # pragma: no cover
494 |                             f"[pandas_groupby_nan] NaN values: {rep}", stacklevel=0
495 |                         )
496 |                         break
497 |                 return res
498 |             raise NotImplementedError(
499 |                 "Not yet implemented. Replacing pseudo nan values by real nan "
500 |                 "values is not as easy as it looks. Use nanback=False"
501 |             )
502 | 
503 |             # keys = list(res.grouper.groups.keys())
504 |             # didit = False
505 |             # mapping = {}
506 |             # for key in keys:
507 |             #     new_key = list(key)
508 |             #     mod = False
509 |             #     for k, b in enumerate(by):
510 |             #         if b not in rep:
511 |             #             continue
512 |             #         fnan = rep[b]
513 |             #         if key[k] == fnan:
514 |             #             new_key[k] = numpy.nan
515 |             #             mod = True
516 |             #             didit = True
517 |             #             mapping[fnan] = numpy.nan
518 |             #     if mod:
519 |             #         new_key = tuple(new_key)
520 |             #         mapping[key] = new_key
521 |             #         res.grouper.groups[new_key] = res.grouper.groups[key]
522 |             #         del res.grouper.groups[key]
523 |             # if didit:
524 |             #     # this code deos not work
525 |             #     vnan = numpy.nan
526 |             #     new_index = list(mapping.get(v, v)
527 |             #                      for v in res.grouper.result_index)
528 |             #     names = res.grouper.result_index.names
529 |             #     # index = MultiIndex.from_tuples(tuples=new_index, names=names)
530 |             #     # res.grouper.result_index = index  # does not work cannot set
531 |             #     # values for [result_index]
532 |             #     for k in range(len(res.grouper.groupings)):
533 |             #         grou = res.grouper.groupings[k]
534 |             #         new_val = list(mapping.get(v, v) for v in grou)
535 |             #         grou._group_index = Index(new_val)
536 |             #         b = names[k]
537 |             #         if b in rep:
538 |             #             vv = rep[b]
539 |             #             grou.obj[b].replace(vv, vnan, inplace=True)
540 |             #         if isinstance(grou.grouper, numpy.ndarray):
541 |             #             grou.grouper = numpy.array(new_val)
542 |             #         else:
543 |             #             raise NotImplementedError(
544 |             #                 "Not implemented for type: {0}".format(
545 |             #                       type(grou.grouper)))
546 |             #     del res.grouper._cache
547 |         return res
548 |     return df.groupby(by, axis=axis, **kwargs)
549 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_io.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import zipfile
  4 | import pandas
  5 | import numpy
  6 | 
  7 | 
  8 | def to_zip(df, zipfilename, zname="df.csv", **kwargs):
  9 |     """
 10 |     Saves a :epkg:`Dataframe` into a :epkg:`zip` file.
 11 |     It can be read by :meth:`read_zip`.
 12 | 
 13 |     :param df: :epkg:`dataframe` or :class:`numpy.ndarray`
 14 |     :param zipfilename: a :class:`zipfile.ZipFile` or a filename
 15 |     :param zname: a filename in the zipfile
 16 |     :param kwargs: parameters for :meth:`pandas.DataFrame.to_csv` or
 17 |         :func:`numpy.save`
 18 |     :return: zipfilename
 19 | 
 20 |     .. exref::
 21 |         :title: Saves and reads a dataframe in a zip file
 22 |         :tag: dataframe
 23 | 
 24 |         This shows an example on how to save and read a
 25 |         :class:`pandas.DataFrame` directly into a zip file.
 26 | 
 27 |         .. runpython::
 28 |             :showcode:
 29 | 
 30 |             import pandas
 31 |             from pandas_streaming.df import to_zip, read_zip
 32 | 
 33 |             df = pandas.DataFrame([dict(a=1, b="e"),
 34 |                                    dict(b="f", a=5.7)])
 35 | 
 36 |             name = "dfs.zip"
 37 |             to_zip(df, name, encoding="utf-8", index=False)
 38 |             df2 = read_zip(name, encoding="utf-8")
 39 |             print(df2)
 40 | 
 41 |     .. exref::
 42 |         :title: Saves and reads a numpy array in a zip file
 43 |         :tag: array
 44 | 
 45 |         This shows an example on how to save and read a
 46 |         :class:`numpy.ndarray` directly into a zip file.
 47 | 
 48 |         .. runpython::
 49 |             :showcode:
 50 | 
 51 |             import numpy
 52 |             from pandas_streaming.df import to_zip, read_zip
 53 | 
 54 |             arr = numpy.array([[0.5, 1.5], [0.4, 1.6]])
 55 | 
 56 |             name = "dfsa.zip"
 57 |             to_zip(arr, name, 'arr.npy')
 58 |             arr2 = read_zip(name, 'arr.npy')
 59 |             print(arr2)
 60 |     """
 61 |     if isinstance(df, pandas.DataFrame):
 62 |         stb = io.StringIO()
 63 |         ext = os.path.splitext(zname)[-1]
 64 |         if ext == ".npy":
 65 |             raise ValueError(  # pragma: no cover
 66 |                 "Extension '.npy' cannot be used to save a dataframe."
 67 |             )
 68 |         df.to_csv(stb, **kwargs)
 69 |     elif isinstance(df, numpy.ndarray):
 70 |         stb = io.BytesIO()
 71 |         ext = os.path.splitext(zname)[-1]
 72 |         if ext != ".npy":
 73 |             raise ValueError(  # pragma: no cover
 74 |                 "Extension '.npy' is required when saving a numpy array."
 75 |             )
 76 |         numpy.save(stb, df, **kwargs)
 77 |     else:
 78 |         raise TypeError(f"Type not handled {type(df)}")  # pragma: no cover
 79 |     text = stb.getvalue()
 80 | 
 81 |     if isinstance(zipfilename, str):
 82 |         ext = os.path.splitext(zipfilename)[-1]
 83 |         if ext != ".zip":
 84 |             raise NotImplementedError(  # pragma: no cover
 85 |                 f"Only zip file are implemented not '{ext}'."
 86 |             )
 87 |         zf = zipfile.ZipFile(zipfilename, "w")  # pylint: disable=R1732
 88 |         close = True
 89 |     elif isinstance(zipfilename, zipfile.ZipFile):
 90 |         zf = zipfilename
 91 |         close = False
 92 |     else:
 93 |         raise TypeError(  # pragma: no cover
 94 |             f"No implementation for type '{type(zipfilename)}'"
 95 |         )
 96 | 
 97 |     zf.writestr(zname, text)
 98 |     if close:
 99 |         zf.close()
100 | 
101 | 
102 | def read_zip(zipfilename, zname=None, **kwargs):
103 |     """
104 |     Reads a :epkg:`dataframe` from a :epkg:`zip` file.
105 |     It can be saved by :meth:`to_zip`.
106 | 
107 |     :param zipfilename: a :class:`zipfile.ZipFile` or a filename
108 |     :param zname: a filename in zipfile, if None, takes the first one
109 |     :param kwargs: parameters for :func:`pandas.read_csv`
110 |     :return: :class:`pandas.DataFrame` or :class:`numpy.ndarray`
111 |     """
112 |     if isinstance(zipfilename, str):
113 |         ext = os.path.splitext(zipfilename)[-1]
114 |         if ext != ".zip":
115 |             raise NotImplementedError(  # pragma: no cover
116 |                 f"Only zip files are supported not '{ext}'."
117 |             )
118 |         zf = zipfile.ZipFile(zipfilename, "r")  # pylint: disable=R1732
119 |         close = True
120 |     elif isinstance(zipfilename, zipfile.ZipFile):
121 |         zf = zipfilename
122 |         close = False
123 |     else:
124 |         raise TypeError(  # pragma: no cover
125 |             f"No implementation for type '{type(zipfilename)}'"
126 |         )
127 | 
128 |     if zname is None:
129 |         zname = zf.namelist()[0]
130 |     content = zf.read(zname)
131 |     stb = io.BytesIO(content)
132 |     ext = os.path.splitext(zname)[-1]
133 |     if ext == ".npy":
134 |         df = numpy.load(stb, **kwargs)
135 |     else:
136 |         df = pandas.read_csv(stb, **kwargs)
137 | 
138 |     if close:
139 |         zf.close()
140 | 
141 |     return df
142 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_io_helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from io import StringIO, BytesIO
  3 | 
  4 | try:
  5 |     from ujson import dumps
  6 | except ImportError:  # pragma: no cover
  7 |     from json import dumps
  8 | 
  9 | 
 10 | class JsonPerRowsStream:
 11 |     """
 12 |     Reads a :epkg:`json` streams and adds
 13 |     ``,``, ``[``, ``]`` to convert a stream containing
 14 |     one :epkg:`json` object per row into one single :epkg:`json` object.
 15 |     It only implements method *readline*.
 16 | 
 17 |     :param st: stream
 18 |     """
 19 | 
 20 |     def __init__(self, st):
 21 |         self.st = st
 22 |         self.begin = True
 23 |         self.newline = False
 24 |         self.end = True
 25 | 
 26 |     def seek(self, offset):
 27 |         """
 28 |         Change the stream position to the given byte offset.
 29 | 
 30 |         :param offset: offset, only 0 is implemented
 31 |         """
 32 |         self.st.seek(offset)
 33 | 
 34 |     def readline(self, size=-1):
 35 |         """
 36 |         Reads a line, adds ``,``, ``[``, ``]`` if needed.
 37 |         So the number of read characters is not recessarily
 38 |         the requested one but could be greater.
 39 |         """
 40 |         text = self.st.readline(size)
 41 |         if size == 0:
 42 |             return text
 43 |         if self.newline:
 44 |             text = "," + text
 45 |             self.newline = False
 46 |         elif self.begin:
 47 |             text = "[" + text
 48 |             self.begin = False
 49 | 
 50 |         if text.endswith("\n"):
 51 |             self.newline = True
 52 |             return text
 53 |         if len(text) == 0 or len(text) < size:
 54 |             if self.end:
 55 |                 self.end = False
 56 |                 return text + "]"
 57 |             return text
 58 |         return text
 59 | 
 60 |     def read(self, size=-1):
 61 |         """
 62 |         Reads characters, adds ``,``, ``[``, ``]`` if needed.
 63 |         So the number of read characters is not recessarily
 64 |         the requested one but could be greater.
 65 |         """
 66 |         text = self.st.read(size)
 67 |         if isinstance(text, bytes):
 68 |             cst = b"\n", b"\n,", b",", b"[", b"]"
 69 |         else:
 70 |             cst = "\n", "\n,", ",", "[", "]"
 71 |         if size == 0:
 72 |             return text
 73 |         if len(text) > 1:
 74 |             t1, t2 = text[: len(text) - 1], text[len(text) - 1 :]
 75 |             t1 = t1.replace(cst[0], cst[1])
 76 |             text = t1 + t2
 77 | 
 78 |         if self.newline:
 79 |             text = cst[2] + text
 80 |             self.newline = False
 81 |         elif self.begin:
 82 |             text = cst[3] + text
 83 |             self.begin = False
 84 | 
 85 |         if text.endswith(cst[0]):
 86 |             self.newline = True
 87 |             return text
 88 |         if len(text) == 0 or len(text) < size:
 89 |             if self.end:
 90 |                 self.end = False
 91 |                 return text + cst[4]
 92 |             return text
 93 |         return text
 94 | 
 95 |     def getvalue(self):
 96 |         """
 97 |         Returns the whole stream content.
 98 |         """
 99 | 
100 |         def byline():
101 |             line = self.readline()
102 |             while line:
103 |                 yield line
104 |                 line = self.readline()
105 | 
106 |         return "".join(byline())
107 | 
108 | 
109 | def flatten_dictionary(dico, sep="_"):
110 |     """
111 |     Flattens a dictionary with nested structure to a dictionary with no
112 |     hierarchy.
113 | 
114 |     :param dico: dictionary to flatten
115 |     :param sep: string to separate dictionary keys by
116 |     :return: flattened dictionary
117 | 
118 |     Inspired from `flatten_json
119 |     <https://github.com/amirziai/flatten/blob/master/flatten_json/__init__.py>`_.
120 |     """
121 |     flattened_dict = {}
122 | 
123 |     def _flatten(obj, key):
124 |         if obj is None:
125 |             flattened_dict[key] = obj
126 |         elif isinstance(obj, dict):
127 |             for k, v in obj.items():
128 |                 if not isinstance(k, str):
129 |                     raise TypeError("All keys must a string.")  # pragma: no cover
130 |                 k2 = k if key is None else f"{key}{sep}{k}"
131 |                 _flatten(v, k2)
132 |         elif isinstance(obj, (list, set)):
133 |             for index, item in enumerate(obj):
134 |                 k2 = k if key is None else f"{key}{sep}{index}"
135 |                 _flatten(item, k2)
136 |         else:
137 |             flattened_dict[key] = obj
138 | 
139 |     _flatten(dico, None)
140 |     return flattened_dict
141 | 
142 | 
143 | def enumerate_json_items(
144 |     filename, encoding=None, lines=False, flatten=False, verbose=0
145 | ):
146 |     """
147 |     Enumerates items from a :epkg:`JSON` file or string.
148 | 
149 |     :param filename: filename or string or stream to parse
150 |     :param encoding: encoding
151 |     :param lines: one record per row
152 |     :param flatten: call @see fn flatten_dictionary
153 |     :param verbose: verbosity (based on :epkg:`tqdm`)
154 |     :return: iterator on records at first level.
155 | 
156 |     It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.
157 |     However, if option *lines* if true, the function considers that the
158 |     stream or file does have one record per row as follows:
159 | 
160 |         {"id":1, ...}
161 |         {"id": 2, ...}
162 | 
163 |     .. exref::
164 |         :title: Processes a json file by streaming.
165 | 
166 |         The module :epkg:`ijson` can read a :epkg:`JSON` file by streaming.
167 |         This module is needed because a record can be written on multiple lines.
168 |         This function leverages it produces the following results.
169 | 
170 |         .. runpython::
171 |             :showcode:
172 | 
173 |             from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items
174 | 
175 |             text_json = b'''
176 |                 [
177 |                 {
178 |                     "glossary": {
179 |                         "title": "example glossary",
180 |                         "GlossDiv": {
181 |                             "title": "S",
182 |                             "GlossList": [{
183 |                                 "GlossEntry": {
184 |                                     "ID": "SGML",
185 |                                     "SortAs": "SGML",
186 |                                     "GlossTerm": "Standard Generalized Markup Language",
187 |                                     "Acronym": "SGML",
188 |                                     "Abbrev": "ISO 8879:1986",
189 |                                     "GlossDef": {
190 |                                         "para": "A meta-markup language, used to create markup languages such as DocBook.",
191 |                                         "GlossSeeAlso": ["GML", "XML"]
192 |                                     },
193 |                                     "GlossSee": "markup"
194 |                                 }
195 |                             }]
196 |                         }
197 |                     }
198 |                 },
199 |                 {
200 |                     "glossary": {
201 |                         "title": "example glossary",
202 |                         "GlossDiv": {
203 |                             "title": "S",
204 |                             "GlossList": {
205 |                                 "GlossEntry": [{
206 |                                     "ID": "SGML",
207 |                                     "SortAs": "SGML",
208 |                                     "GlossTerm": "Standard Generalized Markup Language",
209 |                                     "Acronym": "SGML",
210 |                                     "Abbrev": "ISO 8879:1986",
211 |                                     "GlossDef": {
212 |                                         "para": "A meta-markup language, used to create markup languages such as DocBook.",
213 |                                         "GlossSeeAlso": ["GML", "XML"]
214 |                                     },
215 |                                     "GlossSee": "markup"
216 |                                 }]
217 |                             }
218 |                         }
219 |                     }
220 |                 }
221 |                 ]
222 |             '''
223 | 
224 |             for item in enumerate_json_items(text_json):
225 |                 print(item)
226 | 
227 |     The parsed json must have an empty line at the end otherwise
228 |     the following exception is raised:
229 |     `ijson.common.IncompleteJSONError: `
230 |     `parse error: unallowed token at this point in JSON text`.
231 |     """
232 |     if isinstance(filename, str):
233 |         if "{" not in filename and os.path.exists(filename):
234 |             with open(filename, "r", encoding=encoding) as f:
235 |                 for el in enumerate_json_items(
236 |                     f, encoding=encoding, lines=lines, flatten=flatten
237 |                 ):
238 |                     yield el
239 |         else:
240 |             st = StringIO(filename)
241 |             for el in enumerate_json_items(
242 |                 st, encoding=encoding, lines=lines, flatten=flatten
243 |             ):
244 |                 yield el
245 |     elif isinstance(filename, bytes):
246 |         st = BytesIO(filename)
247 |         for el in enumerate_json_items(
248 |             st, encoding=encoding, lines=lines, flatten=flatten
249 |         ):
250 |             yield el
251 |     elif lines:
252 |         for el in enumerate_json_items(
253 |             JsonPerRowsStream(filename), encoding=encoding, lines=False, flatten=flatten
254 |         ):
255 |             yield el
256 |     else:
257 |         if hasattr(filename, "seek"):
258 |             filename.seek(0)
259 |         import ijson
260 | 
261 |         parser = ijson.parse(filename)
262 |         current = None
263 |         curkey = None
264 |         stack = []
265 |         nbyield = 0
266 |         if verbose:
267 |             from tqdm import tqdm
268 | 
269 |             loop = tqdm(enumerate(parser))
270 |         else:
271 |             loop = enumerate(parser)
272 |         for i, (_, event, value) in loop:
273 |             if verbose:
274 |                 loop.set_description(f"process row {i}-event={event!r}")
275 |             if event == "start_array":
276 |                 if curkey is None:
277 |                     current = []
278 |                 else:
279 |                     if not isinstance(current, dict):
280 |                         raise RuntimeError(  # pragma: no cover
281 |                             f"Type issue {type(current)}"
282 |                         )
283 |                     c = []
284 |                     current[curkey] = c  # pylint: disable=E1137
285 |                     current = c
286 |                 curkey = None
287 |                 stack.append(current)
288 |             elif event == "end_array":
289 |                 stack.pop()
290 |                 if len(stack) == 0:
291 |                     # We should be done.
292 |                     current = None
293 |                 else:
294 |                     current = stack[-1]
295 |             elif event == "start_map":
296 |                 c = {}
297 |                 if curkey is None:
298 |                     if current is None:
299 |                         current = []
300 |                     current.append(c)
301 |                 else:
302 |                     current[curkey] = c  # pylint: disable=E1137
303 |                 stack.append(c)
304 |                 current = c
305 |                 curkey = None
306 |             elif event == "end_map":
307 |                 stack.pop()
308 |                 current = stack[-1]
309 |                 if len(stack) == 1:
310 |                     nbyield += 1
311 |                     if flatten:
312 |                         yield flatten_dictionary(current[-1])
313 |                     else:
314 |                         yield current[-1]
315 |                     # We clear the memory.
316 |                     current.clear()
317 |             elif event == "map_key":
318 |                 curkey = value
319 |             elif event in {"string", "number", "boolean"}:
320 |                 if curkey is None:
321 |                     current.append(value)
322 |                 else:
323 |                     current[curkey] = value  # pylint: disable=E1137
324 |                     curkey = None
325 |             elif event == "null":
326 |                 if curkey is None:
327 |                     current.append(None)
328 |                 else:
329 |                     current[curkey] = None  # pylint: disable=E1137
330 |                     curkey = None
331 |             else:
332 |                 raise ValueError(f"Unknown event '{event}'")  # pragma: no cover
333 | 
334 | 
335 | class JsonIterator2Stream:
336 |     """
337 |     Transforms an iterator on :epkg:`JSON` items
338 |     into a stream which returns an items as a string every time
339 |     method *read* is called.
340 |     The iterator could be one returned by @see fn enumerate_json_items.
341 | 
342 |     :param it: iterator
343 |     :param kwargs: arguments to :class:`json.dumps`
344 | 
345 |     .. exref::
346 |         :title: Reshape a json file
347 | 
348 |         The function @see fn enumerate_json_items reads any
349 |         :epkg:`json` even if every record is split over
350 |         multiple lines. Class @see cl JsonIterator2Stream
351 |         mocks this iterator as a stream. Each row is a single item.
352 | 
353 |         .. runpython::
354 |             :showcode:
355 | 
356 |             from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
357 | 
358 |             text_json = b'''
359 |                 [
360 |                 {
361 |                     "glossary": {
362 |                         "title": "example glossary",
363 |                         "GlossDiv": {
364 |                             "title": "S",
365 |                             "GlossList": [{
366 |                                 "GlossEntry": {
367 |                                     "ID": "SGML",
368 |                                     "SortAs": "SGML",
369 |                                     "GlossTerm": "Standard Generalized Markup Language",
370 |                                     "Acronym": "SGML",
371 |                                     "Abbrev": "ISO 8879:1986",
372 |                                     "GlossDef": {
373 |                                         "para": "A meta-markup language, used to create markup languages such as DocBook.",
374 |                                         "GlossSeeAlso": ["GML", "XML"]
375 |                                     },
376 |                                     "GlossSee": "markup"
377 |                                 }
378 |                             }]
379 |                         }
380 |                     }
381 |                 },
382 |                 {
383 |                     "glossary": {
384 |                         "title": "example glossary",
385 |                         "GlossDiv": {
386 |                             "title": "S",
387 |                             "GlossList": {
388 |                                 "GlossEntry": [{
389 |                                     "ID": "SGML",
390 |                                     "SortAs": "SGML",
391 |                                     "GlossTerm": "Standard Generalized Markup Language",
392 |                                     "Acronym": "SGML",
393 |                                     "Abbrev": "ISO 8879:1986",
394 |                                     "GlossDef": {
395 |                                         "para": "A meta-markup language, used to create markup languages such as DocBook.",
396 |                                         "GlossSeeAlso": ["GML", "XML"]
397 |                                     },
398 |                                     "GlossSee": "markup"
399 |                                 }]
400 |                             }
401 |                         }
402 |                     }
403 |                 }
404 |                 ]
405 |             '''
406 | 
407 |             for item in JsonIterator2Stream(lambda: enumerate_json_items(text_json)):
408 |                 print(item)
409 | 
410 |     .. versionchanged:: 0.3
411 |         The class takes a function which outputs an iterator and not an iterator.
412 |         `JsonIterator2Stream(enumerate_json_items(text_json))` needs to be rewritten
413 |         into JsonIterator2Stream(lambda: enumerate_json_items(text_json)).
414 |     """
415 | 
416 |     def __init__(self, it, **kwargs):
417 |         self.it = it
418 |         self.kwargs = kwargs
419 |         self.it0 = it()
420 | 
421 |     def seek(self, offset):
422 |         """
423 |         Change the stream position to the given byte offset.
424 | 
425 |         :param offset: offset, only 0 is implemented
426 |         """
427 |         if offset != 0:
428 |             raise NotImplementedError("The iterator can only return at the beginning.")
429 |         self.it0 = self.it()
430 | 
431 |     def write(self):
432 |         """
433 |         The class does not write.
434 |         """
435 |         raise NotImplementedError()
436 | 
437 |     def read(self):
438 |         """
439 |         Reads the next item and returns it as a string.
440 |         """
441 |         try:
442 |             value = next(self.it0)
443 |             return dumps(value, **self.kwargs)
444 |         except StopIteration:
445 |             return None
446 | 
447 |     def __iter__(self):
448 |         """
449 |         Iterates on each row. The behaviour is a bit tricky.
450 |         It is implemented to be swalled by :func:`pandas.read_json` which
451 |         uses :func:`itertools.islice` to go through the items.
452 |         It calls multiple times `__iter__` but does expect the
453 |         iterator to continue from where it stopped last time.
454 |         """
455 |         for value in self.it0:
456 |             yield dumps(value, **self.kwargs)
457 | 


--------------------------------------------------------------------------------
/pandas_streaming/df/dataframe_split.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import pickle
  3 | import random
  4 | import warnings
  5 | from io import StringIO
  6 | import pandas
  7 | 
  8 | 
  9 | def sklearn_train_test_split(
 10 |     self, path_or_buf=None, export_method="to_csv", names=None, **kwargs
 11 | ):
 12 |     """
 13 |     Randomly splits a dataframe into smaller pieces.
 14 |     The function returns streams of file names.
 15 |     The function relies on :func:`sklearn.model_selection.train_test_split`.
 16 |     It does not handle stratified version of it.
 17 | 
 18 |     :param self: see :class:`StreamingDataFrame
 19 |         <pandas_streaming.df.dataframe.StreamingDataFrame>`
 20 |     :param path_or_buf: a string, a list of strings or buffers, if it is a
 21 |         string, it must contain ``{}`` like ``partition{}.txt``
 22 |     :param export_method: method used to store the partitions, by default
 23 |         :meth:`pandas.DataFrame.to_csv`
 24 |     :param names: partitions names, by default ``('train', 'test')``
 25 |     :param kwargs: parameters for the export function and
 26 |         :func:`sklearn.model_selection.train_test_split`.
 27 |     :return: outputs of the exports functions
 28 | 
 29 |     The function cannot return two iterators or two
 30 |     see :class:`StreamingDataFrame
 31 |     <pandas_streaming.df.dataframe.StreamingDataFrame>`
 32 |     because running through one
 33 |     means running through the other. We can assume both
 34 |     splits do not hold in memory and we cannot run through
 35 |     the same iterator again as random draws would be different.
 36 |     We need to store the results into files or buffers.
 37 | 
 38 |     .. warning::
 39 |         The method *export_method* must write the data in
 40 |         mode *append* and allows stream.
 41 |     """
 42 |     if kwargs.get("stratify") is not None:
 43 |         raise NotImplementedError(  # pragma: no cover
 44 |             "No implementation yet for the stratified version."
 45 |         )
 46 |     with warnings.catch_warnings():
 47 |         warnings.filterwarnings("ignore", category=ImportWarning)
 48 |         from sklearn.model_selection import train_test_split
 49 | 
 50 |     opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]
 51 |     split_ops = {}
 52 |     for o in opts:
 53 |         if o in kwargs:
 54 |             split_ops[o] = kwargs[o]
 55 |             del kwargs[o]
 56 | 
 57 |     exportf_ = getattr(pandas.DataFrame, export_method)
 58 |     if export_method == "to_csv" and "mode" not in kwargs:
 59 |         exportf = lambda *a, **kw: exportf_(*a, mode="a", **kw)  # noqa: E731
 60 |     else:
 61 |         exportf = exportf_
 62 | 
 63 |     if isinstance(path_or_buf, str):
 64 |         if "{}" not in path_or_buf:
 65 |             raise ValueError("path_or_buf must contain {} to insert the partition name")
 66 |         if names is None:
 67 |             names = ["train", "test"]
 68 |         elif len(names) != len(path_or_buf):
 69 |             raise ValueError(  # pragma: no cover
 70 |                 "names and path_or_buf must have the same length"
 71 |             )
 72 |         path_or_buf = [path_or_buf.format(n) for n in names]
 73 |     elif path_or_buf is None:
 74 |         path_or_buf = [None, None]
 75 |     else:
 76 |         if not isinstance(path_or_buf, list):
 77 |             raise TypeError(  # pragma: no cover
 78 |                 "path_or_buf must be a list or a string"
 79 |             )
 80 | 
 81 |     bufs = []
 82 |     close = []
 83 |     for p in path_or_buf:
 84 |         if p is None:
 85 |             st = StringIO()
 86 |             cl = False
 87 |         elif isinstance(p, str):
 88 |             st = open(p, "w", encoding=kwargs.get("encoding"))  # noqa: SIM115
 89 |             cl = True
 90 |         else:
 91 |             st = p
 92 |             cl = False
 93 |         bufs.append(st)
 94 |         close.append(cl)
 95 | 
 96 |     for df in self:
 97 |         train, test = train_test_split(df, **split_ops)
 98 |         exportf(train, bufs[0], **kwargs)
 99 |         exportf(test, bufs[1], **kwargs)
100 |         kwargs["header"] = False
101 | 
102 |     for b, c in zip(bufs, close):
103 |         if c:
104 |             b.close()
105 |     return [
106 |         st.getvalue() if isinstance(st, StringIO) else p
107 |         for st, p in zip(bufs, path_or_buf)
108 |     ]
109 | 
110 | 
111 | def sklearn_train_test_split_streaming(
112 |     self, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False
113 | ):
114 |     """
115 |     Randomly splits a dataframe into smaller pieces.
116 |     The function returns streams of file names.
117 |     The function relies on :func:`sklearn.model_selection.train_test_split`.
118 |     It handles the stratified version of it.
119 | 
120 |     :param self: see :class:`StreamingDataFrame
121 |         <pandas_streaming.df.dataframe.StreamingDataFrame>`
122 |     :param test_size: ratio for the test partition
123 |         (if *train_size* is not specified)
124 |     :param train_size: ratio for the train partition
125 |     :param stratify: column holding the stratification
126 |     :param hash_size: size of the hash to cache information about partition
127 |     :param unique_rows: ensures that rows are unique
128 |     :return: Two see :class:`StreamingDataFrame
129 |         <pandas_streaming.df.dataframe.StreamingDataFrame>`,
130 |         one for train, one for test.
131 | 
132 |     The function returns two iterators or two
133 |     see :class:`StreamingDataFrame
134 |     <pandas_streaming.df.dataframe.StreamingDataFrame>`. It
135 |     tries to do everything without writing anything on disk
136 |     but it requires to store the repartition somehow.
137 |     This function hashes every row and maps the hash with a part
138 |     (train or test). This cache must hold in memory otherwise the
139 |     function fails. The two returned iterators must not be used
140 |     for the first time in the same time. The first time is used to
141 |     build the cache. The function changes the order of rows if
142 |     the parameter *stratify* is not null. The cache has a side effect:
143 |     every exact same row will be put in the same partition.
144 |     If that is not what you want, you should add an index column
145 |     or a random one.
146 |     """
147 |     p = (1 - test_size) if test_size else None
148 |     if train_size is not None:
149 |         p = train_size
150 |     n = 2 * max(1 / p, 1 / (1 - p))  # changement
151 | 
152 |     static_schema = []
153 | 
154 |     def iterator_rows():
155 |         "iterates on rows"
156 |         counts = {}
157 |         memory = {}
158 |         pos_col = None
159 |         for df in self:
160 |             if pos_col is None:
161 |                 static_schema.append(list(df.columns))
162 |                 static_schema.append(list(df.dtypes))
163 |                 static_schema.append(df.shape[0])
164 |                 if stratify is not None:
165 |                     pos_col = list(df.columns).index(stratify)
166 |                 else:
167 |                     pos_col = -1
168 | 
169 |             for obs in df.itertuples(index=False, name=None):
170 |                 strat = 0 if stratify is None else obs[pos_col]
171 |                 if strat not in memory:
172 |                     memory[strat] = []
173 |                 memory[strat].append(obs)
174 | 
175 |                 for k, v in memory.items():
176 |                     if len(v) >= n + random.randint(0, 10):  # changement
177 |                         vr = list(range(len(v)))
178 |                         # on permute aléatoirement
179 |                         random.shuffle(vr)
180 |                         if (0, k) in counts:
181 |                             tt = counts[1, k] + counts[0, k]
182 |                             delta = -int(counts[0, k] - tt * p + 0.5)
183 |                         else:
184 |                             delta = 0
185 |                         i = int(len(v) * p + 0.5)
186 |                         i += delta
187 |                         i = max(0, min(len(v), i))
188 |                         one = set(vr[:i])
189 |                         for d, obs_ in enumerate(v):
190 |                             yield obs_, 0 if d in one else 1
191 |                         if (0, k) not in counts:
192 |                             counts[0, k] = i
193 |                             counts[1, k] = len(v) - i
194 |                         else:
195 |                             counts[0, k] += i
196 |                             counts[1, k] += len(v) - i
197 |                         # on efface de la mémoire les informations produites
198 |                         v.clear()
199 | 
200 |         # Lorsqu'on a fini, il faut tout de même répartir les
201 |         # observations stockées.
202 |         for k, v in memory.items():
203 |             vr = list(range(len(v)))
204 |             # on permute aléatoirement
205 |             random.shuffle(vr)
206 |             if (0, k) in counts:
207 |                 tt = counts[1, k] + counts[0, k]
208 |                 delta = -int(counts[0, k] - tt * p + 0.5)
209 |             else:
210 |                 delta = 0
211 |             i = int(len(v) * p + 0.5)
212 |             i += delta
213 |             i = max(0, min(len(v), i))
214 |             one = set(vr[:i])
215 |             for d, obs in enumerate(v):
216 |                 yield obs, 0 if d in one else 1
217 |             if (0, k) not in counts:
218 |                 counts[0, k] = i
219 |                 counts[1, k] = len(v) - i
220 |             else:
221 |                 counts[0, k] += i
222 |                 counts[1, k] += len(v) - i
223 | 
224 |     def h11(w):
225 |         "pickle and hash"
226 |         b = pickle.dumps(w)
227 |         return hashlib.md5(b).hexdigest()[:hash_size]
228 | 
229 |     # We store the repartition in a cache.
230 |     cache = {}
231 | 
232 |     def iterator_internal(part_requested):
233 |         "internal iterator on dataframes"
234 |         iy = 0
235 |         accumul = []
236 |         if len(cache) == 0:
237 |             for obs, part in iterator_rows():
238 |                 h = h11(obs)
239 |                 if unique_rows and h in cache:
240 |                     raise ValueError(
241 |                         "A row or at least its hash is already cached. "  # noqa: UP030
242 |                         "Increase hash_size or check for duplicates "
243 |                         "('{0}')\n{1}.".format(h, obs)
244 |                     )
245 |                 if h not in cache:
246 |                     cache[h] = part
247 |                 else:
248 |                     part = cache[h]
249 |                 if part == part_requested:
250 |                     accumul.append(obs)
251 |                     if len(accumul) >= static_schema[2]:
252 |                         dfo = pandas.DataFrame(accumul, columns=static_schema[0])
253 |                         self.ensure_dtype(dfo, static_schema[1])
254 |                         iy += dfo.shape[0]
255 |                         accumul.clear()
256 |                         yield dfo
257 |         else:
258 |             for df in self:
259 |                 for obs in df.itertuples(index=False, name=None):
260 |                     h = h11(obs)
261 |                     part = cache.get(h)
262 |                     if part is None:
263 |                         raise ValueError(  # pragma: no cover
264 |                             f"Second iteration. A row was "
265 |                             f"never met in the first one\n{obs}"
266 |                         )
267 |                     if part == part_requested:
268 |                         accumul.append(obs)
269 |                         if len(accumul) >= static_schema[2]:
270 |                             dfo = pandas.DataFrame(accumul, columns=static_schema[0])
271 |                             self.ensure_dtype(dfo, static_schema[1])
272 |                             iy += dfo.shape[0]
273 |                             accumul.clear()
274 |                             yield dfo
275 |         if len(accumul) > 0:
276 |             dfo = pandas.DataFrame(accumul, columns=static_schema[0])
277 |             self.ensure_dtype(dfo, static_schema[1])
278 |             iy += dfo.shape[0]
279 |             yield dfo
280 | 
281 |     return (
282 |         self.__class__(lambda: iterator_internal(0)),
283 |         self.__class__(lambda: iterator_internal(1)),
284 |     )
285 | 


--------------------------------------------------------------------------------
/pandas_streaming/exc/__init__.py:
--------------------------------------------------------------------------------
1 | from .exc_streaming import StreamingInefficientException  # noqa: F401
2 | 


--------------------------------------------------------------------------------
/pandas_streaming/exc/exc_streaming.py:
--------------------------------------------------------------------------------
 1 | class StreamingInefficientException(Exception):
 2 |     """
 3 |     Kind of operations doable with a :epkg:`pandas:DataFrame`
 4 |     but which should not be done in streaming mode.
 5 |     """
 6 | 
 7 |     def __init__(self, meth):
 8 |         """
 9 |         This method is inefficient in streaming mode
10 |         and not implemented.
11 | 
12 |         :param meth: inefficient method
13 |         """
14 |         Exception.__init__(self, f"{meth} should not be done in streaming mode.")
15 | 


--------------------------------------------------------------------------------
/pandas_streaming/ext_test_case.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import unittest
  4 | import warnings
  5 | from contextlib import redirect_stderr, redirect_stdout
  6 | from io import StringIO
  7 | from typing import Any, Callable, List, Optional
  8 | 
  9 | import numpy
 10 | from numpy.testing import assert_allclose
 11 | 
 12 | 
 13 | def unit_test_going():
 14 |     """
 15 |     Enables a flag telling the script is running while testing it.
 16 |     Avois unit tests to be very long.
 17 |     """
 18 |     going = int(os.environ.get("UNITTEST_GOING", 0))
 19 |     return going == 1
 20 | 
 21 | 
 22 | def ignore_warnings(warns: List[Warning]) -> Callable:
 23 |     """
 24 |     Catches warnings.
 25 | 
 26 |     :param warns:   warnings to ignore
 27 |     """
 28 | 
 29 |     def wrapper(fct):
 30 |         if warns is None:
 31 |             raise AssertionError(f"warns cannot be None for '{fct}'.")
 32 | 
 33 |         def call_f(self):
 34 |             with warnings.catch_warnings():
 35 |                 warnings.simplefilter("ignore", warns)
 36 |                 return fct(self)
 37 | 
 38 |         return call_f
 39 | 
 40 |     return wrapper
 41 | 
 42 | 
 43 | class sys_path_append:
 44 |     """
 45 |     Stores the content of :epkg:`*py:sys:path` and
 46 |     restores it afterwards.
 47 |     """
 48 | 
 49 |     def __init__(self, paths, position=-1):
 50 |         """
 51 |         :param paths: paths to add
 52 |         :param position: where to add it
 53 |         """
 54 |         self.to_add = paths if isinstance(paths, list) else [paths]
 55 |         self.position = position
 56 | 
 57 |     def __enter__(self):
 58 |         """
 59 |         Modifies ``sys.path``.
 60 |         """
 61 |         self.store = sys.path.copy()
 62 |         if self.position == -1:
 63 |             sys.path.extend(self.to_add)
 64 |         else:
 65 |             for p in reversed(self.to_add):
 66 |                 sys.path.insert(self.position, p)
 67 | 
 68 |     def __exit__(self, exc_type, exc_value, traceback):
 69 |         """
 70 |         Restores``sys.path``.
 71 |         """
 72 |         sys.path = self.store
 73 | 
 74 | 
 75 | class ExtTestCase(unittest.TestCase):
 76 |     _warns = []
 77 | 
 78 |     def assertExists(self, name):
 79 |         if not os.path.exists(name):
 80 |             raise AssertionError(f"File or folder {name!r} does not exists.")
 81 | 
 82 |     def assertEqualArray(
 83 |         self,
 84 |         expected: numpy.ndarray,
 85 |         value: numpy.ndarray,
 86 |         atol: float = 0,
 87 |         rtol: float = 0,
 88 |     ):
 89 |         self.assertEqual(expected.dtype, value.dtype)
 90 |         self.assertEqual(expected.shape, value.shape)
 91 |         assert_allclose(expected, value, atol=atol, rtol=rtol)
 92 | 
 93 |     def assertEqualDataFrame(self, d1, d2, **kwargs):
 94 |         """
 95 |         Checks that two dataframes are equal.
 96 |         Calls :func:`pandas.testing.assert_frame_equal`.
 97 |         """
 98 |         from pandas.testing import assert_frame_equal
 99 | 
100 |         assert_frame_equal(d1, d2, **kwargs)
101 | 
102 |     def assertAlmostEqual(
103 |         self,
104 |         expected: numpy.ndarray,
105 |         value: numpy.ndarray,
106 |         atol: float = 0,
107 |         rtol: float = 0,
108 |     ):
109 |         if not isinstance(expected, numpy.ndarray):
110 |             expected = numpy.array(expected)
111 |         if not isinstance(value, numpy.ndarray):
112 |             value = numpy.array(value).astype(expected.dtype)
113 |         self.assertEqualArray(expected, value, atol=atol, rtol=rtol)
114 | 
115 |     def assertRaise(
116 |         self, fct: Callable, exc_type: Exception, msg: Optional[str] = None
117 |     ):
118 |         try:
119 |             fct()
120 |         except exc_type as e:
121 |             if not isinstance(e, exc_type):
122 |                 raise AssertionError(f"Unexpected exception {type(e)!r}.") from e
123 |             if msg is None:
124 |                 return
125 |             if msg not in str(e):
126 |                 raise AssertionError(f"Unexpected error message {e!r}.") from e
127 |             return
128 |         raise AssertionError("No exception was raised.")
129 | 
130 |     def assertEmpty(self, value: Any):
131 |         if value is None:
132 |             return
133 |         if len(value) == 0:
134 |             return
135 |         raise AssertionError(f"value is not empty: {value!r}.")
136 | 
137 |     def assertNotEmpty(self, value: Any):
138 |         if value is None:
139 |             raise AssertionError(f"value is empty: {value!r}.")
140 |         if isinstance(value, (list, dict, tuple, set)):
141 |             if len(value) == 0:
142 |                 raise AssertionError(f"value is empty: {value!r}.")
143 | 
144 |     def assertStartsWith(self, prefix: str, full: str):
145 |         if not full.startswith(prefix):
146 |             raise AssertionError(f"prefix={prefix!r} does not start string  {full!r}.")
147 | 
148 |     def assertLesser(self, x, y, strict=False):
149 |         """
150 |         Checks that ``x <= y``.
151 |         """
152 |         if x > y or (strict and x == y):
153 |             raise AssertionError(
154 |                 "x >{2} y with x={0} and y={1}".format(  # noqa: UP030
155 |                     ExtTestCase._format_str(x),
156 |                     ExtTestCase._format_str(y),
157 |                     "" if strict else "=",
158 |                 )
159 |             )
160 | 
161 |     @staticmethod
162 |     def abs_path_join(filename: str, *args: List[str]):
163 |         """
164 |         Returns an absolute and normalized path from this location.
165 | 
166 |         :param filename: filename, the folder which contains it
167 |             is used as the base
168 |         :param args: list of subpaths to the previous path
169 |         :return: absolute and normalized path
170 |         """
171 |         dirname = os.path.join(os.path.dirname(filename), *args)
172 |         return os.path.normpath(os.path.abspath(dirname))
173 | 
174 |     @classmethod
175 |     def tearDownClass(cls):
176 |         for name, line, w in cls._warns:
177 |             warnings.warn(f"\n{name}:{line}: {type(w)}\n  {str(w)}", stacklevel=0)
178 | 
179 |     def capture(self, fct: Callable):
180 |         """
181 |         Runs a function and capture standard output and error.
182 | 
183 |         :param fct: function to run
184 |         :return: result of *fct*, output, error
185 |         """
186 |         sout = StringIO()
187 |         serr = StringIO()
188 |         with redirect_stdout(sout), redirect_stderr(serr):
189 |             res = fct()
190 |         return res, sout.getvalue(), serr.getvalue()
191 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | 
 3 | # Exclude a variety of commonly ignored directories.
 4 | exclude = [
 5 |     ".eggs",
 6 |     ".git",
 7 |     "build",
 8 |     "dist",
 9 | ]
10 | 
11 | line-length = 88
12 | 
13 | [tool.ruff.lint]
14 | select = [
15 |     "B", # flake8-bugbear
16 |     "C4", # flake8-comprehensions
17 |     #"D", # pydocstyle
18 |     "E", # pycodestyle
19 |     "F", # Pyflakes
20 |     "G", # flake8-logging-format
21 |     #"I", # isort
22 |     "ISC", # flake8-implicit-str-concat
23 |     "LOG", # flake8-logging
24 |     #"N", # pep8-naming
25 |     #"NPY", # modern numpy
26 |     #"PERF", # Perflint
27 |     "PIE", # flake8-pie
28 |     "PYI", # flake8-pyi
29 |     "RUF", # Ruff-specific rules
30 |     "SIM", # flake8-simplify
31 |     "SLOT", # flake8-slot
32 |     "T10", # flake8-debugger
33 |     #"TID", # Disallow relative imports
34 |     #"TRY", # flake8-try-except-raise
35 |     "UP", # pyupgrade
36 |     "W", # pycodestyle
37 |     "YTT", # flake8-2020
38 | ]
39 | 
40 | [tool.ruff.lint.per-file-ignores]
41 | "**" = ["B905", "C401", "C408", "C413", "RUF012", "RUF100", "RUF010", "SIM108", "SIM910", "SIM110", "SIM102", "SIM114", "SIM103", "UP015", "UP027", "UP031", "UP034", "UP032", "UP006", "UP035", "UP007", "UP038"]
42 | "**/plot*.py" = ["B018"]
43 | "_doc/examples/**.py" = ["E402", "F811", "B018"]
44 | "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"]
45 | "pandas_streaming/data/__init__.py" = ["F401"]
46 | "pandas_streaming/df/__init__.py" = ["F401"]
47 | "pandas_streaming/df/dataframe_io_helpers.py" = ["E501"]
48 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | autopep8
 2 | black
 3 | coverage
 4 | furo
 5 | ijson
 6 | jupyter_sphinx
 7 | jyquickhelper
 8 | matplotlib
 9 | nbsphinx
10 | pandas>=1.1.0
11 | pandocfilters
12 | Pillow
13 | pycodestyle
14 | pylint>=2.14.0
15 | pytest
16 | pytest-cov
17 | ruff
18 | scikit-learn
19 | scipy
20 | sphinx
21 | sphinx-issues 
22 | git+https://github.com/sdpython/sphinx-runpython.git
23 | sphinx_gallery
24 | ujson
25 | wheel
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options]
2 | packages = find:
3 | 
4 | [options.packages.find]
5 | include = pandas_streaming*
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | ######################
 6 | # beginning of setup
 7 | ######################
 8 | 
 9 | 
10 | here = os.path.dirname(__file__)
11 | if here == "":
12 |     here = "."
13 | package_data = {"pandas_streaming.validation": ["*.css", "*.js"]}
14 | 
15 | try:
16 |     with open(os.path.join(here, "requirements.txt"), "r") as f:
17 |         requirements = f.read().strip(" \n\r\t").split("\n")
18 | except FileNotFoundError:
19 |     requirements = []
20 | if len(requirements) == 0 or requirements == [""]:
21 |     requirements = ["pandas"]
22 | 
23 | try:
24 |     with open(os.path.join(here, "README.rst"), "r", encoding="utf-8") as f:
25 |         long_description = "pandas-streaming:" + f.read().split("pandas-streaming:")[1]
26 | except FileNotFoundError:
27 |     long_description = ""
28 | 
29 | version_str = "0.1.0"
30 | with open(os.path.join(here, "pandas_streaming/__init__.py"), "r") as f:
31 |     line = [
32 |         _
33 |         for _ in [_.strip("\r\n ") for _ in f.readlines()]
34 |         if _.startswith("__version__")
35 |     ]
36 |     if len(line) > 0:
37 |         version_str = line[0].split("=")[1].strip('" ')
38 | 
39 | 
40 | setup(
41 |     name="pandas-streaming",
42 |     version=version_str,
43 |     description="Array (and numpy) API for ONNX",
44 |     long_description=long_description,
45 |     author="Xavier Dupré",
46 |     author_email="xavier.dupre@gmail.com",
47 |     url="https://github.com/sdpython/pandas-streaming",
48 |     package_data=package_data,
49 |     setup_requires=["numpy", "scipy"],
50 |     install_requires=requirements,
51 |     classifiers=[
52 |         "Intended Audience :: Science/Research",
53 |         "Intended Audience :: Developers",
54 |         "License :: OSI Approved :: MIT License",
55 |         "Programming Language :: C",
56 |         "Programming Language :: Python",
57 |         "Topic :: Software Development",
58 |         "Topic :: Scientific/Engineering",
59 |         "Development Status :: 5 - Production/Stable",
60 |         "Operating System :: Microsoft :: Windows",
61 |         "Operating System :: POSIX",
62 |         "Operating System :: Unix",
63 |         "Operating System :: MacOS",
64 |         "Programming Language :: Python :: 3",
65 |         "Programming Language :: Python :: 3.8",
66 |         "Programming Language :: Python :: 3.9",
67 |         "Programming Language :: Python :: 3.10",
68 |         "Programming Language :: Python :: 3.11",
69 |     ],
70 | )
71 | 


--------------------------------------------------------------------------------