├── .github └── workflows │ ├── black-ruff.yml │ ├── check-urls.yml │ ├── codeql.yml │ ├── documentation.yml │ └── wheels-any.yml ├── .gitignore ├── .local.jenkins.lin.yml ├── CHANGELOGS.rst ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── _doc ├── _static │ ├── git_logo.png │ ├── project_ico.ico │ └── project_ico.png ├── api │ ├── connex_split.rst │ ├── dataframe.rst │ ├── dataframe_io.rst │ ├── dataframe_split.rst │ ├── index.rst │ ├── rdata.rst │ ├── rdf.rst │ ├── rexc.rst │ └── rio.rst ├── conf.py ├── examples │ ├── README.txt │ └── first_step.py ├── i_ex.rst ├── index.rst ├── license.rst ├── sg_execution_times.rst └── tutorial │ └── index.rst ├── _unittests ├── ut_df │ ├── data │ │ ├── buggy_hash.csv │ │ ├── buggy_hash2.csv │ │ ├── classic.json │ │ ├── example.json │ │ └── example2.json │ ├── test_connex_split.py │ ├── test_connex_split_big.py │ ├── test_connex_split_cat.py │ ├── test_dataframe_helpers.py │ ├── test_dataframe_helpers_simple.py │ ├── test_dataframe_io.py │ ├── test_dataframe_io_helpers.py │ ├── test_dataframe_sort.py │ ├── test_pandas_groupbynan.py │ └── test_streaming_dataframe.py └── ut_module │ └── test_sklearn.py ├── appveyor.yml ├── azure-pipelines.yml ├── pandas_streaming ├── __init__.py ├── data │ ├── __init__.py │ └── dummy.py ├── df │ ├── __init__.py │ ├── connex_split.py │ ├── dataframe.py │ ├── dataframe_helpers.py │ ├── dataframe_io.py │ ├── dataframe_io_helpers.py │ └── dataframe_split.py ├── exc │ ├── __init__.py │ └── exc_streaming.py └── ext_test_case.py ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg └── setup.py /.github/workflows/black-ruff.yml: -------------------------------------------------------------------------------- 1 | name: Black + Ruff Format Checker 2 | on: [push, pull_request] 3 | jobs: 4 | black-format-check: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v2 8 | - uses: psf/black@stable 9 | with: 10 | options: "--diff --check" 11 | src: "." 12 | ruff-format-check: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: chartboost/ruff-action@v1 17 | -------------------------------------------------------------------------------- /.github/workflows/check-urls.yml: -------------------------------------------------------------------------------- 1 | name: Check URLs 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | schedule: 7 | # ┌───────────── minute (0 - 59) 8 | # │ ┌───────────── hour (0 - 23) 9 | # │ │ ┌───────────── day of the month (1 - 31) 10 | # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) 11 | # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) 12 | # │ │ │ │ │ 13 | # │ │ │ │ │ 14 | # │ │ │ │ │ 15 | # * * * * * 16 | - cron: '30 1 * * 0' 17 | 18 | jobs: 19 | build: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - name: urls-checker-code 26 | uses: urlstechie/urlchecker-action@master 27 | with: 28 | subfolder: pandas_streaming 29 | file_types: .md,.py,.rst,.ipynb 30 | print_all: false 31 | timeout: 2 32 | retry_count# : 2 33 | # exclude_urls: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz,https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz 34 | # exclude_patterns: https://dumps.wikimedia.org/ 35 | # force_pass : true 36 | 37 | - name: urls-checker-docs 38 | uses: urlstechie/urlchecker-action@master 39 | with: 40 | subfolder: _doc 41 | file_types: .md,.py,.rst,.ipynb 42 | print_all: false 43 | timeout: 2 44 | retry_count# : 2 45 | # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document 46 | exclude_patterns: https://circleci.com/gh/sdpython/pandas_streaming/ 47 | # force_pass : true 48 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "Code Scanning - Action" 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | # ┌───────────── minute (0 - 59) 10 | # │ ┌───────────── hour (0 - 23) 11 | # │ │ ┌───────────── day of the month (1 - 31) 12 | # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) 13 | # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) 14 | # │ │ │ │ │ 15 | # │ │ │ │ │ 16 | # │ │ │ │ │ 17 | # * * * * * 18 | - cron: '30 1 * * 0' 19 | 20 | jobs: 21 | CodeQL-Build: 22 | # CodeQL runs on ubuntu-latest, windows-latest, and macos-latest 23 | runs-on: ubuntu-latest 24 | 25 | permissions: 26 | # required for all workflows 27 | security-events: write 28 | 29 | # only required for workflows in private repositories 30 | actions: read 31 | contents: read 32 | 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v3 36 | 37 | # Initializes the CodeQL tools for scanning. 38 | - name: Initialize CodeQL 39 | uses: github/codeql-action/init@v2 40 | # Override language selection by uncommenting this and choosing your languages 41 | # with: 42 | # languages: go, javascript, csharp, python, cpp, java, ruby 43 | 44 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 45 | # If this step fails, then you should remove it and run the build manually (see below). 46 | - name: Autobuild 47 | uses: github/codeql-action/autobuild@v2 48 | 49 | # ℹ️ Command-line programs to run using the OS shell. 50 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 51 | 52 | # ✏️ If the Autobuild fails above, remove it and uncomment the following 53 | # three lines and modify them (or add more) to build your code if your 54 | # project uses a compiled language 55 | 56 | #- run: | 57 | # make bootstrap 58 | # make release 59 | 60 | - name: Perform CodeQL Analysis 61 | uses: github/codeql-action/analyze@v2 62 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation and Code Coverage 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: 7 | - closed 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | name: Build documentation on ${{ matrix.os }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest] 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.11' 25 | 26 | - uses: tlylt/install-graphviz@v1 27 | 28 | - name: Install pandoc 29 | run: sudo apt-get install -y pandoc 30 | 31 | - name: Install requirements 32 | run: python -m pip install -r requirements.txt 33 | 34 | - name: Install requirements dev 35 | run: python -m pip install -r requirements-dev.txt 36 | 37 | - name: Cache pip 38 | uses: actions/cache@v2 39 | with: 40 | path: ~/.cache/pip 41 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }} 42 | restore-keys: | 43 | ${{ runner.os }}-pip- 44 | ${{ runner.os }}- 45 | 46 | - name: Generate coverage report 47 | run: | 48 | pip install pytest 49 | pip install pytest-cov 50 | export PYTHONPATH=. 51 | pytest --cov=./pandas_streaming/ --cov-report=xml --durations=10 --ignore-glob=**LONG*.py --ignore-glob=**notebook*.py 52 | export PYTHONPATH= 53 | 54 | - name: Upload coverage reports to Codecov 55 | uses: codecov/codecov-action@v3 56 | env: 57 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 58 | 59 | - name: Install 60 | run: python setup.py install 61 | 62 | - name: Copy license, changelogs 63 | run: | 64 | cp LICENSE* ./_doc 65 | cp CHANGELOGS* ./_doc 66 | 67 | - name: Documentation 68 | run: python -m sphinx ./_doc ./dist/html -n -w doc.txt 69 | 70 | - name: Summary 71 | run: cat doc.txt 72 | 73 | - name: Check for errors and warnings 74 | run: | 75 | if [[ $(grep ERROR doc.txt) ]]; then 76 | echo "Documentation produces errors." 77 | grep ERROR doc.txt 78 | exit 1 79 | fi 80 | if [[ $(grep WARNING doc.txt | grep -v 'std:term:y') ]]; then 81 | echo "Documentation produces warnings." 82 | grep WARNING doc.txt 83 | exit 1 84 | fi 85 | 86 | - uses: actions/upload-artifact@v3 87 | with: 88 | path: ./dist/html/** 89 | -------------------------------------------------------------------------------- /.github/workflows/wheels-any.yml: -------------------------------------------------------------------------------- 1 | name: Build Any Wheel 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'releases/**' 8 | 9 | jobs: 10 | build_wheels: 11 | name: Build wheels on ${{ matrix.os }} 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: actions/setup-python@v4 21 | with: 22 | python-version: '3.11' 23 | 24 | - name: build wheel 25 | run: python -m pip wheel . 26 | 27 | - uses: actions/upload-artifact@v3 28 | with: 29 | path: ./pandas_streaming*.whl 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyd 3 | *.dylib 4 | *.so 5 | *.whl 6 | *.csv 7 | *.zip 8 | coverage.html/* 9 | _cache/* 10 | .coverage 11 | dist/* 12 | build/* 13 | .eggs/* 14 | .hypothesis/* 15 | *egg-info/* 16 | prof 17 | _doc/CHANGELOGS.rst 18 | _doc/LICENSE.txt 19 | _doc/auto_examples/* 20 | _doc/examples/_cache/* 21 | _doc/examples/plot_*.png 22 | _doc/examples/plot_*.xlsx 23 | _doc/examples/*.html 24 | _doc/_static/require.js 25 | _doc/_static/viz.js 26 | _unittests/ut__main/*.png 27 | _unittests/ut__main/_cache/* 28 | _unittests/ut__main/*.html 29 | _unittests/.hypothesis/* 30 | -------------------------------------------------------------------------------- /.local.jenkins.lin.yml: -------------------------------------------------------------------------------- 1 | 2 | language: python 3 | 4 | python: 5 | - { PATH: "{{Python39}}", VERSION: 3.9, DIST: std, PYINT: python3.9 } 6 | 7 | virtualenv: 8 | - path: {{ospathjoin(root_path, pickname("$NAME_JENKINS", project_name + "_$VERSION_$DIST_$NAME"), "_venv")}} 9 | 10 | install: 11 | - $PYINT -m pip install --upgrade pip 12 | - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/ 13 | - $PYINT -m pip install -r requirements.txt 14 | - $PYINT -m pip install -r requirements-dev.txt 15 | - $PYINT --version 16 | - $PYINT -m pip freeze 17 | 18 | script: 19 | - { CMD: "$PYINT -u setup.py unittests --covtoken=14c7930a-a5c0-405d-a22f-3f9c6feaf0bc", NAME: "UT" } 20 | 21 | after_script: 22 | - $PYINT -u setup.py bdist_wheel 23 | - if [ ${NAME} == "UT" ] then cp dist/*.whl {{root_path}}/../local_pypi/local_pypi_server fi 24 | 25 | documentation: 26 | - if [ ${NAME} == "UT" ] then $PYINT -u setup.py build_sphinx --layout=html fi 27 | - if [ ${NAME} == "UT" ] then cp -R -f _doc/sphinxdoc/build/html dist/html fi 28 | -------------------------------------------------------------------------------- /CHANGELOGS.rst: -------------------------------------------------------------------------------- 1 | 2 | Change Logs 3 | =========== 4 | 5 | 0.5.1 6 | +++++ 7 | 8 | * :pr:`43`: improves reproducibility of function train_test_apart_stratify 9 | 10 | 0.5.0 11 | +++++ 12 | 13 | * :pr:`33`: removes pyquickhelper dependency 14 | * :pr:`30`: fix compatiblity with pandas 2.0 15 | 16 | 0.3.239 17 | +++++++ 18 | 19 | * :pr:`27`: Fixes json parser when input is a stream (2021-10-26) 20 | * :pr:`26`: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26) 21 | * :pr:`25`: Fixes documentation (2021-10-18) 22 | * :pr:`24`: Implements a first version of sort_values. (2021-10-18) 23 | * :pr:`23`: First version of operator __setitem__ (2021-10-16) 24 | * :pr:`22`: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11) 25 | * :pr:`21`: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10) 26 | * :pr:`17`: Implements method describe (2021-04-08) 27 | 28 | 0.2.175 29 | +++++++ 30 | 31 | * :pr:`16`: Unit tests failing with pandas 1.1.0. (2020-08-06) 32 | * :pr:`15`: implements parameter lines, flatten for read_json (2018-11-21) 33 | * :pr:`14`: implements fillna (2018-10-29) 34 | * :pr:`13`: implement concat for axis=0,1 (2018-10-26) 35 | * :pr:`12`: add groupby_streaming (2018-10-26) 36 | * :pr:`11`: add method add_column (2018-10-26) 37 | * :pr:`10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26) 38 | * :pr:`9`: head is very slow (2018-10-26) 39 | * :pr:`8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31) 40 | * :pr:`7`: implement read_json (2018-05-17) 41 | * :pr:`6`: add pandas_groupby_nan from pyensae (2018-05-17) 42 | * :pr:`5`: add random_state parameter to splitting functions (2018-02-04) 43 | * :pr:`2`: add method sample, resevoir sampling (2017-11-05) 44 | * :pr:`3`: method train_test_split for out-of-memory datasets (2017-10-21) 45 | * :pr:`1`: Excited for your project (2017-10-10) 46 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | We are a community based on openness, as well as friendly and didactic discussions. 4 | 5 | We aspire to treat everybody equally, and value their contributions. 6 | 7 | Decisions are made based on technical merit and consensus. 8 | 9 | Code is not the only way to help the project. Reviewing pull requests, 10 | answering questions to help others on mailing lists or issues, organizing and 11 | teaching tutorials, working on the website, improving the documentation, are 12 | all priceless contributions. 13 | 14 | We abide by the principles of openness, respect, and consideration of others of 15 | the Python Software Foundation: https://www.python.org/psf/codeofconduct/ 16 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2024, Xavier Dupré 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune _doc 2 | prune _unittests 3 | exclude *.bat 4 | exclude *.yml 5 | exclude *.git* 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pandas-streaming: streaming API over pandas 2 | =========================================== 3 | 4 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true 5 | :target: https://ci.appveyor.com/project/sdpython/pandas-streaming 6 | :alt: Build Status Windows 7 | 8 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming 9 | :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ 10 | 11 | .. image:: https://badge.fury.io/py/pandas_streaming.svg 12 | :target: http://badge.fury.io/py/pandas_streaming 13 | 14 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 15 | :alt: MIT License 16 | :target: https://opensource.org/license/MIT/ 17 | 18 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8 19 | :target: https://codecov.io/gh/sdpython/pandas-streaming 20 | 21 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png 22 | :alt: GitHub Issues 23 | :target: https://github.com/sdpython/pandas_streaming/issues 24 | 25 | .. image:: https://pepy.tech/badge/pandas_streaming/month 26 | :target: https://pepy.tech/project/pandas_streaming/month 27 | :alt: Downloads 28 | 29 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg 30 | :target: https://github.com/sdpython/pandas_streaming/ 31 | :alt: Forks 32 | 33 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg 34 | :target: https://github.com/sdpython/pandas_streaming/ 35 | :alt: Stars 36 | 37 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming 38 | :target: https://github.com/sdpython/pandas_streaming/ 39 | :alt: size 40 | 41 | `pandas-streaming `_ 42 | aims at processing big files with `pandas `_, 43 | too big to hold in memory, too small to be parallelized with a significant gain. 44 | The module replicates a subset of *pandas* API 45 | and implements other functionalities for machine learning. 46 | 47 | .. code-block:: python 48 | 49 | from pandas_streaming.df import StreamingDataFrame 50 | sdf = StreamingDataFrame.read_csv("filename", sep="\t", encoding="utf-8") 51 | 52 | for df in sdf: 53 | # process this chunk of data 54 | # df is a dataframe 55 | print(df) 56 | 57 | The module can also stream an existing dataframe. 58 | 59 | .. code-block:: python 60 | 61 | import pandas 62 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), 63 | dict(cf=1, cint=1, cstr="1"), 64 | dict(cf=3, cint=3, cstr="3")]) 65 | 66 | from pandas_streaming.df import StreamingDataFrame 67 | sdf = StreamingDataFrame.read_df(df) 68 | 69 | for df in sdf: 70 | # process this chunk of data 71 | # df is a dataframe 72 | print(df) 73 | 74 | It contains other helpers to split datasets into 75 | train and test with some weird constraints. 76 | -------------------------------------------------------------------------------- /_doc/_static/git_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/git_logo.png -------------------------------------------------------------------------------- /_doc/_static/project_ico.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.ico -------------------------------------------------------------------------------- /_doc/_static/project_ico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/pandas-streaming/4a2927bbc960c8f73f4de188a3c43ddf97015eac/_doc/_static/project_ico.png -------------------------------------------------------------------------------- /_doc/api/connex_split.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.df.connex_split 3 | ================================ 4 | 5 | .. automodule:: pandas_streaming.df.connex_split 6 | :members: 7 | -------------------------------------------------------------------------------- /_doc/api/dataframe.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.df.dataframe 3 | ============================= 4 | 5 | StreamingDataFrameSchemaError 6 | +++++++++++++++++++++++++++++ 7 | 8 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrameSchemaError 9 | :members: 10 | 11 | StreamingDataFrame 12 | ++++++++++++++++++ 13 | 14 | .. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame 15 | :members: 16 | :special-members: 17 | 18 | StreamingSeries 19 | +++++++++++++++ 20 | 21 | .. autoclass:: pandas_streaming.df.dataframe.StreamingSeries 22 | :members: 23 | -------------------------------------------------------------------------------- /_doc/api/dataframe_io.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.df.dataframe_io 3 | ================================ 4 | 5 | .. automodule:: pandas_streaming.df.dataframe_io 6 | :members: 7 | -------------------------------------------------------------------------------- /_doc/api/dataframe_split.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.df.dataframe_split 3 | =================================== 4 | 5 | .. automodule:: pandas_streaming.df.dataframe_split 6 | :members: 7 | -------------------------------------------------------------------------------- /_doc/api/index.rst: -------------------------------------------------------------------------------- 1 | 2 | API 3 | === 4 | 5 | .. toctree:: 6 | 7 | rdata 8 | rdf 9 | rexc 10 | rio 11 | -------------------------------------------------------------------------------- /_doc/api/rdata.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.data 3 | ===================== 4 | 5 | Collection of functions which produces 6 | :class:`StreamingDataFrame `. 7 | 8 | .. autofunction:: pandas_streaming.data.dummy.dummy_streaming_dataframe 9 | -------------------------------------------------------------------------------- /_doc/api/rdf.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.df 3 | =================== 4 | 5 | Streaming 6 | +++++++++ 7 | 8 | The main class is an interface which mimic 9 | :class:`pandas.DataFrame` interface to offer 10 | a short list of methods which apply on an 11 | iterator of dataframes. This provides somehow 12 | a streaming version of it. As a result, the creation 13 | of an instance is fast as long as the data is not 14 | processed. Iterators can be chained as many map reduce 15 | framework does. 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | dataframe 21 | 22 | The module implements additional and useful functions 23 | not necessarily for the streaming version of the dataframes. 24 | Many methods have been rewritten to support 25 | streaming. Among them, IO methods: 26 | :meth:`read_csv `, 27 | :meth:`read_df `, 28 | :meth:`read_json `. 29 | 30 | Data Manipulation 31 | +++++++++++++++++ 32 | 33 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns 34 | 35 | .. autofunction:: pandas_streaming.df.connex_split.dataframe_shuffle 36 | 37 | .. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_unfold 38 | 39 | .. autofunction:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan 40 | 41 | Complex splits 42 | ++++++++++++++ 43 | 44 | Splitting a database into train and test is usually simple except 45 | if rows are not independant and share some ids. In that case, 46 | the following functions will try to build two partitions keeping 47 | ids separate or separate as much as possible: 48 | :func:`train_test_apart_stratify `, 49 | :func:`train_test_connex_split `, 50 | :func:`train_test_split_weights `. 51 | 52 | Extensions 53 | ++++++++++ 54 | 55 | .. toctree:: 56 | :maxdepth: 1 57 | 58 | connex_split 59 | dataframe_io 60 | dataframe_split 61 | -------------------------------------------------------------------------------- /_doc/api/rexc.rst: -------------------------------------------------------------------------------- 1 | 2 | pandas_streaming.exc 3 | ==================== 4 | 5 | Exceptions. 6 | 7 | .. autoclass:: pandas_streaming.exc.exc_streaming.StreamingInefficientException 8 | -------------------------------------------------------------------------------- /_doc/api/rio.rst: -------------------------------------------------------------------------------- 1 | 2 | Inputs / Outputs 3 | ================ 4 | 5 | Dataframes / Numpy arrays 6 | +++++++++++++++++++++++++ 7 | 8 | `HDF5 `_ 9 | is easy to manipulate in the :epkg:`Python` world but difficult 10 | to exchange with other people and other environments. 11 | The two following functions makes it easier to collapse many dataframes 12 | or numpy arrays into one single file. The data can be unzipped afterwards, 13 | see :func:`read_zip `, 14 | :func:`to_zip `. 15 | -------------------------------------------------------------------------------- /_doc/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from sphinx_runpython.github_link import make_linkcode_resolve 4 | from sphinx_runpython.conf_helper import has_dvipng, has_dvisvgm 5 | from pandas_streaming import __version__ 6 | 7 | 8 | extensions = [ 9 | "nbsphinx", 10 | "sphinx.ext.autodoc", 11 | "sphinx.ext.coverage", 12 | "sphinx.ext.githubpages", 13 | "sphinx.ext.ifconfig", 14 | "sphinx.ext.intersphinx", 15 | "sphinx.ext.linkcode", 16 | "sphinx.ext.viewcode", 17 | "sphinx.ext.napoleon", 18 | "sphinx.ext.todo", 19 | "sphinx_gallery.gen_gallery", 20 | "sphinx_issues", 21 | "sphinx_runpython.blocdefs.sphinx_exref_extension", 22 | "sphinx_runpython.blocdefs.sphinx_mathdef_extension", 23 | "sphinx_runpython.epkg", 24 | "sphinx_runpython.gdot", 25 | "sphinx_runpython.runpython", 26 | "matplotlib.sphinxext.plot_directive", 27 | ] 28 | 29 | if has_dvisvgm(): 30 | extensions.append("sphinx.ext.imgmath") 31 | imgmath_image_format = "svg" 32 | elif has_dvipng(): 33 | extensions.append("sphinx.ext.pngmath") 34 | imgmath_image_format = "png" 35 | else: 36 | extensions.append("sphinx.ext.mathjax") 37 | 38 | templates_path = ["_templates"] 39 | html_logo = "_static/project_ico.png" 40 | source_suffix = ".rst" 41 | master_doc = "index" 42 | project = "pandas-streaming" 43 | copyright = "2017-2024, Xavier Dupré" 44 | author = "Xavier Dupré" 45 | version = __version__ 46 | release = __version__ 47 | language = "en" 48 | exclude_patterns = ["auto_examples/*.ipynb"] 49 | pygments_style = "sphinx" 50 | todo_include_todos = True 51 | nbsphinx_execute = "never" 52 | 53 | html_theme = "furo" 54 | html_theme_path = ["_static"] 55 | html_theme_options = {} 56 | html_sourcelink_suffix = "" 57 | html_static_path = ["_static"] 58 | 59 | issues_github_path = "sdpython/pandas-streaming" 60 | 61 | # The following is used by sphinx.ext.linkcode to provide links to github 62 | linkcode_resolve = make_linkcode_resolve( 63 | "pandas_streaming", 64 | ( 65 | "https://github.com/sdpython/pandas-streaming/" 66 | "blob/{revision}/{package}/" 67 | "{path}#L{lineno}" 68 | ), 69 | ) 70 | 71 | latex_elements = { 72 | "papersize": "a4", 73 | "pointsize": "10pt", 74 | "title": project, 75 | } 76 | 77 | mathjax3_config = {"chtml": {"displayAlign": "left"}} 78 | 79 | intersphinx_mapping = { 80 | "onnx": ("https://onnx.ai/onnx/", None), 81 | "matplotlib": ("https://matplotlib.org/", None), 82 | "numpy": ("https://numpy.org/doc/stable", None), 83 | "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), 84 | "python": (f"https://docs.python.org/{sys.version_info.major}", None), 85 | "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), 86 | "sklearn": ("https://scikit-learn.org/stable/", None), 87 | "sklearn-onnx": ("https://onnx.ai/sklearn-onnx/", None), 88 | "torch": ("https://pytorch.org/docs/stable/", None), 89 | } 90 | 91 | # Check intersphinx reference targets exist 92 | nitpicky = True 93 | # See also scikit-learn/scikit-learn#26761 94 | nitpick_ignore = [ 95 | ("py:class", "False"), 96 | ("py:class", "True"), 97 | ("py:class", "pipeline.Pipeline"), 98 | ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"), 99 | ] 100 | 101 | sphinx_gallery_conf = { 102 | # path to your examples scripts 103 | "examples_dirs": os.path.join(os.path.dirname(__file__), "examples"), 104 | # path where to save gallery generated examples 105 | "gallery_dirs": "auto_examples", 106 | } 107 | 108 | # next 109 | 110 | preamble = """ 111 | \\usepackage{etex} 112 | \\usepackage{fixltx2e} % LaTeX patches, \\textsubscript 113 | \\usepackage{cmap} % fix search and cut-and-paste in Acrobat 114 | \\usepackage[raccourcis]{fast-diagram} 115 | \\usepackage{titlesec} 116 | \\usepackage{amsmath} 117 | \\usepackage{amssymb} 118 | \\usepackage{amsfonts} 119 | \\usepackage{graphics} 120 | \\usepackage{epic} 121 | \\usepackage{eepic} 122 | %\\usepackage{pict2e} 123 | %%% Redefined titleformat 124 | \\setlength{\\parindent}{0cm} 125 | \\setlength{\\parskip}{1ex plus 0.5ex minus 0.2ex} 126 | \\newcommand{\\hsp}{\\hspace{20pt}} 127 | \\newcommand{\\acc}[1]{\\left\\{#1\\right\\}} 128 | \\newcommand{\\cro}[1]{\\left[#1\\right]} 129 | \\newcommand{\\pa}[1]{\\left(#1\\right)} 130 | \\newcommand{\\R}{\\mathbb{R}} 131 | \\newcommand{\\HRule}{\\rule{\\linewidth}{0.5mm}} 132 | %\\titleformat{\\chapter}[hang]{\\Huge\\bfseries\\sffamily}{\\thechapter\\hsp}{0pt}{\\Huge\\bfseries\\sffamily} 133 | 134 | \\usepackage[all]{xy} 135 | \\newcommand{\\vecteur}[2]{\\pa{#1,\\dots,#2}} 136 | \\newcommand{\\N}[0]{\\mathbb{N}} 137 | \\newcommand{\\indicatrice}[1]{ {1\\!\\!1}_{\\acc{#1}} } 138 | \\newcommand{\\infegal}[0]{\\leqslant} 139 | \\newcommand{\\supegal}[0]{\\geqslant} 140 | \\newcommand{\\ensemble}[2]{\\acc{#1,\\dots,#2}} 141 | \\newcommand{\\fleche}[1]{\\overrightarrow{ #1 }} 142 | \\newcommand{\\intervalle}[2]{\\left\\{#1,\\cdots,#2\\right\\}} 143 | \\newcommand{\\independant}[0]{\\perp \\!\\!\\! \\perp} 144 | \\newcommand{\\esp}{\\mathbb{E}} 145 | \\newcommand{\\espf}[2]{\\mathbb{E}_{#1}\\pa{#2}} 146 | \\newcommand{\\var}{\\mathbb{V}} 147 | \\newcommand{\\pr}[1]{\\mathbb{P}\\pa{#1}} 148 | \\newcommand{\\loi}[0]{{\\cal L}} 149 | \\newcommand{\\vecteurno}[2]{#1,\\dots,#2} 150 | \\newcommand{\\norm}[1]{\\left\\Vert#1\\right\\Vert} 151 | \\newcommand{\\norme}[1]{\\left\\Vert#1\\right\\Vert} 152 | \\newcommand{\\scal}[2]{\\left<#1,#2\\right>} 153 | \\newcommand{\\dans}[0]{\\rightarrow} 154 | \\newcommand{\\partialfrac}[2]{\\frac{\\partial #1}{\\partial #2}} 155 | \\newcommand{\\partialdfrac}[2]{\\dfrac{\\partial #1}{\\partial #2}} 156 | \\newcommand{\\trace}[1]{tr\\pa{#1}} 157 | \\newcommand{\\sac}[0]{|} 158 | \\newcommand{\\abs}[1]{\\left|#1\\right|} 159 | \\newcommand{\\loinormale}[2]{{\\cal N} \\pa{#1,#2}} 160 | \\newcommand{\\loibinomialea}[1]{{\\cal B} \\pa{#1}} 161 | \\newcommand{\\loibinomiale}[2]{{\\cal B} \\pa{#1,#2}} 162 | \\newcommand{\\loimultinomiale}[1]{{\\cal M} \\pa{#1}} 163 | \\newcommand{\\variance}[1]{\\mathbb{V}\\pa{#1}} 164 | \\newcommand{\\intf}[1]{\\left\\lfloor #1 \\right\\rfloor} 165 | """ 166 | 167 | imgmath_latex_preamble = preamble 168 | latex_elements["preamble"] = imgmath_latex_preamble 169 | 170 | 171 | epkg_dictionary = { 172 | "csv": "https://en.wikipedia.org/wiki/Comma-separated_values", 173 | "dask": "https://dask.pydata.org/en/latest/", 174 | "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", 175 | "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", 176 | "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", 177 | "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", 178 | "dill": "https://dill.readthedocs.io/en/latest/dill.html", 179 | "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html", 180 | "Jupyter": "https://jupyter.org/", 181 | "Hadoop": "http://hadoop.apache.org/", 182 | "ijson": "https://github.com/ICRAR/ijson", 183 | "json": "https://docs.python.org/3/library/json.html", 184 | "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", 185 | "numpy": "https://numpy.org/", 186 | "pandas": ( 187 | "http://pandas.pydata.org/pandas-docs/stable/", 188 | ( 189 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html", 190 | 1, 191 | ), 192 | ( 193 | "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html", 194 | 2, 195 | ), 196 | ), 197 | "pyarrow": "https://arrow.apache.org/docs/python/", 198 | "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html", 199 | "Python": "https://www.python.org/", 200 | "scikit-learn": "https://scikit-learn.org/stable/", 201 | "scikit-multiflow": "https://scikit-multiflow.github.io/", 202 | "sklearn": ( 203 | "https://scikit-learn.org/stable/", 204 | ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1), 205 | ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), 206 | ), 207 | "streamz": "https://streamz.readthedocs.io/en/latest/index.html", 208 | "tornado": "https://www.tornadoweb.org/en/stable/", 209 | "zip": "https://en.wikipedia.org/wiki/ZIP_(file_format)", 210 | } 211 | -------------------------------------------------------------------------------- /_doc/examples/README.txt: -------------------------------------------------------------------------------- 1 | Gallery of Examples 2 | =================== 3 | 4 | -------------------------------------------------------------------------------- /_doc/examples/first_step.py: -------------------------------------------------------------------------------- 1 | """ 2 | First steps with pandas_streaming 3 | ================================= 4 | 5 | A few difference between :epkg:`pandas` and *pandas_streaming*. 6 | 7 | pandas to pandas_streaming 8 | ++++++++++++++++++++++++++ 9 | """ 10 | 11 | import glob 12 | from pandas import DataFrame 13 | from pandas_streaming.df import StreamingDataFrame 14 | 15 | 16 | df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) 17 | df 18 | 19 | 20 | ############################# 21 | # We create a streaming dataframe: 22 | 23 | 24 | sdf = StreamingDataFrame.read_df(df) 25 | sdf 26 | 27 | 28 | ################################ 29 | # 30 | 31 | sdf.to_dataframe() 32 | 33 | 34 | ######################################## 35 | # Internally, StreamingDataFrame implements an iterator on 36 | # dataframes and then tries to replicate the same interface as 37 | # :class:`pandas.DataFrame` possibly wherever it is possible to 38 | # manipulate data without loading everything into memory. 39 | 40 | 41 | sdf2 = sdf.concat(sdf) 42 | sdf2.to_dataframe() 43 | 44 | 45 | ############################### 46 | # 47 | 48 | m = DataFrame(dict(Y=["a", "b"], Z=[10, 20])) 49 | m 50 | 51 | 52 | ########################################## 53 | # 54 | 55 | sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") 56 | sdf3.to_dataframe() 57 | 58 | 59 | ############################################ 60 | # 61 | 62 | sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer") 63 | 64 | 65 | ############################################ 66 | # The order might be different. 67 | 68 | 69 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5) 70 | sdfte.head() 71 | 72 | 73 | ############################################ 74 | # 75 | 76 | 77 | sdftr.head() 78 | 79 | 80 | ############################################ 81 | # split a big file 82 | # ++++++++++++++++ 83 | 84 | 85 | sdf2.to_csv("example.txt") 86 | 87 | 88 | ############################################ 89 | # 90 | 91 | 92 | new_sdf = StreamingDataFrame.read_csv("example.txt") 93 | new_sdf.train_test_split("example.{}.txt", streaming=False) 94 | 95 | 96 | ############################################ 97 | # 98 | 99 | glob.glob("ex*.txt") 100 | -------------------------------------------------------------------------------- /_doc/i_ex.rst: -------------------------------------------------------------------------------- 1 | 2 | Examples 3 | ======== 4 | 5 | About array 6 | +++++++++++ 7 | 8 | .. exreflist:: 9 | :contents: 10 | :tag: array 11 | 12 | About DataFrame 13 | +++++++++++++++ 14 | 15 | .. exreflist:: 16 | :contents: 17 | :tag: dataframe 18 | 19 | About StreamingDataFrame 20 | ++++++++++++++++++++++++ 21 | 22 | .. exreflist:: 23 | :contents: 24 | :tag: streaming 25 | -------------------------------------------------------------------------------- /_doc/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. |gitlogo| image:: _static/git_logo.png 3 | :height: 20 4 | 5 | pandas-streaming: streaming API over pandas 6 | =========================================== 7 | 8 | .. image:: https://ci.appveyor.com/api/projects/status/4te066r8ne1ymmhy?svg=true 9 | :target: https://ci.appveyor.com/project/sdpython/pandas-streaming 10 | :alt: Build Status Windows 11 | 12 | .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming 13 | :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ 14 | 15 | .. image:: https://badge.fury.io/py/pandas_streaming.svg 16 | :target: http://badge.fury.io/py/pandas-streaming 17 | 18 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 19 | :alt: MIT License 20 | :target: https://opensource.org/license/MIT/ 21 | 22 | .. image:: https://codecov.io/gh/sdpython/pandas-streaming/branch/main/graph/badge.svg?token=0caHX1rhr8 23 | :target: https://codecov.io/gh/sdpython/pandas-streaming 24 | 25 | .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png 26 | :alt: GitHub Issues 27 | :target: https://github.com/sdpython/pandas_streaming/issues 28 | 29 | .. image:: https://pepy.tech/badge/pandas_streaming 30 | :target: https://pypi.org/project/pandas_streaming/ 31 | :alt: Downloads 32 | 33 | .. image:: https://img.shields.io/github/forks/sdpython/pandas_streaming.svg 34 | :target: https://github.com/sdpython/pandas_streaming/ 35 | :alt: Forks 36 | 37 | .. image:: https://img.shields.io/github/stars/sdpython/pandas_streaming.svg 38 | :target: https://github.com/sdpython/pandas_streaming/ 39 | :alt: Stars 40 | 41 | .. image:: https://img.shields.io/github/repo-size/sdpython/pandas_streaming 42 | :target: https://github.com/sdpython/pandas_streaming/ 43 | :alt: size 44 | 45 | *pandas_streaming* aims at processing big files with :epkg:`pandas`, 46 | too big to hold in memory, too small to be parallelized with a significant gain. 47 | The module replicates a subset of :epkg:`pandas` API 48 | and implements other functionalities for machine learning. 49 | 50 | .. toctree:: 51 | :maxdepth: 1 52 | :caption: Contents 53 | 54 | tutorial/index 55 | auto_examples/index 56 | api/index 57 | i_ex 58 | 59 | .. toctree:: 60 | :maxdepth: 1 61 | :caption: More 62 | 63 | CHANGELOGS 64 | license 65 | 66 | Source are available at `sdpython/pandas_streaming `_. 67 | 68 | Older versions 69 | ++++++++++++++ 70 | 71 | * `0.5.1 <../v0.5.1/index.html>`_ 72 | * `0.5.0 <../v0.5.0/index.html>`_ 73 | -------------------------------------------------------------------------------- /_doc/license.rst: -------------------------------------------------------------------------------- 1 | .. _l-license: 2 | 3 | License 4 | ======= 5 | 6 | .. include:: LICENSE.txt 7 | :literal: 8 | -------------------------------------------------------------------------------- /_doc/sg_execution_times.rst: -------------------------------------------------------------------------------- 1 | 2 | :orphan: 3 | 4 | .. _sphx_glr_sg_execution_times: 5 | 6 | 7 | Computation times 8 | ================= 9 | **00:00.000** total execution time for 1 file **from all galleries**: 10 | 11 | .. container:: 12 | 13 | .. raw:: html 14 | 15 | 19 | 20 | 21 | 22 | 27 | 28 | .. list-table:: 29 | :header-rows: 1 30 | :class: table table-striped sg-datatable 31 | 32 | * - Example 33 | - Time 34 | - Mem (MB) 35 | * - :ref:`sphx_glr_auto_examples_first_step.py` (``examples/first_step.py``) 36 | - 00:00.000 37 | - 0.0 38 | -------------------------------------------------------------------------------- /_doc/tutorial/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Tutorial 3 | ======== 4 | 5 | The main class :class:`StreamingDataFrame ` 6 | is basically on iterator on dataframes. Altogether, it is a 7 | single dataframe which does not have to fit in memory. 8 | It implements a subset a functionalities :epkg:`pandas` provides 9 | related to map reduce, 10 | :meth:`concat `, 11 | :meth:`join `. 12 | Both return a :class:`StreamingDataFrame ` 13 | as opposed to :meth:`groupby ` 14 | which does not. 15 | 16 | The beginning is always the same, we create such object with one 17 | method :meth:`read_csv `, 18 | :meth:`read_df `, 19 | :meth:`read_str `. 20 | The module was initially created to easily split a dataset into train/test 21 | when it does not fit into memory. 22 | 23 | :: 24 | 25 | from pandas_streaming.df import StreamingDataFrame 26 | sdf = StreamingDataFrame.read_csv("", sep="\t") 27 | sdf.train_test_split("dataset_split_{}.txt", sep="\t") 28 | 29 | >>> ['dataset_split_train.txt', 'dataset_split_test.txt'] 30 | 31 | Objectives and Competitors 32 | ++++++++++++++++++++++++++ 33 | 34 | The first objective is speed. 35 | :class:`StreamingDataFrame ` 36 | is useful when the user needs to process a large data set which does not 37 | hold in memory (*out-of-memory dataset*) or when the user needs to fastly 38 | check an algorithm on the beginning of a big dataset without paying the 39 | cost of loading the data. 40 | 41 | The second objective is simplicity. The proposed interface 42 | tries to follow the same syntax as :epkg:`pandas`. 43 | That is one of the direction followed by :epkg:`dask`. 44 | 45 | :epkg:`dask` tries to address these two objectives 46 | and also offers parallelization. Based on my experience, 47 | :epkg:`dask` is efficient but tends to be slow for simple things 48 | on medium datasets (a couple of gigabytes). The API is not exactly 49 | the same either. The parser does not behave exactly the same. 50 | :epkg:`pyspark` seems a bit of overhead, more difficult 51 | to install and still slow if it is used locally. 52 | :epkg:`pyarrow` is supposed to be the next :epkg:`pandas` but its 53 | scope is larger (it handles streaming dataset from :epkg:`Hadoop`) 54 | and does not work yet with :epkg:`scikit-learn`. 55 | I expect this module to be live until 56 | :epkg:`scikit-learn` updates its code to handle 57 | a streaming container. This one will probably be 58 | the winner. 59 | :epkg:`streamz` follows a different direction. 60 | It offers parallelisation, relies on :epkg:`tornado` but not 61 | on :epkg:`pandas` meaning using it for machine learning 62 | might hide some unexpected loopholes. 63 | :epkg:`scikit-multiflow` does not only implement streaming 64 | container but also streaming machine learning trainings. 65 | 66 | One element of design to remember 67 | +++++++++++++++++++++++++++++++++ 68 | 69 | The class :class:`StreamingDataFrame ` 70 | does not hold an iterator but a function which creates an iterator. 71 | Every time the user writes the following loop, the function is called 72 | to create an iterator then used to walk through the data. 73 | 74 | .. runpython:: 75 | :showcode: 76 | 77 | import pandas 78 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"), 79 | dict(cf=3, cint=3, cstr="3")]) 80 | 81 | from pandas_streaming.df import StreamingDataFrame 82 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 83 | 84 | print("First time:") 85 | 86 | for df in sdf: 87 | # process this chunk of data 88 | print(df) 89 | 90 | print("\nSecond time:\n") 91 | 92 | for df in sdf: 93 | # process this chunk of data a second time 94 | print(df) 95 | 96 | The reason why the class cannot directly use an iterator is because 97 | it is not possible to pickle an iterator. An iterator is meant to 98 | be used only once, a second loop would not be possible and would 99 | be quite surprising to most of users. 100 | 101 | A :class:`StreamingDataFrame ` 102 | is also supposed to be *stable*: the two loops in the previous example 103 | should produce the exact same chunks. However, in some cases, the user can choose 104 | not to abide by this constraint. Drawing a sample is one of the reasons. 105 | A user can either choose to draw the same sample every time he is going 106 | through the data. He could also choose that a different sample should be 107 | drawn each time. The following method indicates which kinds of sample 108 | the :class:`StreamingDataFrame ` 109 | is producing. 110 | 111 | Check the schema consistency of a large file 112 | ++++++++++++++++++++++++++++++++++++++++++++ 113 | 114 | Large files usually comes from an export of a database and this 115 | for some reason, this export failed for a couple of lines. 116 | It can be character *end of line* not removed from a comment, 117 | a separator also present in the data. When that happens, :epkg:`pandas` 118 | takes the least strict type as the column type. Sometimes, we prefer to get a 119 | an idea of where we could find the error. 120 | 121 | .. runpython:: 122 | :showcode: 123 | 124 | import pandas 125 | df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"), 126 | dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")]) 127 | name = "temp_df.csv" 128 | df.to_csv(name, index=False) 129 | 130 | from pandas_streaming.df import StreamingDataFrame 131 | try: 132 | sdf = StreamingDataFrame.read_csv(name, chunksize=2) 133 | for df in sdf: 134 | print(df.dtypes) 135 | except Exception as e: 136 | print("ERROR:", e) 137 | 138 | The method :meth:`__iter__ 139 | ` 140 | checks that the schema does not change between two iterations. 141 | It can be disabled by adding *check_schema=False* when 142 | the constructor is called. 143 | -------------------------------------------------------------------------------- /_unittests/ut_df/data/buggy_hash2.csv: -------------------------------------------------------------------------------- 1 | 1092397418290.0 a181248367 366498568522.0 2 | 138742792720.0 516e2e745c 73810952621.0 3 | 108082559849.0 1601fecc7f 79402822525.0 4 | 251797282335.0 29d56f63ec 530980115159.0 5 | 651822622544.0 67be9eb2e5 618639148003.0 6 | 817909238810.0 3a24c42894 441595633456.0 7 | 427513930052.0 42fbf1e0a9 759755785197.0 8 | 409652918460.0 e0e09bcb7b 487633962255.0 9 | 126536040328.0 a2c6f80ea6 325262414951.0 10 | 195809963606.0 7d67e33166 58693978128.0 11 | 426363751898.0 4f67c53e66 1037516316531.0 12 | 51702292002.0 37c64b233a 206747200377.0 13 | 945246123121.0 8739a9cebb 639796038157.0 14 | -------------------------------------------------------------------------------- /_unittests/ut_df/data/classic.json: -------------------------------------------------------------------------------- 1 | [{"name":"cudaGetDeviceCount", 2 | "ph":"X", 3 | "cat":"cuda", 4 | "ts":1634290065724226794, 5 | "dur":800, 6 | "tid":"Thread 2080429824: Runtime API", 7 | "pid":"[89792] Process", 8 | "args":{}}, 9 | {"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii", 10 | "ph":"X", 11 | "cat":"cuda", 12 | "ts":1634290112071305413, 13 | "dur":1888, 14 | "tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii", 15 | "pid":"[0:1] Compute", 16 | "args":{"Grid size":"[ 1, 1, 1 ]", 17 | "Block size":"[ 256, 1, 1 ]"}}, 18 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", 19 | "ph":"X", 20 | "cat":"cuda", 21 | "ts":1634290112071308133, 22 | "dur":1440, 23 | "tid":"Compute", 24 | "pid":"[0:1] Overview", 25 | "args":{"Grid size":"[ 1, 1, 1 ]", 26 | "Block size":"[ 1, 256, 1 ]"}}, 27 | {"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", 28 | "ph":"X", 29 | "cat":"cuda", 30 | "ts":1634290112071308133, 31 | "dur":1440, 32 | "tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", 33 | "pid":"[0:1] Compute", 34 | "args":{"Grid size":"[ 1, 1, 1 ]", 35 | "Block size":"[ 1, 256, 1 ]"}}] 36 | -------------------------------------------------------------------------------- /_unittests/ut_df/data/example.json: -------------------------------------------------------------------------------- 1 | {"a": 1, "b": 2} 2 | {"a": 3, "b": 4} -------------------------------------------------------------------------------- /_unittests/ut_df/data/example2.json: -------------------------------------------------------------------------------- 1 | [{"a":1,"b":2},{"a":3,"b":4}] -------------------------------------------------------------------------------- /_unittests/ut_df/test_connex_split.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas 3 | from pandas_streaming.ext_test_case import ExtTestCase 4 | from pandas_streaming.df import ( 5 | dataframe_shuffle, 6 | train_test_split_weights, 7 | train_test_connex_split, 8 | ) 9 | 10 | 11 | class TestConnexSplit(ExtTestCase): 12 | def test_shuffle(self): 13 | df = pandas.DataFrame( 14 | [ 15 | dict(a=1, b="e", c=5.6, ind="a1"), 16 | dict(a=2, b="f", c=5.7, ind="a2"), 17 | dict(a=4, b="g", c=5.8, ind="a3"), 18 | dict(a=8, b="h", c=5.9, ind="a4"), 19 | dict(a=16, b="i", c=6.2, ind="a5"), 20 | ] 21 | ) 22 | shuffled = dataframe_shuffle(df, random_state=0) 23 | sorted_ = shuffled.sort_values("a") 24 | self.assertEqualDataFrame(df, sorted_) 25 | 26 | df2 = df.set_index("ind") 27 | shuffled = dataframe_shuffle(df2, random_state=0) 28 | sorted_ = shuffled.sort_values("a") 29 | self.assertEqualDataFrame(df2, sorted_) 30 | 31 | df2 = df.set_index(["ind", "c"]) 32 | shuffled = dataframe_shuffle(df2, random_state=0) 33 | sorted_ = shuffled.sort_values("a") 34 | self.assertEqualDataFrame(df2, sorted_) 35 | 36 | def test_split_weights_errors(self): 37 | df = pandas.DataFrame( 38 | [ 39 | dict(a=1, b="e", c=1), 40 | dict(a=2, b="f", c=1), 41 | dict(a=4, b="g", c=1), 42 | dict(a=8, b="h", c=1), 43 | dict(a=12, b="h", c=1), 44 | dict(a=16, b="i", c=1), 45 | ] 46 | ) 47 | 48 | train, test = train_test_split_weights(df, train_size=0.5, weights="c") 49 | self.assertTrue(train is not None) 50 | self.assertTrue(test is not None) 51 | self.assertRaise( 52 | lambda: train_test_split_weights(df, test_size=0.5, weights=[0.5, 0.5]), 53 | ValueError, 54 | "Dimension", 55 | ) 56 | self.assertRaise( 57 | lambda: train_test_split_weights(df, test_size=0), ValueError, "null" 58 | ) 59 | self.assertRaise( 60 | lambda: train_test_split_weights(df, test_size=0, weights="c"), 61 | ValueError, 62 | "null", 63 | ) 64 | 65 | def test_split_weights(self): 66 | df = pandas.DataFrame( 67 | [ 68 | dict(a=1, b="e", c=1), 69 | dict(a=2, b="f", c=1), 70 | dict(a=4, b="g", c=1), 71 | dict(a=8, b="h", c=1), 72 | dict(a=12, b="h", c=1), 73 | dict(a=16, b="i", c=1), 74 | ] 75 | ) 76 | 77 | train, test = train_test_split_weights(df, test_size=0.5) 78 | self.assertEqual(train.shape[1], test.shape[1]) 79 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 80 | 81 | train, test = train_test_split_weights(df, test_size=0.5, weights="c") 82 | self.assertEqual(train.shape[1], test.shape[1]) 83 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 84 | 85 | train, test = train_test_split_weights(df, test_size=0.5, weights=df["c"]) 86 | self.assertEqual(train.shape[1], test.shape[1]) 87 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 88 | 89 | df = pandas.DataFrame( 90 | [ 91 | dict(a=1, b="e", c=1), 92 | dict(a=2, b="f", c=2), 93 | dict(a=4, b="g", c=3), 94 | dict(a=8, b="h", c=1), 95 | dict(a=12, b="h", c=2), 96 | dict(a=16, b="i", c=3), 97 | ] 98 | ) 99 | 100 | train, test = train_test_split_weights( 101 | df, test_size=0.5, weights="c", fail_imbalanced=0.4 102 | ) 103 | self.assertEqual(train.shape[1], test.shape[1]) 104 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 105 | w1, w2 = train["c"].sum(), test["c"].sum() 106 | delta = abs(w1 - w2) / (w1 + w2) 107 | self.assertGreater(0.4, delta) 108 | 109 | def test_split_connex(self): 110 | df = pandas.DataFrame( 111 | [ 112 | dict(user="UA", prod="PA", card="C1"), 113 | dict(user="UA", prod="PB", card="C1"), 114 | dict(user="UB", prod="PC", card="C2"), 115 | dict(user="UB", prod="PD", card="C2"), 116 | dict(user="UC", prod="PE", card="C3"), 117 | dict(user="UC", prod="PF", card="C4"), 118 | dict(user="UD", prod="PG", card="C5"), 119 | ] 120 | ) 121 | 122 | train, test = train_test_connex_split( # pylint: disable=W0632 123 | df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4 124 | ) 125 | 126 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 127 | for col in ["user", "prod", "card"]: 128 | s1 = set(train[col]) 129 | s2 = set(test[col]) 130 | if s1 & s2: 131 | raise AssertionError( 132 | f"Non empty intersection {s1} & {s2}\n{train}\n{test}" 133 | ) 134 | 135 | df["connex"] = "ole" 136 | train, test = train_test_connex_split( # pylint: disable=W0632 137 | df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4 138 | ) 139 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 140 | 141 | def test_split_connex2(self): 142 | df = pandas.DataFrame( 143 | [ 144 | dict(user="UA", prod="PAA", card="C1"), 145 | dict(user="UA", prod="PB", card="C1"), 146 | dict(user="UB", prod="PC", card="C2"), 147 | dict(user="UB", prod="PD", card="C2"), 148 | dict(user="UC", prod="PAA", card="C3"), 149 | dict(user="UC", prod="PF", card="C4"), 150 | dict(user="UD", prod="PG", card="C5"), 151 | ] 152 | ) 153 | 154 | train_test_connex_split( 155 | df, 156 | test_size=0.5, 157 | groups=["user", "prod", "card"], 158 | fail_imbalanced=0.5, 159 | return_cnx=True, 160 | ) 161 | train, test, stats = train_test_connex_split( 162 | df, 163 | test_size=0.5, 164 | groups=["user", "prod", "card"], 165 | fail_imbalanced=0.5, 166 | return_cnx=True, 167 | random_state=0, 168 | ) 169 | 170 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 171 | for col in ["user", "prod", "card"]: 172 | s1 = set(train[col]) 173 | s2 = set(test[col]) 174 | if s1 & s2: 175 | rows = [] 176 | for k, v in sorted(stats[0].items()): 177 | rows.append(f"{k}={v}") 178 | raise AssertionError( 179 | "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030 180 | s1, s2, train, test, "\n".join(rows) 181 | ) 182 | ) 183 | 184 | def test_split_connex_missing(self): 185 | df = pandas.DataFrame( 186 | [ 187 | dict(user="UA", prod="PAA", card="C1"), 188 | dict(user="UA", prod="PB", card="C1"), 189 | dict(user="UB", prod="PC", card="C2"), 190 | dict(user="UB", prod="PD", card="C2"), 191 | dict(user="UC", prod="PAA", card="C3"), 192 | dict(user="UC", card="C4"), 193 | dict(user="UD", prod="PG"), 194 | ] 195 | ) 196 | 197 | train, test, stats = train_test_connex_split( 198 | df, 199 | test_size=0.5, 200 | groups=["user", "prod", "card"], 201 | fail_imbalanced=0.4, 202 | return_cnx=True, 203 | random_state=0, 204 | ) 205 | 206 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 207 | for col in ["user", "prod", "card"]: 208 | s1 = set(train[col]) 209 | s2 = set(test[col]) 210 | if s1 & s2: 211 | rows = [] 212 | for k, v in sorted(stats[0].items()): 213 | rows.append(f"{k}={v}") 214 | raise AssertionError( 215 | "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( # noqa: UP030 216 | s1, s2, train, test, "\n".join(rows) 217 | ) 218 | ) 219 | 220 | 221 | if __name__ == "__main__": 222 | unittest.main() 223 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_connex_split_big.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from collections import Counter 4 | import pandas 5 | from pandas_streaming.ext_test_case import ExtTestCase 6 | from pandas_streaming.df import train_test_connex_split 7 | 8 | 9 | class TestConnexSplitBig(ExtTestCase): 10 | def test_connex_big(self): 11 | data = os.path.join(os.path.dirname(__file__), "data") 12 | name = os.path.join(data, "buggy_hash.csv") 13 | df = pandas.read_csv(name, sep="\t", encoding="utf-8") 14 | train, test, stats = train_test_connex_split( 15 | df, 16 | groups=["cart_id", "mail", "product_id"], 17 | fail_imbalanced=0.9, 18 | return_cnx=True, 19 | ) 20 | self.assertGreater(train.shape[0], 0) 21 | self.assertGreater(test.shape[0], 0) 22 | elements = stats[1]["connex"] 23 | counts = Counter(elements) 24 | nbc = len(counts) 25 | maxi = max(counts.values()) 26 | self.assertEqual(nbc, 5376) 27 | self.assertEqual(maxi, 14181) 28 | 29 | def test_connex_big_approx(self): 30 | data = os.path.join(os.path.dirname(__file__), "data") 31 | name = os.path.join(data, "buggy_hash.csv") 32 | df = pandas.read_csv(name, sep="\t", encoding="utf-8") 33 | train, test, stats = train_test_connex_split( 34 | df, 35 | groups=["cart_id", "mail", "product_id"], 36 | stop_if_bigger=0.05, 37 | return_cnx=True, 38 | keep_balance=0.8, 39 | ) 40 | self.assertGreater(train.shape[0], 0) 41 | self.assertGreater(test.shape[0], 0) 42 | elements = stats[1]["connex"] 43 | counts = Counter(elements) 44 | nbc = len(counts) 45 | maxi = max(counts.values()) 46 | self.assertGreater(nbc, 5376) 47 | self.assertLesser(maxi, 14181) 48 | 49 | def test_connex_big_approx_must(self): 50 | data = os.path.join(os.path.dirname(__file__), "data") 51 | name = os.path.join(data, "buggy_hash.csv") 52 | df = pandas.read_csv(name, sep="\t", encoding="utf-8") 53 | train, test, stats = train_test_connex_split( 54 | df, 55 | groups=["cart_id", "mail", "product_id"], 56 | stop_if_bigger=0.05, 57 | return_cnx=True, 58 | keep_balance=0.8, 59 | must_groups=["product_id"], 60 | ) 61 | self.assertGreater(train.shape[0], 0) 62 | self.assertGreater(test.shape[0], 0) 63 | elements = stats[1]["connex"] 64 | counts = Counter(elements) 65 | nbc = len(counts) 66 | maxi = max(counts.values()) 67 | self.assertGreater(nbc, 5376) 68 | self.assertLesser(maxi, 14181) 69 | train_ids = set(train.product_id) 70 | test_ids = set(test.product_id) 71 | inter = train_ids & test_ids 72 | self.assertEqual(len(inter), 0) 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_connex_split_cat.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from collections import Counter 3 | import pandas 4 | from pandas_streaming.ext_test_case import ExtTestCase 5 | from pandas_streaming.df import train_test_apart_stratify 6 | 7 | 8 | class TestConnexSplitCat(ExtTestCase): 9 | def test_cat_strat(self): 10 | df = pandas.DataFrame( 11 | [ 12 | dict(a=1, b="e"), 13 | dict(a=2, b="e"), 14 | dict(a=4, b="f"), 15 | dict(a=8, b="f"), 16 | dict(a=32, b="f"), 17 | dict(a=16, b="f"), 18 | ] 19 | ) 20 | 21 | train, test = train_test_apart_stratify( 22 | df, group="a", stratify="b", test_size=0.5 23 | ) 24 | self.assertEqual(train.shape[1], test.shape[1]) 25 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 26 | c1 = Counter(train["b"]) 27 | c2 = Counter(train["b"]) 28 | self.assertEqual(c1, c2) 29 | 30 | self.assertRaise( 31 | lambda: train_test_apart_stratify( 32 | df, group=None, stratify="b", test_size=0.5 33 | ), 34 | ValueError, 35 | ) 36 | self.assertRaise( 37 | lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError 38 | ) 39 | 40 | def test_cat_strat_sorted(self): 41 | df = pandas.DataFrame( 42 | [ 43 | dict(a=1, b="e"), 44 | dict(a=2, b="e"), 45 | dict(a=4, b="f"), 46 | dict(a=8, b="f"), 47 | dict(a=32, b="f"), 48 | dict(a=16, b="f"), 49 | ] 50 | ) 51 | 52 | train, test = train_test_apart_stratify( 53 | df, group="a", stratify="b", test_size=0.5, sorted_indices=True 54 | ) 55 | self.assertEqual(train.shape[1], test.shape[1]) 56 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 57 | c1 = Counter(train["b"]) 58 | c2 = Counter(train["b"]) 59 | self.assertEqual(c1, c2) 60 | 61 | self.assertRaise( 62 | lambda: train_test_apart_stratify( 63 | df, group=None, stratify="b", test_size=0.5, sorted_indices=True 64 | ), 65 | ValueError, 66 | ) 67 | self.assertRaise( 68 | lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError 69 | ) 70 | 71 | def test_cat_strat_multi(self): 72 | df = pandas.DataFrame( 73 | [ 74 | dict(a=1, b="e"), 75 | dict(a=1, b="f"), 76 | dict(a=2, b="e"), 77 | dict(a=2, b="f"), 78 | ] 79 | ) 80 | 81 | train, test = train_test_apart_stratify( 82 | df, group="a", stratify="b", test_size=0.5 83 | ) 84 | self.assertEqual(train.shape[1], test.shape[1]) 85 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 86 | c1 = Counter(train["b"]) 87 | c2 = Counter(train["b"]) 88 | self.assertEqual(c1, c2) 89 | self.assertEqual(len(set(train["a"])), 1) 90 | self.assertEqual(len(set(test["a"])), 1) 91 | self.assertTrue(set(train["a"]) != set(test["a"])) 92 | 93 | def test_cat_strat_multi_force(self): 94 | df = pandas.DataFrame( 95 | [ 96 | dict(a=1, b="e"), 97 | dict(a=1, b="f"), 98 | dict(a=2, b="e"), 99 | dict(a=2, b="f"), 100 | ] 101 | ) 102 | 103 | train, test = train_test_apart_stratify( 104 | df, group="a", stratify="b", test_size=0.1, force=True 105 | ) 106 | self.assertEqual(train.shape[1], test.shape[1]) 107 | self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) 108 | c1 = Counter(train["b"]) 109 | c2 = Counter(train["b"]) 110 | self.assertEqual(c1, c2) 111 | self.assertEqual(len(set(train["a"])), 1) 112 | self.assertEqual(len(set(test["a"])), 1) 113 | self.assertTrue(set(train["a"]) != set(test["a"])) 114 | 115 | 116 | if __name__ == "__main__": 117 | unittest.main() 118 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_dataframe_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import numpy 4 | import pandas 5 | from pandas_streaming.ext_test_case import ExtTestCase 6 | from pandas_streaming.df import dataframe_hash_columns 7 | 8 | 9 | class TestDataFrameHelpers(ExtTestCase): 10 | def test_hash_columns(self): 11 | df = pandas.DataFrame( 12 | [ 13 | dict(a=1, b="e", c=5.6, ind="a1", ai=1), 14 | dict(b="f", c=5.7, ind="a2", ai=2), 15 | dict(a=4, b="g", ind="a3", ai=3), 16 | dict(a=8, b="h", c=5.9, ai=4), 17 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 18 | ] 19 | ) 20 | df2 = dataframe_hash_columns(df) 21 | self.assertEqual(df2.shape, df.shape) 22 | for j in range(df.shape[1]): 23 | self.assertEqual(df.columns[j], df2.columns[j]) 24 | self.assertEqual(df.dtypes[j], df2.dtypes[j]) 25 | for i in range(df.shape[0]): 26 | v1 = df.iloc[i, j] 27 | v2 = df2.iloc[i, j] 28 | if isinstance(v1, float): 29 | if numpy.isnan(v1): 30 | self.assertTrue(numpy.isnan(v2)) 31 | else: 32 | self.assertEqual(type(v1), type(v2)) 33 | else: 34 | self.assertEqual(type(v1), type(v2)) 35 | 36 | def test_hash_columns_bigger(self): 37 | data = os.path.join(os.path.dirname(__file__), "data") 38 | name = os.path.join(data, "buggy_hash.csv") 39 | df = pandas.read_csv(name, sep="\t", encoding="utf-8") 40 | df2 = dataframe_hash_columns(df) 41 | self.assertEqual(df.shape, df2.shape) 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_dataframe_helpers_simple.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas 3 | import numpy 4 | from pandas_streaming.ext_test_case import ExtTestCase 5 | from pandas_streaming.df import dataframe_unfold 6 | from pandas_streaming.df.dataframe_helpers import hash_int, hash_str, hash_float 7 | 8 | 9 | class TestDataFrameHelpersSimple(ExtTestCase): 10 | def test_unfold(self): 11 | df = pandas.DataFrame([dict(a=1, b="e,f"), dict(a=2, b="g"), dict(a=3)]) 12 | df2 = dataframe_unfold(df, "b") 13 | 14 | exp = pandas.DataFrame( 15 | [ 16 | dict(a=1, b="e,f", b_unfold="e"), 17 | dict(a=1, b="e,f", b_unfold="f"), 18 | dict(a=2, b="g", b_unfold="g"), 19 | dict(a=3), 20 | ] 21 | ) 22 | self.assertEqualDataFrame(df2, exp) 23 | 24 | # fold 25 | folded = df2.groupby("a").apply( 26 | lambda row: ( 27 | ",".join(row["b_unfold"].dropna()) 28 | if len(row["b_unfold"].dropna()) > 0 29 | else numpy.nan 30 | ) 31 | ) 32 | bf = folded.reset_index(drop=False) 33 | bf.columns = ["a", "b"] 34 | self.assertEqualDataFrame(df, bf) 35 | 36 | def test_hash_except(self): 37 | self.assertRaise(lambda: hash_int(0.1, 3), ValueError, "numpy.nan expected") 38 | r = hash_int(numpy.nan, 3) 39 | self.assertTrue(numpy.isnan(r)) 40 | 41 | self.assertRaise(lambda: hash_str(0.1, 3), ValueError, "numpy.nan expected") 42 | r = hash_str(numpy.nan, 3) 43 | self.assertTrue(numpy.isnan(r)) 44 | 45 | self.assertRaise(lambda: hash_float("0.1", 3), TypeError, "isnan") 46 | r = hash_float(numpy.nan, 3) 47 | self.assertTrue(numpy.isnan(r)) 48 | r = hash_str("3", 100) 49 | self.assertLess(len(r), 100) 50 | 51 | 52 | if __name__ == "__main__": 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_dataframe_io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | import io 5 | import zipfile 6 | import numpy 7 | import pandas 8 | from pandas_streaming.ext_test_case import ExtTestCase 9 | from pandas_streaming.df import to_zip, read_zip 10 | 11 | 12 | class TestDataFrameIO(ExtTestCase): 13 | def test_zip_dataframe(self): 14 | df = pandas.DataFrame( 15 | [ 16 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 17 | dict(b="f", c=5.7, ind="a2", ai=2), 18 | dict(a=4, b="g", ind="a3", ai=3), 19 | dict(a=8, b="h", c=5.9, ai=4), 20 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 21 | ] 22 | ) 23 | 24 | with tempfile.TemporaryDirectory() as temp: 25 | name = os.path.join(temp, "df.zip") 26 | to_zip(df, name, encoding="utf-8", index=False) 27 | df2 = read_zip(name, encoding="utf-8") 28 | self.assertEqualDataFrame(df, df2) 29 | 30 | st = io.BytesIO() 31 | zp = zipfile.ZipFile(st, "w") 32 | to_zip(df, zp, encoding="utf-8", index=False) 33 | zp.close() 34 | 35 | st = io.BytesIO(st.getvalue()) 36 | zp = zipfile.ZipFile(st, "r") 37 | df3 = read_zip(zp, encoding="utf-8") 38 | zp.close() 39 | self.assertEqualDataFrame(df, df3) 40 | 41 | def test_zip_numpy(self): 42 | df = numpy.zeros((3, 4)) 43 | df[2, 3] = 1 44 | 45 | with tempfile.TemporaryDirectory() as temp: 46 | name = os.path.join(temp, "df.zip") 47 | to_zip(df, name, "arr.npy") 48 | df2 = read_zip(name, "arr.npy") 49 | self.assertEqualArray(df, df2) 50 | 51 | st = io.BytesIO() 52 | zp = zipfile.ZipFile(st, "w") 53 | to_zip(df, zp, "arr.npy") 54 | zp.close() 55 | 56 | st = io.BytesIO(st.getvalue()) 57 | zp = zipfile.ZipFile(st, "r") 58 | df3 = read_zip(zp, "arr.npy") 59 | zp.close() 60 | self.assertEqualArray(df, df3) 61 | 62 | 63 | if __name__ == "__main__": 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_dataframe_io_helpers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from io import StringIO, BytesIO 3 | from json import loads 4 | import pandas 5 | from pandas_streaming.ext_test_case import ExtTestCase 6 | from pandas_streaming.df.dataframe_io_helpers import ( 7 | enumerate_json_items, 8 | JsonPerRowsStream, 9 | JsonIterator2Stream, 10 | ) 11 | from pandas_streaming.df import StreamingDataFrame 12 | 13 | 14 | class TestDataFrameIOHelpers(ExtTestCase): 15 | text_json = b""" 16 | [ 17 | { 18 | "glossary": { 19 | "title": "example glossary", 20 | "GlossDiv": { 21 | "title": "S", 22 | "GlossList": [{ 23 | "GlossEntry": { 24 | "ID": "SGML", 25 | "SortAs": "SGML", 26 | "GlossTerm": "Standard Generalized Markup Language", 27 | "Acronym": "SGML", 28 | "Abbrev": "ISO 8879:1986", 29 | "GlossDef": { 30 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 31 | "GlossSeeAlso": ["GML", "XML"] 32 | }, 33 | "GlossSee": "markup" 34 | } 35 | }] 36 | } 37 | } 38 | }, 39 | { 40 | "glossary": { 41 | "title": "example glossary", 42 | "GlossDiv": { 43 | "title": "X", 44 | "GlossList": { 45 | "GlossEntry": [{ 46 | "ID": "SGML", 47 | "SortAs": "SGML", 48 | "GlossTerm": "Standard Generalized Markup Language", 49 | "Acronym": "SGML", 50 | "Abbrev": "ISO 8879:1986", 51 | "GlossDef": { 52 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 53 | "GlossSeeAlso": ["GML", "XML"] 54 | }, 55 | "GlossSee": "markup" 56 | }] 57 | } 58 | } 59 | } 60 | } 61 | ] 62 | """ 63 | text_json_exp = [ 64 | { 65 | "glossary": { 66 | "title": "example glossary", 67 | "GlossDiv": { 68 | "title": "S", 69 | "GlossList": [ 70 | { 71 | "GlossEntry": { 72 | "ID": "SGML", 73 | "SortAs": "SGML", 74 | "GlossTerm": "Standard Generalized Markup Language", 75 | "Acronym": "SGML", 76 | "Abbrev": "ISO 8879:1986", 77 | "GlossDef": { 78 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 79 | "GlossSeeAlso": ["GML", "XML"], 80 | }, 81 | "GlossSee": "markup", 82 | } 83 | } 84 | ], 85 | }, 86 | } 87 | }, 88 | { 89 | "glossary": { 90 | "title": "example glossary", 91 | "GlossDiv": { 92 | "title": "X", 93 | "GlossList": { 94 | "GlossEntry": [ 95 | { 96 | "ID": "SGML", 97 | "SortAs": "SGML", 98 | "GlossTerm": "Standard Generalized Markup Language", 99 | "Acronym": "SGML", 100 | "Abbrev": "ISO 8879:1986", 101 | "GlossDef": { 102 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 103 | "GlossSeeAlso": ["GML", "XML"], 104 | }, 105 | "GlossSee": "markup", 106 | } 107 | ] 108 | }, 109 | }, 110 | } 111 | }, 112 | ] 113 | 114 | def test_enumerate_json_items(self): 115 | items = list(enumerate_json_items(TestDataFrameIOHelpers.text_json)) 116 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) 117 | items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json))) 118 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) 119 | items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json))) 120 | self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) 121 | 122 | def test_read_json_raw(self): 123 | data = [ 124 | {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, 125 | {"name": {"given": "Mose", "family": "Regner"}}, 126 | {"id": 2, "name": "FayeRaker"}, 127 | ] 128 | exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"}, 129 | {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null}, 130 | {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null, 131 | "name.given":null,"name.last":null}]""".replace( 132 | " ", "" 133 | ).replace( 134 | "\n", "" 135 | ) 136 | self.assertRaise( 137 | lambda: StreamingDataFrame.read_json(data), NotImplementedError 138 | ) 139 | it = StreamingDataFrame.read_json(data, flatten=True) 140 | dfs = list(it) 141 | self.assertEqual(len(dfs), 1) 142 | js = dfs[0].to_json(orient="records") 143 | js_read = loads(js) 144 | js_exp = loads(exp) 145 | self.assertEqual(js_exp, js_read) 146 | 147 | def test_read_json_raw_head(self): 148 | data = [ 149 | {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, 150 | {"name": {"given": "Mose", "family": "Regner"}}, 151 | {"id": 2, "name": "FayeRaker"}, 152 | ] 153 | it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1) 154 | h1 = it.head() 155 | h2 = it.head() 156 | self.assertEqualDataFrame(h1, h2) 157 | self.assertGreater(h1.shape[0], 1) 158 | self.assertGreater(h2.shape[0], 1) 159 | 160 | def test_pandas_json_chunksize(self): 161 | jsonl = """{"a": 1, "b": 2} 162 | {"a": 3, "b": 4}""" 163 | df = pandas.read_json(jsonl, lines=True) 164 | idf = pandas.read_json(jsonl, lines=True, chunksize=2) 165 | ldf = list(idf) 166 | self.assertEqualDataFrame(df, ldf[0]) 167 | 168 | def test_read_json_rows(self): 169 | data = """{"a": 1, "b": 2} 170 | {"a": 3, "b": 4}""" 171 | it = StreamingDataFrame.read_json(StringIO(data), lines=True) 172 | dfs = list(it) 173 | self.assertEqual(len(dfs), 1) 174 | js = dfs[0].to_json(orient="records") 175 | self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]') 176 | 177 | def test_read_json_rows2(self): 178 | data = b"""{"a": 1, "b": 2} 179 | {"a": 3, "b": 4}""" 180 | dfs = pandas.read_json(BytesIO(data), lines=True) 181 | self.assertEqual(dfs.shape, (2, 2)) 182 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") 183 | dfs = list(it) 184 | self.assertEqual(len(dfs), 1) 185 | js = dfs[0].to_json(orient="records") 186 | self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js) 187 | 188 | def test_read_json_rows2_head(self): 189 | data = b"""{"a": 1, "b": 2} 190 | {"a": 3, "b": 4}""" 191 | dfs = pandas.read_json(BytesIO(data), lines=True) 192 | self.assertEqual(dfs.shape, (2, 2)) 193 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") 194 | h1 = it.head() 195 | h2 = it.head() 196 | self.assertNotEmpty(h1) 197 | self.assertNotEmpty(h2) 198 | self.assertEqualDataFrame(h1, h2) 199 | 200 | def test_read_json_rows_file_head(self): 201 | data = self.abs_path_join(__file__, "data", "example2.json") 202 | dfs = pandas.read_json(data, orient="records") 203 | self.assertEqual(dfs.shape, (2, 2)) 204 | it = StreamingDataFrame.read_json(data) 205 | h1 = it.head() 206 | h2 = it.head() 207 | self.assertNotEmpty(h1) 208 | self.assertNotEmpty(h2) 209 | self.assertEqualDataFrame(h1, h2) 210 | 211 | def test_read_json_rows_file_lines_head(self): 212 | data = self.abs_path_join(__file__, "data", "example.json") 213 | dfs = pandas.read_json(data, orient="records", lines=True) 214 | self.assertEqual(dfs.shape, (2, 2)) 215 | it = StreamingDataFrame.read_json(data, lines="stream") 216 | h1 = it.head() 217 | h2 = it.head() 218 | self.assertNotEmpty(h1) 219 | self.assertNotEmpty(h2) 220 | self.assertEqualDataFrame(h1, h2) 221 | 222 | def test_read_json_ijson(self): 223 | it = StreamingDataFrame.read_json(BytesIO(TestDataFrameIOHelpers.text_json)) 224 | dfs = list(it) 225 | self.assertEqual(len(dfs), 1) 226 | js = dfs[0].to_json(orient="records", lines=True) 227 | jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]") 228 | self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp) 229 | 230 | def test_read_json_stream(self): 231 | text = """{'a': 1} 232 | {'b': 1, 'a', 'r'}""" 233 | st = JsonPerRowsStream(StringIO(text)) 234 | val = st.getvalue().replace(" ", "").replace("\n", "") 235 | exp = "[{'a':1},{'b':1,'a','r'}]" 236 | self.assertEqual(val, exp) 237 | 238 | st = JsonPerRowsStream(StringIO(text)) 239 | t = st.read(0) 240 | t = st.read(1) 241 | c = "" 242 | while t: 243 | c += t 244 | t = st.read(1) 245 | val = c.replace(" ", "").replace("\n", "") 246 | self.assertEqual(val, exp) 247 | 248 | def test_enumerate_json_items_lines(self): 249 | data = b"""{"a": 1, "b": 2} 250 | {"a": 3, "b": 4}""" 251 | items = list(enumerate_json_items(data, lines=True)) 252 | self.assertEqual(items, [{"a": 1, "b": 2}, {"a": 3, "b": 4}]) 253 | 254 | def test_read_json_file2(self): 255 | data = b"""{"a": {"c": 1}, "b": [2, 3]} 256 | {"a": {"a": 3}, "b": [4, 5, "r"]}""" 257 | 258 | obj1 = list(enumerate_json_items(BytesIO(data), flatten=False, lines=True)) 259 | obj2 = list(enumerate_json_items(BytesIO(data), flatten=True, lines=True)) 260 | self.assertNotEqual(obj1, obj2) 261 | self.assertEqual( 262 | obj2, 263 | [ 264 | {"a_c": 1, "b_0": 2, "b_1": 3}, 265 | {"a_a": 3, "b_0": 4, "b_1": 5, "b_2": "r"}, 266 | ], 267 | ) 268 | 269 | it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True) 270 | dfs = list(it) 271 | self.assertEqual( 272 | ["a_a", "a_c", "b_0", "b_1", "b_2"], 273 | list(sorted(dfs[0].columns)), 274 | ) 275 | self.assertEqual(len(dfs), 1) 276 | js = dfs[0].to_json(orient="records", lines=True) 277 | jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]") 278 | exp = [ 279 | {"a_a": None, "a_c": 1.0, "b_0": 2, "b_1": 3, "b_2": None}, 280 | {"a_a": 3.0, "a_c": None, "b_0": 4, "b_1": 5, "b_2": "r"}, 281 | ] 282 | self.assertEqual(exp, jsjson) 283 | 284 | def test_read_json_item(self): 285 | text = TestDataFrameIOHelpers.text_json 286 | st = JsonPerRowsStream(BytesIO(text)) 287 | res = [] 288 | while True: 289 | n = st.read() 290 | if not n: 291 | break 292 | res.append(n) 293 | self.assertGreater(len(res), 1) 294 | 295 | def test_bug_documentation(self): 296 | items = [] 297 | for item in JsonIterator2Stream( 298 | lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json) 299 | ): 300 | items.append(item) 301 | self.assertEqual(len(items), 2) 302 | 303 | def test_read_json_classic(self): 304 | data = self.abs_path_join(__file__, "data", "classic.json") 305 | dfs = pandas.read_json(data, orient="records") 306 | dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9) 307 | self.assertEqual(dfs.shape[1], 9) 308 | self.assertGreater(dfs.shape[0], 2) 309 | it = StreamingDataFrame.read_json(data) 310 | it["ts2"] = it["ts"].apply(lambda t: t / 1e9) 311 | h1 = it.to_df() 312 | h2 = it.to_df() 313 | self.assertNotEmpty(h1) 314 | self.assertNotEmpty(h2) 315 | self.assertEqualDataFrame(h1, h2) 316 | self.assertEqual(h1.shape[1], 9) 317 | 318 | def test_read_json_classic_file(self): 319 | data = self.abs_path_join(__file__, "data", "classic.json") 320 | dfs = pandas.read_json(data, orient="records") 321 | self.assertEqual(dfs.shape[1], 8) 322 | self.assertGreater(dfs.shape[0], 2) 323 | with open(data, "r", encoding="utf-8") as f: 324 | it = StreamingDataFrame.read_json(f, orient="records") 325 | h1 = it.to_df() 326 | h2 = it.to_df() 327 | self.assertNotEmpty(h1) 328 | self.assertNotEmpty(h2) 329 | self.assertEqualDataFrame(h1, h2) 330 | self.assertEqual(h1.shape[1], 8) 331 | 332 | def test_read_json_classic_file_formula(self): 333 | data = self.abs_path_join(__file__, "data", "classic.json") 334 | dfs = pandas.read_json(data, orient="records") 335 | dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9) 336 | self.assertEqual(dfs.shape[1], 9) 337 | self.assertGreater(dfs.shape[0], 2) 338 | with open(data, "r", encoding="utf-8") as f: 339 | it = StreamingDataFrame.read_json(f) 340 | it["ts2"] = it["ts"].apply(lambda t: t / 1e9) 341 | h1 = it.to_df() 342 | h2 = it.to_df() 343 | self.assertNotEmpty(h1) 344 | self.assertNotEmpty(h2) 345 | self.assertEqualDataFrame(h1, h2) 346 | self.assertEqual(h1.shape[1], 9) 347 | 348 | 349 | if __name__ == "__main__": 350 | unittest.main() 351 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_dataframe_sort.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | import pandas 5 | from pandas_streaming.ext_test_case import ExtTestCase 6 | from pandas_streaming.df import StreamingDataFrame 7 | 8 | 9 | class TestDataFrameSort(ExtTestCase): 10 | def test_sort_values(self): 11 | with tempfile.TemporaryDirectory() as temp: 12 | name = os.path.join(temp, "_data_") 13 | df = pandas.DataFrame( 14 | [ 15 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 16 | dict(a=5, b="f", c=5.7, ind="a2", ai=2), 17 | dict(a=4, b="g", ind="a3", ai=3), 18 | dict(a=8, b="h", c=5.9, ai=4), 19 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 20 | ] 21 | ) 22 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 23 | sorted_df = df.sort_values(by="a") 24 | res = sdf.sort_values(by="a", temp_file=name) 25 | res_df = res.to_df() 26 | self.assertEqualDataFrame(sorted_df, res_df) 27 | 28 | def test_sort_values_twice(self): 29 | with tempfile.TemporaryDirectory() as temp: 30 | name = os.path.join(temp, "_data_") 31 | df = pandas.DataFrame( 32 | [ 33 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 34 | dict(a=5, b="f", c=5.7, ind="a2", ai=2), 35 | dict(a=4, b="g", ind="a3", ai=3), 36 | dict(a=8, b="h", c=5.9, ai=4), 37 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 38 | ] 39 | ) 40 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 41 | sorted_df = df.sort_values(by="a") 42 | res = sdf.sort_values(by="a", temp_file=name) 43 | res_df = res.to_df() 44 | self.assertEqualDataFrame(sorted_df, res_df) 45 | res_df = res.to_df() 46 | self.assertEqualDataFrame(sorted_df, res_df) 47 | 48 | def test_sort_values_reverse(self): 49 | with tempfile.TemporaryDirectory() as temp: 50 | name = os.path.join(temp, "_data_") 51 | df = pandas.DataFrame( 52 | [ 53 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 54 | dict(a=5, b="f", c=5.7, ind="a2", ai=2), 55 | dict(a=4, b="g", ind="a3", ai=3), 56 | dict(a=8, b="h", c=5.9, ai=4), 57 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 58 | ] 59 | ) 60 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 61 | sorted_df = df.sort_values(by="a", ascending=False) 62 | res = sdf.sort_values(by="a", temp_file=name, ascending=False) 63 | res_df = res.to_df() 64 | self.assertEqualDataFrame(sorted_df, res_df) 65 | 66 | def test_sort_values_nan_last(self): 67 | with tempfile.TemporaryDirectory() as temp: 68 | name = os.path.join(temp, "_data_") 69 | df = pandas.DataFrame( 70 | [ 71 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 72 | dict(b="f", c=5.7, ind="a2", ai=2), 73 | dict(b="f", c=5.8, ind="a2", ai=2), 74 | dict(a=4, b="g", ind="a3", ai=3), 75 | dict(a=8, b="h", c=5.9, ai=4), 76 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 77 | ] 78 | ) 79 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 80 | sorted_df = df.sort_values(by="a", na_position="last") 81 | res = sdf.sort_values(by="a", temp_file=name, na_position="last") 82 | res_df = res.to_df() 83 | self.assertEqualDataFrame(sorted_df, res_df) 84 | 85 | def test_sort_values_nan_first(self): 86 | with tempfile.TemporaryDirectory() as temp: 87 | name = os.path.join(temp, "_data_") 88 | df = pandas.DataFrame( 89 | [ 90 | dict(a=1, b="eé", c=5.6, ind="a1", ai=1), 91 | dict(b="f", c=5.7, ind="a2", ai=2), 92 | dict(b="f", c=5.8, ind="a2", ai=2), 93 | dict(a=4, b="g", ind="a3", ai=3), 94 | dict(a=8, b="h", c=5.9, ai=4), 95 | dict(a=16, b="i", c=6.2, ind="a5", ai=5), 96 | ] 97 | ) 98 | sdf = StreamingDataFrame.read_df(df, chunksize=2) 99 | sorted_df = df.sort_values(by="a", na_position="first") 100 | res = sdf.sort_values(by="a", temp_file=name, na_position="first") 101 | res_df = res.to_df() 102 | self.assertEqualDataFrame(sorted_df, res_df) 103 | 104 | 105 | if __name__ == "__main__": 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_pandas_groupbynan.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas 3 | import numpy 4 | from scipy.sparse.linalg import lsqr as sparse_lsqr 5 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings 6 | from pandas_streaming.df import pandas_groupby_nan, numpy_types 7 | 8 | 9 | class TestPandasHelper(ExtTestCase): 10 | def test_pandas_groupbynan(self): 11 | self.assertTrue(sparse_lsqr is not None) 12 | types = [(int, -10), (float, -20.2), (str, "e"), (bytes, bytes("a", "ascii"))] 13 | skip = (numpy.bool_, numpy.complex64, numpy.complex128) 14 | types += [(_, _(5)) for _ in numpy_types() if _ not in skip] 15 | 16 | for ty in types: 17 | data = [ 18 | {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, 19 | {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]}, 20 | {"this": "cst", "type": "row_for_nan"}, 21 | ] 22 | df = pandas.DataFrame(data) 23 | gr = pandas_groupby_nan(df, "value") 24 | co = gr.sum() 25 | li = list(co["value"]) 26 | try: 27 | self.assertIsInstance(li[-1], float) 28 | except AssertionError as e: 29 | raise AssertionError(f"Issue with {ty}") from e 30 | try: 31 | self.assertTrue(numpy.isnan(li[-1])) 32 | except AssertionError as e: 33 | raise AssertionError( 34 | "Issue with value {}\n--df--\n{}\n--gr--\n{}\n--co--\n{}".format( 35 | li, df, gr.count(), co 36 | ) 37 | ) from e 38 | 39 | for ty in types: 40 | data = [ 41 | {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, 42 | {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]}, 43 | {"this": "cst", "type": "row_for_nan"}, 44 | ] 45 | df = pandas.DataFrame(data) 46 | try: 47 | gr = pandas_groupby_nan(df, ("value", "this")) 48 | t = True 49 | raise AssertionError("---") 50 | except (TypeError, KeyError): 51 | t = False 52 | if t: 53 | co = gr.sum() 54 | li = list(co["value"]) 55 | self.assertIsInstance(li[-1], float) 56 | self.assertTrue(numpy.isnan(li[-1])) 57 | try: 58 | gr = pandas_groupby_nan(df, ["value", "this"]) 59 | t = True 60 | except (TypeError, NotImplementedError): 61 | t = False 62 | 63 | if t: 64 | co = gr.sum() 65 | li = list(co["value"]) 66 | self.assertEqual(len(li), 2) 67 | 68 | def test_pandas_groupbynan_tuple(self): 69 | data = [ 70 | dict(a="a", b="b", c="c", n=1), 71 | dict(b="b", n=2), 72 | dict(a="a", n=3), 73 | dict(c="c", n=4), 74 | ] 75 | df = pandas.DataFrame(data) 76 | gr = df.groupby(["a", "b", "c"]).sum() 77 | self.assertEqual(gr.shape, (1, 1)) 78 | 79 | for nanback in [True, False]: 80 | try: 81 | gr2_ = pandas_groupby_nan( 82 | df, ["a", "b", "c"], nanback=nanback, suffix="NAN" 83 | ) 84 | except NotImplementedError: 85 | continue 86 | gr2 = gr2_.sum().sort_values("n") 87 | self.assertEqual(gr2.shape, (4, 4)) 88 | d = gr2.to_dict("records") 89 | self.assertEqual(d[0]["a"], "a") 90 | self.assertEqual(d[0]["b"], "b") 91 | self.assertEqual(d[0]["c"], "c") 92 | self.assertEqual(d[0]["n"], 1) 93 | self.assertEqual(d[1]["a"], "NAN") 94 | 95 | def test_pandas_groupbynan_regular(self): 96 | df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)]) 97 | gr = df.groupby(["a"], as_index=False).sum() 98 | gr2_ = pandas_groupby_nan(df, ["a"]).sum() 99 | self.assertEqualDataFrame(gr, gr2_) 100 | 101 | def test_pandas_groupbynan_regular_nanback(self): 102 | df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)]) 103 | gr = df.groupby(["a", "cc"]).sum() 104 | self.assertEqual(len(gr), 1) 105 | 106 | def test_pandas_groupbynan_doc(self): 107 | data = [ 108 | dict(a=2, ind="a", n=1), 109 | dict(a=2, ind="a"), 110 | dict(a=3, ind="b"), 111 | dict(a=30), 112 | ] 113 | df = pandas.DataFrame(data) 114 | gr2 = pandas_groupby_nan(df, ["ind"]).sum() 115 | ind = list(gr2["ind"]) 116 | self.assertTrue(numpy.isnan(ind[-1])) 117 | val = list(gr2["a"]) 118 | self.assertEqual(val[-1], 30) 119 | 120 | @ignore_warnings(UserWarning) 121 | def test_pandas_groupbynan_doc2(self): 122 | data = [ 123 | dict(a=2, ind="a", n=1), 124 | dict(a=2, ind="a"), 125 | dict(a=3, ind="b"), 126 | dict(a=30), 127 | ] 128 | df = pandas.DataFrame(data) 129 | gr2 = pandas_groupby_nan(df, ["ind", "a"], nanback=False).sum() 130 | ind = list(gr2["ind"]) 131 | self.assertEqual(ind[-1], "²nan") 132 | 133 | def test_pandas_groupbynan_doc3(self): 134 | data = [ 135 | dict(a=2, ind="a", n=1), 136 | dict(a=2, ind="a"), 137 | dict(a=3, ind="b"), 138 | dict(a=30), 139 | ] 140 | df = pandas.DataFrame(data) 141 | gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum() 142 | ind = list(gr2["ind"]) 143 | self.assertTrue(numpy.isnan(ind[-1])) 144 | 145 | 146 | if __name__ == "__main__": 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /_unittests/ut_df/test_streaming_dataframe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | from io import StringIO 5 | import pandas 6 | import numpy 7 | from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings 8 | from pandas_streaming.data import dummy_streaming_dataframe 9 | from pandas_streaming.df import StreamingDataFrame 10 | from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError 11 | 12 | 13 | class TestStreamingDataFrame(ExtTestCase): 14 | def test_shape(self): 15 | sdf = dummy_streaming_dataframe(100) 16 | dfs = list(sdf) 17 | self.assertEqual(len(dfs), 10) 18 | self.assertEqual(len(dfs), 10) 19 | shape = sdf.shape 20 | self.assertEqual(shape, (100, 2)) 21 | 22 | def test_init(self): 23 | sdf = dummy_streaming_dataframe(100) 24 | df1 = sdf.to_df() 25 | sdf2 = StreamingDataFrame(sdf) 26 | df2 = sdf2.to_df() 27 | self.assertEqualDataFrame(df1, df2) 28 | 29 | def test_to_csv(self): 30 | sdf = dummy_streaming_dataframe(100) 31 | st = sdf.to_csv() 32 | self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", "")) 33 | st = sdf.to_csv() 34 | self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", "")) 35 | 36 | def test_iterrows(self): 37 | sdf = dummy_streaming_dataframe(100) 38 | rows = list(sdf.iterrows()) 39 | self.assertEqual(sdf.shape[0], len(rows)) 40 | rows = list(sdf.iterrows()) 41 | self.assertEqual(sdf.shape[0], len(rows)) 42 | 43 | def test_head(self): 44 | sdf = dummy_streaming_dataframe(100) 45 | st = sdf.head() 46 | self.assertEqual(st.shape, (5, 2)) 47 | st = sdf.head(n=20) 48 | self.assertEqual(st.shape, (20, 2)) 49 | st = sdf.head(n=20) 50 | self.assertEqual(st.shape, (20, 2)) 51 | 52 | def test_tail(self): 53 | sdf = dummy_streaming_dataframe(100) 54 | st = sdf.tail() 55 | self.assertEqual(st.shape, (5, 2)) 56 | st = sdf.tail(n=20) 57 | self.assertEqual(st.shape, (10, 2)) 58 | 59 | def test_read_csv(self): 60 | with tempfile.TemporaryDirectory() as temp: 61 | df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) 62 | name = os.path.join(temp, "df.csv") 63 | name2 = os.path.join(temp, "df2.csv") 64 | name3 = os.path.join(temp, "df3.csv") 65 | df.to_csv(name, index=False) 66 | df.to_csv(name2, index=True) 67 | sdf = StreamingDataFrame.read_csv(name) 68 | text = sdf.to_csv(index=False) 69 | self.assertRaise( 70 | lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None), 71 | ValueError, 72 | ) 73 | self.assertRaise( 74 | lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False), 75 | ValueError, 76 | ) 77 | sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) 78 | text2 = sdf2.to_csv(index=True) 79 | sdf2.to_csv(name3, index=True) 80 | with open(name, "r", encoding="utf-8") as f: 81 | exp = f.read() 82 | with open(name2, "r", encoding="utf-8") as f: 83 | exp2 = f.read() 84 | with open(name3, "r", encoding="utf-8") as f: 85 | text3 = f.read() 86 | self.assertEqual(text.replace("\r", ""), exp) 87 | sdf2 = StreamingDataFrame.read_df(df) 88 | self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) 89 | self.assertEqual(text2.replace("\r", ""), exp2) 90 | self.assertEqual( 91 | text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "") 92 | ) 93 | 94 | def test_where(self): 95 | sdf = dummy_streaming_dataframe(100) 96 | cols = sdf.columns 97 | self.assertEqual(list(cols), ["cint", "cstr"]) 98 | dts = sdf.dtypes 99 | self.assertEqual(len(dts), 2) 100 | res = sdf.where(lambda row: row["cint"] == 1) 101 | st = res.to_csv() 102 | self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", "")) 103 | res = sdf.where(lambda row: row["cint"] == 1) 104 | st = res.to_csv() 105 | self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", "")) 106 | 107 | def test_dataframe(self): 108 | sdf = dummy_streaming_dataframe(100) 109 | df = sdf.to_dataframe() 110 | self.assertEqual(df.shape, (100, 2)) 111 | 112 | def test_sample(self): 113 | sdf = dummy_streaming_dataframe(100) 114 | res = sdf.sample(frac=0.1) 115 | self.assertLesser(res.shape[0], 30) 116 | self.assertRaise(lambda: sdf.sample(n=5), ValueError) 117 | res = sdf.sample(frac=0.1) 118 | self.assertLesser(res.shape[0], 30) 119 | self.assertRaise(lambda: sdf.sample(n=5), ValueError) 120 | 121 | def test_sample_cache(self): 122 | sdf = dummy_streaming_dataframe(100) 123 | res = sdf.sample(frac=0.1, cache=True) 124 | df1 = res.to_df() 125 | df2 = res.to_df() 126 | self.assertEqualDataFrame(df1, df2) 127 | self.assertTrue(res.is_stable(n=df1.shape[0], do_check=True)) 128 | self.assertTrue(res.is_stable(n=df1.shape[0], do_check=False)) 129 | res = sdf.sample(frac=0.1, cache=False) 130 | self.assertFalse(res.is_stable(n=df1.shape[0], do_check=False)) 131 | 132 | def test_sample_reservoir_cache(self): 133 | sdf = dummy_streaming_dataframe(100) 134 | res = sdf.sample(n=10, cache=True, reservoir=True) 135 | df1 = res.to_df() 136 | df2 = res.to_df() 137 | self.assertEqualDataFrame(df1, df2) 138 | self.assertEqual(df1.shape, (10, res.shape[1])) 139 | self.assertRaise( 140 | lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError 141 | ) 142 | self.assertRaise( 143 | lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError 144 | ) 145 | 146 | def test_apply(self): 147 | sdf = dummy_streaming_dataframe(100) 148 | self.assertNotEmpty(list(sdf)) 149 | sdf = sdf.applymap(str) 150 | self.assertNotEmpty(list(sdf)) 151 | sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1) 152 | self.assertNotEmpty(list(sdf)) 153 | text = sdf.to_csv(header=False) 154 | self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text.replace("\r", "")) 155 | 156 | def test_train_test_split(self): 157 | sdf = dummy_streaming_dataframe(100) 158 | tr, te = sdf.train_test_split(index=False, streaming=False) 159 | self.assertRaise( 160 | lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError 161 | ) 162 | self.assertRaise( 163 | lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError 164 | ) 165 | StreamingDataFrame.read_str(tr.encode("utf-8")) 166 | trsdf = StreamingDataFrame.read_str(tr) 167 | tesdf = StreamingDataFrame.read_str(te) 168 | trdf = trsdf.to_dataframe() 169 | tedf = tesdf.to_dataframe() 170 | df_exp = sdf.to_dataframe() 171 | df_val = pandas.concat([trdf, tedf]) 172 | self.assertEqual(df_exp.shape, df_val.shape) 173 | df_val = df_val.sort_values("cint").reset_index(drop=True) 174 | self.assertEqualDataFrame(df_val, df_exp) 175 | 176 | def test_train_test_split_streaming(self): 177 | sdf = dummy_streaming_dataframe(100, asfloat=True) 178 | trsdf, tesdf = sdf.train_test_split( 179 | streaming=True, unique_rows=True, partitions=[0.7, 0.3] 180 | ) 181 | trdf = trsdf.to_dataframe() 182 | tedf = tesdf.to_dataframe() 183 | df_exp = sdf.to_dataframe() 184 | df_val = pandas.concat([trdf, tedf]) 185 | self.assertEqual(df_exp.shape, df_val.shape) 186 | df_val = df_val.sort_values("cfloat").reset_index(drop=True) 187 | self.assertEqualDataFrame(df_val, df_exp) 188 | trdf2 = trsdf.to_dataframe() 189 | tedf2 = tesdf.to_dataframe() 190 | df_val = pandas.concat([trdf2, tedf2]) 191 | self.assertEqual(df_exp.shape, df_val.shape) 192 | df_val = df_val.sort_values("cfloat").reset_index(drop=True) 193 | self.assertEqualDataFrame(df_val, df_exp) 194 | self.assertEqual(trdf.shape, trdf2.shape) 195 | self.assertEqual(tedf.shape, tedf2.shape) 196 | self.assertGreater(trdf.shape[0], tedf.shape[0]) 197 | self.assertGreater(trdf2.shape[0], tedf2.shape[0]) 198 | 199 | def test_train_test_split_streaming_tiny(self): 200 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) 201 | 202 | sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df])) 203 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5) 204 | df1 = sdfte.head() 205 | df2 = sdfte.head() 206 | if df1 is not None or df2 is not None: 207 | self.assertEqualDataFrame(df1, df2) 208 | df1 = sdftr.head() 209 | df2 = sdftr.head() 210 | if df1 is not None or df2 is not None: 211 | self.assertEqualDataFrame(df1, df2) 212 | sdf = StreamingDataFrame.read_df(df) 213 | sdf2 = sdf.concat(sdf, axis=0) 214 | sdftr, sdfte = sdf2.train_test_split(test_size=0.5) 215 | df1 = sdfte.head() 216 | df2 = sdfte.head() 217 | if df1 is not None or df2 is not None: 218 | self.assertEqualDataFrame(df1, df2) 219 | df1 = sdftr.head() 220 | df2 = sdftr.head() 221 | if df1 is not None or df2 is not None: 222 | self.assertEqualDataFrame(df1, df2) 223 | 224 | def test_train_test_split_streaming_strat(self): 225 | sdf = dummy_streaming_dataframe( 226 | 100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(100)] 227 | ) 228 | trsdf, tesdf = sdf.train_test_split( 229 | streaming=True, unique_rows=True, stratify="tify" 230 | ) 231 | trdf = trsdf.to_dataframe() 232 | tedf = tesdf.to_dataframe() 233 | df_exp = sdf.to_dataframe() 234 | df_val = pandas.concat([trdf, tedf]) 235 | self.assertEqual(df_exp.shape, df_val.shape) 236 | df_val = df_val.sort_values("cfloat").reset_index(drop=True) 237 | self.assertEqualDataFrame(df_val, df_exp) 238 | trdf = trsdf.to_dataframe() 239 | tedf = tesdf.to_dataframe() 240 | df_val = pandas.concat([trdf, tedf]) 241 | self.assertEqual(df_exp.shape, df_val.shape) 242 | df_val = df_val.sort_values("cfloat").reset_index(drop=True) 243 | self.assertEqualDataFrame(df_val, df_exp) 244 | trgr = trdf.groupby("tify").count() 245 | trgr["part"] = 0 246 | tegr = tedf.groupby("tify").count() 247 | tegr["part"] = 1 248 | gr = pandas.concat([trgr, tegr]) 249 | self.assertGreater(gr["cfloat"].min(), 4) 250 | 251 | def test_train_test_split_file(self): 252 | with tempfile.TemporaryDirectory() as temp: 253 | names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] 254 | sdf = dummy_streaming_dataframe(100) 255 | sdf.train_test_split(names, index=False, streaming=False) 256 | trsdf = StreamingDataFrame.read_csv(names[0]) 257 | tesdf = StreamingDataFrame.read_csv(names[1]) 258 | self.assertGreater(trsdf.shape[0], 20) 259 | self.assertGreater(tesdf.shape[0], 20) 260 | trdf = trsdf.to_dataframe() 261 | tedf = tesdf.to_dataframe() 262 | self.assertGreater(trdf.shape[0], 20) 263 | self.assertGreater(tedf.shape[0], 20) 264 | df_exp = sdf.to_dataframe() 265 | df_val = pandas.concat([trdf, tedf]) 266 | self.assertEqual(df_exp.shape, df_val.shape) 267 | df_val = df_val.sort_values("cint").reset_index(drop=True) 268 | self.assertEqualDataFrame(df_val, df_exp) 269 | 270 | def test_train_test_split_file_pattern(self): 271 | with tempfile.TemporaryDirectory() as temp: 272 | sdf = dummy_streaming_dataframe(100) 273 | names = os.path.join(temp, "spl_{0}.txt") 274 | self.assertRaise( 275 | lambda: sdf.train_test_split(names, index=False, streaming=False), 276 | ValueError, 277 | ) 278 | names = os.path.join(temp, "spl_{}.txt") 279 | tr, te = sdf.train_test_split(names, index=False, streaming=False) 280 | trsdf = StreamingDataFrame.read_csv(tr) 281 | tesdf = StreamingDataFrame.read_csv(te) 282 | trdf = trsdf.to_dataframe() 283 | tedf = tesdf.to_dataframe() 284 | df_exp = sdf.to_dataframe() 285 | df_val = pandas.concat([trdf, tedf]) 286 | self.assertEqual(df_exp.shape, df_val.shape) 287 | df_val = df_val.sort_values("cint").reset_index(drop=True) 288 | self.assertEqualDataFrame(df_val, df_exp) 289 | 290 | def test_merge(self): 291 | def compares(a, b, how): 292 | m = a.merge(b, on="cint", indicator=True) 293 | dm = m.to_dataframe() 294 | da = a.to_dataframe() 295 | db = b.to_dataframe() 296 | exp = da.merge(db, on="cint", indicator=True) 297 | self.assertEqualDataFrame( 298 | dm.reset_index(drop=True), exp.reset_index(drop=True) 299 | ) 300 | 301 | sdf20 = dummy_streaming_dataframe(20) 302 | sdf30 = dummy_streaming_dataframe(30) 303 | # itself 304 | hows = "inner left right outer".split() 305 | for how in hows: 306 | compares(sdf20, sdf20, how) 307 | compares(sdf20, sdf20, how) 308 | for how in hows: 309 | compares(sdf20, sdf30, how) 310 | compares(sdf20, sdf30, how) 311 | for how in hows: 312 | compares(sdf30, sdf20, how) 313 | compares(sdf30, sdf20, how) 314 | sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True) 315 | 316 | def test_concatv(self): 317 | sdf20 = dummy_streaming_dataframe(20) 318 | sdf30 = dummy_streaming_dataframe(30) 319 | df20 = sdf20.to_dataframe() 320 | df30 = sdf30.to_dataframe() 321 | df = pandas.concat([df20, df30], axis=0) 322 | 323 | m1 = sdf20.concat(sdf30, axis=0) 324 | self.assertEqualDataFrame(m1.to_dataframe(), df) 325 | m1 = sdf20.concat(df30, axis=0) 326 | self.assertEqualDataFrame(m1.to_dataframe(), df) 327 | m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) # noqa: C417 328 | self.assertEqualDataFrame(m1.to_dataframe(), df) 329 | m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) # noqa: C417 330 | self.assertEqualDataFrame(m1.to_dataframe(), df) 331 | 332 | df20["cint"] = df20["cint"].astype(float) 333 | self.assertRaise( 334 | lambda: sdf20.concat(df20).to_dataframe(), 335 | ValueError, 336 | "Frame others[0] do not have the same column types", 337 | ) 338 | df30["g"] = 4 339 | self.assertRaise( 340 | lambda: sdf20.concat(df30).to_dataframe(), 341 | ValueError, 342 | "Frame others[0] do not have the same column names", 343 | ) 344 | 345 | def test_concath(self): 346 | sdf20 = dummy_streaming_dataframe(20) 347 | sdf30 = dummy_streaming_dataframe(20) 348 | df20 = sdf20.to_dataframe() 349 | df30 = sdf30.to_dataframe() 350 | df = pandas.concat([df20, df30], axis=1) 351 | 352 | m1 = sdf20.concat(sdf30, axis=1) 353 | self.assertEqualDataFrame(m1.to_dataframe(), df) 354 | sdf22 = dummy_streaming_dataframe(22) 355 | sdf25 = dummy_streaming_dataframe(25) 356 | self.assertRaise( 357 | lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError 358 | ) 359 | 360 | def test_groupby(self): 361 | df20 = dummy_streaming_dataframe(20).to_dataframe() 362 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) 363 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) 364 | gr = sdf20.groupby("key", lambda gr: gr.sum()) 365 | gr2 = df20.groupby("key").sum() 366 | self.assertEqualDataFrame(gr, gr2) 367 | self.assertRaise( 368 | lambda: sdf20.groupby("key", in_memory=False), NotImplementedError 369 | ) 370 | 371 | # Do not replace lambda c:sum(c) by sum or... 372 | # pandas.core.base.SpecificationError: Function names 373 | # must be unique, found multiple named sum 374 | gr2 = ( 375 | df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c: sum(c)]) 376 | ) 377 | gr = sdf20.drop("cstr", axis=1).groupby( 378 | "key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)]) 379 | ) 380 | self.assertEqualDataFrame(gr, gr2) 381 | 382 | gr = sdf20.groupby("key", lambda gr: gr.count()) 383 | gr2 = df20.groupby("key").count() 384 | self.assertEqualDataFrame(gr, gr2) 385 | 386 | df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7])) 387 | sdf = StreamingDataFrame.read_df(df) 388 | gr = sdf.groupby("A") 389 | gr2 = df.groupby("A").sum() 390 | self.assertEqualDataFrame(gr, gr2) 391 | 392 | def test_groupby_cum(self): 393 | df20 = dummy_streaming_dataframe(20).to_dataframe() 394 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) 395 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) 396 | sgr = sdf20.groupby_streaming( 397 | "key", lambda gr: gr.sum(), strategy="cum", as_index=False 398 | ) 399 | gr2 = df20.groupby("key", as_index=False).sum() 400 | lastgr = None 401 | for gr in sgr: 402 | self.assertEqual(list(gr.columns), list(gr2.columns)) 403 | lastgr = gr 404 | self.assertEqualDataFrame(lastgr, gr2) 405 | 406 | def test_groupby_streaming(self): 407 | df20 = dummy_streaming_dataframe(20).to_dataframe() 408 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) 409 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) 410 | sgr = sdf20.groupby_streaming( 411 | "key", lambda gr: gr.sum(), strategy="streaming", as_index=False 412 | ) 413 | gr2 = df20.groupby("key", as_index=False).sum() 414 | grs = list(sgr) 415 | gr = pandas.concat(grs).groupby("key", as_index=False).sum() 416 | self.assertEqualDataFrame(gr, gr2) 417 | 418 | def test_groupby_cum_asindex(self): 419 | df20 = dummy_streaming_dataframe(20).to_dataframe() 420 | df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) 421 | sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) 422 | sgr = sdf20.groupby_streaming( 423 | "key", lambda gr: gr.sum(), strategy="cum", as_index=True 424 | ) 425 | gr2 = df20.groupby("key", as_index=True).sum() 426 | lastgr = None 427 | for gr in sgr: 428 | self.assertEqual(list(gr.columns), list(gr2.columns)) 429 | lastgr = gr 430 | self.assertEqualDataFrame(lastgr, gr2) 431 | 432 | def test_merge_2(self): 433 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) 434 | df2 = pandas.concat([df, df]) 435 | sdf = StreamingDataFrame.read_df(df) 436 | sdf2 = sdf.concat(sdf, axis=0) 437 | self.assertEqualDataFrame(df2, sdf2.to_dataframe()) 438 | self.assertEqualDataFrame(df2, sdf2.to_dataframe()) 439 | m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20])) 440 | jm = df2.merge(m, left_on="Y", right_on="Y", how="outer") 441 | sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") 442 | self.assertEqualDataFrame( 443 | jm.sort_values(["X", "Y"]).reset_index(drop=True), 444 | sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True), 445 | ) 446 | 447 | @ignore_warnings(ResourceWarning) 448 | def test_schema_consistent(self): 449 | df = pandas.DataFrame( 450 | [ 451 | dict(cf=0, cint=0, cstr="0"), 452 | dict(cf=1, cint=1, cstr="1"), 453 | dict(cf=2, cint="s2", cstr="2"), 454 | dict(cf=3, cint=3, cstr="3"), 455 | ] 456 | ) 457 | with tempfile.TemporaryDirectory() as temp: 458 | name = os.path.join(temp, "df.csv") 459 | stio = StringIO() 460 | df.to_csv(stio, index=False) 461 | self.assertNotEmpty(stio.getvalue()) 462 | df.to_csv(name, index=False) 463 | self.assertEqual(df.shape, (4, 3)) 464 | sdf = StreamingDataFrame.read_csv(name, chunksize=2) 465 | self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) 466 | sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False) 467 | pieces = list(sdf) 468 | self.assertEqual(len(pieces), 2) 469 | 470 | def test_getitem(self): 471 | sdf = dummy_streaming_dataframe(100) 472 | sdf2 = sdf[["cint"]] 473 | self.assertEqual(sdf2.shape, (100, 1)) 474 | df1 = sdf.to_df() 475 | df2 = sdf2.to_df() 476 | self.assertEqualDataFrame(df1[["cint"]], df2) 477 | self.assertRaise(lambda: sdf[:, "cint"], NotImplementedError) 478 | 479 | @ignore_warnings(ResourceWarning) 480 | def test_read_csv_names(self): 481 | this = os.path.abspath(os.path.dirname(__file__)) 482 | data = os.path.join(this, "data", "buggy_hash2.csv") 483 | df = pandas.read_csv(data, sep="\t", names=["A", "B", "C"], header=None) 484 | sdf = StreamingDataFrame.read_csv( 485 | data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None 486 | ) 487 | head = sdf.head(n=1) 488 | self.assertEqualDataFrame(df.head(n=1), head) 489 | 490 | def test_add_column(self): 491 | df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) 492 | sdf = StreamingDataFrame.read_df(df) 493 | sdf2 = sdf.add_column("d", lambda _row: 1) 494 | df2 = sdf2.to_dataframe() 495 | df["d"] = 1 496 | self.assertEqualDataFrame(df, df2) 497 | 498 | sdf3 = StreamingDataFrame.read_df(df) 499 | sdf4 = sdf3.add_column("dd", 2) 500 | df4 = sdf4.to_dataframe() 501 | df["dd"] = 2 502 | self.assertEqualDataFrame(df, df4) 503 | 504 | sdfA = StreamingDataFrame.read_df(df) 505 | sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10) 506 | dfB = sdfB.to_dataframe() 507 | df["dd12"] = 12 508 | self.assertEqualDataFrame(df, dfB) 509 | 510 | def test_fillna(self): 511 | df = pandas.DataFrame(data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan])) 512 | sdf = StreamingDataFrame.read_df(df) 513 | 514 | df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) 515 | na = sdf.fillna(value=dict(X=10.0, Y="NAN")) 516 | ndf = na.to_df() 517 | self.assertEqualDataFrame(ndf, df2) 518 | 519 | df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) 520 | na = sdf.fillna(value=dict(X=10.0)) 521 | ndf = na.to_df() 522 | self.assertEqualDataFrame(ndf, df3) 523 | 524 | def test_describe(self): 525 | x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5 526 | y = numpy.arange(100001).astype(numpy.int64) 527 | z = numpy.array([chr(65 + j % 45) for j in y]) 528 | df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z)) 529 | sdf = StreamingDataFrame.read_df(df) 530 | 531 | desc = sdf.describe() 532 | self.assertEqual(["X", "Y"], list(desc.columns)) 533 | self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0]) 534 | self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000]) 535 | self.assertEqualArray( 536 | desc.loc["mean", :], numpy.array([0, 50000], dtype=numpy.float64), atol=1e-8 537 | ) 538 | self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000])) 539 | self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000])) 540 | self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000])) 541 | self.assertEqualArray( 542 | desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), atol=1e-4 543 | ) 544 | 545 | def test_set_item(self): 546 | df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) 547 | self.assertRaise(lambda: StreamingDataFrame(df), TypeError) 548 | sdf = StreamingDataFrame.read_df(df) 549 | 550 | def f(): 551 | sdf[["a"]] = 10 552 | 553 | self.assertRaise(f, ValueError) 554 | 555 | def g(): 556 | sdf["a"] = [10] 557 | 558 | self.assertRaise(g, NotImplementedError) 559 | 560 | sdf["aa"] = 10 561 | df = sdf.to_df() 562 | ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10])) 563 | self.assertEqualDataFrame(df, ddf) 564 | sdf["bb"] = sdf["b"] + 10 565 | df = sdf.to_df() 566 | ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16])) 567 | self.assertEqualDataFrame(df, ddf) 568 | 569 | def test_set_item_function(self): 570 | df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) 571 | self.assertRaise(lambda: StreamingDataFrame(df), TypeError) 572 | sdf = StreamingDataFrame.read_df(df) 573 | sdf["bb"] = sdf["b"].apply(lambda x: x + 11) 574 | df = sdf.to_df() 575 | ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], bb=[17])) 576 | self.assertEqualDataFrame(df, ddf) 577 | 578 | 579 | if __name__ == "__main__": 580 | unittest.main(verbosity=2) 581 | -------------------------------------------------------------------------------- /_unittests/ut_module/test_sklearn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy 3 | import pandas 4 | from sklearn.linear_model import LogisticRegression 5 | from pandas_streaming.ext_test_case import ExtTestCase 6 | 7 | 8 | class TestScikitLearn(ExtTestCase): 9 | def test_logistic_regression_check(self): 10 | X = pandas.DataFrame(numpy.array([[0.1, 0.2], [-0.2, 0.3]])) 11 | Y = numpy.array([0, 1]) 12 | clq = LogisticRegression( 13 | fit_intercept=False, solver="liblinear", random_state=42 14 | ) 15 | clq.fit(X, Y) 16 | pred2 = clq.predict(X) 17 | self.assertEqualArray(numpy.array([0, 1]), pred2) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | image: 2 | - Visual Studio 2019 3 | environment: 4 | matrix: 5 | - PYTHON: "C:\\Python310-x64" 6 | PYTHON_VERSION: "3.10.x" 7 | PYTHON_ARCH: "64" 8 | init: 9 | - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" 10 | 11 | install: 12 | - "%PYTHON%\\python -m pip install --upgrade pip" 13 | - "%PYTHON%\\Scripts\\pip install -r requirements-dev.txt" 14 | build: off 15 | 16 | before_test: 17 | - "%PYTHON%\\python -u setup.py build_ext --inplace" 18 | 19 | test_script: 20 | - "%PYTHON%\\python -u setup.py unittests" 21 | 22 | after_test: 23 | - "%PYTHON%\\python -u setup.py bdist_wheel" 24 | 25 | artifacts: 26 | - path: dist 27 | name: pandas_streaming 28 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | jobs: 2 | - job: 'TestLinuxWheelPip' 3 | pool: 4 | vmImage: 'ubuntu-latest' 5 | strategy: 6 | matrix: 7 | Python311-Linux: 8 | python.version: '3.11' 9 | maxParallel: 3 10 | 11 | steps: 12 | - task: UsePythonVersion@0 13 | inputs: 14 | versionSpec: '$(python.version)' 15 | architecture: 'x64' 16 | - script: sudo apt-get update 17 | displayName: 'AptGet Update' 18 | - script: sudo apt-get install -y graphviz 19 | displayName: 'Install Graphviz' 20 | - script: python -m pip install --upgrade pip setuptools wheel 21 | displayName: 'Install tools' 22 | - script: pip install -r requirements.txt 23 | displayName: 'Install Requirements' 24 | - script: pip install -r requirements-dev.txt 25 | displayName: 'Install Requirements dev' 26 | - script: | 27 | ruff check . 28 | displayName: 'Ruff' 29 | - script: | 30 | black --diff . 31 | displayName: 'Black' 32 | - script: | 33 | python -m pip wheel . --wheel-dir dist -v -v -v 34 | displayName: 'build wheel' 35 | - script: | 36 | python -m pip install . -v -v -v 37 | displayName: 'install wheel' 38 | - script: | 39 | python -m pytest 40 | displayName: 'Runs Unit Tests' 41 | - task: PublishPipelineArtifact@0 42 | inputs: 43 | artifactName: 'wheel-linux-wheel-$(python.version)' 44 | targetPath: 'dist' 45 | 46 | - job: 'TestLinuxNightly' 47 | pool: 48 | vmImage: 'ubuntu-latest' 49 | strategy: 50 | matrix: 51 | Python311-Linux: 52 | python.version: '3.11' 53 | maxParallel: 3 54 | 55 | steps: 56 | - task: UsePythonVersion@0 57 | inputs: 58 | versionSpec: '$(python.version)' 59 | architecture: 'x64' 60 | - script: sudo apt-get update 61 | displayName: 'AptGet Update' 62 | - script: sudo apt-get install -y pandoc 63 | displayName: 'Install Pandoc' 64 | - script: sudo apt-get install -y inkscape 65 | displayName: 'Install Inkscape' 66 | - script: sudo apt-get install -y graphviz 67 | displayName: 'Install Graphviz' 68 | - script: python -m pip install --upgrade pip setuptools wheel 69 | displayName: 'Install tools' 70 | - script: pip install -r requirements.txt 71 | displayName: 'Install Requirements' 72 | - script: pip install -r requirements-dev.txt 73 | displayName: 'Install Requirements dev' 74 | - script: pip uninstall -y scikit-learn 75 | displayName: 'Uninstall scikit-learn' 76 | - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn 77 | displayName: 'Install scikit-learn nightly' 78 | - script: | 79 | ruff check . 80 | displayName: 'Ruff' 81 | - script: | 82 | black --diff . 83 | displayName: 'Black' 84 | - script: | 85 | python -m pytest 86 | displayName: 'Runs Unit Tests' 87 | 88 | - job: 'TestLinux' 89 | pool: 90 | vmImage: 'ubuntu-latest' 91 | strategy: 92 | matrix: 93 | Python311-Linux: 94 | python.version: '3.11' 95 | maxParallel: 3 96 | 97 | steps: 98 | - task: UsePythonVersion@0 99 | inputs: 100 | versionSpec: '$(python.version)' 101 | architecture: 'x64' 102 | - script: sudo apt-get update 103 | displayName: 'AptGet Update' 104 | - script: sudo apt-get install -y pandoc 105 | displayName: 'Install Pandoc' 106 | - script: sudo apt-get install -y inkscape 107 | displayName: 'Install Inkscape' 108 | - script: sudo apt-get install -y graphviz 109 | displayName: 'Install Graphviz' 110 | - script: python -m pip install --upgrade pip setuptools wheel 111 | displayName: 'Install tools' 112 | - script: pip install -r requirements.txt 113 | displayName: 'Install Requirements' 114 | - script: pip install -r requirements-dev.txt 115 | displayName: 'Install Requirements dev' 116 | - script: | 117 | ruff check . 118 | displayName: 'Ruff' 119 | - script: | 120 | black --diff . 121 | displayName: 'Black' 122 | - script: | 123 | python -m pytest --cov 124 | displayName: 'Runs Unit Tests' 125 | - script: | 126 | python -u setup.py bdist_wheel 127 | displayName: 'Build Package' 128 | #- script: | 129 | # python -m sphinx _doc dist/html 130 | # displayName: 'Builds Documentation' 131 | - task: PublishPipelineArtifact@0 132 | inputs: 133 | artifactName: 'wheel-linux-$(python.version)' 134 | targetPath: 'dist' 135 | 136 | - job: 'TestWindows' 137 | pool: 138 | vmImage: 'windows-latest' 139 | strategy: 140 | matrix: 141 | Python311-Windows: 142 | python.version: '3.11' 143 | maxParallel: 3 144 | 145 | steps: 146 | - task: UsePythonVersion@0 147 | inputs: 148 | versionSpec: '$(python.version)' 149 | architecture: 'x64' 150 | - script: python -m pip install --upgrade pip setuptools wheel 151 | displayName: 'Install tools' 152 | - script: pip install -r requirements.txt 153 | displayName: 'Install Requirements' 154 | - script: pip install -r requirements-dev.txt 155 | displayName: 'Install Requirements dev' 156 | - script: | 157 | python -m pytest 158 | displayName: 'Runs Unit Tests' 159 | - script: | 160 | python -u setup.py bdist_wheel 161 | displayName: 'Build Package' 162 | - task: PublishPipelineArtifact@0 163 | inputs: 164 | artifactName: 'wheel-windows-$(python.version)' 165 | targetPath: 'dist' 166 | 167 | - job: 'TestMac' 168 | pool: 169 | vmImage: 'macOS-latest' 170 | strategy: 171 | matrix: 172 | Python311-Mac: 173 | python.version: '3.11' 174 | maxParallel: 3 175 | 176 | steps: 177 | - task: UsePythonVersion@0 178 | inputs: 179 | versionSpec: '$(python.version)' 180 | architecture: 'x64' 181 | - script: gcc --version 182 | displayName: 'gcc version' 183 | #- script: brew upgrade 184 | # displayName: 'brew upgrade' 185 | #- script: brew update 186 | # displayName: 'brew update' 187 | - script: export 188 | displayName: 'export' 189 | - script: gcc --version 190 | displayName: 'gcc version' 191 | - script: python -m pip install --upgrade pip setuptools wheel 192 | displayName: 'Install tools' 193 | - script: pip install -r requirements.txt 194 | displayName: 'Install Requirements' 195 | - script: pip install -r requirements-dev.txt 196 | displayName: 'Install Requirements dev' 197 | - script: | 198 | python -m pytest 199 | displayName: 'Runs Unit Tests' 200 | - script: | 201 | python -u setup.py bdist_wheel 202 | displayName: 'Build Package' 203 | - task: PublishPipelineArtifact@0 204 | inputs: 205 | artifactName: 'wheel-mac-$(python.version)' 206 | targetPath: 'dist' 207 | 208 | -------------------------------------------------------------------------------- /pandas_streaming/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.5.1" 2 | __author__ = "Xavier Dupré" 3 | __github__ = "https://github.com/sdpython/pandas_streaming" 4 | __url__ = "https://sdpython.github.io/doc/pandas-streaming/dev/" 5 | __license__ = "MIT License" 6 | -------------------------------------------------------------------------------- /pandas_streaming/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dummy import dummy_streaming_dataframe 2 | -------------------------------------------------------------------------------- /pandas_streaming/data/dummy.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | from ..df import StreamingDataFrame 3 | 4 | 5 | def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols): 6 | """ 7 | Returns a dummy streaming dataframe 8 | mostly for unit test purposes. 9 | 10 | :param n: number of rows 11 | :param chunksize: chunk size 12 | :param asfloat: use random float and not random int 13 | :param cols: additional columns 14 | :return: a @see cl StreamingDataFrame 15 | """ 16 | if asfloat: 17 | df = DataFrame( 18 | dict( 19 | cfloat=[_ + 0.1 for _ in range(n)], 20 | cstr=[f"s{i}" for i in range(n)], 21 | ) 22 | ) 23 | else: 24 | df = DataFrame(dict(cint=list(range(n)), cstr=[f"s{i}" for i in range(n)])) 25 | for k, v in cols.items(): 26 | df[k] = v 27 | return StreamingDataFrame.read_df(df, chunksize=chunksize) 28 | -------------------------------------------------------------------------------- /pandas_streaming/df/__init__.py: -------------------------------------------------------------------------------- 1 | from .connex_split import ( 2 | train_test_split_weights, 3 | train_test_connex_split, 4 | train_test_apart_stratify, 5 | ) 6 | from .dataframe import StreamingDataFrame 7 | from .dataframe_helpers import ( 8 | dataframe_hash_columns, 9 | dataframe_unfold, 10 | dataframe_shuffle, 11 | ) 12 | from .dataframe_helpers import pandas_groupby_nan, numpy_types 13 | from .dataframe_io import to_zip, read_zip 14 | -------------------------------------------------------------------------------- /pandas_streaming/df/connex_split.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from logging import getLogger 3 | from typing import Optional, Tuple 4 | import pandas 5 | import numpy 6 | from .dataframe_helpers import dataframe_shuffle 7 | 8 | logger = getLogger("pandas-streaming") 9 | 10 | 11 | class ImbalancedSplitException(Exception): 12 | """ 13 | Raised when an imbalanced split is detected. 14 | """ 15 | 16 | 17 | def train_test_split_weights( 18 | df, 19 | weights=None, 20 | test_size=0.25, 21 | train_size=None, 22 | shuffle=True, 23 | fail_imbalanced=0.05, 24 | random_state=None, 25 | ): 26 | """ 27 | Splits a database in train/test given, every row 28 | can have a different weight. 29 | 30 | :param df: :class:`pandas.DataFrame` or see 31 | :class:`StreamingDataFrame ` 32 | :param weights: None or weights or weights column name 33 | :param test_size: ratio for the test partition 34 | (if *train_size* is not specified) 35 | :param train_size: ratio for the train partition 36 | :param shuffle: shuffles before the split 37 | :param fail_imbalanced: raises an exception if relative weights 38 | difference is higher than this value 39 | :param random_state: seed for random generators 40 | :return: train and test :class:`pandas.DataFrame` 41 | 42 | If the dataframe is not shuffled first, the function 43 | will produce two datasets which are unlikely to be randomized 44 | as the function tries to keep equal weights among both paths 45 | without using randomness. 46 | """ 47 | if hasattr(df, "iter_creation"): 48 | raise NotImplementedError( # pragma: no cover 49 | "Not implemented yet for StreamingDataFrame." 50 | ) 51 | if isinstance(df, numpy.ndarray): 52 | raise NotImplementedError( # pragma: no cover 53 | "Not implemented on numpy arrays." 54 | ) 55 | if shuffle: 56 | df = dataframe_shuffle(df, random_state=random_state) 57 | if weights is None: 58 | if test_size == 0 or train_size == 0: 59 | raise ValueError( 60 | f"test_size={test_size} or train_size={train_size} cannot be null (1)." 61 | ) 62 | from sklearn.model_selection import train_test_split 63 | 64 | return train_test_split( 65 | df, test_size=test_size, train_size=train_size, random_state=random_state 66 | ) 67 | 68 | if isinstance(weights, pandas.Series): 69 | weights = list(weights) 70 | elif isinstance(weights, str): 71 | weights = list(df[weights]) 72 | if len(weights) != df.shape[0]: 73 | raise ValueError( 74 | "Dimension mismatch between weights and dataframe " # noqa: UP030 75 | "{0} != {1}".format(df.shape[0], len(weights)) 76 | ) 77 | 78 | p = (1 - test_size) if test_size else None 79 | if train_size is not None: 80 | p = train_size 81 | test_size = 1 - p 82 | if p is None or min(test_size, p) <= 0: 83 | raise ValueError( 84 | f"test_size={test_size} or train_size={train_size} cannot be null (2)." 85 | ) 86 | ratio = test_size / p 87 | 88 | if random_state is None: 89 | randint = numpy.random.randint 90 | else: 91 | state = numpy.random.RandomState(random_state) 92 | randint = state.randint 93 | 94 | balance = 0 95 | train_ids = [] 96 | test_ids = [] 97 | test_weights = 0 98 | train_weights = 0 99 | for i in range(df.shape[0]): 100 | w = weights[i] 101 | if balance == 0: 102 | h = randint(0, 1) 103 | totest = h == 0 104 | else: 105 | totest = balance < 0 106 | if totest: 107 | test_ids.append(i) 108 | balance += w 109 | test_weights += w 110 | else: 111 | train_ids.append(i) 112 | balance -= w * ratio 113 | train_weights += w * ratio 114 | 115 | r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights)) 116 | if r >= fail_imbalanced: 117 | raise ImbalancedSplitException( # pragma: no cover 118 | "Split is imbalanced: train_weights={0} test_weights={1} r={2}." # noqa: UP030 119 | "".format(train_weights, test_weights, r) 120 | ) 121 | 122 | return df.iloc[train_ids, :], df.iloc[test_ids, :] 123 | 124 | 125 | def train_test_connex_split( 126 | df, 127 | groups, 128 | test_size=0.25, 129 | train_size=None, 130 | stratify=None, 131 | hash_size=9, 132 | unique_rows=False, 133 | shuffle=True, 134 | fail_imbalanced=0.05, 135 | keep_balance=None, 136 | stop_if_bigger=None, 137 | return_cnx=False, 138 | must_groups=None, 139 | random_state=None, 140 | verbose=0, 141 | ): 142 | """ 143 | This split is for a specific case where data is linked 144 | in many ways. Let's assume we have three ids as we have 145 | for online sales: *(product id, user id, card id)*. 146 | As we may need to compute aggregated features, 147 | we need every id not to be present in both train and 148 | test set. The function computes the connected components 149 | and breaks each of them in two parts for train and test. 150 | 151 | :param df: :epkg:`pandas:DataFrame` 152 | :param groups: columns name for the ids 153 | :param test_size: ratio for the test partition 154 | (if *train_size* is not specified) 155 | :param train_size: ratio for the train partition 156 | :param stratify: column holding the stratification 157 | :param hash_size: size of the hash to cache information about partition 158 | :param unique_rows: ensures that rows are unique 159 | :param shuffle: shuffles before the split 160 | :param fail_imbalanced: raises an exception if relative weights difference 161 | is higher than this value 162 | :param stop_if_bigger: (float) stops a connected components from being 163 | bigger than this ratio of elements, this should not be used 164 | unless a big components emerges, the algorithm stops merging 165 | but does not guarantee it returns the best cut, 166 | the value should be close to 0 167 | :param keep_balance: (float), if not None, does not merge connected components 168 | if their relative sizes are too different, 169 | the value should be close to 1 170 | :param return_cnx: returns connected components as a third results 171 | :param must_groups: column name for ids which must not be shared by 172 | train/test partitions 173 | :param random_state: seed for random generator 174 | :param verbose: verbosity (uses logging) 175 | :return: Two see :class:`StreamingDataFrame 176 | `, one 177 | for train, one for test. 178 | 179 | The list of ids must hold in memory. 180 | There is no streaming implementation for the ids. 181 | 182 | .. exref:: 183 | :title: Splits a dataframe, keep ids in separate partitions 184 | :tag: dataframe 185 | 186 | In some data science problems, rows are not independant 187 | and share common value, most of the time ids. In some 188 | specific case, multiple ids from different columns are 189 | connected and must appear in the same partition. 190 | Testing that each id column is evenly split and do not 191 | appear in both sets in not enough. Connected components 192 | are needed. 193 | 194 | .. runpython:: 195 | :showcode: 196 | 197 | from pandas import DataFrame 198 | from pandas_streaming.df import train_test_connex_split 199 | 200 | df = DataFrame([dict(user="UA", prod="PAA", card="C1"), 201 | dict(user="UA", prod="PB", card="C1"), 202 | dict(user="UB", prod="PC", card="C2"), 203 | dict(user="UB", prod="PD", card="C2"), 204 | dict(user="UC", prod="PAA", card="C3"), 205 | dict(user="UC", prod="PF", card="C4"), 206 | dict(user="UD", prod="PG", card="C5"), 207 | ]) 208 | 209 | train, test = train_test_connex_split( 210 | df, test_size=0.5, groups=['user', 'prod', 'card'], 211 | fail_imbalanced=0.6) 212 | 213 | print(train) 214 | print(test) 215 | 216 | If *return_cnx* is True, the third results contains: 217 | 218 | * connected components for each id 219 | * the dataframe with connected components as a new column 220 | 221 | .. runpython:: 222 | :showcode: 223 | 224 | from pandas import DataFrame 225 | from pandas_streaming.df import train_test_connex_split 226 | 227 | df = DataFrame([dict(user="UA", prod="PAA", card="C1"), 228 | dict(user="UA", prod="PB", card="C1"), 229 | dict(user="UB", prod="PC", card="C2"), 230 | dict(user="UB", prod="PD", card="C2"), 231 | dict(user="UC", prod="PAA", card="C3"), 232 | dict(user="UC", prod="PF", card="C4"), 233 | dict(user="UD", prod="PG", card="C5"), 234 | ]) 235 | 236 | train, test, cnx = train_test_connex_split( 237 | df, test_size=0.5, groups=['user', 'prod', 'card'], 238 | fail_imbalanced=0.6, return_cnx=True) 239 | 240 | print(cnx[0]) 241 | print(cnx[1]) 242 | """ 243 | if stratify is not None: 244 | raise NotImplementedError( # pragma: no cover 245 | "Option stratify is not implemented." 246 | ) 247 | if groups is None or len(groups) == 0: 248 | raise ValueError( # pragma: no cover 249 | "groups is empty. Use regular train_test_split." 250 | ) 251 | if hasattr(df, "iter_creation"): 252 | raise NotImplementedError( # pragma: no cover 253 | "Not implemented yet for StreamingDataFrame." 254 | ) 255 | if isinstance(df, numpy.ndarray): 256 | raise NotImplementedError( # pragma: no cover 257 | "Not implemented on numpy arrays." 258 | ) 259 | if shuffle: 260 | df = dataframe_shuffle(df, random_state=random_state) 261 | 262 | dfids = df[groups].copy() 263 | if must_groups is not None: 264 | dfids_must = df[must_groups].copy() 265 | 266 | name = "connex" 267 | while name in dfids.columns: 268 | name += "_" 269 | one = "weight" 270 | while one in dfids.columns: 271 | one += "_" 272 | 273 | # Connected components. 274 | elements = list(range(dfids.shape[0])) 275 | counts_cnx = {i: {i} for i in elements} 276 | connex = {} 277 | avoids_merge = {} 278 | 279 | def do_connex_components(dfrows, local_groups, kb, sib): 280 | "run connected components algorithms" 281 | itern = 0 282 | modif = 1 283 | 284 | while modif > 0 and itern < len(elements): 285 | if df.shape[0] > 10000: 286 | logger.info( 287 | "[train_test_connex_split] iteration=%d-#nb connect=%d - " 288 | "modif=%s", 289 | itern, 290 | len(set(elements)), 291 | modif, 292 | ) 293 | 294 | modif = 0 295 | itern += 1 296 | for i, row in enumerate(dfrows.itertuples(index=False, name=None)): 297 | vals = [ 298 | val 299 | for val in zip(local_groups, row) 300 | if not isinstance(val[1], float) or not numpy.isnan(val[1]) 301 | ] 302 | 303 | c = elements[i] 304 | 305 | for val in vals: 306 | if val not in connex: 307 | connex[val] = c 308 | modif += 1 309 | 310 | set_c = set(connex[val] for val in vals) 311 | set_c.add(c) 312 | new_c = min(set_c) 313 | 314 | add_pair_c = [] 315 | for c in set_c: 316 | if c == new_c or (new_c, c) in avoids_merge: 317 | continue 318 | if kb is not None: 319 | maxi = min(len(counts_cnx[new_c]), len(counts_cnx[c])) 320 | if maxi > 5: 321 | diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi 322 | r = diff / float(maxi) 323 | if r > kb: 324 | if verbose: # pragma: no cover 325 | logger.info( 326 | "[train_test_connex_split] balance " 327 | "r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d", 328 | r, 329 | kb, 330 | new_c, 331 | len(counts_cnx[new_c]), 332 | c, 333 | len(counts_cnx[c]), 334 | ) 335 | 336 | continue 337 | 338 | if sib is not None: 339 | r = (len(counts_cnx[new_c]) + len(counts_cnx[c])) / float( 340 | len(elements) 341 | ) 342 | if r > sib: 343 | logger.info( 344 | "[train_test_connex_split] " 345 | "no merge r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d", 346 | r, 347 | sib, 348 | new_c, 349 | len(counts_cnx[new_c]), 350 | c, 351 | len(counts_cnx[c]), 352 | ) 353 | avoids_merge[new_c, c] = i 354 | continue 355 | 356 | add_pair_c.append(c) 357 | 358 | if len(add_pair_c) > 0: 359 | for c in add_pair_c: 360 | modif += len(counts_cnx[c]) 361 | for ii in counts_cnx[c]: 362 | elements[ii] = new_c 363 | counts_cnx[new_c] = counts_cnx[new_c].union(counts_cnx[c]) 364 | counts_cnx[c] = set() 365 | 366 | keys = list(vals) 367 | for val in keys: 368 | if connex[val] == c: 369 | connex[val] = new_c 370 | modif += 1 371 | 372 | if must_groups: 373 | do_connex_components(dfids_must, must_groups, None, None) 374 | do_connex_components(dfids, groups, keep_balance, stop_if_bigger) 375 | 376 | # final 377 | dfids[name] = elements 378 | dfids[one] = 1 379 | grsum = dfids[[name, one]].groupby(name, as_index=False).sum() 380 | for g in groups: 381 | logger.info("[train_test_connex_split] #nb in '%d':", len(set(dfids[g]))) 382 | logger.info( 383 | "[train_test_connex_split] #connex %d/%d", grsum.shape[0], dfids.shape[0] 384 | ) 385 | if grsum.shape[0] <= 1: 386 | raise ValueError( # pragma: no cover 387 | "Every element is in the same connected components." 388 | ) 389 | 390 | # Statistics: top connected components 391 | if verbose: 392 | # Global statistics 393 | counts = Counter(elements) 394 | cl = [(v, k) for k, v in counts.items()] 395 | cum = 0 396 | maxc = None 397 | logger.info( 398 | "[train_test_connex_split] number of connected components: %d", 399 | len(set(elements)), 400 | ) 401 | for i, (v, k) in enumerate(sorted(cl, reverse=True)): 402 | if i == 0: 403 | maxc = k, v 404 | if i >= 10: 405 | break 406 | cum += v 407 | logger.info( 408 | "[train_test_connex_split] c=%s #elements=%s cumulated=%d/%d", 409 | k, 410 | v, 411 | cum, 412 | len(elements), 413 | ) 414 | 415 | # Most important component 416 | logger.info( 417 | "[train_test_connex_split] first row of the biggest component %d", maxc 418 | ) 419 | tdf = dfids[dfids[name] == maxc[0]] 420 | logger.info("[train_test_connex_split] % s", tdf.head(n=10)) 421 | 422 | # Splits. 423 | train, test = train_test_split_weights( 424 | grsum, 425 | weights=one, 426 | test_size=test_size, 427 | train_size=train_size, 428 | shuffle=shuffle, 429 | fail_imbalanced=fail_imbalanced, 430 | random_state=random_state, 431 | ) 432 | train.drop(one, inplace=True, axis=1) 433 | test.drop(one, inplace=True, axis=1) 434 | 435 | # We compute the final dataframe. 436 | def double_merge(d): 437 | "merge twice" 438 | merge1 = dfids.merge(d, left_on=name, right_on=name) 439 | merge2 = df.merge(merge1, left_on=groups, right_on=groups) 440 | return merge2 441 | 442 | train_f = double_merge(train) 443 | test_f = double_merge(test) 444 | if return_cnx: 445 | return train_f, test_f, (connex, dfids) 446 | else: 447 | return train_f, test_f 448 | 449 | 450 | def train_test_apart_stratify( 451 | df: pandas.DataFrame, 452 | group, 453 | test_size: Optional[float] = 0.25, 454 | train_size: Optional[float] = None, 455 | stratify: Optional[str] = None, 456 | force: bool = False, 457 | random_state: Optional[int] = None, 458 | sorted_indices: bool = False, 459 | ) -> Tuple["StreamingDataFrame", "StreamingDataFrame"]: # noqa: F821 460 | """ 461 | This split is for a specific case where data is linked 462 | in one way. Let's assume we have two ids as we have 463 | for online sales: *(product id, category id)*. 464 | A product can have multiple categories. We need to have 465 | distinct products on train and test but common categories 466 | on both sides. 467 | 468 | :param df: :epkg:`pandas:DataFrame` 469 | :param group: columns name for the ids 470 | :param test_size: ratio for the test partition 471 | (if *train_size* is not specified) 472 | :param train_size: ratio for the train partition 473 | :param stratify: column holding the stratification 474 | :param force: if True, tries to get at least one example on the test side 475 | for each value of the column *stratify* 476 | :param random_state: seed for random generators 477 | :param sorted_indices: sort index first, 478 | see issue `41 ` 479 | :return: Two see :class:`StreamingDataFrame 480 | `, one 481 | for train, one for test. 482 | 483 | The list of ids must hold in memory. 484 | There is no streaming implementation for the ids. 485 | This split was implemented for a case of a multi-label 486 | classification. A category (*stratify*) is not exclusive 487 | and an observation can be assigned to multiple 488 | categories. In that particular case, the method 489 | :func:`sklearn.model_selection.train_test_split` 490 | can not directly be used. 491 | 492 | .. runpython:: 493 | :showcode: 494 | 495 | import pandas 496 | from pandas_streaming.df import train_test_apart_stratify 497 | 498 | df = pandas.DataFrame([dict(a=1, b="e"), 499 | dict(a=1, b="f"), 500 | dict(a=2, b="e"), 501 | dict(a=2, b="f")]) 502 | 503 | train, test = train_test_apart_stratify( 504 | df, group="a", stratify="b", test_size=0.5) 505 | print(train) 506 | print('-----------') 507 | print(test) 508 | 509 | """ 510 | if stratify is None: 511 | raise ValueError("stratify must be specified.") # pragma: no cover 512 | if group is None: 513 | raise ValueError("group must be specified.") # pragma: no cover 514 | if hasattr(df, "iter_creation"): 515 | raise NotImplementedError("Not implemented yet for StreamingDataFrame.") 516 | if isinstance(df, numpy.ndarray): 517 | raise NotImplementedError("Not implemented on numpy arrays.") 518 | 519 | p = (1 - test_size) if test_size else None 520 | if train_size is not None: 521 | p = train_size 522 | test_size = 1 - p 523 | if p is None or min(test_size, p) <= 0: 524 | raise ValueError( # pragma: no cover 525 | f"test_size={test_size} or train_size={train_size} cannot be null" 526 | ) 527 | 528 | couples = df[[group, stratify]].itertuples(name=None, index=False) 529 | hist = Counter(df[stratify]) 530 | sorted_hist = [(v, k) for k, v in hist.items()] 531 | sorted_hist.sort() 532 | ids = {c: set() for c in hist} 533 | 534 | for g, s in couples: 535 | ids[s].add(g) 536 | 537 | if random_state is None: 538 | permutation = numpy.random.permutation 539 | else: 540 | state = numpy.random.RandomState(random_state) 541 | permutation = state.permutation 542 | 543 | split = {} 544 | for _, k in sorted_hist: 545 | indices = sorted(ids[k]) if sorted_indices else ids[k] 546 | not_assigned, assigned = [], [] 547 | for c in indices: 548 | if c in split: 549 | assigned.append(c) 550 | else: 551 | not_assigned.append(c) 552 | if len(not_assigned) == 0: 553 | continue 554 | nb_test = sum(split[c] for c in assigned) 555 | expected = min(len(ids[k]), int(test_size * len(ids[k]) + 0.5)) - nb_test 556 | if force and expected == 0 and nb_test == 0: 557 | nb_train = len(assigned) - nb_test 558 | if nb_train > 0 or len(not_assigned) > 1: 559 | expected = min(1, len(not_assigned)) 560 | if expected > 0: 561 | permutation(not_assigned) 562 | for e in not_assigned[:expected]: 563 | split[e] = 1 564 | for e in not_assigned[expected:]: 565 | split[e] = 0 566 | else: 567 | for c in not_assigned: 568 | split[c] = 0 569 | 570 | train_set = set(k for k, v in split.items() if v == 0) 571 | test_set = set(k for k, v in split.items() if v == 1) 572 | train_df = df[df[group].isin(train_set)] 573 | test_df = df[df[group].isin(test_set)] 574 | return train_df, test_df 575 | -------------------------------------------------------------------------------- /pandas_streaming/df/dataframe_helpers.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import struct 3 | import warnings 4 | import numpy 5 | from pandas import DataFrame, Index, Series 6 | 7 | 8 | def numpy_types(): 9 | """ 10 | Returns the list of :epkg:`numpy` available types. 11 | 12 | :return: list of types 13 | """ 14 | 15 | return [ 16 | numpy.bool_, 17 | numpy.int_, 18 | numpy.intc, 19 | numpy.intp, 20 | numpy.int8, 21 | numpy.int16, 22 | numpy.int32, 23 | numpy.int64, 24 | numpy.uint8, 25 | numpy.uint16, 26 | numpy.uint32, 27 | numpy.uint64, 28 | numpy.float16, 29 | numpy.float32, 30 | numpy.float64, 31 | numpy.complex64, 32 | numpy.complex128, 33 | ] 34 | 35 | 36 | def hash_str(c, hash_length): 37 | """ 38 | Hashes a string. 39 | 40 | @param c value to hash 41 | @param hash_length hash_length 42 | @return string 43 | """ 44 | if isinstance(c, float): 45 | if numpy.isnan(c): 46 | return c 47 | raise ValueError(f"numpy.nan expected, not {c}") 48 | m = hashlib.sha256() 49 | m.update(c.encode("utf-8")) 50 | r = m.hexdigest() 51 | if len(r) >= hash_length: 52 | return r[:hash_length] 53 | return r 54 | 55 | 56 | def hash_int(c, hash_length): 57 | """ 58 | Hashes an integer into an integer. 59 | 60 | @param c value to hash 61 | @param hash_length hash_length 62 | @return int 63 | """ 64 | if isinstance(c, float): 65 | if numpy.isnan(c): 66 | return c 67 | else: 68 | raise ValueError(f"numpy.nan expected, not {c}") 69 | else: 70 | b = struct.pack("i", c) 71 | m = hashlib.sha256() 72 | m.update(b) 73 | r = m.hexdigest() 74 | if len(r) >= hash_length: 75 | r = r[:hash_length] 76 | return int(r, 16) % (10**8) 77 | 78 | 79 | def hash_float(c, hash_length): 80 | """ 81 | Hashes a float into a float. 82 | 83 | @param c value to hash 84 | @param hash_length hash_length 85 | @return int 86 | """ 87 | if numpy.isnan(c): 88 | return c 89 | else: 90 | b = struct.pack("d", c) 91 | m = hashlib.sha256() 92 | m.update(b) 93 | r = m.hexdigest() 94 | if len(r) >= hash_length: 95 | r = r[:hash_length] 96 | i = int(r, 16) % (2**53) 97 | return float(i) 98 | 99 | 100 | def dataframe_hash_columns(df, cols=None, hash_length=10, inplace=False): 101 | """ 102 | Hashes a set of columns in a dataframe. 103 | Keeps the same type. Skips missing values. 104 | 105 | @param df dataframe 106 | @param cols columns to hash or None for alls. 107 | @param hash_length for strings only, length of the hash 108 | @param inplace modifies inplace 109 | @return new dataframe 110 | 111 | This might be useful to anonimized data before 112 | making it public. 113 | 114 | .. exref:: 115 | :title: Hashes a set of columns in a dataframe 116 | :tag: dataframe 117 | 118 | .. runpython:: 119 | :showcode: 120 | 121 | import pandas 122 | from pandas_streaming.df import dataframe_hash_columns 123 | df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1", ai=1), 124 | dict(b="f", c=5.7, ind="a2", ai=2), 125 | dict(a=4, b="g", ind="a3", ai=3), 126 | dict(a=8, b="h", c=5.9, ai=4), 127 | dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) 128 | print(df) 129 | print('--------------') 130 | df2 = dataframe_hash_columns(df) 131 | print(df2) 132 | """ 133 | if cols is None: 134 | cols = list(df.columns) 135 | 136 | if not inplace: 137 | df = df.copy() 138 | 139 | def hash_intl(c): 140 | "hash int" 141 | return hash_int(c, hash_length) 142 | 143 | def hash_strl(c): 144 | "hash string" 145 | return hash_str(c, hash_length) 146 | 147 | def hash_floatl(c): 148 | "hash float" 149 | return hash_float(c, hash_length) 150 | 151 | coltype = dict(zip(df.columns, df.dtypes)) 152 | for c in cols: 153 | t = coltype[c] 154 | if t == int: # noqa: E721 155 | df[c] = df[c].apply(hash_intl) 156 | elif t == numpy.int64: 157 | df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x))) 158 | elif t == float: # noqa: E721 159 | df[c] = df[c].apply(hash_floatl) 160 | elif t == object: # noqa: E721 161 | df[c] = df[c].apply(hash_strl) 162 | else: 163 | raise NotImplementedError( # pragma: no cover 164 | f"Conversion of type {t} in column '{c}' is not implemented" 165 | ) 166 | 167 | return df 168 | 169 | 170 | def dataframe_unfold(df, col, new_col=None, sep=","): 171 | """ 172 | One column may contain concatenated values. 173 | This function splits these values and multiplies the 174 | rows for each split value. 175 | 176 | @param df dataframe 177 | @param col column with the concatenated values (strings) 178 | @param new_col new column name, if None, use default value. 179 | @param sep separator 180 | @return a new dataframe 181 | 182 | .. exref:: 183 | :title: Unfolds a column of a dataframe. 184 | :tag: dataframe 185 | 186 | .. runpython:: 187 | :showcode: 188 | 189 | import pandas 190 | import numpy 191 | from pandas_streaming.df import dataframe_unfold 192 | 193 | df = pandas.DataFrame([dict(a=1, b="e,f"), 194 | dict(a=2, b="g"), 195 | dict(a=3)]) 196 | print(df) 197 | df2 = dataframe_unfold(df, "b") 198 | print('----------') 199 | print(df2) 200 | 201 | # To fold: 202 | folded = df2.groupby('a').apply( 203 | lambda row: ','.join(row['b_unfold'].dropna()) 204 | if len(row['b_unfold'].dropna()) > 0 else numpy.nan) 205 | print('----------') 206 | print(folded) 207 | """ 208 | if new_col is None: 209 | col_name = col + "_unfold" 210 | else: 211 | col_name = new_col 212 | temp_col = "__index__" 213 | while temp_col in df.columns: 214 | temp_col += "_" 215 | rows = [] 216 | for i, v in enumerate(df[col]): 217 | if isinstance(v, str): 218 | spl = v.split(sep) 219 | for vs in spl: 220 | rows.append({col: v, col_name: vs, temp_col: i}) 221 | else: 222 | rows.append({col: v, col_name: v, temp_col: i}) 223 | df = df.copy() 224 | df[temp_col] = list(range(df.shape[0])) 225 | dfj = DataFrame(rows) 226 | res = df.merge(dfj, on=[col, temp_col]) 227 | return res.drop(temp_col, axis=1).copy() 228 | 229 | 230 | def dataframe_shuffle(df, random_state=None): 231 | """ 232 | Shuffles a dataframe. 233 | 234 | :param df: :epkg:`pandas:DataFrame` 235 | :param random_state: seed 236 | :return: new :epkg:`pandas:DataFrame` 237 | 238 | .. exref:: 239 | :title: Shuffles the rows of a dataframe 240 | :tag: dataframe 241 | 242 | .. runpython:: 243 | :showcode: 244 | 245 | import pandas 246 | from pandas_streaming.df import dataframe_shuffle 247 | 248 | df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1"), 249 | dict(a=2, b="f", c=5.7, ind="a2"), 250 | dict(a=4, b="g", c=5.8, ind="a3"), 251 | dict(a=8, b="h", c=5.9, ind="a4"), 252 | dict(a=16, b="i", c=6.2, ind="a5")]) 253 | print(df) 254 | print('----------') 255 | 256 | shuffled = dataframe_shuffle(df, random_state=0) 257 | print(shuffled) 258 | """ 259 | if random_state is not None: 260 | state = numpy.random.RandomState(random_state) 261 | permutation = state.permutation 262 | else: 263 | permutation = numpy.random.permutation 264 | ori_cols = list(df.columns) 265 | scols = set(ori_cols) 266 | 267 | no_index = df.reset_index(drop=False) 268 | keep_cols = [_ for _ in no_index.columns if _ not in scols] 269 | index = no_index.index 270 | index = permutation(index) 271 | shuffled = no_index.iloc[index, :] 272 | res = shuffled.set_index(keep_cols)[ori_cols] 273 | res.index.names = df.index.names 274 | return res 275 | 276 | 277 | def pandas_fillna(df, by, hasna=None, suffix=None): 278 | """ 279 | Replaces the :epkg:`nan` values for something not :epkg:`nan`. 280 | Mostly used by @see fn pandas_groupby_nan. 281 | 282 | :param df: dataframe 283 | :param by: list of columns for which we need to replace nan 284 | :param hasna: None or list of columns for which we need to replace NaN 285 | :param suffix: use a prefix for the NaN value 286 | :return: list of values chosen for each column, new dataframe (new copy) 287 | """ 288 | suffix = suffix if suffix else "²nan" 289 | df = df.copy() 290 | rep = {} 291 | for c in by: 292 | if hasna is not None and c not in hasna: 293 | continue 294 | if df[c].dtype in (str, bytes, object): 295 | se = set(df[c].dropna()) 296 | val = se.pop() 297 | if isinstance(val, str): 298 | cst = suffix 299 | val = "" 300 | elif isinstance(val, bytes): 301 | cst = b"_" 302 | else: 303 | raise TypeError( # pragma: no cover 304 | "Unable to determine a constant for type='{0}' dtype='{1}'".format( # noqa: UP030 305 | val, df[c].dtype 306 | ) 307 | ) 308 | val += cst 309 | while val in se: 310 | val += suffix 311 | df[c].fillna(val, inplace=True) 312 | rep[c] = val 313 | else: 314 | dr = df[c].dropna() 315 | mi = abs(dr.min()) 316 | ma = abs(dr.max()) 317 | val = ma + mi 318 | if val == ma and not isinstance(val, str): 319 | val += ma + 1.0 320 | if val <= ma: 321 | raise ValueError( # pragma: no cover 322 | "Unable to find a different value for column '{}' v='{}: " 323 | "min={} max={}".format(c, val, mi, ma) 324 | ) 325 | df[c].fillna(val, inplace=True) 326 | rep[c] = val 327 | return rep, df 328 | 329 | 330 | def pandas_groupby_nan( 331 | df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs 332 | ): 333 | """ 334 | Does a *groupby* including keeping missing values (:epkg:`nan`). 335 | 336 | :param df: dataframe 337 | :param by: column or list of columns 338 | :param axis: only 0 is allowed 339 | :param as_index: should be False 340 | :param suffix: None or a string 341 | :param nanback: put :epkg:`nan` back in the index, 342 | otherwise it leaves a replacement for :epkg:`nan`. 343 | (does not work when grouping by multiple columns) 344 | :param kwargs: other parameters sent to 345 | `groupby `_ 347 | :return: groupby results 348 | 349 | See :epkg:`groupby and missing values`. 350 | If no :epkg:`nan` is detected, the function falls back in regular 351 | :epkg:`pandas:DataFrame:groupby` which has the following 352 | behavior. 353 | 354 | .. exref:: 355 | :title: Group a dataframe by one column including nan values 356 | :tag: dataframe 357 | 358 | The regular :epkg:`pandas:dataframe:GroupBy` of a 359 | :epkg:`pandas:DataFrame` removes every :epkg:`nan` 360 | values from the index. 361 | 362 | .. runpython:: 363 | :showcode: 364 | 365 | from pandas import DataFrame 366 | 367 | data = [dict(a=2, ind="a", n=1), 368 | dict(a=2, ind="a"), 369 | dict(a=3, ind="b"), 370 | dict(a=30)] 371 | df = DataFrame(data) 372 | print(df) 373 | gr = df.groupby(["ind"]).sum() 374 | print(gr) 375 | 376 | Function @see fn pandas_groupby_nan modifies keeps them. 377 | 378 | .. runpython:: 379 | :showcode: 380 | 381 | from pandas import DataFrame 382 | from pandas_streaming.df import pandas_groupby_nan 383 | 384 | data = [dict(a=2, ind="a", n=1), 385 | dict(a=2, ind="a"), 386 | dict(a=3, ind="b"), 387 | dict(a=30)] 388 | df = DataFrame(data) 389 | gr2 = pandas_groupby_nan(df, ["ind"]).sum() 390 | print(gr2) 391 | """ 392 | if nanback and suffix is None: 393 | try: 394 | res = df.groupby(by, axis=axis, as_index=as_index, dropna=False, **kwargs) 395 | except TypeError: 396 | # old version of pandas 397 | res = None 398 | if res is not None: 399 | if suffix is None: 400 | return res 401 | res.index = Series(res.index).replace(numpy.nan, suffix) 402 | return res 403 | if axis != 0: 404 | raise NotImplementedError("axis should be 0") 405 | if as_index: 406 | raise NotImplementedError("as_index must be False") 407 | if isinstance(by, tuple): 408 | raise TypeError("by should be of list not tuple") 409 | if not isinstance(by, list): 410 | by = [by] 411 | hasna = {} 412 | for b in by: 413 | h = df[b].isnull().values.any() 414 | if h: 415 | hasna[b] = True 416 | if len(hasna) > 0: 417 | rep, df_copy = pandas_fillna(df, by, hasna, suffix=suffix) 418 | res = df_copy.groupby(by, axis=axis, as_index=as_index, **kwargs) 419 | if len(by) == 1: 420 | if not nanback: 421 | dummy = DataFrame([{"a": "a"}]) 422 | do = dummy.dtypes[0] 423 | typ = dict(zip(df.columns, df.dtypes)) 424 | if typ[by[0]] != do: 425 | warnings.warn( # pragma: no cover 426 | f"[pandas_groupby_nan] NaN value: {rep}", stacklevel=0 427 | ) 428 | return res 429 | for b in by: 430 | fnan = rep[b] 431 | if fnan in res.grouper.groups: 432 | res.grouper.groups[numpy.nan] = res.grouper.groups[fnan] 433 | del res.grouper.groups[fnan] 434 | new_val = [ 435 | (numpy.nan if b == fnan else b) for b in res.grouper.result_index 436 | ] 437 | res.grouper.groupings[0]._group_index = Index(new_val) 438 | res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True) 439 | if hasattr(res.grouper, "grouping"): 440 | if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray): 441 | arr = numpy.array(new_val) 442 | res.grouper.groupings[0].grouper = arr 443 | if ( 444 | hasattr(res.grouper.groupings[0], "_cache") 445 | and "result_index" in res.grouper.groupings[0]._cache 446 | ): 447 | del res.grouper.groupings[0]._cache["result_index"] 448 | else: 449 | raise NotImplementedError( 450 | "Not implemented for type: {0}".format( # noqa: UP030 451 | type(res.grouper.groupings[0].grouper) 452 | ) 453 | ) 454 | else: 455 | grouper = res.grouper._get_grouper() 456 | if isinstance(grouper, numpy.ndarray): 457 | arr = numpy.array(new_val) 458 | res.grouper.groupings[0].grouping_vector = arr 459 | if ( 460 | hasattr(res.grouper.groupings[0], "_cache") 461 | and "result_index" in res.grouper.groupings[0]._cache 462 | ): 463 | index = res.grouper.groupings[0]._cache["result_index"] 464 | if len(rep) == 1: 465 | key = list(rep.values())[0] # noqa: RUF015 466 | new_index = numpy.array(index) 467 | for i in range(len(new_index)): 468 | if new_index[i] == key: 469 | new_index[i] = numpy.nan 470 | res.grouper.groupings[0]._cache["result_index"] = ( 471 | index.__class__(new_index) 472 | ) 473 | else: 474 | raise NotImplementedError( # pragma: no cover 475 | "NaN values not implemented for multiindex." 476 | ) 477 | else: 478 | raise NotImplementedError( # pragma: no cover 479 | "Not implemented for type: {0}".format( # noqa: UP030 480 | type(res.grouper.groupings[0].grouper) 481 | ) 482 | ) 483 | res.grouper._cache["result_index"] = res.grouper.groupings[ 484 | 0 485 | ]._group_index 486 | else: 487 | if not nanback: 488 | dummy = DataFrame([{"a": "a"}]) 489 | do = dummy.dtypes[0] 490 | typ = dict(zip(df.columns, df.dtypes)) 491 | for b in by: 492 | if typ[b] != do: 493 | warnings.warn( # pragma: no cover 494 | f"[pandas_groupby_nan] NaN values: {rep}", stacklevel=0 495 | ) 496 | break 497 | return res 498 | raise NotImplementedError( 499 | "Not yet implemented. Replacing pseudo nan values by real nan " 500 | "values is not as easy as it looks. Use nanback=False" 501 | ) 502 | 503 | # keys = list(res.grouper.groups.keys()) 504 | # didit = False 505 | # mapping = {} 506 | # for key in keys: 507 | # new_key = list(key) 508 | # mod = False 509 | # for k, b in enumerate(by): 510 | # if b not in rep: 511 | # continue 512 | # fnan = rep[b] 513 | # if key[k] == fnan: 514 | # new_key[k] = numpy.nan 515 | # mod = True 516 | # didit = True 517 | # mapping[fnan] = numpy.nan 518 | # if mod: 519 | # new_key = tuple(new_key) 520 | # mapping[key] = new_key 521 | # res.grouper.groups[new_key] = res.grouper.groups[key] 522 | # del res.grouper.groups[key] 523 | # if didit: 524 | # # this code deos not work 525 | # vnan = numpy.nan 526 | # new_index = list(mapping.get(v, v) 527 | # for v in res.grouper.result_index) 528 | # names = res.grouper.result_index.names 529 | # # index = MultiIndex.from_tuples(tuples=new_index, names=names) 530 | # # res.grouper.result_index = index # does not work cannot set 531 | # # values for [result_index] 532 | # for k in range(len(res.grouper.groupings)): 533 | # grou = res.grouper.groupings[k] 534 | # new_val = list(mapping.get(v, v) for v in grou) 535 | # grou._group_index = Index(new_val) 536 | # b = names[k] 537 | # if b in rep: 538 | # vv = rep[b] 539 | # grou.obj[b].replace(vv, vnan, inplace=True) 540 | # if isinstance(grou.grouper, numpy.ndarray): 541 | # grou.grouper = numpy.array(new_val) 542 | # else: 543 | # raise NotImplementedError( 544 | # "Not implemented for type: {0}".format( 545 | # type(grou.grouper))) 546 | # del res.grouper._cache 547 | return res 548 | return df.groupby(by, axis=axis, **kwargs) 549 | -------------------------------------------------------------------------------- /pandas_streaming/df/dataframe_io.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import zipfile 4 | import pandas 5 | import numpy 6 | 7 | 8 | def to_zip(df, zipfilename, zname="df.csv", **kwargs): 9 | """ 10 | Saves a :epkg:`Dataframe` into a :epkg:`zip` file. 11 | It can be read by :meth:`read_zip`. 12 | 13 | :param df: :epkg:`dataframe` or :class:`numpy.ndarray` 14 | :param zipfilename: a :class:`zipfile.ZipFile` or a filename 15 | :param zname: a filename in the zipfile 16 | :param kwargs: parameters for :meth:`pandas.DataFrame.to_csv` or 17 | :func:`numpy.save` 18 | :return: zipfilename 19 | 20 | .. exref:: 21 | :title: Saves and reads a dataframe in a zip file 22 | :tag: dataframe 23 | 24 | This shows an example on how to save and read a 25 | :class:`pandas.DataFrame` directly into a zip file. 26 | 27 | .. runpython:: 28 | :showcode: 29 | 30 | import pandas 31 | from pandas_streaming.df import to_zip, read_zip 32 | 33 | df = pandas.DataFrame([dict(a=1, b="e"), 34 | dict(b="f", a=5.7)]) 35 | 36 | name = "dfs.zip" 37 | to_zip(df, name, encoding="utf-8", index=False) 38 | df2 = read_zip(name, encoding="utf-8") 39 | print(df2) 40 | 41 | .. exref:: 42 | :title: Saves and reads a numpy array in a zip file 43 | :tag: array 44 | 45 | This shows an example on how to save and read a 46 | :class:`numpy.ndarray` directly into a zip file. 47 | 48 | .. runpython:: 49 | :showcode: 50 | 51 | import numpy 52 | from pandas_streaming.df import to_zip, read_zip 53 | 54 | arr = numpy.array([[0.5, 1.5], [0.4, 1.6]]) 55 | 56 | name = "dfsa.zip" 57 | to_zip(arr, name, 'arr.npy') 58 | arr2 = read_zip(name, 'arr.npy') 59 | print(arr2) 60 | """ 61 | if isinstance(df, pandas.DataFrame): 62 | stb = io.StringIO() 63 | ext = os.path.splitext(zname)[-1] 64 | if ext == ".npy": 65 | raise ValueError( # pragma: no cover 66 | "Extension '.npy' cannot be used to save a dataframe." 67 | ) 68 | df.to_csv(stb, **kwargs) 69 | elif isinstance(df, numpy.ndarray): 70 | stb = io.BytesIO() 71 | ext = os.path.splitext(zname)[-1] 72 | if ext != ".npy": 73 | raise ValueError( # pragma: no cover 74 | "Extension '.npy' is required when saving a numpy array." 75 | ) 76 | numpy.save(stb, df, **kwargs) 77 | else: 78 | raise TypeError(f"Type not handled {type(df)}") # pragma: no cover 79 | text = stb.getvalue() 80 | 81 | if isinstance(zipfilename, str): 82 | ext = os.path.splitext(zipfilename)[-1] 83 | if ext != ".zip": 84 | raise NotImplementedError( # pragma: no cover 85 | f"Only zip file are implemented not '{ext}'." 86 | ) 87 | zf = zipfile.ZipFile(zipfilename, "w") # pylint: disable=R1732 88 | close = True 89 | elif isinstance(zipfilename, zipfile.ZipFile): 90 | zf = zipfilename 91 | close = False 92 | else: 93 | raise TypeError( # pragma: no cover 94 | f"No implementation for type '{type(zipfilename)}'" 95 | ) 96 | 97 | zf.writestr(zname, text) 98 | if close: 99 | zf.close() 100 | 101 | 102 | def read_zip(zipfilename, zname=None, **kwargs): 103 | """ 104 | Reads a :epkg:`dataframe` from a :epkg:`zip` file. 105 | It can be saved by :meth:`to_zip`. 106 | 107 | :param zipfilename: a :class:`zipfile.ZipFile` or a filename 108 | :param zname: a filename in zipfile, if None, takes the first one 109 | :param kwargs: parameters for :func:`pandas.read_csv` 110 | :return: :class:`pandas.DataFrame` or :class:`numpy.ndarray` 111 | """ 112 | if isinstance(zipfilename, str): 113 | ext = os.path.splitext(zipfilename)[-1] 114 | if ext != ".zip": 115 | raise NotImplementedError( # pragma: no cover 116 | f"Only zip files are supported not '{ext}'." 117 | ) 118 | zf = zipfile.ZipFile(zipfilename, "r") # pylint: disable=R1732 119 | close = True 120 | elif isinstance(zipfilename, zipfile.ZipFile): 121 | zf = zipfilename 122 | close = False 123 | else: 124 | raise TypeError( # pragma: no cover 125 | f"No implementation for type '{type(zipfilename)}'" 126 | ) 127 | 128 | if zname is None: 129 | zname = zf.namelist()[0] 130 | content = zf.read(zname) 131 | stb = io.BytesIO(content) 132 | ext = os.path.splitext(zname)[-1] 133 | if ext == ".npy": 134 | df = numpy.load(stb, **kwargs) 135 | else: 136 | df = pandas.read_csv(stb, **kwargs) 137 | 138 | if close: 139 | zf.close() 140 | 141 | return df 142 | -------------------------------------------------------------------------------- /pandas_streaming/df/dataframe_io_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import StringIO, BytesIO 3 | 4 | try: 5 | from ujson import dumps 6 | except ImportError: # pragma: no cover 7 | from json import dumps 8 | 9 | 10 | class JsonPerRowsStream: 11 | """ 12 | Reads a :epkg:`json` streams and adds 13 | ``,``, ``[``, ``]`` to convert a stream containing 14 | one :epkg:`json` object per row into one single :epkg:`json` object. 15 | It only implements method *readline*. 16 | 17 | :param st: stream 18 | """ 19 | 20 | def __init__(self, st): 21 | self.st = st 22 | self.begin = True 23 | self.newline = False 24 | self.end = True 25 | 26 | def seek(self, offset): 27 | """ 28 | Change the stream position to the given byte offset. 29 | 30 | :param offset: offset, only 0 is implemented 31 | """ 32 | self.st.seek(offset) 33 | 34 | def readline(self, size=-1): 35 | """ 36 | Reads a line, adds ``,``, ``[``, ``]`` if needed. 37 | So the number of read characters is not recessarily 38 | the requested one but could be greater. 39 | """ 40 | text = self.st.readline(size) 41 | if size == 0: 42 | return text 43 | if self.newline: 44 | text = "," + text 45 | self.newline = False 46 | elif self.begin: 47 | text = "[" + text 48 | self.begin = False 49 | 50 | if text.endswith("\n"): 51 | self.newline = True 52 | return text 53 | if len(text) == 0 or len(text) < size: 54 | if self.end: 55 | self.end = False 56 | return text + "]" 57 | return text 58 | return text 59 | 60 | def read(self, size=-1): 61 | """ 62 | Reads characters, adds ``,``, ``[``, ``]`` if needed. 63 | So the number of read characters is not recessarily 64 | the requested one but could be greater. 65 | """ 66 | text = self.st.read(size) 67 | if isinstance(text, bytes): 68 | cst = b"\n", b"\n,", b",", b"[", b"]" 69 | else: 70 | cst = "\n", "\n,", ",", "[", "]" 71 | if size == 0: 72 | return text 73 | if len(text) > 1: 74 | t1, t2 = text[: len(text) - 1], text[len(text) - 1 :] 75 | t1 = t1.replace(cst[0], cst[1]) 76 | text = t1 + t2 77 | 78 | if self.newline: 79 | text = cst[2] + text 80 | self.newline = False 81 | elif self.begin: 82 | text = cst[3] + text 83 | self.begin = False 84 | 85 | if text.endswith(cst[0]): 86 | self.newline = True 87 | return text 88 | if len(text) == 0 or len(text) < size: 89 | if self.end: 90 | self.end = False 91 | return text + cst[4] 92 | return text 93 | return text 94 | 95 | def getvalue(self): 96 | """ 97 | Returns the whole stream content. 98 | """ 99 | 100 | def byline(): 101 | line = self.readline() 102 | while line: 103 | yield line 104 | line = self.readline() 105 | 106 | return "".join(byline()) 107 | 108 | 109 | def flatten_dictionary(dico, sep="_"): 110 | """ 111 | Flattens a dictionary with nested structure to a dictionary with no 112 | hierarchy. 113 | 114 | :param dico: dictionary to flatten 115 | :param sep: string to separate dictionary keys by 116 | :return: flattened dictionary 117 | 118 | Inspired from `flatten_json 119 | `_. 120 | """ 121 | flattened_dict = {} 122 | 123 | def _flatten(obj, key): 124 | if obj is None: 125 | flattened_dict[key] = obj 126 | elif isinstance(obj, dict): 127 | for k, v in obj.items(): 128 | if not isinstance(k, str): 129 | raise TypeError("All keys must a string.") # pragma: no cover 130 | k2 = k if key is None else f"{key}{sep}{k}" 131 | _flatten(v, k2) 132 | elif isinstance(obj, (list, set)): 133 | for index, item in enumerate(obj): 134 | k2 = k if key is None else f"{key}{sep}{index}" 135 | _flatten(item, k2) 136 | else: 137 | flattened_dict[key] = obj 138 | 139 | _flatten(dico, None) 140 | return flattened_dict 141 | 142 | 143 | def enumerate_json_items( 144 | filename, encoding=None, lines=False, flatten=False, verbose=0 145 | ): 146 | """ 147 | Enumerates items from a :epkg:`JSON` file or string. 148 | 149 | :param filename: filename or string or stream to parse 150 | :param encoding: encoding 151 | :param lines: one record per row 152 | :param flatten: call @see fn flatten_dictionary 153 | :param verbose: verbosity (based on :epkg:`tqdm`) 154 | :return: iterator on records at first level. 155 | 156 | It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``. 157 | However, if option *lines* if true, the function considers that the 158 | stream or file does have one record per row as follows: 159 | 160 | {"id":1, ...} 161 | {"id": 2, ...} 162 | 163 | .. exref:: 164 | :title: Processes a json file by streaming. 165 | 166 | The module :epkg:`ijson` can read a :epkg:`JSON` file by streaming. 167 | This module is needed because a record can be written on multiple lines. 168 | This function leverages it produces the following results. 169 | 170 | .. runpython:: 171 | :showcode: 172 | 173 | from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items 174 | 175 | text_json = b''' 176 | [ 177 | { 178 | "glossary": { 179 | "title": "example glossary", 180 | "GlossDiv": { 181 | "title": "S", 182 | "GlossList": [{ 183 | "GlossEntry": { 184 | "ID": "SGML", 185 | "SortAs": "SGML", 186 | "GlossTerm": "Standard Generalized Markup Language", 187 | "Acronym": "SGML", 188 | "Abbrev": "ISO 8879:1986", 189 | "GlossDef": { 190 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 191 | "GlossSeeAlso": ["GML", "XML"] 192 | }, 193 | "GlossSee": "markup" 194 | } 195 | }] 196 | } 197 | } 198 | }, 199 | { 200 | "glossary": { 201 | "title": "example glossary", 202 | "GlossDiv": { 203 | "title": "S", 204 | "GlossList": { 205 | "GlossEntry": [{ 206 | "ID": "SGML", 207 | "SortAs": "SGML", 208 | "GlossTerm": "Standard Generalized Markup Language", 209 | "Acronym": "SGML", 210 | "Abbrev": "ISO 8879:1986", 211 | "GlossDef": { 212 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 213 | "GlossSeeAlso": ["GML", "XML"] 214 | }, 215 | "GlossSee": "markup" 216 | }] 217 | } 218 | } 219 | } 220 | } 221 | ] 222 | ''' 223 | 224 | for item in enumerate_json_items(text_json): 225 | print(item) 226 | 227 | The parsed json must have an empty line at the end otherwise 228 | the following exception is raised: 229 | `ijson.common.IncompleteJSONError: ` 230 | `parse error: unallowed token at this point in JSON text`. 231 | """ 232 | if isinstance(filename, str): 233 | if "{" not in filename and os.path.exists(filename): 234 | with open(filename, "r", encoding=encoding) as f: 235 | for el in enumerate_json_items( 236 | f, encoding=encoding, lines=lines, flatten=flatten 237 | ): 238 | yield el 239 | else: 240 | st = StringIO(filename) 241 | for el in enumerate_json_items( 242 | st, encoding=encoding, lines=lines, flatten=flatten 243 | ): 244 | yield el 245 | elif isinstance(filename, bytes): 246 | st = BytesIO(filename) 247 | for el in enumerate_json_items( 248 | st, encoding=encoding, lines=lines, flatten=flatten 249 | ): 250 | yield el 251 | elif lines: 252 | for el in enumerate_json_items( 253 | JsonPerRowsStream(filename), encoding=encoding, lines=False, flatten=flatten 254 | ): 255 | yield el 256 | else: 257 | if hasattr(filename, "seek"): 258 | filename.seek(0) 259 | import ijson 260 | 261 | parser = ijson.parse(filename) 262 | current = None 263 | curkey = None 264 | stack = [] 265 | nbyield = 0 266 | if verbose: 267 | from tqdm import tqdm 268 | 269 | loop = tqdm(enumerate(parser)) 270 | else: 271 | loop = enumerate(parser) 272 | for i, (_, event, value) in loop: 273 | if verbose: 274 | loop.set_description(f"process row {i}-event={event!r}") 275 | if event == "start_array": 276 | if curkey is None: 277 | current = [] 278 | else: 279 | if not isinstance(current, dict): 280 | raise RuntimeError( # pragma: no cover 281 | f"Type issue {type(current)}" 282 | ) 283 | c = [] 284 | current[curkey] = c # pylint: disable=E1137 285 | current = c 286 | curkey = None 287 | stack.append(current) 288 | elif event == "end_array": 289 | stack.pop() 290 | if len(stack) == 0: 291 | # We should be done. 292 | current = None 293 | else: 294 | current = stack[-1] 295 | elif event == "start_map": 296 | c = {} 297 | if curkey is None: 298 | if current is None: 299 | current = [] 300 | current.append(c) 301 | else: 302 | current[curkey] = c # pylint: disable=E1137 303 | stack.append(c) 304 | current = c 305 | curkey = None 306 | elif event == "end_map": 307 | stack.pop() 308 | current = stack[-1] 309 | if len(stack) == 1: 310 | nbyield += 1 311 | if flatten: 312 | yield flatten_dictionary(current[-1]) 313 | else: 314 | yield current[-1] 315 | # We clear the memory. 316 | current.clear() 317 | elif event == "map_key": 318 | curkey = value 319 | elif event in {"string", "number", "boolean"}: 320 | if curkey is None: 321 | current.append(value) 322 | else: 323 | current[curkey] = value # pylint: disable=E1137 324 | curkey = None 325 | elif event == "null": 326 | if curkey is None: 327 | current.append(None) 328 | else: 329 | current[curkey] = None # pylint: disable=E1137 330 | curkey = None 331 | else: 332 | raise ValueError(f"Unknown event '{event}'") # pragma: no cover 333 | 334 | 335 | class JsonIterator2Stream: 336 | """ 337 | Transforms an iterator on :epkg:`JSON` items 338 | into a stream which returns an items as a string every time 339 | method *read* is called. 340 | The iterator could be one returned by @see fn enumerate_json_items. 341 | 342 | :param it: iterator 343 | :param kwargs: arguments to :class:`json.dumps` 344 | 345 | .. exref:: 346 | :title: Reshape a json file 347 | 348 | The function @see fn enumerate_json_items reads any 349 | :epkg:`json` even if every record is split over 350 | multiple lines. Class @see cl JsonIterator2Stream 351 | mocks this iterator as a stream. Each row is a single item. 352 | 353 | .. runpython:: 354 | :showcode: 355 | 356 | from pandas_streaming.df.dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream 357 | 358 | text_json = b''' 359 | [ 360 | { 361 | "glossary": { 362 | "title": "example glossary", 363 | "GlossDiv": { 364 | "title": "S", 365 | "GlossList": [{ 366 | "GlossEntry": { 367 | "ID": "SGML", 368 | "SortAs": "SGML", 369 | "GlossTerm": "Standard Generalized Markup Language", 370 | "Acronym": "SGML", 371 | "Abbrev": "ISO 8879:1986", 372 | "GlossDef": { 373 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 374 | "GlossSeeAlso": ["GML", "XML"] 375 | }, 376 | "GlossSee": "markup" 377 | } 378 | }] 379 | } 380 | } 381 | }, 382 | { 383 | "glossary": { 384 | "title": "example glossary", 385 | "GlossDiv": { 386 | "title": "S", 387 | "GlossList": { 388 | "GlossEntry": [{ 389 | "ID": "SGML", 390 | "SortAs": "SGML", 391 | "GlossTerm": "Standard Generalized Markup Language", 392 | "Acronym": "SGML", 393 | "Abbrev": "ISO 8879:1986", 394 | "GlossDef": { 395 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 396 | "GlossSeeAlso": ["GML", "XML"] 397 | }, 398 | "GlossSee": "markup" 399 | }] 400 | } 401 | } 402 | } 403 | } 404 | ] 405 | ''' 406 | 407 | for item in JsonIterator2Stream(lambda: enumerate_json_items(text_json)): 408 | print(item) 409 | 410 | .. versionchanged:: 0.3 411 | The class takes a function which outputs an iterator and not an iterator. 412 | `JsonIterator2Stream(enumerate_json_items(text_json))` needs to be rewritten 413 | into JsonIterator2Stream(lambda: enumerate_json_items(text_json)). 414 | """ 415 | 416 | def __init__(self, it, **kwargs): 417 | self.it = it 418 | self.kwargs = kwargs 419 | self.it0 = it() 420 | 421 | def seek(self, offset): 422 | """ 423 | Change the stream position to the given byte offset. 424 | 425 | :param offset: offset, only 0 is implemented 426 | """ 427 | if offset != 0: 428 | raise NotImplementedError("The iterator can only return at the beginning.") 429 | self.it0 = self.it() 430 | 431 | def write(self): 432 | """ 433 | The class does not write. 434 | """ 435 | raise NotImplementedError() 436 | 437 | def read(self): 438 | """ 439 | Reads the next item and returns it as a string. 440 | """ 441 | try: 442 | value = next(self.it0) 443 | return dumps(value, **self.kwargs) 444 | except StopIteration: 445 | return None 446 | 447 | def __iter__(self): 448 | """ 449 | Iterates on each row. The behaviour is a bit tricky. 450 | It is implemented to be swalled by :func:`pandas.read_json` which 451 | uses :func:`itertools.islice` to go through the items. 452 | It calls multiple times `__iter__` but does expect the 453 | iterator to continue from where it stopped last time. 454 | """ 455 | for value in self.it0: 456 | yield dumps(value, **self.kwargs) 457 | -------------------------------------------------------------------------------- /pandas_streaming/df/dataframe_split.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import pickle 3 | import random 4 | import warnings 5 | from io import StringIO 6 | import pandas 7 | 8 | 9 | def sklearn_train_test_split( 10 | self, path_or_buf=None, export_method="to_csv", names=None, **kwargs 11 | ): 12 | """ 13 | Randomly splits a dataframe into smaller pieces. 14 | The function returns streams of file names. 15 | The function relies on :func:`sklearn.model_selection.train_test_split`. 16 | It does not handle stratified version of it. 17 | 18 | :param self: see :class:`StreamingDataFrame 19 | ` 20 | :param path_or_buf: a string, a list of strings or buffers, if it is a 21 | string, it must contain ``{}`` like ``partition{}.txt`` 22 | :param export_method: method used to store the partitions, by default 23 | :meth:`pandas.DataFrame.to_csv` 24 | :param names: partitions names, by default ``('train', 'test')`` 25 | :param kwargs: parameters for the export function and 26 | :func:`sklearn.model_selection.train_test_split`. 27 | :return: outputs of the exports functions 28 | 29 | The function cannot return two iterators or two 30 | see :class:`StreamingDataFrame 31 | ` 32 | because running through one 33 | means running through the other. We can assume both 34 | splits do not hold in memory and we cannot run through 35 | the same iterator again as random draws would be different. 36 | We need to store the results into files or buffers. 37 | 38 | .. warning:: 39 | The method *export_method* must write the data in 40 | mode *append* and allows stream. 41 | """ 42 | if kwargs.get("stratify") is not None: 43 | raise NotImplementedError( # pragma: no cover 44 | "No implementation yet for the stratified version." 45 | ) 46 | with warnings.catch_warnings(): 47 | warnings.filterwarnings("ignore", category=ImportWarning) 48 | from sklearn.model_selection import train_test_split 49 | 50 | opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"] 51 | split_ops = {} 52 | for o in opts: 53 | if o in kwargs: 54 | split_ops[o] = kwargs[o] 55 | del kwargs[o] 56 | 57 | exportf_ = getattr(pandas.DataFrame, export_method) 58 | if export_method == "to_csv" and "mode" not in kwargs: 59 | exportf = lambda *a, **kw: exportf_(*a, mode="a", **kw) # noqa: E731 60 | else: 61 | exportf = exportf_ 62 | 63 | if isinstance(path_or_buf, str): 64 | if "{}" not in path_or_buf: 65 | raise ValueError("path_or_buf must contain {} to insert the partition name") 66 | if names is None: 67 | names = ["train", "test"] 68 | elif len(names) != len(path_or_buf): 69 | raise ValueError( # pragma: no cover 70 | "names and path_or_buf must have the same length" 71 | ) 72 | path_or_buf = [path_or_buf.format(n) for n in names] 73 | elif path_or_buf is None: 74 | path_or_buf = [None, None] 75 | else: 76 | if not isinstance(path_or_buf, list): 77 | raise TypeError( # pragma: no cover 78 | "path_or_buf must be a list or a string" 79 | ) 80 | 81 | bufs = [] 82 | close = [] 83 | for p in path_or_buf: 84 | if p is None: 85 | st = StringIO() 86 | cl = False 87 | elif isinstance(p, str): 88 | st = open(p, "w", encoding=kwargs.get("encoding")) # noqa: SIM115 89 | cl = True 90 | else: 91 | st = p 92 | cl = False 93 | bufs.append(st) 94 | close.append(cl) 95 | 96 | for df in self: 97 | train, test = train_test_split(df, **split_ops) 98 | exportf(train, bufs[0], **kwargs) 99 | exportf(test, bufs[1], **kwargs) 100 | kwargs["header"] = False 101 | 102 | for b, c in zip(bufs, close): 103 | if c: 104 | b.close() 105 | return [ 106 | st.getvalue() if isinstance(st, StringIO) else p 107 | for st, p in zip(bufs, path_or_buf) 108 | ] 109 | 110 | 111 | def sklearn_train_test_split_streaming( 112 | self, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False 113 | ): 114 | """ 115 | Randomly splits a dataframe into smaller pieces. 116 | The function returns streams of file names. 117 | The function relies on :func:`sklearn.model_selection.train_test_split`. 118 | It handles the stratified version of it. 119 | 120 | :param self: see :class:`StreamingDataFrame 121 | ` 122 | :param test_size: ratio for the test partition 123 | (if *train_size* is not specified) 124 | :param train_size: ratio for the train partition 125 | :param stratify: column holding the stratification 126 | :param hash_size: size of the hash to cache information about partition 127 | :param unique_rows: ensures that rows are unique 128 | :return: Two see :class:`StreamingDataFrame 129 | `, 130 | one for train, one for test. 131 | 132 | The function returns two iterators or two 133 | see :class:`StreamingDataFrame 134 | `. It 135 | tries to do everything without writing anything on disk 136 | but it requires to store the repartition somehow. 137 | This function hashes every row and maps the hash with a part 138 | (train or test). This cache must hold in memory otherwise the 139 | function fails. The two returned iterators must not be used 140 | for the first time in the same time. The first time is used to 141 | build the cache. The function changes the order of rows if 142 | the parameter *stratify* is not null. The cache has a side effect: 143 | every exact same row will be put in the same partition. 144 | If that is not what you want, you should add an index column 145 | or a random one. 146 | """ 147 | p = (1 - test_size) if test_size else None 148 | if train_size is not None: 149 | p = train_size 150 | n = 2 * max(1 / p, 1 / (1 - p)) # changement 151 | 152 | static_schema = [] 153 | 154 | def iterator_rows(): 155 | "iterates on rows" 156 | counts = {} 157 | memory = {} 158 | pos_col = None 159 | for df in self: 160 | if pos_col is None: 161 | static_schema.append(list(df.columns)) 162 | static_schema.append(list(df.dtypes)) 163 | static_schema.append(df.shape[0]) 164 | if stratify is not None: 165 | pos_col = list(df.columns).index(stratify) 166 | else: 167 | pos_col = -1 168 | 169 | for obs in df.itertuples(index=False, name=None): 170 | strat = 0 if stratify is None else obs[pos_col] 171 | if strat not in memory: 172 | memory[strat] = [] 173 | memory[strat].append(obs) 174 | 175 | for k, v in memory.items(): 176 | if len(v) >= n + random.randint(0, 10): # changement 177 | vr = list(range(len(v))) 178 | # on permute aléatoirement 179 | random.shuffle(vr) 180 | if (0, k) in counts: 181 | tt = counts[1, k] + counts[0, k] 182 | delta = -int(counts[0, k] - tt * p + 0.5) 183 | else: 184 | delta = 0 185 | i = int(len(v) * p + 0.5) 186 | i += delta 187 | i = max(0, min(len(v), i)) 188 | one = set(vr[:i]) 189 | for d, obs_ in enumerate(v): 190 | yield obs_, 0 if d in one else 1 191 | if (0, k) not in counts: 192 | counts[0, k] = i 193 | counts[1, k] = len(v) - i 194 | else: 195 | counts[0, k] += i 196 | counts[1, k] += len(v) - i 197 | # on efface de la mémoire les informations produites 198 | v.clear() 199 | 200 | # Lorsqu'on a fini, il faut tout de même répartir les 201 | # observations stockées. 202 | for k, v in memory.items(): 203 | vr = list(range(len(v))) 204 | # on permute aléatoirement 205 | random.shuffle(vr) 206 | if (0, k) in counts: 207 | tt = counts[1, k] + counts[0, k] 208 | delta = -int(counts[0, k] - tt * p + 0.5) 209 | else: 210 | delta = 0 211 | i = int(len(v) * p + 0.5) 212 | i += delta 213 | i = max(0, min(len(v), i)) 214 | one = set(vr[:i]) 215 | for d, obs in enumerate(v): 216 | yield obs, 0 if d in one else 1 217 | if (0, k) not in counts: 218 | counts[0, k] = i 219 | counts[1, k] = len(v) - i 220 | else: 221 | counts[0, k] += i 222 | counts[1, k] += len(v) - i 223 | 224 | def h11(w): 225 | "pickle and hash" 226 | b = pickle.dumps(w) 227 | return hashlib.md5(b).hexdigest()[:hash_size] 228 | 229 | # We store the repartition in a cache. 230 | cache = {} 231 | 232 | def iterator_internal(part_requested): 233 | "internal iterator on dataframes" 234 | iy = 0 235 | accumul = [] 236 | if len(cache) == 0: 237 | for obs, part in iterator_rows(): 238 | h = h11(obs) 239 | if unique_rows and h in cache: 240 | raise ValueError( 241 | "A row or at least its hash is already cached. " # noqa: UP030 242 | "Increase hash_size or check for duplicates " 243 | "('{0}')\n{1}.".format(h, obs) 244 | ) 245 | if h not in cache: 246 | cache[h] = part 247 | else: 248 | part = cache[h] 249 | if part == part_requested: 250 | accumul.append(obs) 251 | if len(accumul) >= static_schema[2]: 252 | dfo = pandas.DataFrame(accumul, columns=static_schema[0]) 253 | self.ensure_dtype(dfo, static_schema[1]) 254 | iy += dfo.shape[0] 255 | accumul.clear() 256 | yield dfo 257 | else: 258 | for df in self: 259 | for obs in df.itertuples(index=False, name=None): 260 | h = h11(obs) 261 | part = cache.get(h) 262 | if part is None: 263 | raise ValueError( # pragma: no cover 264 | f"Second iteration. A row was " 265 | f"never met in the first one\n{obs}" 266 | ) 267 | if part == part_requested: 268 | accumul.append(obs) 269 | if len(accumul) >= static_schema[2]: 270 | dfo = pandas.DataFrame(accumul, columns=static_schema[0]) 271 | self.ensure_dtype(dfo, static_schema[1]) 272 | iy += dfo.shape[0] 273 | accumul.clear() 274 | yield dfo 275 | if len(accumul) > 0: 276 | dfo = pandas.DataFrame(accumul, columns=static_schema[0]) 277 | self.ensure_dtype(dfo, static_schema[1]) 278 | iy += dfo.shape[0] 279 | yield dfo 280 | 281 | return ( 282 | self.__class__(lambda: iterator_internal(0)), 283 | self.__class__(lambda: iterator_internal(1)), 284 | ) 285 | -------------------------------------------------------------------------------- /pandas_streaming/exc/__init__.py: -------------------------------------------------------------------------------- 1 | from .exc_streaming import StreamingInefficientException # noqa: F401 2 | -------------------------------------------------------------------------------- /pandas_streaming/exc/exc_streaming.py: -------------------------------------------------------------------------------- 1 | class StreamingInefficientException(Exception): 2 | """ 3 | Kind of operations doable with a :epkg:`pandas:DataFrame` 4 | but which should not be done in streaming mode. 5 | """ 6 | 7 | def __init__(self, meth): 8 | """ 9 | This method is inefficient in streaming mode 10 | and not implemented. 11 | 12 | :param meth: inefficient method 13 | """ 14 | Exception.__init__(self, f"{meth} should not be done in streaming mode.") 15 | -------------------------------------------------------------------------------- /pandas_streaming/ext_test_case.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import warnings 5 | from contextlib import redirect_stderr, redirect_stdout 6 | from io import StringIO 7 | from typing import Any, Callable, List, Optional 8 | 9 | import numpy 10 | from numpy.testing import assert_allclose 11 | 12 | 13 | def unit_test_going(): 14 | """ 15 | Enables a flag telling the script is running while testing it. 16 | Avois unit tests to be very long. 17 | """ 18 | going = int(os.environ.get("UNITTEST_GOING", 0)) 19 | return going == 1 20 | 21 | 22 | def ignore_warnings(warns: List[Warning]) -> Callable: 23 | """ 24 | Catches warnings. 25 | 26 | :param warns: warnings to ignore 27 | """ 28 | 29 | def wrapper(fct): 30 | if warns is None: 31 | raise AssertionError(f"warns cannot be None for '{fct}'.") 32 | 33 | def call_f(self): 34 | with warnings.catch_warnings(): 35 | warnings.simplefilter("ignore", warns) 36 | return fct(self) 37 | 38 | return call_f 39 | 40 | return wrapper 41 | 42 | 43 | class sys_path_append: 44 | """ 45 | Stores the content of :epkg:`*py:sys:path` and 46 | restores it afterwards. 47 | """ 48 | 49 | def __init__(self, paths, position=-1): 50 | """ 51 | :param paths: paths to add 52 | :param position: where to add it 53 | """ 54 | self.to_add = paths if isinstance(paths, list) else [paths] 55 | self.position = position 56 | 57 | def __enter__(self): 58 | """ 59 | Modifies ``sys.path``. 60 | """ 61 | self.store = sys.path.copy() 62 | if self.position == -1: 63 | sys.path.extend(self.to_add) 64 | else: 65 | for p in reversed(self.to_add): 66 | sys.path.insert(self.position, p) 67 | 68 | def __exit__(self, exc_type, exc_value, traceback): 69 | """ 70 | Restores``sys.path``. 71 | """ 72 | sys.path = self.store 73 | 74 | 75 | class ExtTestCase(unittest.TestCase): 76 | _warns = [] 77 | 78 | def assertExists(self, name): 79 | if not os.path.exists(name): 80 | raise AssertionError(f"File or folder {name!r} does not exists.") 81 | 82 | def assertEqualArray( 83 | self, 84 | expected: numpy.ndarray, 85 | value: numpy.ndarray, 86 | atol: float = 0, 87 | rtol: float = 0, 88 | ): 89 | self.assertEqual(expected.dtype, value.dtype) 90 | self.assertEqual(expected.shape, value.shape) 91 | assert_allclose(expected, value, atol=atol, rtol=rtol) 92 | 93 | def assertEqualDataFrame(self, d1, d2, **kwargs): 94 | """ 95 | Checks that two dataframes are equal. 96 | Calls :func:`pandas.testing.assert_frame_equal`. 97 | """ 98 | from pandas.testing import assert_frame_equal 99 | 100 | assert_frame_equal(d1, d2, **kwargs) 101 | 102 | def assertAlmostEqual( 103 | self, 104 | expected: numpy.ndarray, 105 | value: numpy.ndarray, 106 | atol: float = 0, 107 | rtol: float = 0, 108 | ): 109 | if not isinstance(expected, numpy.ndarray): 110 | expected = numpy.array(expected) 111 | if not isinstance(value, numpy.ndarray): 112 | value = numpy.array(value).astype(expected.dtype) 113 | self.assertEqualArray(expected, value, atol=atol, rtol=rtol) 114 | 115 | def assertRaise( 116 | self, fct: Callable, exc_type: Exception, msg: Optional[str] = None 117 | ): 118 | try: 119 | fct() 120 | except exc_type as e: 121 | if not isinstance(e, exc_type): 122 | raise AssertionError(f"Unexpected exception {type(e)!r}.") from e 123 | if msg is None: 124 | return 125 | if msg not in str(e): 126 | raise AssertionError(f"Unexpected error message {e!r}.") from e 127 | return 128 | raise AssertionError("No exception was raised.") 129 | 130 | def assertEmpty(self, value: Any): 131 | if value is None: 132 | return 133 | if len(value) == 0: 134 | return 135 | raise AssertionError(f"value is not empty: {value!r}.") 136 | 137 | def assertNotEmpty(self, value: Any): 138 | if value is None: 139 | raise AssertionError(f"value is empty: {value!r}.") 140 | if isinstance(value, (list, dict, tuple, set)): 141 | if len(value) == 0: 142 | raise AssertionError(f"value is empty: {value!r}.") 143 | 144 | def assertStartsWith(self, prefix: str, full: str): 145 | if not full.startswith(prefix): 146 | raise AssertionError(f"prefix={prefix!r} does not start string {full!r}.") 147 | 148 | def assertLesser(self, x, y, strict=False): 149 | """ 150 | Checks that ``x <= y``. 151 | """ 152 | if x > y or (strict and x == y): 153 | raise AssertionError( 154 | "x >{2} y with x={0} and y={1}".format( # noqa: UP030 155 | ExtTestCase._format_str(x), 156 | ExtTestCase._format_str(y), 157 | "" if strict else "=", 158 | ) 159 | ) 160 | 161 | @staticmethod 162 | def abs_path_join(filename: str, *args: List[str]): 163 | """ 164 | Returns an absolute and normalized path from this location. 165 | 166 | :param filename: filename, the folder which contains it 167 | is used as the base 168 | :param args: list of subpaths to the previous path 169 | :return: absolute and normalized path 170 | """ 171 | dirname = os.path.join(os.path.dirname(filename), *args) 172 | return os.path.normpath(os.path.abspath(dirname)) 173 | 174 | @classmethod 175 | def tearDownClass(cls): 176 | for name, line, w in cls._warns: 177 | warnings.warn(f"\n{name}:{line}: {type(w)}\n {str(w)}", stacklevel=0) 178 | 179 | def capture(self, fct: Callable): 180 | """ 181 | Runs a function and capture standard output and error. 182 | 183 | :param fct: function to run 184 | :return: result of *fct*, output, error 185 | """ 186 | sout = StringIO() 187 | serr = StringIO() 188 | with redirect_stdout(sout), redirect_stderr(serr): 189 | res = fct() 190 | return res, sout.getvalue(), serr.getvalue() 191 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | 3 | # Exclude a variety of commonly ignored directories. 4 | exclude = [ 5 | ".eggs", 6 | ".git", 7 | "build", 8 | "dist", 9 | ] 10 | 11 | line-length = 88 12 | 13 | [tool.ruff.lint] 14 | select = [ 15 | "B", # flake8-bugbear 16 | "C4", # flake8-comprehensions 17 | #"D", # pydocstyle 18 | "E", # pycodestyle 19 | "F", # Pyflakes 20 | "G", # flake8-logging-format 21 | #"I", # isort 22 | "ISC", # flake8-implicit-str-concat 23 | "LOG", # flake8-logging 24 | #"N", # pep8-naming 25 | #"NPY", # modern numpy 26 | #"PERF", # Perflint 27 | "PIE", # flake8-pie 28 | "PYI", # flake8-pyi 29 | "RUF", # Ruff-specific rules 30 | "SIM", # flake8-simplify 31 | "SLOT", # flake8-slot 32 | "T10", # flake8-debugger 33 | #"TID", # Disallow relative imports 34 | #"TRY", # flake8-try-except-raise 35 | "UP", # pyupgrade 36 | "W", # pycodestyle 37 | "YTT", # flake8-2020 38 | ] 39 | 40 | [tool.ruff.lint.per-file-ignores] 41 | "**" = ["B905", "C401", "C408", "C413", "RUF012", "RUF100", "RUF010", "SIM108", "SIM910", "SIM110", "SIM102", "SIM114", "SIM103", "UP015", "UP027", "UP031", "UP034", "UP032", "UP006", "UP035", "UP007", "UP038"] 42 | "**/plot*.py" = ["B018"] 43 | "_doc/examples/**.py" = ["E402", "F811", "B018"] 44 | "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"] 45 | "pandas_streaming/data/__init__.py" = ["F401"] 46 | "pandas_streaming/df/__init__.py" = ["F401"] 47 | "pandas_streaming/df/dataframe_io_helpers.py" = ["E501"] 48 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8 2 | black 3 | coverage 4 | furo 5 | ijson 6 | jupyter_sphinx 7 | jyquickhelper 8 | matplotlib 9 | nbsphinx 10 | pandas>=1.1.0 11 | pandocfilters 12 | Pillow 13 | pycodestyle 14 | pylint>=2.14.0 15 | pytest 16 | pytest-cov 17 | ruff 18 | scikit-learn 19 | scipy 20 | sphinx 21 | sphinx-issues 22 | git+https://github.com/sdpython/sphinx-runpython.git 23 | sphinx_gallery 24 | ujson 25 | wheel 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options] 2 | packages = find: 3 | 4 | [options.packages.find] 5 | include = pandas_streaming* 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | ###################### 6 | # beginning of setup 7 | ###################### 8 | 9 | 10 | here = os.path.dirname(__file__) 11 | if here == "": 12 | here = "." 13 | package_data = {"pandas_streaming.validation": ["*.css", "*.js"]} 14 | 15 | try: 16 | with open(os.path.join(here, "requirements.txt"), "r") as f: 17 | requirements = f.read().strip(" \n\r\t").split("\n") 18 | except FileNotFoundError: 19 | requirements = [] 20 | if len(requirements) == 0 or requirements == [""]: 21 | requirements = ["pandas"] 22 | 23 | try: 24 | with open(os.path.join(here, "README.rst"), "r", encoding="utf-8") as f: 25 | long_description = "pandas-streaming:" + f.read().split("pandas-streaming:")[1] 26 | except FileNotFoundError: 27 | long_description = "" 28 | 29 | version_str = "0.1.0" 30 | with open(os.path.join(here, "pandas_streaming/__init__.py"), "r") as f: 31 | line = [ 32 | _ 33 | for _ in [_.strip("\r\n ") for _ in f.readlines()] 34 | if _.startswith("__version__") 35 | ] 36 | if len(line) > 0: 37 | version_str = line[0].split("=")[1].strip('" ') 38 | 39 | 40 | setup( 41 | name="pandas-streaming", 42 | version=version_str, 43 | description="Array (and numpy) API for ONNX", 44 | long_description=long_description, 45 | author="Xavier Dupré", 46 | author_email="xavier.dupre@gmail.com", 47 | url="https://github.com/sdpython/pandas-streaming", 48 | package_data=package_data, 49 | setup_requires=["numpy", "scipy"], 50 | install_requires=requirements, 51 | classifiers=[ 52 | "Intended Audience :: Science/Research", 53 | "Intended Audience :: Developers", 54 | "License :: OSI Approved :: MIT License", 55 | "Programming Language :: C", 56 | "Programming Language :: Python", 57 | "Topic :: Software Development", 58 | "Topic :: Scientific/Engineering", 59 | "Development Status :: 5 - Production/Stable", 60 | "Operating System :: Microsoft :: Windows", 61 | "Operating System :: POSIX", 62 | "Operating System :: Unix", 63 | "Operating System :: MacOS", 64 | "Programming Language :: Python :: 3", 65 | "Programming Language :: Python :: 3.8", 66 | "Programming Language :: Python :: 3.9", 67 | "Programming Language :: Python :: 3.10", 68 | "Programming Language :: Python :: 3.11", 69 | ], 70 | ) 71 | --------------------------------------------------------------------------------