├── TAGS
├── debian
    ├── compat
    ├── rules
    ├── changelog
    └── control
├── requirements.txt
├── ph
    ├── _version.py
    └── __init__.py
├── assets
    ├── cases.png
    ├── lifeexp.png
    ├── polyfit.png
    ├── scatter.png
    └── covid-plot.png
├── tests
    ├── test_data
    │   ├── headless.csv
    │   ├── a.csv
    │   ├── broken.csv
    │   ├── inf.csv
    │   ├── sheet.xlsx
    │   ├── left.csv
    │   ├── right.csv
    │   ├── f.csv
    │   ├── group.csv
    │   ├── mergel.csv
    │   ├── merger.csv
    │   ├── t.tsv
    │   ├── d.csv
    │   ├── d.scsv
    │   ├── slugit.csv
    │   ├── derr.csv
    │   ├── date-fmt.csv
    │   ├── date-utc.csv
    │   ├── padded_decimals.csv
    │   ├── strip.csv
    │   ├── covid.csv
    │   ├── usa.csv
    │   └── iris.csv
    └── test_ph.py
├── .github
    └── workflows
    │   └── pythonapp.yml
├── LICENSE
├── setup.py
├── .gitignore
└── README.md


/TAGS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 10
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | 


--------------------------------------------------------------------------------
/ph/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.5"
2 | 


--------------------------------------------------------------------------------
/assets/cases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/cases.png


--------------------------------------------------------------------------------
/tests/test_data/headless.csv:
--------------------------------------------------------------------------------
1 | 12,76
2 | 13,74
3 | 14,75
4 | 15,79
5 | 16,77
6 | 


--------------------------------------------------------------------------------
/assets/lifeexp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/lifeexp.png


--------------------------------------------------------------------------------
/assets/polyfit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/polyfit.png


--------------------------------------------------------------------------------
/assets/scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/scatter.png


--------------------------------------------------------------------------------
/tests/test_data/a.csv:
--------------------------------------------------------------------------------
1 | x,y
2 | 3,8
3 | 4,9
4 | 5,10
5 | 6,11
6 | 7,12
7 | 8,13
8 | 


--------------------------------------------------------------------------------
/assets/covid-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/covid-plot.png


--------------------------------------------------------------------------------
/tests/test_data/broken.csv:
--------------------------------------------------------------------------------
1 | x,y
2 | 3,8,,,3
3 | 4,9,
4 | 5,10,2
5 | 6,11
6 | 7,12
7 | 8,13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/inf.csv:
--------------------------------------------------------------------------------
1 | x,y
2 | nan,8
3 | nan,9
4 | nan,10
5 | inf,11
6 | 7,12
7 | 8,13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/sheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgdr/ph/HEAD/tests/test_data/sheet.xlsx


--------------------------------------------------------------------------------
/tests/test_data/left.csv:
--------------------------------------------------------------------------------
1 | key1,key2,A,B
2 | K0,K0,A0,B0
3 | K0,K1,A1,B1
4 | K1,K0,A2,B2
5 | K2,K1,A3,B3
6 | 


--------------------------------------------------------------------------------
/tests/test_data/right.csv:
--------------------------------------------------------------------------------
1 | key1,key2,C,D
2 | K0,K0,C0,D0
3 | K1,K0,C1,D1
4 | K1,K0,C2,D2
5 | K2,K0,C3,D3
6 | 


--------------------------------------------------------------------------------
/tests/test_data/f.csv:
--------------------------------------------------------------------------------
 1 | a,b
 2 | 2,1
 3 | 4,1
 4 | 6,2
 5 | 8,3
 6 | 10,5
 7 | 12,8
 8 | 14,13
 9 | 16,21
10 | 


--------------------------------------------------------------------------------
/tests/test_data/group.csv:
--------------------------------------------------------------------------------
1 | Animal,Max Speed
2 | Falcon,380.0
3 | Falcon,370.0
4 | Parrot,24.0
5 | Parrot,26.0
6 | 


--------------------------------------------------------------------------------
/tests/test_data/mergel.csv:
--------------------------------------------------------------------------------
1 | lk1,lk2,lk3,lk4
2 | K0,K5,A0,B0
3 | K1,K4,A1,B1
4 | K2,K3,A2,B2
5 | K3,K2,A3,B3
6 | 


--------------------------------------------------------------------------------
/tests/test_data/merger.csv:
--------------------------------------------------------------------------------
1 | rk1,rk2,rk3,rk4
2 | K2,K3,A0,B0
3 | K3,K4,A1,B1
4 | K4,K5,A2,B2
5 | K5,K6,A3,B3
6 | 


--------------------------------------------------------------------------------
/tests/test_data/t.tsv:
--------------------------------------------------------------------------------
1 | a	b
2 | 1	0
3 | 10	1
4 | 100	2
5 | 1,000	3
6 | 10,000	4
7 | 100,000	5
8 | 1,000,000	6
9 | 


--------------------------------------------------------------------------------
/tests/test_data/d.csv:
--------------------------------------------------------------------------------
1 | year,month,day
2 | 2003,3,8
3 | 2004,4,9
4 | 2005,5,10
5 | 2006,6,11
6 | 2007,7,12
7 | 2008,8,13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/d.scsv:
--------------------------------------------------------------------------------
1 | year;month;day
2 | 2003;3;8
3 | 2004;4;9
4 | 2005;5;10
5 | 2006;6;11
6 | 2007;7;12
7 | 2008;8;13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/slugit.csv:
--------------------------------------------------------------------------------
1 |   Stupid column 1,  Jerky-column No. 2
2 | 3,8
3 | 4,9
4 | 5,10
5 | 6,11
6 | 7,12
7 | 8,13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/derr.csv:
--------------------------------------------------------------------------------
1 | year,month,day
2 | 2003-01-01,3,8
3 | 2004-01-01,4,9
4 | 2005-01-01,5,10
5 | 200-01,6,11
6 | 2007,7,12
7 | 2008,8,13
8 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
1 | #! /usr/bin/make -f
2 | 
3 | #export DH_VERBOSE = 1
4 | export PYBUILD_NAME=ph
5 | 
6 | %:
7 |     dh $@ --with python3 --buildsystem=pybuild


--------------------------------------------------------------------------------
/tests/test_data/date-fmt.csv:
--------------------------------------------------------------------------------
1 | date,x,y
2 | 2020_02/02,3,8
3 | 2020_02/03,4,9
4 | 2020_02/04,5,10
5 | 2020_02/05,6,11
6 | 2020_02/06,7,12
7 | 2020_02/07,8,13
8 | 


--------------------------------------------------------------------------------
/tests/test_data/date-utc.csv:
--------------------------------------------------------------------------------
1 | date,x,y
2 | 1580601600,3,8
3 | 1580688000,4,9
4 | 1580774400,5,10
5 | 1580860800,6,11
6 | 1580947200,7,12
7 | 1581033600,8,13
8 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
1 | python3-ph (1.1.2) bionic; urgency=medium
2 | 
3 |     * First deb release
4 | 
5 |     -- PG Drange <Pal.Drange@uib.no> Thu Jul 21 11:42:31 CEST 2022
6 | 


--------------------------------------------------------------------------------
/tests/test_data/padded_decimals.csv:
--------------------------------------------------------------------------------
1 | idx,paddecim
2 | 0,"   502,50"
3 | 1,"   172,50"
4 | 2,"     7,50"
5 | 3,"   142,50"
6 | 4,"   157,50"
7 | 5,"   487,50"
8 | 6," 1.470,00"
9 | 


--------------------------------------------------------------------------------
/tests/test_data/strip.csv:
--------------------------------------------------------------------------------
1 | idx,date,x,y
2 | 1,   2020-05-12 ,3,8
3 | 2,   2020-05-13 ,4,9
4 | 3,   2020-05-14 ,5,10
5 | 4,   2020-05-15 ,6,11
6 | 5,   2020-05-16 ,7,12
7 | 6,   2020-05-17 ,8,13
8 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: python3-ph
 2 | Maintainer: PG Drange <Pal.Drange@uib.no>
 3 | Build-Depends: debhelper,dh-python,python3-all,python3-setuptools
 4 | Section: devel
 5 | Priority: optional
 6 | Standards-Version: 3.9.6
 7 | X-Python3-Version: >= 3.6
 8 | 
 9 | Package: python3-ph
10 | Architecture: all
11 | Description: Tabular data shell tool
12 | Depends: ${python3:Depends},python3-pandas,python3-matplotlib


--------------------------------------------------------------------------------
/.github/workflows/pythonapp.yml:
--------------------------------------------------------------------------------
 1 | name: ph tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up Python 3.7
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: 3.7
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |         pip install .
25 |     - name: Lint with flake8
26 |       run: |
27 |         pip install flake8
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=99 --statistics
30 |     - name: Test with pytest
31 |       run: |
32 |         pip install pytest
33 |         pytest
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Pål Grønås Drange
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_data/covid.csv:
--------------------------------------------------------------------------------
 1 | China,S. Korea,Italy,Iran,France,Germany,Spain,USA,UK,Canada
 2 | 571,58,79,95,57,79,84,75,87,96
 3 | 830,111,157,139,100,130,120,100,116,158
 4 | 1287,209,229,245,130,165,165,124,164,190
 5 | 1975,436,323,388,191,203,228,158,209,249
 6 | 2744,602,470,593,212,262,282,221,278,300
 7 | 4515,833,655,978,285,545,401,319,321,
 8 | 5974,977,889,1501,423,670,525,435,383,
 9 | 7711,1261,1128,2336,653,800,674,541,456,
10 | 9692,1766,1701,2922,949,1040,1231,704,590,
11 | 11791,2337,2036,3513,1209,1224,1695,994,798,
12 | 14380,3150,2502,4747,1412,1565,2277,1329,1140,
13 | 17205,3736,3089,5823,1784,1966,3146,1762,1140,
14 | 20440,4335,3858,6566,2281,2745,5232,2247,,
15 | 24324,5186,4636,7161,2876,3675,6391,2943,,
16 | 28018,5621,5883,8042,3661,4599,7753,3046,,
17 | 31161,6284,7375,9000,4469,5381,,,,
18 | 34546,6593,9172,10075,4499,,,,,
19 | 37198,7041,10149,11364,,,,,,
20 | 40171,7313,12462,12729,,,,,,
21 | 42638,7478,15113,13938,,,,,,
22 | 44653,7513,17660,,,,,,,
23 | 58761,7755,21157,,,,,,,
24 | 63851,7869,21157,,,,,,,
25 | 66492,7979,,,,,,,,
26 | 68500,8086,,,,,,,,
27 | 70548,8162,,,,,,,,
28 | 72436,,,,,,,,,
29 | 74185,,,,,,,,,
30 | 74576,,,,,,,,,
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | from setuptools import setup
 6 | 
 7 | 
 8 | __pgdr = "PG Drange <Pal.Drange@uib.no>"
 9 | __source = "https://github.com/pgdr/ph"
10 | __webpage = __source
11 | __description = "ph - the tabular data shell tool"
12 | 
13 | _min_req = ["pandas"]
14 | requirements = {
15 |     "minimum": _min_req,
16 |     "parquet": _min_req + ["pyarrow"],
17 |     "xls": _min_req + ["xlrd"],
18 |     "xlsw": _min_req + ["xlrd", "xlwt"],
19 |     "plot": _min_req + ["matplotlib"],
20 |     "data": _min_req + ["scikit-learn"],
21 |     "math": _min_req + ["numpy"],
22 |     "iplot": _min_req + ["cufflinks"],
23 |     "gpx": _min_req + ["gpxpy"],
24 | }
25 | requirements["complete"] = sorted(set(sum(requirements.values(), [])))
26 | 
27 | 
28 | def _src(x):
29 |     root = os.path.dirname(__file__)
30 |     return os.path.abspath(os.path.join(root, x))
31 | 
32 | 
33 | def _read_file(fname, op):
34 |     with open(_src(fname), "r") as fin:
35 |         return op(fin.readlines())
36 | 
37 | 
38 | def readme():
39 |     try:
40 |         return _read_file("README.md", lambda lines: "".join(lines))
41 |     except Exception:
42 |         return __description
43 | 
44 | 
45 | VERSIONFILE = "ph/_version.py"
46 | verstrline = open(VERSIONFILE, "rt").read()
47 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
48 | mo = re.search(VSRE, verstrline, re.M)
49 | if mo:
50 |     verstr = mo.group(1)
51 | else:
52 |     raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,))
53 | 
54 | setup(
55 |     version=verstr,
56 |     name="ph",
57 |     packages=["ph"],
58 |     description=__description,
59 |     long_description=readme(),
60 |     long_description_content_type="text/markdown",
61 |     author="PG Drange",
62 |     author_email="Pal.Drange@uib.no",
63 |     maintainer=__pgdr,
64 |     url=__webpage,
65 |     project_urls={
66 |         "Bug Tracker": "{}/issues".format(__source),
67 |         "Documentation": "{}/blob/master/README.md".format(__source),
68 |         "Source Code": __source,
69 |     },
70 |     license="MIT",
71 |     keywords="tabular data, pandas, csv, pipeline, unix, command line tool",
72 |     install_requires=requirements["minimum"],
73 |     entry_points={
74 |         "console_scripts": [
75 |             "ph = ph:main",
76 |         ],
77 |     },
78 |     test_suite="tests",
79 |     tests_require=["pytest"],
80 |     extras_require=requirements,
81 | )
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | TAGS
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/tests/test_data/usa.csv:
--------------------------------------------------------------------------------
 1 | dateRep,day,month,year,cases,deaths,geoId
 2 | 01/04/2020,1,4,2020,24998,909,US
 3 | 31/03/2020,31,3,2020,21595,661,US
 4 | 30/03/2020,30,3,2020,18360,318,US
 5 | 29/03/2020,29,3,2020,19979,484,US
 6 | 28/03/2020,28,3,2020,18695,411,US
 7 | 27/03/2020,27,3,2020,16797,246,US
 8 | 26/03/2020,26,3,2020,13963,249,US
 9 | 25/03/2020,25,3,2020,8789,211,US
10 | 24/03/2020,24,3,2020,11236,119,US
11 | 23/03/2020,23,3,2020,8459,131,US
12 | 22/03/2020,22,3,2020,7123,80,US
13 | 21/03/2020,21,3,2020,5374,110,US
14 | 20/03/2020,20,3,2020,4835,0,US
15 | 19/03/2020,19,3,2020,2988,42,US
16 | 18/03/2020,18,3,2020,1766,23,US
17 | 17/03/2020,17,3,2020,887,16,US
18 | 16/03/2020,16,3,2020,823,12,US
19 | 15/03/2020,15,3,2020,777,10,US
20 | 14/03/2020,14,3,2020,511,7,US
21 | 13/03/2020,13,3,2020,351,10,US
22 | 12/03/2020,12,3,2020,287,2,US
23 | 11/03/2020,11,3,2020,271,2,US
24 | 10/03/2020,10,3,2020,200,5,US
25 | 09/03/2020,9,3,2020,121,4,US
26 | 08/03/2020,8,3,2020,95,3,US
27 | 07/03/2020,7,3,2020,105,2,US
28 | 06/03/2020,6,3,2020,74,1,US
29 | 05/03/2020,5,3,2020,34,2,US
30 | 04/03/2020,4,3,2020,22,3,US
31 | 03/03/2020,3,3,2020,14,4,US
32 | 02/03/2020,2,3,2020,20,1,US
33 | 01/03/2020,1,3,2020,3,1,US
34 | 29/02/2020,29,2,2020,6,0,US
35 | 28/02/2020,28,2,2020,1,0,US
36 | 27/02/2020,27,2,2020,6,0,US
37 | 26/02/2020,26,2,2020,0,0,US
38 | 25/02/2020,25,2,2020,18,0,US
39 | 24/02/2020,24,2,2020,0,0,US
40 | 23/02/2020,23,2,2020,0,0,US
41 | 22/02/2020,22,2,2020,19,0,US
42 | 21/02/2020,21,2,2020,1,0,US
43 | 20/02/2020,20,2,2020,0,0,US
44 | 19/02/2020,19,2,2020,0,0,US
45 | 18/02/2020,18,2,2020,0,0,US
46 | 17/02/2020,17,2,2020,0,0,US
47 | 16/02/2020,16,2,2020,0,0,US
48 | 15/02/2020,15,2,2020,0,0,US
49 | 14/02/2020,14,2,2020,1,0,US
50 | 13/02/2020,13,2,2020,1,0,US
51 | 12/02/2020,12,2,2020,0,0,US
52 | 11/02/2020,11,2,2020,1,0,US
53 | 10/02/2020,10,2,2020,0,0,US
54 | 09/02/2020,9,2,2020,0,0,US
55 | 08/02/2020,8,2,2020,0,0,US
56 | 07/02/2020,7,2,2020,0,0,US
57 | 06/02/2020,6,2,2020,1,0,US
58 | 05/02/2020,5,2,2020,0,0,US
59 | 04/02/2020,4,2,2020,0,0,US
60 | 03/02/2020,3,2,2020,3,0,US
61 | 02/02/2020,2,2,2020,1,0,US
62 | 01/02/2020,1,2,2020,1,0,US
63 | 31/01/2020,31,1,2020,1,0,US
64 | 30/01/2020,30,1,2020,0,0,US
65 | 29/01/2020,29,1,2020,0,0,US
66 | 28/01/2020,28,1,2020,0,0,US
67 | 27/01/2020,27,1,2020,3,0,US
68 | 26/01/2020,26,1,2020,0,0,US
69 | 25/01/2020,25,1,2020,1,0,US
70 | 24/01/2020,24,1,2020,0,0,US
71 | 23/01/2020,23,1,2020,0,0,US
72 | 22/01/2020,22,1,2020,0,0,US
73 | 21/01/2020,21,1,2020,1,0,US
74 | 20/01/2020,20,1,2020,0,0,US
75 | 19/01/2020,19,1,2020,0,0,US
76 | 18/01/2020,18,1,2020,0,0,US
77 | 17/01/2020,17,1,2020,0,0,US
78 | 16/01/2020,16,1,2020,0,0,US
79 | 15/01/2020,15,1,2020,0,0,US
80 | 14/01/2020,14,1,2020,0,0,US
81 | 13/01/2020,13,1,2020,0,0,US
82 | 12/01/2020,12,1,2020,0,0,US
83 | 11/01/2020,11,1,2020,0,0,US
84 | 10/01/2020,10,1,2020,0,0,US
85 | 09/01/2020,9,1,2020,0,0,US
86 | 08/01/2020,8,1,2020,0,0,US
87 | 07/01/2020,7,1,2020,0,0,US
88 | 06/01/2020,6,1,2020,0,0,US
89 | 05/01/2020,5,1,2020,0,0,US
90 | 04/01/2020,4,1,2020,0,0,US
91 | 03/01/2020,3,1,2020,0,0,US
92 | 02/01/2020,2,1,2020,0,0,US
93 | 01/01/2020,1,1,2020,0,0,US
94 | 31/12/2019,31,12,2019,0,0,US
95 | 


--------------------------------------------------------------------------------
/tests/test_data/iris.csv:
--------------------------------------------------------------------------------
  1 | 150,4,setosa,versicolor,virginica
  2 | 5.1,3.5,1.4,0.2,0
  3 | 4.9,3.0,1.4,0.2,0
  4 | 4.7,3.2,1.3,0.2,0
  5 | 4.6,3.1,1.5,0.2,0
  6 | 5.0,3.6,1.4,0.2,0
  7 | 5.4,3.9,1.7,0.4,0
  8 | 4.6,3.4,1.4,0.3,0
  9 | 5.0,3.4,1.5,0.2,0
 10 | 4.4,2.9,1.4,0.2,0
 11 | 4.9,3.1,1.5,0.1,0
 12 | 5.4,3.7,1.5,0.2,0
 13 | 4.8,3.4,1.6,0.2,0
 14 | 4.8,3.0,1.4,0.1,0
 15 | 4.3,3.0,1.1,0.1,0
 16 | 5.8,4.0,1.2,0.2,0
 17 | 5.7,4.4,1.5,0.4,0
 18 | 5.4,3.9,1.3,0.4,0
 19 | 5.1,3.5,1.4,0.3,0
 20 | 5.7,3.8,1.7,0.3,0
 21 | 5.1,3.8,1.5,0.3,0
 22 | 5.4,3.4,1.7,0.2,0
 23 | 5.1,3.7,1.5,0.4,0
 24 | 4.6,3.6,1.0,0.2,0
 25 | 5.1,3.3,1.7,0.5,0
 26 | 4.8,3.4,1.9,0.2,0
 27 | 5.0,3.0,1.6,0.2,0
 28 | 5.0,3.4,1.6,0.4,0
 29 | 5.2,3.5,1.5,0.2,0
 30 | 5.2,3.4,1.4,0.2,0
 31 | 4.7,3.2,1.6,0.2,0
 32 | 4.8,3.1,1.6,0.2,0
 33 | 5.4,3.4,1.5,0.4,0
 34 | 5.2,4.1,1.5,0.1,0
 35 | 5.5,4.2,1.4,0.2,0
 36 | 4.9,3.1,1.5,0.2,0
 37 | 5.0,3.2,1.2,0.2,0
 38 | 5.5,3.5,1.3,0.2,0
 39 | 4.9,3.6,1.4,0.1,0
 40 | 4.4,3.0,1.3,0.2,0
 41 | 5.1,3.4,1.5,0.2,0
 42 | 5.0,3.5,1.3,0.3,0
 43 | 4.5,2.3,1.3,0.3,0
 44 | 4.4,3.2,1.3,0.2,0
 45 | 5.0,3.5,1.6,0.6,0
 46 | 5.1,3.8,1.9,0.4,0
 47 | 4.8,3.0,1.4,0.3,0
 48 | 5.1,3.8,1.6,0.2,0
 49 | 4.6,3.2,1.4,0.2,0
 50 | 5.3,3.7,1.5,0.2,0
 51 | 5.0,3.3,1.4,0.2,0
 52 | 7.0,3.2,4.7,1.4,1
 53 | 6.4,3.2,4.5,1.5,1
 54 | 6.9,3.1,4.9,1.5,1
 55 | 5.5,2.3,4.0,1.3,1
 56 | 6.5,2.8,4.6,1.5,1
 57 | 5.7,2.8,4.5,1.3,1
 58 | 6.3,3.3,4.7,1.6,1
 59 | 4.9,2.4,3.3,1.0,1
 60 | 6.6,2.9,4.6,1.3,1
 61 | 5.2,2.7,3.9,1.4,1
 62 | 5.0,2.0,3.5,1.0,1
 63 | 5.9,3.0,4.2,1.5,1
 64 | 6.0,2.2,4.0,1.0,1
 65 | 6.1,2.9,4.7,1.4,1
 66 | 5.6,2.9,3.6,1.3,1
 67 | 6.7,3.1,4.4,1.4,1
 68 | 5.6,3.0,4.5,1.5,1
 69 | 5.8,2.7,4.1,1.0,1
 70 | 6.2,2.2,4.5,1.5,1
 71 | 5.6,2.5,3.9,1.1,1
 72 | 5.9,3.2,4.8,1.8,1
 73 | 6.1,2.8,4.0,1.3,1
 74 | 6.3,2.5,4.9,1.5,1
 75 | 6.1,2.8,4.7,1.2,1
 76 | 6.4,2.9,4.3,1.3,1
 77 | 6.6,3.0,4.4,1.4,1
 78 | 6.8,2.8,4.8,1.4,1
 79 | 6.7,3.0,5.0,1.7,1
 80 | 6.0,2.9,4.5,1.5,1
 81 | 5.7,2.6,3.5,1.0,1
 82 | 5.5,2.4,3.8,1.1,1
 83 | 5.5,2.4,3.7,1.0,1
 84 | 5.8,2.7,3.9,1.2,1
 85 | 6.0,2.7,5.1,1.6,1
 86 | 5.4,3.0,4.5,1.5,1
 87 | 6.0,3.4,4.5,1.6,1
 88 | 6.7,3.1,4.7,1.5,1
 89 | 6.3,2.3,4.4,1.3,1
 90 | 5.6,3.0,4.1,1.3,1
 91 | 5.5,2.5,4.0,1.3,1
 92 | 5.5,2.6,4.4,1.2,1
 93 | 6.1,3.0,4.6,1.4,1
 94 | 5.8,2.6,4.0,1.2,1
 95 | 5.0,2.3,3.3,1.0,1
 96 | 5.6,2.7,4.2,1.3,1
 97 | 5.7,3.0,4.2,1.2,1
 98 | 5.7,2.9,4.2,1.3,1
 99 | 6.2,2.9,4.3,1.3,1
100 | 5.1,2.5,3.0,1.1,1
101 | 5.7,2.8,4.1,1.3,1
102 | 6.3,3.3,6.0,2.5,2
103 | 5.8,2.7,5.1,1.9,2
104 | 7.1,3.0,5.9,2.1,2
105 | 6.3,2.9,5.6,1.8,2
106 | 6.5,3.0,5.8,2.2,2
107 | 7.6,3.0,6.6,2.1,2
108 | 4.9,2.5,4.5,1.7,2
109 | 7.3,2.9,6.3,1.8,2
110 | 6.7,2.5,5.8,1.8,2
111 | 7.2,3.6,6.1,2.5,2
112 | 6.5,3.2,5.1,2.0,2
113 | 6.4,2.7,5.3,1.9,2
114 | 6.8,3.0,5.5,2.1,2
115 | 5.7,2.5,5.0,2.0,2
116 | 5.8,2.8,5.1,2.4,2
117 | 6.4,3.2,5.3,2.3,2
118 | 6.5,3.0,5.5,1.8,2
119 | 7.7,3.8,6.7,2.2,2
120 | 7.7,2.6,6.9,2.3,2
121 | 6.0,2.2,5.0,1.5,2
122 | 6.9,3.2,5.7,2.3,2
123 | 5.6,2.8,4.9,2.0,2
124 | 7.7,2.8,6.7,2.0,2
125 | 6.3,2.7,4.9,1.8,2
126 | 6.7,3.3,5.7,2.1,2
127 | 7.2,3.2,6.0,1.8,2
128 | 6.2,2.8,4.8,1.8,2
129 | 6.1,3.0,4.9,1.8,2
130 | 6.4,2.8,5.6,2.1,2
131 | 7.2,3.0,5.8,1.6,2
132 | 7.4,2.8,6.1,1.9,2
133 | 7.9,3.8,6.4,2.0,2
134 | 6.4,2.8,5.6,2.2,2
135 | 6.3,2.8,5.1,1.5,2
136 | 6.1,2.6,5.6,1.4,2
137 | 7.7,3.0,6.1,2.3,2
138 | 6.3,3.4,5.6,2.4,2
139 | 6.4,3.1,5.5,1.8,2
140 | 6.0,3.0,4.8,1.8,2
141 | 6.9,3.1,5.4,2.1,2
142 | 6.7,3.1,5.6,2.4,2
143 | 6.9,3.1,5.1,2.3,2
144 | 5.8,2.7,5.1,1.9,2
145 | 6.8,3.2,5.9,2.3,2
146 | 6.7,3.3,5.7,2.5,2
147 | 6.7,3.0,5.2,2.3,2
148 | 6.3,2.5,5.0,1.9,2
149 | 6.5,3.0,5.2,2.0,2
150 | 6.2,3.4,5.4,2.3,2
151 | 5.9,3.0,5.1,1.8,2
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # ph (pronounced _φ_) - the tabular data shell tool ![ph tests](https://github.com/pgdr/ph/workflows/ph%20tests/badge.svg?branch=master)
   2 | 
   3 | 
   4 | Spoiler: Working with tabular data (csv) in the command line is difficult.
   5 | 
   6 | `ph` makes it easy:
   7 | 
   8 | ```bash
   9 | $ pip install ph
  10 | $ cat iris.csv | ph columns
  11 | 150
  12 | 4
  13 | setosa
  14 | versicolor
  15 | virginica
  16 | $ cat iris.csv | ph columns setosa versicolor | ph head 15 | ph tail 5 | ph show
  17 |       setosa    versicolor
  18 | --  --------  ------------
  19 |  0       1.5           0.2
  20 |  1       1.6           0.2
  21 |  2       1.4           0.1
  22 |  3       1.1           0.1
  23 |  4       1.2           0.2
  24 | ```
  25 | 
  26 | ```bash
  27 | $ cat iris.csv | ph describe
  28 |               150           4      setosa  versicolor   virginica
  29 | count  150.000000  150.000000  150.000000  150.000000  150.000000
  30 | mean     5.843333    3.057333    3.758000    1.199333    1.000000
  31 | std      0.828066    0.435866    1.765298    0.762238    0.819232
  32 | min      4.300000    2.000000    1.000000    0.100000    0.000000
  33 | 25%      5.100000    2.800000    1.600000    0.300000    0.000000
  34 | 50%      5.800000    3.000000    4.350000    1.300000    1.000000
  35 | 75%      6.400000    3.300000    5.100000    1.800000    2.000000
  36 | max      7.900000    4.400000    6.900000    2.500000    2.000000
  37 | ```
  38 | 
  39 | Occasionally you would like to plot a CSV file real quick, in which case you can
  40 | simply pipe it to `ph plot`:
  41 | 
  42 | Suppose you have a dataset `covid.csv`
  43 | 
  44 | ```csv
  45 | SK,Italy,Iran,France,Spain,US
  46 | 51,79,95,57,84,85
  47 | 104,150,139,100,125,111
  48 | 204,227,245,130,169,176
  49 | 433,320,388,191,228,252
  50 | 602,445,593,212,282,352
  51 | 833,650,978,285,365,495
  52 | 977,888,1501,423,430,640
  53 | 1261,1128,2336,613,674,926
  54 | 1766,1694,2922,949,1231,NaN
  55 | 2337,2036,3513,1126,1696,NaN
  56 | 3150,2502,4747,1412,NaN,NaN
  57 | 4212,3089,5823,1748,NaN,NaN
  58 | 4812,3858,6566,NaN,NaN,NaN
  59 | 5328,4638,7161,NaN,NaN,NaN
  60 | 5766,5883,8042,NaN,NaN,NaN
  61 | 6284,7375,NaN,NaN,NaN,NaN
  62 | 6767,9172,NaN,NaN,NaN,NaN
  63 | 7134,10149,NaN,NaN,NaN,NaN
  64 | 7382,NaN,NaN,NaN,NaN,NaN
  65 | 7513,NaN,NaN,NaN,NaN,NaN
  66 | ```
  67 | 
  68 | With this simple command, you get a certified _"So fancy" plot_.
  69 | 
  70 | ```bash
  71 | $ cat covid.csv | ph plot
  72 | ```
  73 | 
  74 | ![So fancy covid plot](https://raw.githubusercontent.com/pgdr/ph/master/assets/covid-plot.png)
  75 | 
  76 | 
  77 | _(Notice that this needs [matplotlib](https://matplotlib.org/): `pip install ph[plot]`)_
  78 | 
  79 | 
  80 | ---
  81 | 
  82 | ## Raison d'être
  83 | 
  84 | Using the _pipeline_ in Linux is nothing short of a dream in the life of the
  85 | computer super user.
  86 | 
  87 | However the pipe is clearly most suited for a stream of lines of textual data,
  88 | and not when the stream is actually tabular data.
  89 | 
  90 | Tabular data is much more complex to work with due to its dual indexing and the
  91 | fact that we often read horizontally and often read vertically.
  92 | 
  93 | The defacto format for tabular data is `csv`
  94 | ([comma-separated values](https://en.wikipedia.org/wiki/Comma-separated_values),
  95 | which is not perfect in any sense
  96 | of the word), and the defacto tool for working with tabular data in Python is
  97 | Pandas.
  98 | 
  99 | This is a shell utility `ph` (pronounced _phi_)
 100 | that reads tabular data from
 101 | [_standard in_](https://en.wikipedia.org/wiki/Standard_streams#Standard_input_(stdin))
 102 | and allows
 103 | you to perform a pandas function on the data, before writing it to standard out
 104 | in `csv` format.
 105 | 
 106 | The goal is to create a tool which makes it nicer to work with tabular data in a
 107 | pipeline.
 108 | 
 109 | To achieve the goal, `ph` then reads csv data, does some manipulation,
 110 | and prints out csv data.  With csv as the invariant, `ph` can be used in
 111 | a pipeline.
 112 | 
 113 | ---
 114 | 
 115 | A very quick introduction to what `ph` can do for you,
 116 | run this in your shell:
 117 | 
 118 | ```bash
 119 | ph open csv https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/archived/ecdc/total_cases.csv \
 120 |     | ph slugify                                                       \
 121 |     | ph columns date norway sweden denmark                            \
 122 |     | ph diff norway sweden denmark                                    \
 123 |     | ph spencer norway sweden denmark                                 \
 124 |     | ph rolling 7 norway sweden denmark --how=mean                    \
 125 |     | ph dropna                                                        \
 126 |     | ph slice 50:                                                     \
 127 |     | ph plot --linewidth=3 --savefig=cases.svg --index=date
 128 | ```
 129 | 
 130 | ![cases](https://raw.githubusercontent.com/pgdr/ph/master/assets/cases.png)
 131 | 
 132 | ---
 133 | 
 134 | ## Table of contents
 135 | 
 136 | 1. [Getting started](#getting-started)
 137 | 1. [Example usage](#example-usage)
 138 | 1. [The tools](#the-tools)
 139 |    1. [Concatenating, merging, filtering](#concatenating-merging-filtering)
 140 |       1. [`cat`, `open`, `from`](#cat-open-from)
 141 |       1. [`dropna` and `fillna`](#dropna-and-fillna)
 142 |       1. [`head` and `tail`](#head-and-tail)
 143 |       1. [`date`](#date)
 144 |       1. [`merge`](#merge)
 145 |    1. [Editing the csv](#editing-the-csv)
 146 |       1. [`columns`, listing, selecting and re-ordering of](#columns-listing-selecting-and-re-ordering-of)
 147 |       1. [`rename`](#rename)
 148 |       1. [`replace`](#replace)
 149 |       1. [`slice`](#slice)
 150 |       1. [`eval`; Mathematipulating and creating new columns](#eval-mathematipulating-and-creating-new-columns)
 151 |       1. [`normalize`](#normalize)
 152 |       1. [`query`](#query)
 153 |       1. [`grep`](#grep)
 154 |       1. [`strip`](#strip)
 155 |       1. [`removeprefix` and `removesuffix`](#removeprefix-and-removesuffix)
 156 |    1. [Analyzing the csv file](#analyzing-the-csv-file)
 157 |       1. [`describe`](#describe)
 158 |       1. [`show`](#show)
 159 |       1. [`tabulate`](#tabulate)
 160 |       1. [`sort` values by column](#sort-values-by-column)
 161 |       1. [`plot`](#plot)
 162 |       1. [`groupby`](#groupby)
 163 |       1. [`rolling`, `ewm`, `expanding`](#rolling-ewm-expanding)
 164 |       1. [`index`](#index)
 165 |       1. [`polyfit`](#polyfit)
 166 | 1. [Working with different formats](#working-with-different-formats)
 167 |    1. [`open`](#open)
 168 |    1. [`to` and `from`; Exporting and importing](#to-and-from-exporting-and-importing)
 169 |    1. [Supported formats](#supported-formats)
 170 | 
 171 | 
 172 | ---
 173 | 
 174 | 
 175 | ## Getting started
 176 | 
 177 | If you have installed `ph[data]`, you can experiment using `ph dataset` if you
 178 | don't have an appropriate csv file available.
 179 | 
 180 | 
 181 | ```bash
 182 | ph dataset boston | ph describe
 183 | ```
 184 | 
 185 | Available datasets are from
 186 | [scikit-learn.datasets](https://scikit-learn.org/stable/datasets/index.html)
 187 | 
 188 | Toy datasets:
 189 | 
 190 | * `boston`
 191 | * `iris`
 192 | * `diabetes`
 193 | * `digits`
 194 | * `linnerud`
 195 | * `wine`
 196 | * `breast_cancer`
 197 | 
 198 | 
 199 | Real world:
 200 | 
 201 | * `olivetti_faces`
 202 | * `lfw_people`
 203 | * `lfw_pairs`
 204 | * `rcv1`
 205 | * `kddcup99`
 206 | * `california_housing`
 207 | 
 208 | 
 209 | ## Example usage
 210 | 
 211 | Suppose you have a csv file `a.csv` that looks like this:
 212 | 
 213 | ```csv
 214 | x,y
 215 | 3,8
 216 | 4,9
 217 | 5,10
 218 | 6,11
 219 | 7,12
 220 | 8,13
 221 | ```
 222 | 
 223 | Transpose:
 224 | 
 225 | ```bash
 226 | $ cat a.csv | ph transpose
 227 | 0,1,2,3,4,5
 228 | 3,4,5,6,7,8
 229 | 8,9,10,11,12,13
 230 | ```
 231 | 
 232 | `median` (as well as many others, e.g.  `abs`, `corr`, `count`, `cov`, `cummax`,
 233 | `cumsum`, `diff`, `max`, `product`, `quantile`, `rank`, `round`, `sum`, `std`,
 234 | `var` etc.):
 235 | 
 236 | ```bash
 237 | $ cat a.csv | ph median
 238 | x,y
 239 | 5.5,10.5
 240 | ```
 241 | 
 242 | **_Use `ph help` to list all commands_**
 243 | 
 244 | 
 245 | ## The tools
 246 | 
 247 | ### Concatenating, merging, filtering
 248 | 
 249 | #### `cat`, `open`, `from`
 250 | 
 251 | **cat**
 252 | 
 253 | It is possible to _concatenate_ (`cat`) multiple csv-files with `ph cat`:
 254 | 
 255 | ```bash
 256 | $ ph cat a.csv b.csv --axis=index
 257 | ```
 258 | 
 259 | ```bash
 260 | $ ph cat a.csv b.csv --axis=columns
 261 | ```
 262 | 
 263 | The functionality is described in
 264 | [`pandas.concat`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html).
 265 | 
 266 | 
 267 | **open**
 268 | 
 269 | You can open a csv, json, excel, gpx (etc., see [_supported
 270 | formats_](#supported-formats)) using `ph open type file`:
 271 | 
 272 | ```bash
 273 | $ ph open excel a.xlsx
 274 | ```
 275 | 
 276 | ```bash
 277 | $ ph open excel a.xlsx --sheet_name=0 --skiprows=3
 278 | ```
 279 | 
 280 | 
 281 | ```bash
 282 | $ ph open tsv a.tsv
 283 | ```
 284 | 
 285 | ```bash
 286 | $ ph open csv a.csv
 287 | ```
 288 | 
 289 | In the event that the csv data starts on the first line (i.e. no
 290 | header is present), use `--header=None`:
 291 | 
 292 | ```bash
 293 | $ ph open csv a.csv --header=None
 294 | ```
 295 | 
 296 | 
 297 | 
 298 | **from**
 299 | 
 300 | The `ph from` command works similarly to `ph open` but reads from stdin
 301 | instead of opening a file.  It therefore does not take a filename
 302 | argument:
 303 | 
 304 | ```bash
 305 | $ cat /etc/passwd | ph from csv --sep=':' --header=None
 306 | ```
 307 | 
 308 | 
 309 | #### `dropna` and `fillna`
 310 | 
 311 | 
 312 | Consider again the `covid.csv` file from above.
 313 | 
 314 | ```bash
 315 | $ cat covid.csv | ph dropna
 316 | ```
 317 | 
 318 | will remove all rows that contain N/A (`nan`) values.  If we want to keep all
 319 | rows with at least 5 non-N/A values, we can use
 320 | 
 321 | ```bash
 322 | $ cat covid.csv | ph dropna --thresh=5
 323 | ```
 324 | 
 325 | If we want to drop all _columns_ with N/A values instead of all _rows_, we use
 326 | `--axis=1`.
 327 | 
 328 | If we want to drop only columns (resp. rows) with _all n/a_ values, we use
 329 | `--how=all`.
 330 | 
 331 | 
 332 | To _replace_ N/A values with other values, we can simply run
 333 | 
 334 | ```bash
 335 | cat covid.csv | ph fillna 999.75
 336 | ```
 337 | 
 338 | If we instead want to _pad_ the N/A values, we use `--method=pad`
 339 | 
 340 | ```bash
 341 | cat covid.csv | ph fillna --method=pad
 342 | ```
 343 | 
 344 | We can limit the number of consecutive N/A values that are filled by using
 345 | (e.g.) `--limit=7`.
 346 | 
 347 | 
 348 | 
 349 | 
 350 | 
 351 | 
 352 | 
 353 | 
 354 | #### `head` and `tail`
 355 | 
 356 | Using `head` and `tail` works approximately as the normal shell equivalents,
 357 | however they will preserve the header if there is one, e.g.
 358 | 
 359 | ```bash
 360 | $ cat a.csv | ph head 7 | ph tail 3
 361 | x,y
 362 | 6,11
 363 | 7,12
 364 | 8,13
 365 | ```
 366 | 
 367 | #### `date`
 368 | 
 369 | If the `csv` file contains a column, e.g. named `x` containing
 370 | timestamps, it can be parsed as such with `ph date x`:
 371 | 
 372 | ```bash
 373 | $ cat a.csv | ph date x
 374 | x,y
 375 | 1970-01-04,8
 376 | 1970-01-05,9
 377 | 1970-01-06,10
 378 | 1970-01-07,11
 379 | 1970-01-08,12
 380 | 1970-01-09,13
 381 | ```
 382 | 
 383 | If your column is formatted with _freedom units_, `mm/dd/yyyy`, you can
 384 | use the flag `--dayfirst=True`:
 385 | 
 386 | ```csv
 387 | dateRep,geoId
 388 | 01/04/2020,US
 389 | 31/03/2020,US
 390 | 30/03/2020,US
 391 | 29/03/2020,US
 392 | 28/03/2020,US
 393 | ```
 394 | 
 395 | ```bash
 396 | $ cat ~/cov.csv | ph date dateRep --dayfirst=True
 397 | dateRep,geoId
 398 | 2020-04-01,US
 399 | 2020-03-31,US
 400 | 2020-03-30,US
 401 | 2020-03-29,US
 402 | 2020-03-28,US
 403 | ```
 404 | 
 405 | 
 406 | 
 407 | To get a column with integers (e.g. 3-8) parsed as, e.g. 2003 - 2008, some
 408 | amount of hacking is necessary.  We will go into details later on the `eval` and
 409 | `appendstr`.
 410 | 
 411 | ```bash
 412 | $ cat a.csv | ph eval "x = 2000 + x" | ph appendstr x - | ph date x
 413 | x,y
 414 | 2003-01-01,8
 415 | 2004-01-01,9
 416 | 2005-01-01,10
 417 | 2006-01-01,11
 418 | 2007-01-01,12
 419 | 2008-01-01,13
 420 | ```
 421 | 
 422 | However, it is possible to provide a `--format` instruction to `date`:
 423 | 
 424 | ```bash
 425 | $ cat a.csv | ph eval "x = 2000 + x"  | ph date x --format="%Y"
 426 | x,y
 427 | 2003-01-01,8
 428 | 2004-01-01,9
 429 | 2005-01-01,10
 430 | 2006-01-01,11
 431 | 2007-01-01,12
 432 | 2008-01-01,13
 433 | ```
 434 | 
 435 | Under some very special circumstances, we may have a `unix timestamp` in
 436 | a column, in which the `--utc=True` handle becomes useful:
 437 | 
 438 | Consider `utc.csv`:
 439 | 
 440 | ```csv
 441 | date,x,y
 442 | 1580601600,3,8
 443 | 1580688000,4,9
 444 | 1580774400,5,10
 445 | 1580860800,6,11
 446 | 1580947200,7,12
 447 | 1581033600,8,13
 448 | ```
 449 | 
 450 | where you get the correct dates:
 451 | 
 452 | ```bash
 453 | $ cat utc.csv | ph date date --utc=True
 454 | date,x,y
 455 | 2020-02-02,3,8
 456 | 2020-02-03,4,9
 457 | 2020-02-04,5,10
 458 | 2020-02-05,6,11
 459 | 2020-02-06,7,12
 460 | 2020-02-07,8,13
 461 | ```
 462 | 
 463 | 
 464 | #### `merge`
 465 | 
 466 | Merging two csv files is made available through `ph merge f1 f2`.
 467 | 
 468 | Consider `left.csv`
 469 | 
 470 | ```csv
 471 | key1,key2,A,B
 472 | K0,K0,A0,B0
 473 | K0,K1,A1,B1
 474 | K1,K0,A2,B2
 475 | K2,K1,A3,B3
 476 | ```
 477 | 
 478 | and `right.csv`
 479 | 
 480 | ```csv
 481 | key1,key2,C,D
 482 | K0,K0,C0,D0
 483 | K1,K0,C1,D1
 484 | K1,K0,C2,D2
 485 | K2,K0,C3,D3
 486 | ```
 487 | 
 488 | We can merge them using (default to `--how=inner`)
 489 | 
 490 | ```bash
 491 | $ ph merge left.csv right.csv
 492 | key1,key2,A,B,C,D
 493 | K0,K0,A0,B0,C0,D0
 494 | K1,K0,A2,B2,C1,D1
 495 | K1,K0,A2,B2,C2,D2
 496 | ```
 497 | 
 498 | or using an _outer_ join:
 499 | 
 500 | ```bash
 501 | $ ph merge left.csv right.csv --how=outer
 502 | key1,key2,A,B,C,D
 503 | K0,K0,A0,B0,C0,D0
 504 | K0,K1,A1,B1,,
 505 | K1,K0,A2,B2,C1,D1
 506 | K1,K0,A2,B2,C2,D2
 507 | K2,K1,A3,B3,,
 508 | K2,K0,,,C3,D3
 509 | ```
 510 | 
 511 | and we can specify on which column to join:
 512 | 
 513 | ```bash
 514 | $ ph merge left.csv right.csv --on=key1 --how=outer
 515 | key1,key2_x,A,B,key2_y,C,D
 516 | K0,K0,A0,B0,K0,C0,D0
 517 | K0,K1,A1,B1,K0,C0,D0
 518 | K1,K0,A2,B2,K0,C1,D1
 519 | K1,K0,A2,B2,K0,C2,D2
 520 | K2,K1,A3,B3,K0,C3,D3
 521 | ```
 522 | 
 523 | 
 524 | In the case when the two files do not share a common column key, we can
 525 | join them on key1 from the left file and key2 from the right file by specifying
 526 | 
 527 | ```bash
 528 | $ ph merge mergel.csv merger.csv --left=key1 --right=key2
 529 | ```
 530 | 
 531 | 
 532 | 
 533 | ### Editing the csv
 534 | 
 535 | #### `columns`, listing, selecting and re-ordering of
 536 | 
 537 | Consider `c.csv`:
 538 | 
 539 | ```csv
 540 | it,fr,de
 541 | 79,57,79
 542 | 157,100,130
 543 | 229,130,165
 544 | 323,191,203
 545 | 470,212,262
 546 | 655,285,545
 547 | 889,423,670
 548 | 1128,653,800
 549 | 1701,949,1040
 550 | 2036,1209,1224
 551 | 2502,1412,1565
 552 | 3089,1784,1966
 553 | 3858,2281,2745
 554 | 4636,2876,3675
 555 | 5883,3661,4181
 556 | ```
 557 | 
 558 | Print the column names:
 559 | 
 560 | ```bash
 561 | $ cat c.csv | ph columns
 562 | it
 563 | fr
 564 | de
 565 | ```
 566 | 
 567 | Selecting only certain columns, e.g. `de` and `it`
 568 | 
 569 | ```bash
 570 | $ cat c.csv | ph columns de it | ph tail 3
 571 | de,it
 572 | 2745,3858
 573 | 3675,4636
 574 | 4181,5883
 575 | ```
 576 | 
 577 | 
 578 | #### `rename`
 579 | 
 580 | ```bash
 581 | $ cat c.csv | ph rename de Germany | ph rename it Italy | ph columns Italy Germany
 582 | Italy,Germany
 583 | 79,79
 584 | 157,130
 585 | 229,165
 586 | 323,203
 587 | 470,262
 588 | 655,545
 589 | 889,670
 590 | 1128,800
 591 | 1701,1040
 592 | 2036,1224
 593 | 2502,1565
 594 | 3089,1966
 595 | 3858,2745
 596 | 4636,3675
 597 | 5883,4181
 598 | ```
 599 | 
 600 | In addition to `rename` there is an auxiliary function `slugify` that
 601 | lets you _slugify_ the column names.  Consider `slugit.csv`
 602 | 
 603 | ```csv
 604 |   Stupid column 1,  Jerky-column No. 2
 605 | 3,8
 606 | 4,9
 607 | 5,10
 608 | 6,11
 609 | 7,12
 610 | 8,13
 611 | ```
 612 | 
 613 | ```bash
 614 | $ cat slugit.csv | ph slugify
 615 | stupid_column_1,jerky_column_no_2
 616 | 3,8
 617 | 4,9
 618 | 5,10
 619 | 6,11
 620 | 7,12
 621 | 8,13
 622 | ```
 623 | 
 624 | Then you can do
 625 | 
 626 | ```bash
 627 | $ cat slugit.csv | ph slugify | ph rename stupid_column_1 first | ph rename jerky_column_no_2 second
 628 | first,second
 629 | 3,8
 630 | 4,9
 631 | 5,10
 632 | 6,11
 633 | 7,12
 634 | 8,13
 635 | ```
 636 | 
 637 | 
 638 | #### `replace`
 639 | 
 640 | We can replace values in the data (or in a single column) using `ph
 641 | replace`.  The syntax is
 642 | `ph replace old new [--column=x [--newcolumn=xp]]`:
 643 | 
 644 | ```bash
 645 | $ cat a.csv| ph replace 8 100
 646 | x,y
 647 | 3,100
 648 | 4,9
 649 | 5,10
 650 | 6,11
 651 | 7,12
 652 | 100,13
 653 | ```
 654 | 
 655 | ```bash
 656 | $ cat a.csv| ph replace 8 100 --column=x
 657 | x,y
 658 | 3,8
 659 | 4,9
 660 | 5,10
 661 | 6,11
 662 | 7,12
 663 | 100,13
 664 | ```
 665 | 
 666 | ```bash
 667 | $ cat a.csv| ph replace 8 100 --column=x --newcolumn=xp
 668 | x,y,xp
 669 | 3,8,3
 670 | 4,9,4
 671 | 5,10,5
 672 | 6,11,6
 673 | 7,12,7
 674 | 8,13,100
 675 | ```
 676 | 
 677 | 
 678 | 
 679 | #### `slice`
 680 | 
 681 | Slicing in Python is essential, and occasionally, we want to slice
 682 | tabular data, e.g. look at only the 100 first, or 100 last rows, or
 683 | perhaps we want to look at only every 10th row.  All of this is achieved
 684 | using `ph slice start:end:step` with standard Python slice syntax.
 685 | 
 686 | ```bash
 687 | $ cat a.csv | ph slice 1:9:2
 688 | x,y
 689 | 4,9
 690 | 6,11
 691 | 8,13
 692 | ```
 693 | 
 694 | Reversing:
 695 | 
 696 | ```
 697 | $ cat a.csv|ph slice ::-1
 698 | x,y
 699 | 8,13
 700 | 7,12
 701 | 6,11
 702 | 5,10
 703 | 4,9
 704 | 3,8
 705 | ```
 706 | 
 707 | See also `ph head` and `ph tail`.
 708 | 
 709 | ```bash
 710 | $ cat a.csv | ph slice :3
 711 | x,y
 712 | 3,8
 713 | 4,9
 714 | 5,10
 715 | ```
 716 | 
 717 | equivalent to
 718 | 
 719 | ```bash
 720 | $ cat a.csv | ph head 3
 721 | x,y
 722 | 3,8
 723 | 4,9
 724 | 5,10
 725 | ```
 726 | 
 727 | 
 728 | 
 729 | #### `eval`; Mathematipulating and creating new columns
 730 | 
 731 | You can sum columns and place the result in a new column using
 732 | `eval` (from
 733 | [`pandas.DataFrame.eval`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html#pandas.DataFrame.eval)).
 734 | 
 735 | ```bash
 736 | $ cat c.csv | ph eval "total = it + fr + de" | ph tail 3
 737 | it,fr,de,total
 738 | 3858,2281,2745,8884
 739 | 4636,2876,3675,11187
 740 | 5883,3661,4181,13725
 741 | ```
 742 | 
 743 | 
 744 | ```bash
 745 | $ cat a.csv | ph eval "z = x**2 + y"
 746 | x,y,z
 747 | 3,8,17
 748 | 4,9,25
 749 | 5,10,35
 750 | 6,11,47
 751 | 7,12,61
 752 | 8,13,77
 753 | ```
 754 | 
 755 | 
 756 | If you only want the result, you leave the `eval` expression without assignment
 757 | 
 758 | ```bash
 759 | $ cat a.csv | ph eval "x**2"
 760 | x
 761 | 9
 762 | 16
 763 | 25
 764 | 36
 765 | 49
 766 | 64
 767 | ```
 768 | 
 769 | 
 770 | #### `normalize`
 771 | 
 772 | You can normalize a column using `ph normalize col`.
 773 | 
 774 | ```bash
 775 | $ cat a.csv | ph eval "z = x * y" | ph normalize z
 776 | x,y,z
 777 | 3,8,0.0
 778 | 4,9,0.15
 779 | 5,10,0.325
 780 | 6,11,0.525
 781 | 7,12,0.75
 782 | 8,13,1.0
 783 | ```
 784 | 
 785 | 
 786 | 
 787 | #### `query`
 788 | 
 789 | We can [query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html) data using `ph query expr`.
 790 | 
 791 | ```bash
 792 | $ cat a.csv | ph query "x > 5"
 793 | x,y
 794 | 6,11
 795 | 7,12
 796 | 8,13
 797 | ```
 798 | 
 799 | 
 800 | ```bash
 801 | $ ph open csv 'http://bit.ly/2cLzoxH' | ph query "country == 'Norway'" | ph tabulate --headers
 802 |     country      year          pop  continent      lifeExp    gdpPercap
 803 | --  ---------  ------  -----------  -----------  ---------  -----------
 804 |  0  Norway       1952  3.32773e+06  Europe          72.67       10095.4
 805 |  1  Norway       1957  3.49194e+06  Europe          73.44       11654
 806 |  2  Norway       1962  3.63892e+06  Europe          73.47       13450.4
 807 |  3  Norway       1967  3.78602e+06  Europe          74.08       16361.9
 808 |  4  Norway       1972  3.933e+06    Europe          74.34       18965.1
 809 |  5  Norway       1977  4.04320e+06  Europe          75.37       23311.3
 810 |  6  Norway       1982  4.11479e+06  Europe          75.97       26298.6
 811 |  7  Norway       1987  4.18615e+06  Europe          75.89       31541
 812 |  8  Norway       1992  4.28636e+06  Europe          77.32       33965.7
 813 |  9  Norway       1997  4.40567e+06  Europe          78.32       41283.2
 814 | 10  Norway       2002  4.53559e+06  Europe          79.05       44684
 815 | 11  Norway       2007  4.62793e+06  Europe          80.196      49357.2
 816 | ```
 817 | 
 818 | 
 819 | 
 820 | #### `grep`
 821 | 
 822 | The powerful `grep` is one of the most used command line tools, and it
 823 | would be silly to not ship a version of it ourselves.  Using `ph grep`
 824 | is rarely necessary, but helps when you want to ensure the header is
 825 | kept.
 826 | 
 827 | ```bash
 828 | $ cat txtfile.csv | ph grep "a|b" --case=False --column=Text_Column --regex=False
 829 | ```
 830 | 
 831 | The arguments denote
 832 | 
 833 | * `--case` should be case sensitive?
 834 | * `--column` grep only in given column
 835 | * `--regex` use regex for pattern?
 836 | 
 837 | 
 838 | 
 839 | #### `strip`
 840 | 
 841 | Occasionally csv files come with additional spaces which can lead to
 842 | difficulties in parsing the cells' contents.  A csv file should be
 843 | formatted without spaces after the comma `42,17` over `42, 17`.  But
 844 | since we are human, we sometimes make mistakes.
 845 | 
 846 | If we want to _strip_, or _trim_, the contents of a column, we use `ph
 847 | strip`:
 848 | 
 849 | ```bash
 850 | $ cat txtfile.csv | ph strip col1 col2
 851 | ```
 852 | 
 853 | 
 854 | 
 855 | #### `removeprefix` and `removesuffix`
 856 | 
 857 | If `strip` is not sufficiently powerful, it is possible to
 858 | `removeprefix` or `removesuffix` using
 859 | 
 860 | ```bash
 861 | $cat txtfile.csv | ph removeprefix col1 pattern
 862 | ```
 863 | 
 864 | and similarly for `removesuffix`.
 865 | 
 866 | 
 867 | 
 868 | 
 869 | 
 870 | ### Analyzing the csv file
 871 | 
 872 | 
 873 | #### `describe`
 874 | 
 875 | The normal Pandas `describe` is of course available:
 876 | 
 877 | ```bash
 878 | $ cat a.csv | ph describe
 879 |               x          y
 880 | count  6.000000   6.000000
 881 | mean   5.500000  10.500000
 882 | std    1.870829   1.870829
 883 | min    3.000000   8.000000
 884 | 25%    4.250000   9.250000
 885 | 50%    5.500000  10.500000
 886 | 75%    6.750000  11.750000
 887 | max    8.000000  13.000000
 888 | ```
 889 | 
 890 | 
 891 | #### `show`
 892 | 
 893 | The shorthand `ph show` simply calls the below `ph tabulate --headers`.
 894 | 
 895 | ```bash
 896 | $ cat a.csv | ph show
 897 |       x    y
 898 | --  ---  ---
 899 |  0    3    8
 900 |  1    4    9
 901 |  2    5   10
 902 |  3    6   11
 903 |  4    7   12
 904 |  5    8   13
 905 | ```
 906 | 
 907 | #### `tabulate`
 908 | 
 909 | The amazing _tabulate_ tool comes from the Python package
 910 | [tabulate on PyPI](https://pypi.org/project/tabulate/).
 911 | 
 912 | The `tabulate` command takes arguments `--headers` to toggle printing of header
 913 | row, `--format=[grid,...]` to modify the table style and `--noindex` to remove
 914 | the running index (leftmost column in the example above).
 915 | 
 916 | Among the supported format styles are
 917 | 
 918 | * `plain`, `simple`,
 919 | * `grid`, `fancy_grid`, `pretty`,
 920 | * `github`, `rst`, `mediawiki`, `html`, `latex`,
 921 | * ... (See full list at the project homepage at
 922 |   [python-tabulate](https://github.com/astanin/python-tabulate).)
 923 | 
 924 | 
 925 | #### `sort` values by column
 926 | 
 927 | You can the columns in the csv data by a certain column:
 928 | 
 929 | ```bash
 930 | $ cat iris.csv  | ph sort setosa | ph tail 5
 931 | 150,4,setosa,versicolor,virginica
 932 | 7.9,3.8,6.4,2.0,2
 933 | 7.6,3.0,6.6,2.1,2
 934 | 7.7,3.8,6.7,2.2,2
 935 | 7.7,2.8,6.7,2.0,2
 936 | 7.7,2.6,6.9,2.3,2
 937 | ```
 938 | 
 939 | #### `plot`
 940 | 
 941 | You can plot data using `ph plot [--index=col]`.
 942 | 
 943 | ```bash
 944 | $ ph open parquet 1A_2019.parquet | ph columns Time Value | ph plot --index=Time
 945 | ```
 946 | 
 947 | This will take the columns `Time` and `Value` from the timeseries provided by
 948 | the given `parquet` file and plot the `Value` series using `Time` as _index_.
 949 | 
 950 | 
 951 | The following example plots the life expectancy in Norway using `year` as _index_:
 952 | 
 953 | ```bash
 954 | $ ph open csv http://bit.ly/2cLzoxH  | ph query "country == 'Norway'" | ph appendstr year -01-01 | ph columns year lifeExp | ph plot --index=year
 955 | ```
 956 | 
 957 | ![life-expectancy over time](https://raw.githubusercontent.com/pgdr/ph/master/assets/lifeexp.png)
 958 | 
 959 | > _Note:_ The strange `ph appendstr year -01-01` turns the items `1956` into
 960 | > `"1956-01-01"` and `2005` into `"2005-01-01"`.  These are necessary to make
 961 | > pandas to interpret `1956` as a _year_ and not as a _millisecond_.
 962 | >
 963 | > The command `ph appendstr col str [newcol]` takes a string and appends it to a
 964 | > column, overwriting the original column, or writing it to `newcol` if provided.
 965 | 
 966 | **Advanced plotting**
 967 | 
 968 | You can choose the _kind_ of plotting ( ‘line’, ‘bar’, ‘barh’, ‘hist’, ‘box’,
 969 | ‘kde’, ‘density’, ‘area’, ‘pie’, ‘scatter’, ‘hexbin’), the _style_ of plotting
 970 | (e.g. `--style=o`), and in case of scatter plot, you need to specify `--x=col1`
 971 | and `--y=col2`, e.g.:
 972 | 
 973 | ```bash
 974 | $ ph open csv http://bit.ly/2cLzoxH | ph query "continent == 'Europe'" | ph plot --kind=scatter --x=lifeExp --y=gdpPercap
 975 | ```
 976 | 
 977 | ![life-expectancy vs gdp](https://raw.githubusercontent.com/pgdr/ph/master/assets/scatter.png)
 978 | 
 979 | 
 980 | 
 981 | 
 982 | 
 983 | To specify the styling `k--` gives a black dashed line:
 984 | 
 985 | ```bash
 986 | $ ph open csv http://bit.ly/2cLzoxH  | ph query "country == 'Norway'" | ph appendstr year -01-01 | ph columns year lifeExp | ph plot --index=year --style=k--
 987 | ```
 988 | 
 989 | 
 990 | **Using `plot` headless**
 991 | 
 992 | Occasionally we would like to generate a plot to an image(-like) file on
 993 | the command line or in a script, without necessarily launching any
 994 | graphic user interface.
 995 | 
 996 | Calling `ph plot` with the argument `--savefig=myfile.png` will create a
 997 | PNG file called `myfile.png` instead of opening the matplotlib window.
 998 | It is also possible to get other formats by using different extensions,
 999 | like `eps`, `pdf`, `pgf`, `png`, `ps`, `raw`, `rgba`, `svg`, `svgz`.
1000 | 
1001 | 
1002 | **_`iplot`_ with `plotly` and `cufflinks`**
1003 | 
1004 | Instead of using the `matplotlib` backend, there is an option for using `plotly`
1005 | and [`cufflinks`](https://github.com/santosjorge/cufflinks) to generate
1006 | interactive plots.
1007 | This depends on `cufflinks`, and can be installed with `pip install ph[iplot]`.
1008 | 
1009 | ```bash
1010 | $ cat a.csv | ph iplot --kind=bar --barmode=stack
1011 | ```
1012 | 
1013 | ```bash
1014 | $ cat a.csv | ph iplot --kind=scatter --mode=markers
1015 | ```
1016 | 
1017 | 
1018 | #### `groupby`
1019 | 
1020 | Suppose you have a csv file
1021 | 
1022 | ```csv
1023 | Animal,Max Speed
1024 | Falcon,380.0
1025 | Falcon,370.0
1026 | Parrot,24.0
1027 | Parrot,26.0
1028 | ```
1029 | 
1030 | You can use Pandas' `groupby` functionality to get the aggregated `sum`,
1031 | `mean`, or `first` value:
1032 | 
1033 | ```bash
1034 | $ cat group.csv | ph groupby Animal --how=mean
1035 | Max Speed
1036 | 375.0
1037 | 25.0
1038 | ```
1039 | 
1040 | If you want to retain the index column,
1041 | 
1042 | ```bash
1043 | $ cat group.csv | ph groupby Animal --how=mean --as_index=False
1044 | Animal,Max Speed
1045 | Falcon,375.0
1046 | Parrot,25.0
1047 | ```
1048 | 
1049 | 
1050 | 
1051 | #### `rolling`, `ewm`, `expanding`
1052 | 
1053 | **rolling**
1054 | 
1055 | Compute rolling averages/sums using `ph rolling 3 --how=mean`
1056 | 
1057 | Consider again `a.csv`:
1058 | 
1059 | ```csv
1060 | x,y
1061 | 3,8
1062 | 4,9
1063 | 5,10
1064 | 6,11
1065 | 7,12
1066 | 8,13
1067 | ```
1068 | 
1069 | Moving average with window size 3:
1070 | 
1071 | ```bash
1072 | $ cat a.csv|ph rolling 3 --how=mean | ph dropna
1073 | x,y
1074 | 4.0,9.0
1075 | 5.0,10.0
1076 | 6.0,11.0
1077 | 7.0,12.0
1078 | ```
1079 | 
1080 | 
1081 | Rolling sum with window size 2:
1082 | 
1083 | ```bash
1084 | $ cat a.csv|ph rolling 2 --how=sum | ph dropna
1085 | x,y
1086 | 7.0,17.0
1087 | 9.0,19.0
1088 | 11.0,21.0
1089 | 13.0,23.0
1090 | 15.0,25.0
1091 | ```
1092 | 
1093 | 
1094 | **ewm — exponentially weighted methods**
1095 | 
1096 | ```bash
1097 | $ cat a.csv | ph ewm --com=0.5 --how=mean | ph show
1098 |           x         y
1099 | --  -------  --------
1100 |  0  3         8
1101 |  1  3.75      8.75
1102 |  2  4.61538   9.61538
1103 |  3  5.55     10.55
1104 |  4  6.52066  11.5207
1105 |  5  7.50824  12.5082
1106 | ```
1107 | 
1108 | Use either `com` (center of mass), `span`, `halflife`, or `alpha`,
1109 | together with `--how=mean`, `--how=std`, `--how=var`, etc.
1110 | 
1111 | 
1112 | **expanding — expanding window**
1113 | 
1114 | > A common alternative to rolling statistics is to use an expanding
1115 | > window, which yields the value of the statistic with all the data
1116 | > available up to that point in time.
1117 | 
1118 | ```bash
1119 | $ cat a.csv | ph expanding 3
1120 | x,y
1121 | ,
1122 | ,
1123 | 12.0,27.0
1124 | 18.0,38.0
1125 | 25.0,50.0
1126 | 33.0,63.0
1127 | ```
1128 | 
1129 | 
1130 | **Spencer's 15-weight average**
1131 | 
1132 | We also support an experimental and slow version of Spencer's 15-weight
1133 | average.  This method takes a window of size 15, and pointwise multiply
1134 | with the following vector (normalized)
1135 | 
1136 | ```
1137 | (-3, -6, -5, 3, 21, 46, 67, 74, 67, 46, 21, 3, -5, -6, -3)
1138 | ```
1139 | 
1140 | and then takes the sum of the resulting vector.
1141 | 
1142 | Spencer's 15-weight average is an interesting (impulse response) filter
1143 | that preserves all up to cubic polynomial functions.
1144 | 
1145 | 
1146 | #### `index`
1147 | 
1148 | Occasionally you need to have an index, in which case `ph index` is your tool:
1149 | 
1150 | ```bash
1151 | $ cat a.csv | ph index
1152 | index,x,y
1153 | 0,3,8
1154 | 1,4,9
1155 | 2,5,10
1156 | 3,6,11
1157 | 4,7,12
1158 | 5,8,13
1159 | ```
1160 | 
1161 | #### `polyfit`
1162 | 
1163 | You can perform **linear regression** and **polynomial regression** on a certain
1164 | index column `x` and a `y = f(x)` column using `ph polyfit`.  It takes two
1165 | arguments, the `x` column name, the `y` column name and an optional
1166 | `--deg=<degree>`, the degree of the polynomial.  The default option is `--deg=1`
1167 | which corresponds to a linear regression.
1168 | 
1169 | Suppose you have a csv file `lr.csv` with content
1170 | 
1171 | ```csv
1172 | x,y
1173 | 4,12
1174 | 5,19
1175 | 6,17
1176 | 7,24
1177 | 8,28
1178 | 9,34
1179 | ```
1180 | 
1181 | With linear (polynomial) regression, you get an extra column, `polyfit_{deg}`:
1182 | 
1183 | ```bash
1184 | $ cat lr.csv | ph polyfit x y | ph astype int
1185 | x,y,polyfit_1
1186 | 4,12,12
1187 | 5,19,16
1188 | 6,17,20
1189 | 7,24,24
1190 | 8,28,28
1191 | 9,34,32
1192 | ```
1193 | 
1194 | Using `ph plot --index=x` results in this plot:
1195 | 
1196 | ![polyfit](https://raw.githubusercontent.com/pgdr/ph/master/assets/polyfit.png)
1197 | 
1198 | ## Working with different formats
1199 | 
1200 | 
1201 | ### `open`
1202 | 
1203 | Pandas supports reading a multitude of [readers](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html).
1204 | 
1205 | To read an Excel file and pipe the stream, you can use `ph open`.
1206 | 
1207 | The syntax of `ph open` is `ph open ftype fname`, where `fname` is the
1208 | file you want to stream and `ftype` is the type of the file.
1209 | 
1210 | A list of all available formats is given below.
1211 | 
1212 | ```bash
1213 | $ ph open xls a.xlsx
1214 | x,y
1215 | 3,8
1216 | 4,9
1217 | 5,10
1218 | 6,11
1219 | 7,12
1220 | 8,13
1221 | ```
1222 | 
1223 | 
1224 | You can open a _semicolon separated values_ file using `--sep=";"`
1225 | 
1226 | ```bash
1227 | $ ph open csv --sep=";" fname.csv
1228 | ```
1229 | 
1230 | 
1231 | 
1232 | ### `to` and `from`; Exporting and importing
1233 | 
1234 | Observe the following:
1235 | 
1236 | ```json
1237 | {"x":{"0":3,"1":4,"2":5,"3":6,"4":7,"5":8},
1238 |  "y":{"0":8,"1":9,"2":10,"3":11,"4":12,"5":13}}
1239 | ```
1240 | 
1241 | Of course, then,
1242 | 
1243 | ```bash
1244 | $ cat a.csv | ph to json | ph from json
1245 | x,y
1246 | 3,8
1247 | 4,9
1248 | 5,10
1249 | 6,11
1250 | 7,12
1251 | 8,13
1252 | ```
1253 | 
1254 | This also means that
1255 | 
1256 | ```bash
1257 | $ cat a.csv | ph to json > a.json
1258 | $ cat a.json
1259 | {"x":{"0":3,"1":4,"2":5,"3":6,"4":7,"5":8},
1260 |  "y":{"0":8,"1":9,"2":10,"3":11,"4":12,"5":13}}
1261 | $ cat a.json | ph from json
1262 | x,y
1263 | 3,8
1264 | 4,9
1265 | 5,10
1266 | 6,11
1267 | 7,12
1268 | 8,13
1269 | ```
1270 | 
1271 | You can open Excel-like formats using `ph open excel fname.xls[x]`, `parquet`
1272 | files with `ph open parquet data.parquet`.  Note that these two examples require
1273 | `xlrd` and `pyarrow`, respectively, or simply
1274 | 
1275 | ```
1276 | pip install ph[complete]
1277 | ```
1278 | 
1279 | 
1280 | ### Supported formats
1281 | 
1282 | * `csv` / `tsv` (the latter for tab-separated values)
1283 | * `fwf` (fixed-width file format)
1284 | * `json`
1285 | * `html`
1286 | * `clipboard` (pastes tab-separated content from clipboard)
1287 | * `xls`
1288 | * `odf`
1289 | * `hdf5`
1290 | * `feather`
1291 | * `parquet`
1292 | * `orc`
1293 | * `stata`
1294 | * `sas`
1295 | * `spss`
1296 | * `pickle`
1297 | * `sql`
1298 | * `gbq` / `google` / `bigquery`
1299 | 
1300 | We also support reading GPX files with `ph open gpx`.
1301 | This uses the GPX Python library [gpxpy](https://github.com/tkrajina/gpxpy).
1302 | 


--------------------------------------------------------------------------------
/tests/test_ph.py:
--------------------------------------------------------------------------------
   1 | import ph
   2 | 
   3 | import os.path
   4 | import io
   5 | 
   6 | import pytest
   7 | import contextlib
   8 | 
   9 | import pandas as pd
  10 | import datetime as dt
  11 | import math
  12 | 
  13 | NAN = float("nan")
  14 | LEFT_COLUMNS = ["key1", "key2", "A", "B"]  # columns of left.csv
  15 | 
  16 | 
  17 | def __have_xlrd():
  18 |     try:
  19 |         import xlrd  # noqa
  20 | 
  21 |         return True
  22 |     except ImportError:
  23 |         return False
  24 | 
  25 | 
  26 | def _assert_a(df):
  27 |     assert list(df.shape) == [6, 2]
  28 |     assert list(df.columns) == ["x", "y"]
  29 |     assert list(df["x"]) == list(range(3, 9))
  30 |     assert list(df["y"]) == list(range(8, 14))
  31 | 
  32 | 
  33 | def _get_path(name, extension="csv"):
  34 |     pth = "test_data/{}.{}".format(name, extension)
  35 |     root = os.path.dirname(__file__)
  36 |     path = os.path.abspath(os.path.join(root, pth))
  37 | 
  38 |     return path
  39 | 
  40 | 
  41 | def _get_data(name, extension="csv"):
  42 |     path = _get_path(name, extension)
  43 |     with open(path, "r") as fin:
  44 |         data = "".join(fin.readlines())
  45 | 
  46 |     return data
  47 | 
  48 | 
  49 | def _get_io(name, extension="csv"):
  50 |     return io.StringIO(_get_data(name, extension))
  51 | 
  52 | 
  53 | class Capture:
  54 |     # Just a mutable string container for ctx mgr around capture.out
  55 |     def __init__(self, outerr=None):
  56 |         if outerr is not None:
  57 |             self.out = outerr.out
  58 |             self.err = outerr.err
  59 |         else:
  60 |             self.out = ""
  61 |             self.err = ""
  62 |         self._df = None
  63 | 
  64 |     @property
  65 |     def df(self):
  66 |         if self._df is None:
  67 |             self._df = pd.read_csv(io.StringIO(self.out))
  68 |         return self._df
  69 | 
  70 |     def read_df(self, *args, **kwargs):
  71 |         self._df = pd.read_csv(io.StringIO(self.out), *args, **kwargs)
  72 |         return self.df
  73 | 
  74 |     def assert_shape(self, rows, cols):
  75 |         assert list(self.df.shape) == [rows, cols]
  76 | 
  77 |     def assert_columns(self, columns):
  78 |         assert list(self.df.columns) == list(columns)
  79 | 
  80 | 
  81 | @pytest.fixture
  82 | def phmgr(capsys, monkeypatch):
  83 |     @contextlib.contextmanager
  84 |     def phmgr(dataset="a", extension="csv"):
  85 |         monkeypatch.setattr("sys.stdin", _get_io(dataset, extension))
  86 |         cap = Capture()
  87 |         yield cap
  88 |         outerr = capsys.readouterr()
  89 |         cap.out, cap.err = outerr.out, outerr.err
  90 |         assert not cap.err, "Std error not empty: {}".format(cap.err)
  91 | 
  92 |     return phmgr
  93 | 
  94 | 
  95 | def _call(cmd, extra=None):
  96 |     if extra is None:
  97 |         extra = []
  98 |     ph._main(["ph"] + cmd.split(" ") + extra)
  99 | 
 100 | 
 101 | def test_cat(phmgr):
 102 |     with phmgr() as captured:
 103 |         _call("cat")
 104 |     assert captured.out == _get_data("a")
 105 | 
 106 | 
 107 | def test_cat_many(capsys):
 108 |     _call("cat {} {} --axis=index".format(_get_path("a"), _get_path("covid")))
 109 |     cap = Capture(capsys.readouterr())
 110 |     assert not cap.err
 111 |     cap.assert_shape(35, 12)
 112 | 
 113 |     _call("cat {} {} --axis=columns".format(_get_path("a"), _get_path("covid")))
 114 |     cap = Capture(capsys.readouterr())
 115 |     assert not cap.err
 116 |     cap.assert_shape(29, 12)
 117 | 
 118 | 
 119 | def test_columns(phmgr):
 120 |     with phmgr("iris") as captured:
 121 |         _call("columns")
 122 |     assert not captured.err
 123 |     captured.assert_columns(["columns"])
 124 |     assert list(captured.df["columns"]) == [
 125 |         "150",
 126 |         "4",
 127 |         "setosa",
 128 |         "versicolor",
 129 |         "virginica",
 130 |     ]
 131 | 
 132 | 
 133 | def test_columns_args(phmgr):
 134 |     with phmgr("iris") as captured:
 135 |         _call("columns setosa versicolor")
 136 |     assert not captured.err
 137 |     captured.assert_shape(150, 2)
 138 |     captured.assert_columns(["setosa", "versicolor"])
 139 | 
 140 | 
 141 | def test_drop_columns(phmgr):
 142 |     with phmgr("iris") as captured:
 143 |         _call("drop setosa virginica --axis=columns")
 144 |     assert not captured.err
 145 |     df = captured.df
 146 |     captured.assert_shape(150, 3)
 147 |     captured.assert_columns(
 148 |         [
 149 |             "150",
 150 |             "4",
 151 |             "versicolor",
 152 |         ]
 153 |     )
 154 |     assert list(df.iloc[0]) == [5.1, 3.5, 0.2]
 155 | 
 156 | 
 157 | def test_drop_index(phmgr):
 158 |     with phmgr("iris") as captured:
 159 |         _call("drop 0 --axis=index")
 160 |     assert not captured.err
 161 |     df = captured.df
 162 |     captured.assert_shape(149, 5)
 163 |     assert list(df.iloc[0]) == [4.9, 3.0, 1.4, 0.2, 0]
 164 | 
 165 | 
 166 | def test_open_skiprows(capsys):
 167 |     _call("open csv {} --skiprows=6".format(_get_path("f")))
 168 |     captured = Capture(capsys.readouterr())
 169 |     assert not captured.err
 170 |     df = captured.df
 171 |     captured.assert_shape(2, 2)
 172 |     assert list(df.iloc[0]) == [14, 13]
 173 |     assert list(df.iloc[1]) == [16, 21]
 174 | 
 175 | 
 176 | def test_from_headless(phmgr):
 177 |     with phmgr("headless") as captured:
 178 |         _call("from --header=None")
 179 |     assert not captured.err
 180 |     captured.assert_shape(5, 2)
 181 |     captured.assert_columns(["0", "1"])
 182 | 
 183 | 
 184 | def test_open_headless(capsys):
 185 |     _call("open csv {} --header=None".format(_get_path("headless")))
 186 |     captured = Capture(capsys.readouterr())
 187 |     assert not captured.err
 188 |     captured.assert_shape(5, 2)
 189 |     captured.assert_columns(["0", "1"])
 190 | 
 191 | 
 192 | def test_diff_all(phmgr):
 193 |     with phmgr() as captured:
 194 |         _call("diff --periods=2")
 195 |     assert not captured.err
 196 |     captured.assert_shape(6, 2)
 197 |     captured.assert_columns(["x", "y"])
 198 | 
 199 | 
 200 | def test_diff_xy(phmgr):
 201 |     with phmgr() as captured:
 202 |         _call("diff x y --periods=2")
 203 |     assert not captured.err
 204 | 
 205 | 
 206 | def test_diff_with_date_col(phmgr):
 207 |     with phmgr("usa") as captured:
 208 |         _call("diff cases deaths")
 209 |     assert not captured.err
 210 |     captured.assert_shape(93, 7)
 211 |     df = captured.df
 212 |     c = list(df["cases"])
 213 |     d = list(df["deaths"])
 214 |     assert c[1:6] == [-3403.0, -3235.0, 1619.0, -1284.0, -1898.0]
 215 |     assert d[1:7] == [-248.0, -343.0, 166.0, -73.0, -165.0, 3.0]
 216 | 
 217 | 
 218 | @pytest.mark.skipif(
 219 |     os.getenv("GITHUB_WORKFLOW") is not None, reason="clipboard not on headless"
 220 | )
 221 | def test_clipboard(capsys):
 222 |     # This test is a bit nasty as we necessarily need to modify the
 223 |     # clipboard.  We do, however, try to preserve the content.  YMMV.
 224 |     import pandas.io.clipboard as cp
 225 | 
 226 |     old = cp.paste()
 227 |     try:
 228 |         df = pd.read_csv(_get_path("a"))
 229 |         df.to_clipboard()
 230 | 
 231 |         _call("from clipboard")
 232 |         captured = Capture(capsys.readouterr())
 233 |         assert not captured.err
 234 |         df = captured.df
 235 |         captured.assert_shape(6, 2)
 236 |     finally:
 237 |         cp.copy(old)
 238 | 
 239 | 
 240 | def test_sep_from(phmgr):
 241 |     with phmgr("d", extension="scsv") as captured:
 242 |         _call("from csv --sep=;")
 243 |     assert not captured.err
 244 |     captured.assert_shape(6, 3)
 245 | 
 246 | 
 247 | def test_from_skiprows(phmgr):
 248 |     with phmgr("f") as captured:
 249 |         _call("from csv --skiprows=6")
 250 |     assert not captured.err
 251 |     df = captured.df
 252 |     captured.assert_shape(2, 2)
 253 |     assert list(df.iloc[0]) == [14, 13]
 254 |     assert list(df.iloc[1]) == [16, 21]
 255 | 
 256 | 
 257 | def test_sep_to_with_sep(capsys, monkeypatch):
 258 |     monkeypatch.setattr("sys.stdin", _get_io("d"))
 259 |     _call("to csv --sep=_")
 260 |     captured = Capture(capsys.readouterr())
 261 |     assert not captured.err
 262 |     captured.assert_shape(6, 1)
 263 | 
 264 |     df = pd.read_csv(io.StringIO(captured.out), sep="_")
 265 |     assert list(df.shape) == [6, 3]
 266 |     assert list(df["year"]) == list(range(2003, 2009))
 267 | 
 268 | 
 269 | def test_sep_to_with_index(capsys, monkeypatch):
 270 |     monkeypatch.setattr("sys.stdin", _get_io("d"))
 271 |     _call("to csv --index=true")
 272 |     captured = Capture(capsys.readouterr())
 273 |     assert not captured.err
 274 |     captured.assert_shape(6, 4)
 275 | 
 276 | 
 277 | def test_thousands_from(capsys, monkeypatch):
 278 |     monkeypatch.setattr("sys.stdin", _get_io("t", extension="tsv"))
 279 |     _call("from csv --thousands=, --sep=\t")
 280 |     captured = Capture(capsys.readouterr())
 281 |     assert not captured.err
 282 |     df = captured.df
 283 |     captured.assert_shape(7, 2)
 284 |     assert all(df["a"] == 10 ** df["b"])
 285 | 
 286 | 
 287 | def test_thousands_from_escaped_tab(capsys, monkeypatch):
 288 |     monkeypatch.setattr("sys.stdin", _get_io("t", extension="tsv"))
 289 |     _call("from csv --thousands=, --sep=\\t")
 290 |     captured = Capture(capsys.readouterr())
 291 |     assert not captured.err
 292 |     df = captured.df
 293 |     captured.assert_shape(7, 2)
 294 |     assert all(df["a"] == 10 ** df["b"])
 295 | 
 296 | 
 297 | def test_strip_default(phmgr):
 298 |     with phmgr("right") as captured:
 299 |         _call("strip --lstrip=True")
 300 |     assert not captured.err
 301 |     captured.assert_shape(4, 4)
 302 |     captured.assert_columns(["key1", "key2", "C", "D"])
 303 | 
 304 |     assert list(captured.df["C"]) == ["C0", "C1", "C2", "C3"]
 305 | 
 306 | 
 307 | def test_strip_actual(phmgr):
 308 |     with phmgr("strip") as captured:
 309 |         _call("strip date")
 310 |     assert not captured.err
 311 | 
 312 |     captured.assert_shape(6, 4)
 313 |     captured.assert_columns(["idx", "date", "x", "y"])
 314 |     _assert_a(captured.df[["x", "y"]])
 315 |     assert list(captured.df.date) == ["2020-05-{}".format(i) for i in range(12, 18)]
 316 | 
 317 | 
 318 | def test_removeprefix(phmgr):
 319 |     with phmgr("left") as captured:
 320 |         _call("removeprefix A A")
 321 |     assert not captured.err
 322 |     captured.assert_shape(4, 4)
 323 |     captured.assert_columns(LEFT_COLUMNS)
 324 |     assert list(captured.df["A"]) == [0, 1, 2, 3]
 325 | 
 326 | 
 327 | def test_removesuffix(phmgr):
 328 |     with phmgr("left") as captured:
 329 |         _call("removesuffix key1 0")
 330 |     assert not captured.err
 331 |     captured.assert_shape(4, 4)
 332 |     captured.assert_columns(LEFT_COLUMNS)
 333 |     assert list(captured.df["key1"]) == ["K", "K", "K1", "K2"]
 334 | 
 335 | 
 336 | def test_describe(phmgr):
 337 |     with phmgr() as captured:
 338 |         _call("describe")
 339 |     assert len(captured.out.split("\n")) == 10
 340 |     header = set(captured.out.split("\n")[0].split())
 341 |     assert "x" in header
 342 |     assert "y" in header
 343 |     assert "max" in captured.out
 344 | 
 345 | 
 346 | def test_shape(phmgr):
 347 |     with phmgr("covid") as captured:
 348 |         _call("shape")
 349 |     df = captured.df
 350 |     captured.assert_columns(["rows", "columns"])
 351 |     assert list(df["rows"]) == [29]
 352 |     assert list(df["columns"]) == [10]
 353 | 
 354 | 
 355 | def test_transpose(phmgr):
 356 |     with phmgr() as captured:
 357 |         _call("transpose")
 358 |     assert (
 359 |         captured.out
 360 |         == """\
 361 | 0,1,2,3,4,5
 362 | 3,4,5,6,7,8
 363 | 8,9,10,11,12,13
 364 | """
 365 |     )
 366 | 
 367 | 
 368 | def test_head_tail(capsys, monkeypatch):
 369 |     monkeypatch.setattr("sys.stdin", _get_io("a"))
 370 |     _call("head 7")
 371 |     captured = capsys.readouterr()
 372 |     assert not captured.err
 373 | 
 374 |     monkeypatch.setattr("sys.stdin", io.StringIO(captured.out))
 375 |     _call("tail 3")
 376 |     captured = capsys.readouterr()
 377 |     assert (
 378 |         captured.out
 379 |         == """\
 380 | x,y
 381 | 6,11
 382 | 7,12
 383 | 8,13
 384 | """
 385 |     )
 386 |     assert not captured.err
 387 | 
 388 | 
 389 | def test_open_with_decimals(phmgr):
 390 |     with phmgr("padded_decimals") as captured:
 391 |         _call("from csv --decimal=, --thousands=.")
 392 |     assert not captured.err
 393 |     df = captured.df
 394 |     captured.assert_shape(7, 2)
 395 |     assert "paddecim" in df.columns
 396 |     assert str(df["paddecim"].dtype).startswith("float")
 397 |     assert df["paddecim"].sum() == 1470.0 * 2
 398 | 
 399 | 
 400 | def test_from_with_decimals(capsys, monkeypatch):
 401 |     monkeypatch.setattr("sys.stdin", _get_io("padded_decimals"))
 402 |     _call("from csv --decimal=, --thousands=.")
 403 |     captured = Capture(capsys.readouterr())
 404 | 
 405 |     assert not captured.err
 406 |     df = captured.df
 407 |     captured.assert_shape(7, 2)
 408 |     assert "paddecim" in df.columns
 409 |     assert str(df["paddecim"].dtype).startswith("float")
 410 |     assert df["paddecim"].sum() == 1470.0 * 2
 411 | 
 412 | 
 413 | def test_date(phmgr):
 414 |     with phmgr() as captured:
 415 |         _call("date x --unit=D")
 416 |     df = captured.df
 417 |     df["x"] = pd.to_datetime(captured.df["x"])
 418 |     assert list(df["y"]) == list(range(8, 14))
 419 |     x = list(df["x"])
 420 |     assert len(list(df["x"])) == 6
 421 |     for i in range(6):
 422 |         assert str(x[i]) == "1970-01-0{} 00:00:00".format(i + 4)
 423 | 
 424 |     with phmgr("d") as captured:
 425 |         _call("date")
 426 |     df = captured.df
 427 |     assert len(df) == 6
 428 |     captured.assert_columns(["0"])
 429 |     act = [str(x) for x in df["0"]]
 430 |     exp = [
 431 |         "2003-03-08",
 432 |         "2004-04-09",
 433 |         "2005-05-10",
 434 |         "2006-06-11",
 435 |         "2007-07-12",
 436 |         "2008-08-13",
 437 |     ]
 438 |     assert act == exp
 439 | 
 440 | 
 441 | def test_date_dayfirst(phmgr):
 442 |     with phmgr("usa") as captured:
 443 |         _call("date dateRep --dayfirst=True")
 444 |     df = captured.df
 445 |     captured.assert_shape(93, 7)
 446 |     df["dateRep"] = pd.to_datetime(df["dateRep"])
 447 |     df["realdate"] = pd.to_datetime(df[["year", "month", "day"]])
 448 |     assert all(df["realdate"] == df["dateRep"])
 449 | 
 450 | 
 451 | def test_date_errors(phmgr):
 452 |     with pytest.raises(SystemExit) as exit_:
 453 |         with phmgr("derr") as captured:
 454 |             _call("date --col=x")
 455 |     assert str(exit_.value) == "ph date: Unknown column x"
 456 | 
 457 |     with pytest.raises(SystemExit) as exit_:
 458 |         with phmgr("derr") as captured:
 459 |             _call("date --col=year")
 460 |     assert str(exit_.value).startswith("Out of bounds nanosecond timestamp")
 461 | 
 462 |     with pytest.raises(SystemExit) as exit_:
 463 |         with phmgr("derr") as captured:
 464 |             _call("date --col=year --errors=nosucherr")
 465 |     assert str(exit_.value).startswith("Errors must be one of")
 466 | 
 467 |     with phmgr("derr") as captured:
 468 |         _call("date --col=year --errors=coerce")
 469 |     assert not captured.err
 470 |     df = captured.df
 471 |     assert df["year"].dtype == dt.datetime
 472 | 
 473 |     with phmgr("derr") as captured:
 474 |         _call("date --col=year --errors=ignore")
 475 |     assert not captured.err
 476 |     df = captured.df
 477 |     assert "200-01" in list(df["year"])
 478 | 
 479 | 
 480 | def test_date_fmt(phmgr):
 481 |     with phmgr("date-fmt") as captured:
 482 |         _call("date date --format=%Y_%m/%d")
 483 |     assert not captured.err
 484 |     captured.assert_shape(6, 3)
 485 |     captured.assert_columns(["date", "x", "y"])
 486 |     _assert_a(captured.df[["x", "y"]])
 487 |     dates = list(captured.df["date"])
 488 |     assert dates == [
 489 |         "2020-02-02",
 490 |         "2020-02-03",
 491 |         "2020-02-04",
 492 |         "2020-02-05",
 493 |         "2020-02-06",
 494 |         "2020-02-07",
 495 |     ]
 496 | 
 497 | 
 498 | def test_date_utc(phmgr):
 499 |     with phmgr("date-utc") as captured:
 500 |         _call("date date --utc=True")
 501 |     assert not captured.err
 502 |     captured.assert_shape(6, 3)
 503 |     captured.assert_columns(["date", "x", "y"])
 504 |     _assert_a(captured.df[["x", "y"]])
 505 |     dates = list(captured.df["date"])
 506 |     assert dates == [
 507 |         "2020-02-02",
 508 |         "2020-02-03",
 509 |         "2020-02-04",
 510 |         "2020-02-05",
 511 |         "2020-02-06",
 512 |         "2020-02-07",
 513 |     ]
 514 | 
 515 | 
 516 | def test_eval(phmgr):
 517 |     with phmgr() as captured:
 518 |         _call("eval", ["x = x**2"])
 519 |     assert (
 520 |         captured.out
 521 |         == """\
 522 | x,y
 523 | 9,8
 524 | 16,9
 525 | 25,10
 526 | 36,11
 527 | 49,12
 528 | 64,13
 529 | """
 530 |     )
 531 | 
 532 | 
 533 | def test_dropna(phmgr):
 534 |     with phmgr("covid") as captured:
 535 |         _call("dropna")
 536 |     captured.assert_shape(5, 10)
 537 | 
 538 |     with phmgr("covid") as captured:
 539 |         _call("dropna --thresh=7")
 540 |     captured.assert_shape(15, 10)
 541 | 
 542 |     with phmgr("covid") as captured:
 543 |         _call("dropna --axis=1 --thresh=17")
 544 |     captured.assert_shape(29, 5)
 545 | 
 546 | 
 547 | def test_fillna(phmgr):
 548 |     with phmgr("covid") as captured:
 549 |         _call("fillna 17")
 550 |     assert captured.df["Canada"].sum() == 1401
 551 | 
 552 |     with phmgr("covid") as captured:
 553 |         _call("fillna 19 --limit=3")
 554 |     assert captured.df["Canada"].sum() == 1050
 555 | 
 556 |     with phmgr("covid") as captured:
 557 |         _call("fillna --method=pad --limit=5")
 558 |     assert captured.df["Canada"].sum() == 2493
 559 | 
 560 |     with pytest.raises(SystemExit) as exit_:
 561 |         _call("fillna")
 562 |     assert "'ph fillna' needs exactly one of" in str(exit_.value)
 563 | 
 564 | 
 565 | def test_merge(capsys):
 566 |     lft = _get_path("left")
 567 |     rht = _get_path("right")
 568 |     ph.merge(lft, rht)
 569 |     cap = Capture(capsys.readouterr())
 570 |     assert not cap.err
 571 |     cap.assert_shape(3, 6)
 572 | 
 573 |     ph.merge(lft, rht, how="left")
 574 |     cap = Capture(capsys.readouterr())
 575 |     assert not cap.err
 576 |     cap.assert_shape(5, 6)
 577 | 
 578 |     ph.merge(lft, rht, how="outer")
 579 |     cap = Capture(capsys.readouterr())
 580 |     assert not cap.err
 581 |     cap.assert_shape(6, 6)
 582 | 
 583 |     ph.merge(lft, rht, on="key1")
 584 |     cap = Capture(capsys.readouterr())
 585 |     assert not cap.err
 586 |     cap.assert_shape(5, 7)
 587 | 
 588 |     lm = _get_path("mergel")
 589 |     rm = _get_path("merger")
 590 |     ph.merge(lm, rm, left="lk2", right="rk2")
 591 |     cap = Capture(capsys.readouterr())
 592 |     assert not cap.err
 593 |     cap.assert_shape(3, 8)
 594 |     assert list(cap.df.iloc[0]) == [
 595 |         "K0",
 596 |         "K5",
 597 |         "A0",
 598 |         "B0",
 599 |         "K4",
 600 |         "K5",
 601 |         "A2",
 602 |         "B2",
 603 |     ]
 604 | 
 605 | 
 606 | def test_groupby_sum_default(phmgr):
 607 |     with phmgr("group") as captured:
 608 |         _call("groupby Animal")
 609 |     assert not captured.err
 610 |     df = captured.df
 611 |     captured.assert_shape(2, 2)
 612 |     assert list(df.iloc[0]) == ["Falcon", 750.0]
 613 |     assert list(df.iloc[1]) == ["Parrot", 50.0]
 614 | 
 615 | 
 616 | def test_groupby_sum(phmgr):
 617 |     with phmgr("group") as captured:
 618 |         _call("groupby Animal --how=sum")
 619 |     assert not captured.err
 620 |     df = captured.df
 621 |     captured.assert_shape(2, 2)
 622 |     assert list(df.iloc[0]) == ["Falcon", 750.0]
 623 |     assert list(df.iloc[1]) == ["Parrot", 50.0]
 624 | 
 625 | 
 626 | def test_groupby_mean(phmgr):
 627 |     with phmgr("group") as captured:
 628 |         _call("groupby Animal --how=count --as_index=True")
 629 |     assert not captured.err
 630 |     df = captured.df
 631 |     captured.assert_shape(2, 1)
 632 |     assert list(df.iloc[0]) == [2]
 633 |     assert list(df.iloc[1]) == [2]
 634 | 
 635 | 
 636 | def test_groupby_first(phmgr):
 637 |     with phmgr("group") as captured:
 638 |         _call("groupby Animal --how=first")
 639 |     assert not captured.err
 640 |     df = captured.df
 641 |     captured.assert_shape(2, 2)
 642 |     assert list(df.iloc[0]) == ["Falcon", 380.0]
 643 |     assert list(df.iloc[1]) == ["Parrot", 24.0]
 644 | 
 645 | 
 646 | def test_rolling_default(phmgr):
 647 |     with phmgr("iris") as captured:
 648 |         _call("rolling 3")
 649 |     assert not captured.err
 650 |     captured.assert_shape(150, 5)
 651 |     assert captured.df["setosa"].dropna().sum() == pytest.approx(1671.0, 0.01)
 652 | 
 653 | 
 654 | def test_rolling_mean(phmgr):
 655 |     with phmgr("iris") as captured:
 656 |         _call("rolling 7 --how=mean")
 657 |     assert not captured.err
 658 |     captured.assert_shape(150, 5)
 659 |     assert captured.df["setosa"].dropna().sum() == pytest.approx(543.83, 0.01)
 660 | 
 661 | 
 662 | def test_rolling_subset_columns(phmgr):
 663 |     with phmgr("date-fmt") as captured:
 664 |         _call("rolling 3 x y --how=median")
 665 |     assert not captured.err
 666 |     captured.assert_shape(6, 3)
 667 |     captured.assert_columns(["date", "x", "y"])
 668 |     x = list(captured.df["x"])
 669 |     y = list(captured.df["y"])
 670 |     date = list(captured.df["date"])
 671 |     assert math.isnan(x[0])
 672 |     assert math.isnan(x[1])
 673 |     assert math.isnan(y[0])
 674 |     assert math.isnan(y[1])
 675 |     assert x[2:] == [4, 5, 6, 7]
 676 |     assert y[2:] == [9, 10, 11, 12]
 677 |     assert date == ["2020_02/0{}".format(i) for i in range(2, 8)]
 678 | 
 679 | 
 680 | def test_rolling_broken_window(phmgr):
 681 |     with phmgr("date-fmt") as _:
 682 |         with pytest.raises(SystemExit) as exit_:
 683 |             _call("rolling 3")
 684 |     err = 'ph rolling: Could not perform rolling window on column "date"'
 685 |     assert str(exit_.value) == err
 686 | 
 687 | 
 688 | def test_ewm_default(phmgr):
 689 |     with phmgr("iris") as captured:
 690 |         _call("ewm 2 --com=0.5")
 691 |     assert not captured.err
 692 |     captured.assert_shape(150, 5)
 693 |     assert captured.df["setosa"].dropna().sum() == pytest.approx(560.411)
 694 | 
 695 | 
 696 | def test_expanding_default(phmgr):
 697 |     with phmgr("iris") as captured:
 698 |         _call("expanding 3")
 699 |     assert not captured.err
 700 |     captured.assert_shape(150, 5)
 701 |     assert captured.df["setosa"].dropna().sum() == pytest.approx(32468.9)
 702 | 
 703 | 
 704 | def test_expanding_quantile(phmgr):
 705 |     with phmgr("iris") as captured:
 706 |         _call("expanding 3 --how=quantile --quantile=0.9")
 707 |     assert not captured.err
 708 |     captured.assert_shape(150, 5)
 709 |     assert captured.df["setosa"].dropna().sum() == pytest.approx(563.81)
 710 | 
 711 | 
 712 | def test_index(phmgr):
 713 |     with phmgr("a") as captured:
 714 |         _call("index")
 715 | 
 716 |     assert not captured.err
 717 |     assert list(captured.df["index"]) == [i for i in range(6)]
 718 | 
 719 | 
 720 | def test_split(phmgr):
 721 |     with phmgr("padded_decimals") as captured:
 722 |         _call("split paddecim ,")
 723 |     assert not captured.err
 724 |     captured.assert_shape(7, 3)
 725 |     captured.assert_columns(["idx", "paddecim", "paddecim_rhs"])
 726 |     captured.read_df(thousands=".")
 727 |     assert set(captured.df["paddecim_rhs"]) == {0, 50}
 728 |     assert list(captured.df["paddecim"]) == [
 729 |         502,
 730 |         172,
 731 |         7,
 732 |         142,
 733 |         157,
 734 |         487,
 735 |         1470,
 736 |     ]
 737 | 
 738 | 
 739 | def test_split_intcol(phmgr):
 740 |     """Testing that columns that are of int type can be split"""
 741 |     with phmgr("usa") as captured:
 742 |         _call("split year 0")
 743 |     assert not captured.err
 744 |     captured.assert_shape(93, 8)
 745 |     captured.assert_columns(
 746 |         ["dateRep", "day", "month", "year", "cases", "deaths", "geoId", "year_rhs"]
 747 |     )
 748 | 
 749 | 
 750 | def test_split_twice(capsys, monkeypatch):
 751 |     monkeypatch.setattr("sys.stdin", _get_io("date-fmt"))
 752 |     _call("split date /")
 753 |     captured = capsys.readouterr()
 754 |     assert not captured.err
 755 | 
 756 |     monkeypatch.setattr("sys.stdin", io.StringIO(captured.out))
 757 |     _call("split date _")
 758 |     captured = capsys.readouterr()
 759 |     assert (
 760 |         captured.out
 761 |         == """\
 762 | date,x,y,date_rhs,date_rhs_2
 763 | 2020,3,8,2,02
 764 | 2020,4,9,3,02
 765 | 2020,5,10,4,02
 766 | 2020,6,11,5,02
 767 | 2020,7,12,6,02
 768 | 2020,8,13,7,02
 769 | """
 770 |     )
 771 |     assert not captured.err
 772 | 
 773 | 
 774 | def test_sort(phmgr):
 775 |     with phmgr("iris") as captured:
 776 |         _call("sort setosa")
 777 |     assert not captured.err
 778 |     lst = list(captured.df["setosa"])
 779 |     assert lst == sorted(lst)
 780 | 
 781 | 
 782 | def test_grep_case_1(phmgr):
 783 |     with phmgr("left") as captured:
 784 |         _call("grep k0")
 785 |     assert not captured.err
 786 |     captured.assert_shape(0, 4)
 787 |     captured.assert_columns(LEFT_COLUMNS)
 788 | 
 789 | 
 790 | def test_grep_case_0(phmgr):
 791 |     with phmgr("left") as captured:
 792 |         _call("grep k0 --case=0")
 793 |     assert not captured.err
 794 |     captured.assert_shape(3, 4)
 795 |     captured.assert_columns(LEFT_COLUMNS)
 796 | 
 797 | 
 798 | def test_grep_case_false(phmgr):
 799 |     with phmgr("left") as captured:
 800 |         _call("grep k0 --case=False")
 801 |     assert not captured.err
 802 |     captured.assert_shape(3, 4)
 803 |     captured.assert_columns(LEFT_COLUMNS)
 804 | 
 805 | 
 806 | def test_grep_col1(phmgr):
 807 |     with phmgr("left") as captured:
 808 |         _call("grep K0 --column=key1")
 809 |     assert not captured.err
 810 |     captured.assert_shape(2, 4)
 811 |     captured.assert_columns(LEFT_COLUMNS)
 812 |     assert list(captured.df["A"]) == ["A0", "A1"]
 813 | 
 814 | 
 815 | def test_grep_col2(phmgr):
 816 |     with phmgr("left") as captured:
 817 |         _call("grep K0 --column=key2")
 818 |     assert not captured.err
 819 |     captured.assert_shape(2, 4)
 820 |     captured.assert_columns(LEFT_COLUMNS)
 821 |     assert list(captured.df["A"]) == ["A0", "A2"]
 822 | 
 823 | 
 824 | def test_grep_col1_pattern(phmgr):
 825 |     with phmgr("left") as captured:
 826 |         ph.grep("K[0|1]", column="key1")
 827 |     assert not captured.err
 828 |     captured.assert_shape(3, 4)
 829 |     captured.assert_columns(LEFT_COLUMNS)
 830 |     assert list(captured.df["A"]) == ["A0", "A1", "A2"]
 831 | 
 832 | 
 833 | def test_grep_col1_pattern_regex(phmgr):
 834 |     with phmgr("left") as captured:
 835 |         _call("grep K. --column=key1")
 836 |     assert not captured.err
 837 |     captured.assert_shape(4, 4)
 838 |     captured.assert_columns(LEFT_COLUMNS)
 839 |     assert list(captured.df["A"]) == ["A0", "A1", "A2", "A3"]
 840 | 
 841 | 
 842 | def test_grep_col1_pattern_regex_off(phmgr):
 843 |     with phmgr("left") as captured:
 844 |         _call("grep K. --column=key1 --regex=False")
 845 |     assert not captured.err
 846 |     captured.assert_shape(0, 4)
 847 |     captured.assert_columns(LEFT_COLUMNS)
 848 | 
 849 | 
 850 | def test_polyfit(phmgr):
 851 |     with phmgr() as captured:
 852 |         _call("polyfit x y")
 853 |     assert not captured.err
 854 |     df = captured.df
 855 |     assert list(df.columns) == ["x", "y", "polyfit_1"]
 856 |     assert df["y"].equals(df["polyfit_1"].astype(int))
 857 | 
 858 | 
 859 | def test_version(phmgr):
 860 |     import ph._version
 861 | 
 862 |     with phmgr() as captured:
 863 |         ph.print_version()
 864 |     assert not captured.err
 865 |     assert captured.out == ph._version.__version__ + "\n"
 866 | 
 867 | 
 868 | def test_slugify_method():
 869 |     actexp = {
 870 |         "abc": "abc",
 871 |         "abc123": "abc123",
 872 |         "abc_ 123 ": "abc_123",
 873 |         "abc(123)": "abc_123",
 874 |         "abc(123)_": "abc_123_",
 875 |         "(abc)/123": "abc_123",
 876 |         "_abc: 123": "_abc_123",
 877 |         '[]()abc-^  \\ "': "abc",
 878 |         "0": "0_",
 879 |         0: "0_",
 880 |         -3: "3_",
 881 |         "-3": "3_",
 882 |         "3.14": "3_14_",
 883 |         3.14: "3_14_",
 884 |     }
 885 |     for act, exp in actexp.items():
 886 |         assert ph.slugify_name(act) == exp
 887 | 
 888 | 
 889 | def test_replace(phmgr):
 890 |     with phmgr() as captured:
 891 |         _call("replace 8 100")
 892 |     assert not captured.err
 893 |     captured.assert_shape(6, 2)
 894 |     assert list(captured.df.x) == list(range(3, 8)) + [100]
 895 |     assert list(captured.df.y) == [100] + list(range(9, 14))
 896 | 
 897 | 
 898 | def test_replace_col_and_inf(phmgr):
 899 |     with phmgr("inf") as captured:
 900 |         _call("replace inf 0 --column=x")
 901 |     assert not captured.err
 902 |     captured.assert_shape(6, 2)
 903 | 
 904 |     x = captured.df.x
 905 |     lx = list(x)
 906 |     xna = x.dropna()
 907 |     assert list(xna) == [0, 7, 8]
 908 |     assert math.isnan(lx[0])
 909 |     assert math.isnan(lx[1])
 910 |     assert math.isnan(lx[2])
 911 |     assert list(captured.df.y) == list(range(8, 14))
 912 | 
 913 | 
 914 | def test_slice(phmgr):
 915 |     with phmgr("a") as captured:
 916 |         _call("slice 0:3")
 917 |     assert not captured.err
 918 |     captured.assert_shape(3, 2)
 919 |     assert list(captured.df.x) == list(range(3, 6))
 920 |     assert list(captured.df.y) == list(range(8, 11))
 921 | 
 922 | 
 923 | def test_slice_end(phmgr):
 924 |     with phmgr("a") as captured:
 925 |         _call("slice :3")
 926 |     assert not captured.err
 927 |     captured.assert_shape(3, 2)
 928 |     assert list(captured.df.x) == list(range(3, 6))
 929 |     assert list(captured.df.y) == list(range(8, 11))
 930 | 
 931 | 
 932 | def test_slice_start(phmgr):
 933 |     with phmgr("a") as captured:
 934 |         _call("slice 3:")
 935 |     assert not captured.err
 936 |     captured.assert_shape(3, 2)
 937 |     assert list(captured.df.x) == list(range(6, 9))
 938 |     assert list(captured.df.y) == list(range(11, 14))
 939 | 
 940 | 
 941 | def test_slice_start_step(phmgr):
 942 |     with phmgr("a") as captured:
 943 |         _call("slice 1::2")
 944 |     assert not captured.err
 945 |     captured.assert_shape(3, 2)
 946 |     assert list(captured.df.x) == list(range(4, 9, 2))
 947 |     assert list(captured.df.y) == list(range(9, 14, 2))
 948 | 
 949 | 
 950 | def test_slice_start_end_step(phmgr):
 951 |     with phmgr("a") as captured:
 952 |         _call("slice 1:5:2")
 953 |     assert not captured.err
 954 |     captured.assert_shape(2, 2)
 955 |     assert list(captured.df.x) == list(range(4, 7, 2))
 956 |     assert list(captured.df.y) == list(range(9, 12, 2))
 957 | 
 958 | 
 959 | def test_slugify_df(phmgr):
 960 |     with phmgr("slugit") as captured:
 961 |         _call("slugify")
 962 | 
 963 |     assert not captured.err
 964 | 
 965 |     cols = list(captured.df.columns)
 966 |     assert cols == ["stupid_column_1", "jerky_column_no_2"]
 967 | 
 968 | 
 969 | def test_slugify_rename_df(capsys, monkeypatch):
 970 |     monkeypatch.setattr("sys.stdin", _get_io("slugit"))
 971 |     _call("slugify")
 972 |     captured = Capture(capsys.readouterr())
 973 | 
 974 |     assert not captured.err
 975 |     cols = list(captured.df.columns)
 976 |     assert cols == ["stupid_column_1", "jerky_column_no_2"]
 977 | 
 978 |     monkeypatch.setattr("sys.stdin", io.StringIO(captured.out))
 979 |     _call("rename stupid_column_1 first")
 980 |     captured = Capture(capsys.readouterr())
 981 |     assert not captured.err
 982 |     cols = list(captured.df.columns)
 983 |     assert cols == ["first", "jerky_column_no_2"]
 984 | 
 985 |     monkeypatch.setattr("sys.stdin", io.StringIO(captured.out))
 986 |     _call("rename jerky_column_no_2 second")
 987 |     captured = Capture(capsys.readouterr())
 988 |     assert not captured.err
 989 |     cols = list(captured.df.columns)
 990 |     assert cols == ["first", "second"]
 991 | 
 992 | 
 993 | def test_doc_plot(capsys):
 994 |     _call("help plot")
 995 |     captured = Capture(capsys.readouterr())
 996 |     assert not captured.err
 997 |     assert "Plot the csv file" in captured.out
 998 | 
 999 | 
1000 | def test_doc_open_(capsys):  # tests registerx
1001 |     _call("help open")
1002 |     captured = Capture(capsys.readouterr())
1003 |     assert not captured.err
1004 |     assert "Use a reader to open a file" in captured.out
1005 | 
1006 | 
1007 | def test_median(phmgr):
1008 |     with phmgr() as captured:
1009 |         _call("median")
1010 |     assert not captured.err
1011 |     assert captured.out == "x,y\n5.5,10.5\n"
1012 | 
1013 | 
1014 | @pytest.mark.filterwarnings("ignore")
1015 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd")
1016 | def test_xlsx_default_sheet_0(capsys):
1017 |     pth = _get_path("sheet", extension="xlsx")
1018 |     cmd = "open excel {} {}".format(pth, "--skiprows=4")
1019 |     _call(cmd)
1020 |     captured = Capture(capsys.readouterr())
1021 |     assert not captured.err
1022 |     _assert_a(captured.df)
1023 | 
1024 | 
1025 | @pytest.mark.filterwarnings("ignore")
1026 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd")
1027 | def test_xlsx_explicit_sheet_0(capsys):
1028 |     pth = _get_path("sheet", extension="xlsx")
1029 |     cmd = "open excel {} {} {}".format(pth, "--skiprows=4", "--sheet_name=0")
1030 |     _call(cmd)
1031 |     captured = Capture(capsys.readouterr())
1032 |     assert not captured.err
1033 |     _assert_a(captured.df)
1034 | 
1035 | 
1036 | @pytest.mark.filterwarnings("ignore")
1037 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd")
1038 | def test_xlsx_sheet_1(capsys):
1039 |     pth = _get_path("sheet", extension="xlsx")
1040 |     cmd = "open excel {} {} {}".format(pth, "--skiprows=1", "--sheet_name=1")
1041 |     _call(cmd)
1042 |     captured = Capture(capsys.readouterr())
1043 |     assert not captured.err
1044 |     captured.assert_shape(6, 4)
1045 |     captured.assert_columns(["Unnamed: 0", "year", "month", "day"])
1046 | 
1047 | 
1048 | @pytest.mark.filterwarnings("ignore")
1049 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd")
1050 | def test_xlsx_borked(capsys):
1051 |     with pytest.raises(SystemExit) as exit_:
1052 |         pth = _get_path("sheet", extension="xlsx")
1053 |         cmd = "open excel {} {} {}".format(pth, "--skiprows=4", "--sheet_name=None")
1054 |         _call(cmd)
1055 | 
1056 |     errm = 'Specify --sheet_name="a sheet with spaces|the other sheet"'
1057 |     assert str(exit_.value) == errm
1058 | 
1059 | 
1060 | def test_raw(phmgr):
1061 |     with phmgr("broken") as captured:
1062 |         _call("raw")
1063 | 
1064 |     assert not captured.err
1065 |     captured.assert_shape(7, 5)
1066 | 
1067 | 
1068 | _COVID_COLS = [
1069 |     "China",
1070 |     "S. Korea",
1071 |     "Italy",
1072 |     "Iran",
1073 |     "France",
1074 |     "Germany",
1075 |     "Spain",
1076 |     "USA",
1077 |     "UK",
1078 |     "Canada",
1079 | ]
1080 | 
1081 | 
1082 | def test_spencer(phmgr):
1083 |     with phmgr("covid") as captured:
1084 |         _call("spencer")
1085 | 
1086 |     assert not captured.err
1087 |     captured.assert_shape(29, 10)
1088 |     captured.assert_columns(_COVID_COLS)
1089 | 


--------------------------------------------------------------------------------
/ph/__init__.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | from __future__ import print_function
   3 | from .tabulate import tabulate as tabulate_
   4 | import sys
   5 | import pandas as pd
   6 | import re
   7 | import datetime
   8 | 
   9 | 
  10 | def _get_version():
  11 |     import ph._version
  12 | 
  13 |     return ph._version.__version__
  14 | 
  15 | 
  16 | def print_version():
  17 |     print(_get_version())
  18 | 
  19 | 
  20 | # Command line parsing of (1) --abc and (2) --abc=def
  21 | KWARG = re.compile("^--[a-z0-9_-]+$")
  22 | KWARG_WITH_VALUE = re.compile("^--[a-z0-9_-]+=")
  23 | 
  24 | 
  25 | USAGE_TEXT = """
  26 | ph is a command line tool for streaming csv data.
  27 | 
  28 | If you have a csv file `a.csv`, you can pipe it through `ph` on the
  29 | command line by using
  30 | 
  31 | $ cat a.csv | ph columns x y | ph eval "z = x**2 - y" | ph show
  32 | 
  33 | Use ph help [command] for help on the individual commands.
  34 | 
  35 | A list of available commands follows.
  36 | """
  37 | 
  38 | COMMANDS = {}
  39 | DOCS = {}
  40 | 
  41 | 
  42 | def _gpx(fname):
  43 |     try:
  44 |         import gpxpy
  45 |     except ImportError:
  46 |         sys.exit("ph gpx needs gpxpy, pip install ph[gpx]")
  47 | 
  48 |     def from_trackpoint(tp=None):
  49 |         if tp is None:
  50 |             return "time", "latitude", "longitude", "elevation", "distance"
  51 |         p = tp.point
  52 |         return str(p.time), p.latitude, p.longitude, p.elevation, tp.distance_from_start
  53 | 
  54 |     with open(fname, "r") as fin:
  55 |         gpx = gpxpy.parse(fin)
  56 |     data = gpx.get_points_data()
  57 |     columns = from_trackpoint()
  58 |     dfdata = [from_trackpoint(tp) for tp in data]
  59 |     return pd.DataFrame(dfdata, columns=columns)
  60 | 
  61 | 
  62 | def _tsv(*args, **kwargs):
  63 |     kwargs["sep"] = "\t"
  64 |     return pd.read_csv(*args, **kwargs)
  65 | 
  66 | 
  67 | # These are all lambdas because they lazy load, and some of these
  68 | # readers are introduced in later pandas.
  69 | READERS = {
  70 |     "csv": pd.read_csv,
  71 |     "clipboard": pd.read_clipboard,
  72 |     "fwf": pd.read_fwf,
  73 |     "json": pd.read_json,
  74 |     "html": pd.read_html,
  75 |     "tsv": _tsv,
  76 |     "gpx": _gpx,
  77 | }
  78 | 
  79 | try:
  80 |     READERS["excel"] = pd.read_excel
  81 |     READERS["xls"] = pd.read_excel
  82 |     READERS["odf"] = pd.read_excel
  83 | except AttributeError:
  84 |     pass
  85 | 
  86 | try:
  87 |     READERS["hdf5"] = pd.read_hdf
  88 | except AttributeError:
  89 |     pass
  90 | 
  91 | 
  92 | try:
  93 |     READERS["feather"] = pd.read_feather
  94 | except AttributeError:
  95 |     pass
  96 | 
  97 | 
  98 | try:
  99 |     READERS["parquet"] = pd.read_parquet
 100 | except AttributeError:
 101 |     pass
 102 | 
 103 | 
 104 | try:
 105 |     READERS["orc"] = pd.read_orc
 106 | except AttributeError:
 107 |     pass
 108 | 
 109 | 
 110 | try:
 111 |     READERS["msgpack"] = pd.read_msgpack
 112 | except AttributeError:
 113 |     pass
 114 | 
 115 | 
 116 | try:
 117 |     READERS["stata"] = pd.read_stata
 118 | except AttributeError:
 119 |     pass
 120 | 
 121 | 
 122 | try:
 123 |     READERS["sas"] = pd.read_sas
 124 | except AttributeError:
 125 |     pass
 126 | 
 127 | 
 128 | try:
 129 |     READERS["spss"] = pd.read_spss
 130 | except AttributeError:
 131 |     pass
 132 | 
 133 | 
 134 | try:
 135 |     READERS["pickle"] = pd.read_pickle
 136 | except AttributeError:
 137 |     pass
 138 | 
 139 | 
 140 | try:
 141 |     READERS["gbq"] = pd.read_gbq
 142 | except AttributeError:
 143 |     pass
 144 | 
 145 | 
 146 | try:
 147 |     READERS["google"] = pd.read_gbq
 148 | except AttributeError:
 149 |     pass
 150 | 
 151 | 
 152 | try:
 153 |     READERS["bigquery"] = pd.read_gb
 154 | except AttributeError:
 155 |     pass
 156 | 
 157 | 
 158 | WRITERS = {
 159 |     "csv": "to_csv",
 160 |     "fwf": "to_fwf",
 161 |     "json": "to_json",
 162 |     "html": "to_html",
 163 |     "clipboard": "to_clipboard",
 164 |     "xls": "to_excel",
 165 |     "odf": "to_excel",
 166 |     "hdf5": "to_hdf",
 167 |     "feather": "to_feather",
 168 |     "parquet": "to_parquet",
 169 |     "orc": "to_orc",
 170 |     "msgpack": "to_msgpack",
 171 |     "stata": "to_stata",
 172 |     "sas": "to_sas",
 173 |     "spss": "to_spss",
 174 |     "pickle": "to_pickle",
 175 |     "gbq": "to_gbq",
 176 |     "google": "to_gbq",
 177 |     "bigquery": "to_gbq",
 178 |     # extras
 179 |     "tsv": "to_csv",
 180 | }
 181 | 
 182 | 
 183 | FALSY = ("False", "false", "No", "no", "0", False, 0, "None")
 184 | TRUTHY = ("True", "true", "Yes", "yes", "1", True, 1)
 185 | 
 186 | 
 187 | def _assert_col(df, col, caller=None):
 188 |     if col not in df.columns:
 189 |         if caller is not None:
 190 |             sys.exit("ph {}: Unknown column {}".format(caller, col))
 191 |         sys.exit("Unknown column {}".format(col))
 192 | 
 193 | 
 194 | def _assert_cols(df, cols, caller=None):
 195 |     for col in cols:
 196 |         _assert_col(df, col, caller=caller)
 197 | 
 198 | 
 199 | def register(fn, name=None):
 200 |     if name is None:
 201 |         name = fn.__name__
 202 |     COMMANDS[name] = fn
 203 |     DOCS[name] = fn.__doc__
 204 |     return fn
 205 | 
 206 | 
 207 | def registerx(name):
 208 |     def inner(fn):
 209 |         register(fn, name)
 210 |         return fn
 211 | 
 212 |     return inner
 213 | 
 214 | 
 215 | @register
 216 | def dataset(dset=None):
 217 |     """Load dataset as csv.
 218 | 
 219 |     Usage:  ph dataset linnerud | ph describe
 220 |     """
 221 |     try:
 222 |         import sklearn.datasets
 223 |     except ImportError:
 224 |         sys.exit("You need scikit-learn.  Install ph[data].")
 225 | 
 226 |     REALDATA = {
 227 |         "olivetti_faces": sklearn.datasets.fetch_olivetti_faces,
 228 |         "20newsgroups": sklearn.datasets.fetch_20newsgroups,
 229 |         "20newsgroups_vectorized": sklearn.datasets.fetch_20newsgroups_vectorized,
 230 |         "lfw_people": sklearn.datasets.fetch_lfw_people,
 231 |         "lfw_pairs": sklearn.datasets.fetch_lfw_pairs,
 232 |         "covtype": sklearn.datasets.fetch_covtype,
 233 |         "rcv1": sklearn.datasets.fetch_rcv1,
 234 |         "kddcup99": sklearn.datasets.fetch_kddcup99,
 235 |         "california_housing": sklearn.datasets.fetch_california_housing,
 236 |     }
 237 | 
 238 |     TOYDATA = {
 239 |         "boston": sklearn.datasets.load_boston,
 240 |         "iris": sklearn.datasets.load_iris,
 241 |         "diabetes": sklearn.datasets.load_diabetes,
 242 |         "digits": sklearn.datasets.load_digits,
 243 |         "linnerud": sklearn.datasets.load_linnerud,
 244 |         "wine": sklearn.datasets.load_wine,
 245 |         "breast_cancer": sklearn.datasets.load_breast_cancer,
 246 |     }
 247 | 
 248 |     if dset is None:
 249 |         print("type,name")
 250 |         print("\n".join("{},{}".format("real", k) for k in REALDATA))
 251 |         print("\n".join("{},{}".format("toy", k) for k in TOYDATA))
 252 |         sys.exit()
 253 | 
 254 |     if dset not in TOYDATA.keys() | REALDATA.keys():
 255 |         sys.exit("Unknown dataset {}.  See ph help dataset.".format(dset))
 256 |     if dset in TOYDATA:
 257 |         data = TOYDATA[dset]()
 258 |     else:
 259 |         data = REALDATA[dset]()
 260 |     try:
 261 |         df = pd.DataFrame(data.data, columns=data.feature_names)
 262 |     except AttributeError:
 263 |         df = pd.DataFrame(data.data)
 264 |     try:
 265 |         df["target"] = pd.Series(data.target)
 266 |     except Exception:
 267 |         pass
 268 |     pipeout(df)
 269 | 
 270 | 
 271 | @register
 272 | def drop_duplicates(*cols):
 273 |     """Drop duplicates"""
 274 |     df = pipein()
 275 |     pipeout(df.drop_duplicates(cols))
 276 | 
 277 | 
 278 | @register
 279 | def diff(*cols, periods=1, axis=0):
 280 |     """Calculate the difference of an element compared with another element
 281 |     in the csv file (default is element in previous row).
 282 | 
 283 |     Argument: --periods=1
 284 | 
 285 |     Periods to shift for calculating difference, default 1.  Accepts
 286 |     negative values.
 287 | 
 288 |     Argument: --axis=0
 289 | 
 290 |     Take difference over rows (0) or columns (1), default 0.
 291 | 
 292 |     """
 293 | 
 294 |     df = pipein()
 295 |     if not cols:
 296 |         df = df.diff(periods=periods, axis=axis)
 297 |     else:
 298 |         _assert_cols(df, cols, "diff")
 299 |         columns = list(cols)
 300 |         df[columns] = df[columns].diff(periods=periods, axis=axis)
 301 |     pipeout(df)
 302 | 
 303 | 
 304 | @register
 305 | def dropna(axis=0, how="any", thresh=None):
 306 |     """Remove rows (or columns) with N/A values.
 307 | 
 308 |     Argument: --axis=0
 309 |     Defaults to axis=0 (columns), use --axis=1 to remove rows.
 310 | 
 311 |     Argument: --how=any
 312 |     Defaults to how='any', which removes columns (resp. rows) containing
 313 |     nan values.  Use how='all' to remove columns (resp. rows) containing
 314 |     only nan values.
 315 | 
 316 |     Argument: --thresh=5
 317 |     If --thresh=x is specified, will delete any column (resp. row) with
 318 |     fewer than x non-na values.
 319 | 
 320 |     Usage: cat a.csv | ph dropna
 321 |            cat a.csv | ph dropna --axis=1    # for row-wise
 322 |            cat a.csv | ph dropna --thresh=5  # keep cols with >= 5 non-na
 323 |            cat a.csv | ph dropna --how=all   # delete only if all vals na
 324 | 
 325 |     """
 326 |     try:
 327 |         axis = int(axis)
 328 |         if axis not in (0, 1):
 329 |             sys.exit("ph dropna --axis=0 or --axis=1, not {}".format(axis))
 330 |     except ValueError:
 331 |         sys.exit("ph dropna --axis=0 or --axis=1, not {}".format(axis))
 332 | 
 333 |     if thresh is not None:
 334 |         try:
 335 |             thresh = int(thresh)
 336 |         except ValueError:
 337 |             sys.exit("ph dropna --thresh=0 or --thresh=1, not {}".format(thresh))
 338 | 
 339 |     df = pipein()
 340 |     try:
 341 |         df = df.dropna(axis=axis, how=how, thresh=thresh)
 342 |     except Exception as err:
 343 |         sys.exit(str(err))
 344 |     pipeout(df)
 345 | 
 346 | 
 347 | def _safe_out(output):
 348 |     """Prints output to standard out, catching broken pipe."""
 349 |     try:
 350 |         print(output)
 351 |     except BrokenPipeError:
 352 |         try:
 353 |             sys.stdout.close()
 354 |         except IOError:
 355 |             pass
 356 |         try:
 357 |             sys.stderr.close()
 358 |         except IOError:
 359 |             pass
 360 | 
 361 | 
 362 | def pipeout(df, sep=",", index=False, *args, **kwargs):
 363 |     csv = df.to_csv(sep=sep, index=index, *args, **kwargs)
 364 |     output = csv.rstrip("\n")
 365 |     _safe_out(output)
 366 | 
 367 | 
 368 | def pipein(ftype="csv", **kwargs):
 369 |     skiprows = kwargs.get("skiprows")
 370 |     if skiprows is not None:
 371 |         try:
 372 |             skiprows = int(skiprows)
 373 |             if skiprows < 0:
 374 |                 raise ValueError("Negative")
 375 |         except ValueError:
 376 |             sys.exit("skiprows must be a non-negative int, not {}".format(skiprows))
 377 |         kwargs["skiprows"] = skiprows
 378 | 
 379 |     if kwargs.get("sep") == "\\t":
 380 |         kwargs["sep"] = "\t"
 381 | 
 382 |     try:
 383 |         return READERS[ftype](sys.stdin, **kwargs)
 384 |     except pd.errors.EmptyDataError:
 385 |         return pd.DataFrame()
 386 |     except pd.errors.ParserError as err:
 387 |         sys.exit(str(err))
 388 | 
 389 | 
 390 | @register
 391 | def fillna(value=None, method=None, limit=None):
 392 |     """Fill na values with a certain value or method, at most `limit` many.
 393 | 
 394 |     Takes either a value, or a method using (e.g.) --method=ffill.
 395 | 
 396 |     Argument: value
 397 |     If provided, replaces all N/A values with prescribed value.
 398 | 
 399 |     Argument: --method=pad
 400 |     If provided, value cannot be provided.  Allowed methods are
 401 |     backfill, bfill, pad, ffill
 402 | 
 403 |     Argument: --limit=x
 404 |     If provided, limits number of consecutive N/A values to fill.
 405 | 
 406 | 
 407 |     Usage: cat a.csv | ph fillna 999.75
 408 |            cat a.csv | ph fillna -1
 409 |            cat a.csv | ph fillna --method=pad
 410 |            cat a.csv | ph fillna --method=pad --limit=5
 411 | 
 412 |     """
 413 |     if limit is not None:
 414 |         try:
 415 |             limit = int(limit)
 416 |         except ValueError:
 417 |             sys.exit("--limit=x must be an integer, not {}".format(limit))
 418 |     METHODS = ("backfill", "bfill", "pad", "ffill")
 419 |     if method is not None:
 420 |         if method not in METHODS:
 421 |             sys.exit("method must be one of {}, not {}".format(METHODS, method))
 422 |         pipeout(pipein().fillna(method=method, limit=limit))
 423 |     elif value is not None:
 424 |         value = __tryparse(value)
 425 |         pipeout(pipein().fillna(value=value, limit=limit))
 426 |     else:
 427 |         sys.exit("'ph fillna' needs exactly one of value and method")
 428 | 
 429 | 
 430 | @register
 431 | def query(expr):
 432 |     """Using pandas queries.
 433 | 
 434 |     Usage: cat a.csv | ph query "x > 5"
 435 | 
 436 |     """
 437 |     df = pipein()
 438 |     new_df = df.query(expr)
 439 |     pipeout(new_df)
 440 | 
 441 | 
 442 | @register
 443 | def grep(*expr, case=True, na=float("nan"), regex=True, column=None):
 444 |     """Grep (with regex) for content in csv file.
 445 | 
 446 |     Usage: cat a.csv | ph grep 0
 447 |            cat a.csv | ph grep search_string
 448 |            cat a.csv | ph grep "A|B"               # search hits a or b
 449 |            cat a.csv | ph grep "a|b" --case=False  # case insensitive
 450 |            cat a.csv | ph grep 4 --column=x
 451 | 
 452 |     To disable regex (e.g. simple search for "." or "*" characters, use
 453 |     --regex=False).
 454 | 
 455 |     Search only in a specific column with --column=col.
 456 | 
 457 |     Supports regex search queries such as "0-9A-F" and "\\d" (possibly
 458 |     double-escaped.)
 459 | 
 460 |     """
 461 |     df = pipein()
 462 | 
 463 |     if case is True or case in TRUTHY:
 464 |         case = True
 465 |     elif case in FALSY:
 466 |         case = False
 467 |     else:
 468 |         sys.exit("ph grep:  Unknown --case={} should be True or False".format(case))
 469 | 
 470 |     if regex is True or regex in TRUTHY:
 471 |         regex = True
 472 |     elif regex in FALSY:
 473 |         regex = False
 474 |     else:
 475 |         sys.exit("ph grep:  Unknown --regex={} should be True or False".format(regex))
 476 | 
 477 |     if column is not None:
 478 |         _assert_col(df, column, "grep")
 479 | 
 480 |     expr = " ".join(str(e) for e in expr)  # force string input
 481 | 
 482 |     try:
 483 |         import numpy
 484 |     except ImportError:
 485 |         sys.exit("numpy needed for grep.  pip install numpy")
 486 | 
 487 |     retval = df[
 488 |         numpy.logical_or.reduce(
 489 |             [
 490 |                 df[col].astype(str).str.contains(expr, case=case, na=na, regex=regex)
 491 |                 for col in (df.columns if column is None else [column])
 492 |             ]
 493 |         )
 494 |     ]
 495 |     pipeout(retval)
 496 | 
 497 | 
 498 | @register
 499 | def appendstr(col, s, newcol=None):
 500 |     """Special method to append a string to the end of a column.
 501 | 
 502 |     Usage: cat e.csv | ph appendstr year -01-01 | ph date year
 503 |     """
 504 |     df = pipein()
 505 |     if newcol is None:
 506 |         newcol = col
 507 |     df[newcol] = df[col].astype(str) + s
 508 |     pipeout(df)
 509 | 
 510 | 
 511 | @register
 512 | def split(col, pat=" "):
 513 |     """Split a column in two based on a given pattern, default is " ".
 514 | 
 515 |     The resulting csv will have one extra column called "col_rhs" where
 516 |     "col" is the name of the column being split.
 517 | 
 518 |     Usage: cat dates.csv | ph split date /
 519 | 
 520 |     """
 521 |     pat = str(pat)
 522 |     df = pipein()
 523 |     _assert_col(df, col, "split")
 524 |     new_name = col + "_rhs"
 525 |     suffix = ""
 526 |     name = lambda: (new_name + "_" + str(suffix)).rstrip("_")
 527 |     while name() in df.columns:
 528 |         if not suffix:
 529 |             suffix = 1
 530 |         suffix += 1
 531 |     df[[col, name()]] = df[col].astype(str).str.split(pat=pat, n=1, expand=True)
 532 |     pipeout(df)
 533 | 
 534 | 
 535 | @register
 536 | def strip(*cols, lstrip=False, rstrip=False):
 537 |     """Strip (trim) a string.
 538 | 
 539 |     Usage: cat x.csv | ph strip
 540 |            cat x.csv | ph strip --lstrip=True
 541 |            cat x.csv | ph strip --rstrip=True
 542 | 
 543 |     """
 544 |     df = pipein()
 545 |     if not cols:
 546 |         cols = list(df.columns)
 547 |     else:
 548 |         cols = list(cols)
 549 |     _assert_cols(df, cols, "strip")
 550 |     for c in cols:
 551 |         if lstrip in TRUTHY:
 552 |             df[c] = df[c].str.lstrip()
 553 |         elif rstrip in TRUTHY:
 554 |             df[c] = df[c].str.rstrip()
 555 |         else:
 556 |             df[c] = df[c].str.strip()
 557 |     pipeout(df)
 558 | 
 559 | 
 560 | @register
 561 | def removeprefix(col, prefix=" "):
 562 |     """Remove prefix of contents of a column.
 563 | 
 564 |     Usage: cat a.csv | ph removeprefix col1 ..
 565 | 
 566 |     See also @removesuffix @strip
 567 | 
 568 |     """
 569 |     prefix = str(prefix)
 570 |     plen = len(prefix)
 571 |     df = pipein()
 572 |     _assert_col(df, col, "removeprefix")
 573 |     df[col] = df[col].apply(
 574 |         lambda s: str(s)[plen:] if str(s).startswith(prefix) else str(s)
 575 |     )
 576 |     pipeout(df)
 577 | 
 578 | 
 579 | @register
 580 | def removesuffix(col, suffix=" "):
 581 |     """Remove suffix of contents of a column.
 582 | 
 583 |     Usage: cat a.csv | ph removesuffix col1 ..
 584 | 
 585 |     See also @removeprefix @strip
 586 | 
 587 |     """
 588 |     suffix = str(suffix)
 589 |     plen = len(suffix)
 590 |     df = pipein()
 591 |     _assert_col(df, col, "removesuffix")
 592 |     df[col] = df[col].apply(
 593 |         lambda s: str(s)[:-plen] if str(s).endswith(suffix) else str(s)
 594 |     )
 595 |     pipeout(df)
 596 | 
 597 | 
 598 | @register
 599 | def astype(type, column=None, newcolumn=None):
 600 |     """Cast a column to a different type.
 601 | 
 602 |     Usage:  cat a.csv | ph astype double x [new_x]
 603 | 
 604 |     """
 605 |     df = pipein()
 606 |     try:
 607 |         if column is None:
 608 |             df = df.astype(type)
 609 |         elif newcolumn is not None:
 610 |             df[newcolumn] = df[column].astype(type)
 611 |         else:
 612 |             df[column] = df[column].astype(type)
 613 |     except ValueError as err:
 614 |         sys.exit("Could not convert to {}: {}".format(type, err))
 615 |     pipeout(df)
 616 | 
 617 | 
 618 | @register
 619 | def dtypes(t=None):
 620 |     """If no argument is provided, output types, otherwise filter on types.
 621 | 
 622 |     If no argument is provided, output a csv with two columns, "column" and
 623 |     "dtype".  The "column" column contains the names of the columns in the input
 624 |     csv and the "dtype" column contains their respective types.
 625 | 
 626 |     If an argument is provided, all columns with the prescribed type is output.
 627 | 
 628 |     Usage:  cat a.csv | ph dtypes
 629 |             cat a.csv | ph dtypes float64
 630 | 
 631 |     """
 632 |     if t is None:
 633 |         df = pipein()
 634 |         newdf = pd.DataFrame(pd.Series(df.columns), columns=["column"])
 635 |         newdf["dtype"] = pd.Series([str(e) for e in df.dtypes])
 636 |         pipeout(newdf.T, header=False)
 637 |     else:
 638 |         df = pipein().select_dtypes(t)
 639 |         pipeout(df)
 640 | 
 641 | 
 642 | @register
 643 | def pivot(columns, index=None, values=None):
 644 |     """Reshape csv organized by given index / column values.
 645 | 
 646 |     Suppose b.csv is
 647 | foo,bar,baz,zoo
 648 | one,A,1,x
 649 | one,B,2,y
 650 | one,C,3,z
 651 | two,A,4,q
 652 | two,B,5,w
 653 | two,C,6,t
 654 | 
 655 |     Usage: cat b.csv | ph pivot bar --index=foo --values=baz
 656 | 
 657 |       A    B    C
 658 | --  ---  ---  ---
 659 |  0    1    2    3
 660 |  1    4    5    6
 661 | 
 662 |     """
 663 |     pipeout(pipein().pivot(index=index, columns=columns, values=values))
 664 | 
 665 | 
 666 | @register
 667 | def crosstab(column):
 668 |     """Perform a very simplistic crosstabulation on one column of the input csv.
 669 | 
 670 |     Usage:  cat b.csv | ph crosstab foo
 671 |     """
 672 |     newcol = "crosstab_{}".format(column)
 673 |     df = pd.crosstab(pipein()[column], newcol)
 674 |     df["id"] = list(df[newcol].index)
 675 |     pipeout(df)
 676 | 
 677 | 
 678 | @register
 679 | def groupby(*columns, how="sum", as_index=False):
 680 |     """Group by columns, then apply `how` function.
 681 | 
 682 |     Usage: cat a.csv | ph groupby animal  # default to sum
 683 |            cat a.csv | ph groupby animal --how=mean
 684 |            cat a.csv | ph groupby animal --how=prod
 685 |            cat a.csv | ph groupby animal --as_index=True  # removes index
 686 |     """
 687 |     columns = list(columns)
 688 |     if not columns:
 689 |         sys.exit("Needs at least one column to group by")
 690 |     df = pipein()
 691 |     _assert_cols(df, columns, "groupby")
 692 |     if as_index in TRUTHY:
 693 |         as_index = True
 694 |     elif as_index in FALSY:
 695 |         as_index = False
 696 |     else:
 697 |         sys.exit("--as_index=True or False, not {}".format(as_index))
 698 | 
 699 |     grouped = df.groupby(columns, as_index=as_index)
 700 |     try:
 701 |         fn = getattr(grouped, how)
 702 |     except AttributeError:
 703 |         sys.exit("Unknown --how={}, should be sum, mean, ...".format(how))
 704 |     retval = fn()
 705 | 
 706 |     pipeout(retval)
 707 | 
 708 | 
 709 | @register
 710 | def rolling(window, *columns, how="sum", win_type=None, std=None, beta=None, tau=None):
 711 |     """Rolling window calculations using provided `how` function.
 712 | 
 713 |     Usage: cat a.csv | ph rolling 3
 714 |            cat a.csv | ph rolling 5 --how=mean
 715 |            cat a.csv | ph rolling 5 colA colB --how=mean
 716 |            cat a.csv | ph rolling 5 --win_type=gaussian --std=7.62
 717 |     """
 718 |     df = pipein()
 719 |     orig_columns = list(df.columns)
 720 |     columns = list(columns)
 721 |     _assert_cols(df, columns, "rolling")
 722 | 
 723 |     if not columns:
 724 |         columns = list(df.columns)
 725 | 
 726 |     noncols = [c for c in df.columns if c not in columns]
 727 | 
 728 |     rollin = df[columns].rolling(window, win_type=win_type)
 729 |     nonrollin = df[noncols]
 730 |     try:
 731 |         fn = getattr(rollin, how)
 732 |     except AttributeError:
 733 |         sys.exit("Unknown --how={}, should be sum, mean, ...".format(how))
 734 | 
 735 |     if {std, beta, tau} != {None}:
 736 |         retval = fn(std=std, beta=beta, tau=tau)
 737 |     else:
 738 |         retval = fn()
 739 | 
 740 |     df = pd.concat([retval, nonrollin], axis=1)
 741 |     for col in orig_columns:
 742 |         if col not in df.columns:
 743 |             op = "ph rolling"
 744 |             sys.exit(
 745 |                 '{}: Could not perform rolling window on column "{}"'.format(op, col)
 746 |             )
 747 |     df = df[orig_columns]
 748 |     pipeout(df)
 749 | 
 750 | 
 751 | @register
 752 | def ewm(
 753 |     min_periods=0,
 754 |     adjust=True,
 755 |     ignore_na=False,
 756 |     axis=0,
 757 |     com=None,
 758 |     span=None,
 759 |     halflife=None,
 760 |     alpha=None,
 761 |     how="mean",
 762 | ):
 763 |     """Provide exponential weighted functions.
 764 | 
 765 |     A related set of functions are exponentially weighted versions of
 766 |     several of the above statistics. A similar interface to rolling and
 767 |     expanding is accessed through the ewm method to receive an EWM
 768 |     object.  A number of expanding EW (exponentially weighted) methods
 769 |     are provided:
 770 | 
 771 |       * mean
 772 |       * var
 773 |       * std
 774 |       * corr
 775 |       * cov
 776 | 
 777 |     Usage: cat a.csv | ph ewm --com=0.5 --how=mean
 778 |            cat a.csv | ph ewm --halflife=0.5 --how=std
 779 | 
 780 |     """
 781 |     if {com, span, halflife, alpha} == {None}:
 782 |         sys.exit("Must pass one of com, span, halflife, or alpha")
 783 | 
 784 |     df = pipein()
 785 | 
 786 |     ewm_ = df.ewm(
 787 |         min_periods=min_periods,
 788 |         adjust=adjust,
 789 |         ignore_na=ignore_na,
 790 |         axis=axis,
 791 |         com=com,
 792 |         span=span,
 793 |         halflife=halflife,
 794 |         alpha=alpha,
 795 |     )
 796 |     try:
 797 |         fn = getattr(ewm_, how)
 798 |     except AttributeError:
 799 |         sys.exit("Unknown --how={}, should be mean, var, std, corr, cov..".format(how))
 800 | 
 801 |     retval = fn()
 802 | 
 803 |     pipeout(retval)
 804 | 
 805 | 
 806 | @register
 807 | def expanding(min_periods=1, axis=0, how="sum", quantile=None):
 808 |     """Provide expanding transformations.
 809 | 
 810 |     A common alternative to rolling statistics is to use an expanding
 811 |     window, which yields the value of the statistic with all the data
 812 |     available up to that point in time.
 813 | 
 814 |     For working with data, a number of window functions are provided for
 815 |     computing common window or rolling statistics.  Among these are
 816 |     count, sum, mean, median, correlation, variance, covariance,
 817 |     standard deviation, skewness, and kurtosis.
 818 | 
 819 | 
 820 |     Usage: cat a.csv | ph expanding
 821 |            cat a.csv | ph expanding 1 --how=sum   # above equivalent to this
 822 |            cat a.csv | ph expanding 2
 823 |            cat a.csv | ph expanding 5 --how=quantile --quantile=0.25
 824 | 
 825 |     """
 826 | 
 827 |     df = pipein()
 828 | 
 829 |     if quantile is not None:
 830 |         if how != "quantile":
 831 |             sys.exit("Use both or none of --how=quantile and --quantile=<float>")
 832 |     if how == "quantile" and quantile is None:
 833 | 
 834 |         sys.exit("--how=quantile needs --quantile=<float>, e.g. --quantile=0.25")
 835 |     expanding_ = df.expanding(min_periods=min_periods, axis=axis)
 836 |     try:
 837 |         fn = getattr(expanding_, how)
 838 |     except AttributeError:
 839 |         sys.exit("Unknown --how={}, should be sum, mean, max, quantile..".format(how))
 840 | 
 841 |     if how == "quantile":
 842 |         retval = fn(quantile)
 843 |     else:
 844 |         retval = fn()
 845 | 
 846 |     pipeout(retval)
 847 | 
 848 | 
 849 | @register
 850 | def monotonic(column, direction="+"):
 851 |     """Check if a certain column is monotonically increasing or decreasing.
 852 | 
 853 |     Usage:  cat a.csv | ph monotonic x
 854 |             cat a.csv | ph monotonic x +  # equivalent to above
 855 |             cat a.csv | ph monotonic x -  # for decreasing
 856 | 
 857 |     """
 858 |     df = pipein()
 859 |     if column not in df:
 860 |         sys.exit("Unknown column {}".format(column))
 861 |     if direction not in "+-":
 862 |         sys.exit("direction must be either + or -")
 863 |     print("{}_monotonic".format(column))
 864 |     if direction == "+":
 865 |         print(df[column].is_monotonic)
 866 |     else:
 867 |         print(df[column].is_monotonic_decreasing)
 868 | 
 869 | 
 870 | @register
 871 | def iplot(*args, **kwargs):
 872 |     """Use plotly/cufflinks for interactive plot.
 873 | 
 874 |     This option is similar to `plot` but creates an HTML file and opens a
 875 |     browser for an interactive plot.
 876 | 
 877 |     Usage: cat a.csv | ph iplot
 878 |            cat a.csv | ph iplot --kind=bar
 879 |            cat a.csv | ph iplot --kind=bar --barmode=stack
 880 |            cat a.csv | ph iplot --kind=scatter --mode=markers --x=x --y=y
 881 | 
 882 | 
 883 |     Depends on cufflinks: pip install ph[iplot].
 884 | 
 885 |     """
 886 |     try:
 887 |         import cufflinks  # noqa
 888 |         import plotly as py
 889 |     except ImportError:
 890 |         sys.exit("iplot needs cufflinks, pip install ph[iplot]")
 891 | 
 892 |     df = pipein()
 893 |     fig = df.iplot(*args, asFigure=True, **kwargs)
 894 |     py.offline.plot(fig)
 895 |     pipeout(df)
 896 | 
 897 | 
 898 | @register
 899 | def plot(*args, **kwargs):
 900 |     """Plot the csv file.
 901 | 
 902 |     Usage:  ph plot
 903 |             ph plot --index=col
 904 |             ph plot --kind=bar
 905 |             ph plot --kind=scatter --x=col1 --y=col2
 906 |             ph plot --style=k--
 907 |             ph plot --logx=True
 908 |             ph plot --logy=True
 909 |             ph plot --loglog=True
 910 |             ph plot --savefig=fname.png
 911 |             ph plot --savefig=fname.svg
 912 |             ph plot --savefig=fname.svg --savefig-dpi=300
 913 |     """
 914 |     try:
 915 |         import matplotlib.pyplot as plt
 916 |     except ImportError:
 917 |         sys.exit("plot depends on matplotlib, install ph[plot]")
 918 | 
 919 |     df = pipein()
 920 |     index = kwargs.get("index")
 921 |     if index is not None:
 922 |         _assert_col(df, index, caller="plot")
 923 |         df = df.set_index(index)
 924 |         del kwargs["index"]
 925 |     for log_ in ("logx", "logy", "loglog"):
 926 |         if kwargs.get(log_) in TRUTHY:
 927 |             kwargs[log_] = True
 928 |     fname = kwargs.get("savefig")
 929 |     dpi = kwargs.get("savefig-dpi")
 930 | 
 931 |     if fname:
 932 |         del kwargs["savefig"]
 933 |     if dpi:
 934 |         del kwargs["savefig-dpi"]
 935 | 
 936 |     fig, ax = plt.subplots()
 937 |     df.plot(**kwargs, ax=ax)
 938 | 
 939 |     if index == "date":
 940 |         fig.autofmt_xdate()
 941 | 
 942 |     if fname:
 943 |         plt.tight_layout()
 944 |         plt.savefig(fname, dpi=dpi)
 945 |     else:
 946 |         plt.show()
 947 |     pipeout(df)
 948 | 
 949 | 
 950 | @register
 951 | def eval(expr):
 952 |     """Eval expr using pandas.DataFrame.eval.
 953 | 
 954 |     Example:  cat a.csv | ph eval "z = x + y"
 955 | 
 956 |     """
 957 |     df = pipein()
 958 |     pipeout(df.eval(expr))
 959 | 
 960 | 
 961 | @register
 962 | def normalize(col=None):
 963 |     """Normalize a column or an entire dataframe.
 964 | 
 965 |     Usage: cat a.csv | ph normalize
 966 |            cat a.csv | ph normalize x
 967 | 
 968 | 
 969 |     Warning:  This is probably not what you want.
 970 | 
 971 |     """
 972 |     df = pipein()
 973 |     if col is None:
 974 |         df = (df - df.min()) / (df.max() - df.min())
 975 |     else:
 976 |         df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
 977 |     pipeout(df)
 978 | 
 979 | 
 980 | @register
 981 | def date(col=None, unit=None, origin="unix", errors="raise", dayfirst=False, **kwargs):
 982 |     """Assemble datetime from multiple columns or from one column
 983 | 
 984 |     --unit can be D, s, us, ns (defaults to ns, ns from origin)
 985 | 
 986 |     --origin can be unix, julian, or time offset, e.g. '2000-01-01'
 987 | 
 988 |     --errors can be raise, coerce, ignore (see pandas.to_datetime)
 989 | 
 990 |     --format a strptime format string, e.g. '%Y-%m-%d %H:%M:%S'
 991 | 
 992 |     --utc=True if the input is in utc, i.e. seconds from epoch
 993 | 
 994 |     Usage: cat a.csv | ph date x
 995 |            cat a.csv | ph date x --unit=s --origin="1984-05-17 09:30"
 996 |            cat a.csv | ph date x --dayfirst=True
 997 |            cat a.csv | ph date  # if a.csv contains year, month, date
 998 |            cat a.csv | ph date x --format="%Y-%m-%d"
 999 |            cat a.csv | ph date x --utc=True
1000 | 
1001 |     """
1002 |     DATE_ERRORS = ("ignore", "raise", "coerce")
1003 |     if errors not in DATE_ERRORS:
1004 |         sys.exit("Errors must be one of {}, not {}.".format(DATE_ERRORS, errors))
1005 | 
1006 |     dayfirst = dayfirst in TRUTHY
1007 | 
1008 |     date_parser = None
1009 |     if "format" in kwargs:
1010 |         date_parser = lambda d: [
1011 |             datetime.datetime.strptime(str(e), kwargs["format"]) for e in d
1012 |         ]
1013 |     if kwargs.get("utc") in TRUTHY:
1014 |         date_parser = lambda d: [datetime.datetime.utcfromtimestamp(e) for e in d]
1015 |     df = pipein()
1016 |     try:
1017 |         if col is None:
1018 |             df = pd.to_datetime(df, unit=unit, origin=origin, errors=errors)
1019 |         else:
1020 |             _assert_col(df, col, "date")
1021 |             if date_parser is None:
1022 |                 df[col] = pd.to_datetime(
1023 |                     df[col], unit=unit, origin=origin, errors=errors, dayfirst=dayfirst
1024 |                 )
1025 |             else:
1026 |                 df[col] = date_parser(df[col])
1027 |     except Exception as err:
1028 |         sys.exit(err)
1029 | 
1030 |     pipeout(df)
1031 | 
1032 | 
1033 | @register
1034 | def round(col, decimals=0):
1035 |     """Round column to `decimals` decimals.
1036 | 
1037 |     Usage: cat a.csv | ph round x 2
1038 |     """
1039 |     df = pipein()
1040 |     _assert_col(df, col, "round")
1041 |     df[col] = df[col].round(decimals=decimals)
1042 |     pipeout(df)
1043 | 
1044 | @register
1045 | def describe():
1046 |     """Run DataFrame's describe method.
1047 | 
1048 |     The result is NOT tabular data, so pipeline ends.
1049 | 
1050 |     Usage: cat a.csv | ph describe
1051 |     """
1052 |     df = pipein()
1053 |     try:
1054 |         out = df.describe()
1055 |     except ValueError as err:
1056 |         sys.exit(str(err))
1057 |     _safe_out(out)
1058 | 
1059 | 
1060 | @register
1061 | def info():
1062 |     """Run DataFrame's info method.
1063 | 
1064 |     The result is NOT tabular data, so pipeline ends.
1065 | 
1066 |     Usage: cat a.csv | ph info
1067 |     """
1068 |     print(pipein().info())
1069 | 
1070 | 
1071 | @register
1072 | def to(ftype, fname=None, sep=None, index=False):
1073 |     """Export csv to given format (possibly csv).
1074 | 
1075 |     Supports csv, html, json, parquet, bigquery, tsv, etc. (see README for full
1076 |     list).
1077 | 
1078 |     Usage: cat a.csv | ph to html
1079 |            cat a.csv | ph to tsv
1080 |            cat a.csv | ph to csv --index=True
1081 |            cat a.csv | ph to csv --sep=';'
1082 |            cat a.csv | ph to clipboard
1083 |            cat a.csv | ph to json
1084 |            cat a.csv | ph to parquet out.parquet
1085 | 
1086 |     """
1087 |     if ftype not in WRITERS:
1088 |         sys.exit("Unknown datatype {}.".format(ftype))
1089 | 
1090 |     if not fname:
1091 |         if ftype in ("parquet", "xls", "xlsx", "ods", "pickle"):
1092 |             sys.exit("{} needs a path".format(ftype))
1093 | 
1094 |     if ftype == "hdf5":
1095 |         sys.exit("hdf5 writer not implemented")
1096 | 
1097 |     if index not in TRUTHY + FALSY:
1098 |         sys.exit("Index must be True or False, not {}".format(index))
1099 |     index = index in TRUTHY
1100 | 
1101 |     if ftype == "fwf":
1102 |         # pandas has not yet implemented to_fwf
1103 |         df = pipein()
1104 |         content = tabulate_(df.values.tolist(), list(df.columns), tablefmt="plain")
1105 |         if fname:
1106 |             with open(fname, "w") as wout:
1107 |                 wout.write(content)
1108 |         else:
1109 |             print(content)
1110 |         sys.exit()
1111 | 
1112 |     if sep is not None:
1113 |         if ftype != "csv":
1114 |             sys.exit("Only csv mode supports separator")
1115 | 
1116 |     writer = WRITERS[ftype]
1117 |     df = pipein()
1118 |     fn = getattr(df, writer)
1119 |     kwargs = {}
1120 |     if ftype == "tsv":
1121 |         kwargs["sep"] = "\t"
1122 |     elif ftype == "csv" and sep is not None:
1123 |         kwargs["sep"] = sep
1124 | 
1125 |     if ftype == "json":
1126 |         index = True
1127 | 
1128 |     if fname is not None:
1129 |         print(fn(fname, index=index, **kwargs))
1130 |     else:
1131 |         print(fn(index=index, **kwargs))
1132 | 
1133 | 
1134 | @registerx("from")
1135 | def from_(ftype="csv", **kwargs):
1136 |     """Read a certain (default csv) format from standard in and stream out as csv.
1137 | 
1138 |     Usage: cat a.json | ph from json
1139 |            cat /etc/passwd | ph from csv --sep=':' --header=None
1140 | 
1141 |     The following pipes should be equivalent:
1142 | 
1143 |     cat a.csv
1144 |     cat a.csv | ph to json | ph from json
1145 |     cat a.tsv | ph from tsv
1146 |     cat a.tsv | ph from csv --sep='\t'
1147 |     cat a.tsv | ph from csv --sep='\t' --thousands=','
1148 | 
1149 |     In the event that the csv data starts on the first line (i.e. no
1150 |     header is present), use --header=None.
1151 |     """
1152 |     if "header" in kwargs:
1153 |         kwargs["header"] = __tryparse(kwargs["header"])
1154 |     skiprows = kwargs.get("skiprows")
1155 |     if skiprows is not None:
1156 |         try:
1157 |             skiprows = int(skiprows)
1158 |             if skiprows < 0:
1159 |                 raise ValueError("Negative")
1160 |         except ValueError:
1161 |             sys.exit("skiprows must be a non-negative int, not {}".format(skiprows))
1162 |         kwargs["skiprows"] = skiprows
1163 | 
1164 |     if kwargs.get("sep") == "\\t":
1165 |         kwargs["sep"] = "\t"
1166 | 
1167 |     if ftype == "clipboard":
1168 |         pipeout(READERS["clipboard"](**kwargs))
1169 |         return
1170 | 
1171 |     pipeout(pipein(ftype, **kwargs))
1172 | 
1173 | 
1174 | @register
1175 | def cat(*fnames, axis="index"):
1176 |     """Concatenates all files provided.
1177 | 
1178 |     Usage: ph cat a.csv b.csv c.csv
1179 |            ph cat a.csv b.csv c.csv --axis=index  # default
1180 |            ph cat a.csv b.csv c.csv --axis=columns
1181 | 
1182 |     If no arguments are provided, read from std in.
1183 | 
1184 |     """
1185 |     if axis not in ("index", "columns"):
1186 |         sys.exit("Unknown axis command '{}'".format(axis))
1187 |     if not fnames:
1188 |         pipeout(pipein())
1189 |     else:
1190 |         dfs = []
1191 |         for fname in fnames:
1192 |             df = pd.read_csv(fname)
1193 |             dfs.append(df)
1194 |         retval = pd.concat(dfs, axis=axis)
1195 |         pipeout(retval)
1196 | 
1197 | 
1198 | @register
1199 | def merge(fname1, fname2, how="inner", on=None, left=None, right=None):
1200 |     """Merging two csv files.
1201 | 
1202 |     If the two files have a common column name, then the merge will be
1203 |     on that column.  If the files have several common column names, use
1204 |     --on=key for merging on a specific column.
1205 | 
1206 |     If you want to merge on columns with different names, use
1207 |     --left=lkey --right=rkey.
1208 | 
1209 |     Choose between left merge, right merge, inner merge and outer merge
1210 |     by using (e.g.) --how=inner.
1211 | 
1212 |     Usage: ph merge a.csv b.csv --on=ijk
1213 |            ph merge a.csv b.csv --on ijk --how=inner
1214 |            ph merge a.csv b.csv --left=key_a --right=key_b
1215 | 
1216 |     """
1217 |     hows = ("left", "right", "outer", "inner")
1218 |     if how not in hows:
1219 |         sys.exit("Unknown merge --how={}, must be one of {}".format(how, hows))
1220 |     try:
1221 |         df1 = pd.read_csv(fname1)
1222 |         df2 = pd.read_csv(fname2)
1223 |     except Exception as err:
1224 |         sys.exit(str(err))
1225 |     if set([on, left, right]) == set([None]) and not set(df1.columns).intersection(set(df2.columns)):
1226 |         sys.exit("No common columns to perform merge on.  Merge options: on, or: left=None, right=None.")
1227 |     if set([on, left, right]) == set([None]):
1228 |         pipeout(pd.merge(df1, df2, how=how))
1229 |     else:
1230 |         if left is None and right is None:
1231 |             pipeout(pd.merge(df1, df2, how=how, on=on))
1232 |         elif left is not None and right is not None:
1233 |             _assert_col(df1, left, "merge")
1234 |             _assert_col(df2, right, "merge")
1235 |             pipeout(pd.merge(df1, df2, how=how, left_on=left, right_on=right))
1236 |         else:
1237 |             sys.exit("Specify columns in both files.  left was {}, right was {}".format(left, right))
1238 | 
1239 | 
1240 | @register
1241 | def tab():
1242 |     """Equivalent to `ph to tsv`.
1243 | 
1244 |     Usage: cat a.csv | ph tab
1245 |     """
1246 |     pipeout(pipein(), sep="\t")
1247 | 
1248 | 
1249 | @register
1250 | def tabulate(*args, **kwargs):
1251 |     """Tabulate the output for pretty-printing.
1252 | 
1253 |     Usage: cat a.csv | ph tabulate --headers --noindex --format=grid
1254 | 
1255 |     Takes arguments
1256 |       * --headers
1257 |       * --noindex
1258 |       * --format=[grid, latex, pretty, ...].
1259 | 
1260 |     For a full list of format styles confer the README.
1261 | 
1262 |     This function uses the tabulate project available as a standalone
1263 |     package from PyPI.
1264 | 
1265 |     Using `tabulate` in a pipeline usually means that the `ph` pipeline ends.
1266 |     This is because of `tabulate`'s focus on user readability over machine
1267 |     readability.
1268 | 
1269 |     """
1270 |     headers = tuple()
1271 |     fmt = kwargs.get("format")
1272 |     index = True
1273 |     if "--noindex" in args:
1274 |         index = False
1275 |     if "--headers" in args:
1276 |         headers = "keys"
1277 |     df = pipein()
1278 |     out = tabulate_(df, tablefmt=fmt, headers=headers, showindex=index)
1279 |     _safe_out(out)
1280 | 
1281 | 
1282 | @register
1283 | def show(noindex=False):
1284 |     """Similar to ph tabulate --headers [--noindex].
1285 | 
1286 |     Usage: cat a.csv | ph show
1287 |            cat a.csv | ph show --noindex
1288 |     """
1289 |     if noindex:
1290 |         tabulate("--headers", "--noindex")
1291 |     else:
1292 |         tabulate("--headers")
1293 | 
1294 | 
1295 | def _print_commands(cmds):
1296 |     num_cols = 72 // max(len(cmd) for cmd in cmds)
1297 |     while (len(cmds) % num_cols) != 0:
1298 |         cmds.append("")
1299 |     df = pd.DataFrame(pd.Series(cmds).values.reshape(num_cols, -1))
1300 |     print(tabulate_(df.transpose(), showindex=False))
1301 | 
1302 | 
1303 | @registerx("help")
1304 | def help_(*args, **kwargs):
1305 |     """Writes help (docstring) about the different commands."""
1306 |     if not args:
1307 |         print("Usage: ph command arguments")
1308 |         print(USAGE_TEXT)
1309 |         _print_commands(sorted(COMMANDS.keys()))
1310 |         sys.exit(0)
1311 |     cmd = args[0]
1312 | 
1313 |     ds = None
1314 |     if cmd in DOCS:
1315 |         ds = DOCS[cmd]
1316 |     else:
1317 |         try:
1318 |             fn = getattr(pd.DataFrame, cmd)
1319 |             ds = getattr(fn, "__doc__")
1320 |         except AttributeError:
1321 |             pass
1322 |     if ds is None:
1323 |         sys.exit("Unknown command {}".format(cmd))
1324 |     print("Usage: ph {}".format(cmd))
1325 |     print("       {}".format(ds.strip()))
1326 | 
1327 | 
1328 | def slugify_name(name):
1329 |     name_ = name
1330 |     try:
1331 |         name = float(name_)
1332 |     except ValueError:
1333 |         pass
1334 |     if isinstance(name_, (int, str)):
1335 |         try:
1336 |             name = int(name_)
1337 |         except ValueError:
1338 |             pass
1339 |     if isinstance(name, (int, float)):
1340 |         name = str(name) + "_"
1341 |     if not name:
1342 |         return "unnamed"
1343 |     if name == "_":
1344 |         return "_"
1345 |     lead_under = name[0] == "_"
1346 |     trail_under = name[-1] == "_"
1347 | 
1348 |     name = name.strip().lower()
1349 |     unwanted = set(c for c in name if not c.isalnum())
1350 |     for u in unwanted:
1351 |         name = name.replace(u, "_").strip()
1352 |     while "__" in name:
1353 |         name = name.replace("__", "_").strip()
1354 |     name = name.strip("_")
1355 |     if lead_under:
1356 |         name = "_" + name
1357 |     if trail_under:
1358 |         name = name + "_"
1359 |     return name
1360 | 
1361 | 
1362 | @register
1363 | def slugify():
1364 |     """Slugify the column headers.
1365 | 
1366 |     Usage: cat a.csv | ph slugify
1367 | 
1368 |     Removes all non-alphanumeric characters aside from the underscore.
1369 | 
1370 |     Is useful in scenarios where you have possibly many columns with
1371 |     very ugly names.  Can be a good preprocessor to @rename:
1372 | 
1373 |     Usage: cat a.csv | ph slugify | ph rename less_bad_name good_name
1374 | 
1375 |     """
1376 |     df = pipein()
1377 |     df.columns = [slugify_name(name) for name in df.columns]
1378 |     pipeout(df)
1379 | 
1380 | 
1381 | @register
1382 | def raw(fname=None):
1383 |     """Do your best to read this comma-separated input."""
1384 |     import csv
1385 | 
1386 |     if fname is None:
1387 |         d = csv.reader(sys.stdin)
1388 |         df = pd.DataFrame(d)
1389 |     else:
1390 |         with open(fname, "r") as fin:
1391 |             d = csv.reader(fin)
1392 |             df = pd.DataFrame(d)
1393 |     pipeout(df)
1394 | 
1395 | 
1396 | @registerx("open")
1397 | def open_(ftype, fname, **kwargs):
1398 |     """Use a reader to open a file.
1399 | 
1400 |     Open ftype file with name fname and stream out.
1401 | 
1402 |     Usage: ph open csv a.csv
1403 |            ph open csv a.csv --skiprows=7
1404 |            ph open json a.json
1405 |            ph open parquet a.parquet
1406 |            ph open excel a.ods
1407 |            ph open excel a.xls
1408 |            ph open excel a.xlsx
1409 |            ph open excel a.xls --sheet_name=2
1410 |            ph open excel a.xls --sheet_name="The Real Dataset sheet"
1411 |            ph open csv a.csv --thousands=','
1412 | 
1413 | 
1414 |     In the event that the csv data starts on the first line (i.e. no
1415 |     header is present), use --header=None.
1416 | 
1417 |     """
1418 |     if "header" in kwargs:
1419 |         kwargs["header"] = __tryparse(kwargs["header"])
1420 | 
1421 |     if ftype not in READERS:
1422 |         sys.exit("Unknown filetype {}".format(ftype))
1423 |     reader = READERS[ftype]
1424 | 
1425 |     if kwargs.get("sep") == "\\t":
1426 |         kwargs["sep"] = "\t"
1427 | 
1428 |     if ftype == "clipboard" and fname is not None:
1429 |         sys.exit("clipboard does not take fname")
1430 |     if ftype != "clipboard" and fname is None:
1431 |         sys.exit("filename is required for {}".format(ftype))
1432 | 
1433 |     skiprows = kwargs.get("skiprows")
1434 |     if skiprows is not None:
1435 |         try:
1436 |             skiprows = int(skiprows)
1437 |             if skiprows < 0:
1438 |                 raise ValueError("Negative")
1439 |         except ValueError:
1440 |             sys.exit("skiprows must be a non-negative int, not {}".format(skiprows))
1441 |         kwargs["skiprows"] = skiprows
1442 | 
1443 |     try:
1444 |         if ftype == "clipboard":
1445 |             df = reader(**kwargs)
1446 |         elif ftype in ("excel", "xls", "odf"):
1447 |             try:
1448 |                 df = reader(fname, **kwargs)
1449 |             except Exception as err:
1450 |                 sys.exit(err)
1451 |             if not isinstance(df, pd.DataFrame):  # could be dict
1452 |                 try:
1453 |                     errormsg = 'Specify --sheet_name="{}"'.format(
1454 |                         "|".join(str(k) for k in df.keys())
1455 |                     )
1456 |                 except Exception:
1457 |                     errormsg = "Specify --sheet_name"
1458 |                 sys.exit(errormsg)
1459 |         else:
1460 |             df = reader(fname, **kwargs)
1461 |     except AttributeError as err:
1462 |         sys.exit(
1463 |             "{} is not supported in your Pandas installation\n{}".format(ftype, err)
1464 |         )
1465 |     except ImportError as err:
1466 |         sys.exit(
1467 |             "{} is not supported in your Pandas installation\n{}".format(ftype, err)
1468 |         )
1469 |     except FileNotFoundError as err:
1470 |         sys.exit("File not found: {}".format(err))
1471 |     pipeout(df)
1472 | 
1473 | 
1474 | _ATTRS_WITH_SERIES_OUTPUT = (
1475 |     "all",
1476 |     "any",
1477 |     "count",
1478 |     "kurt",
1479 |     "kurtosis",
1480 |     "mad",
1481 |     "mean",
1482 |     "median",
1483 |     "min",
1484 |     "nunique",
1485 |     "prod",
1486 |     "product",
1487 |     "quantile",
1488 |     "sem",
1489 |     "skew",
1490 |     "std",
1491 |     "sum",
1492 |     "var",
1493 | )
1494 | 
1495 | 
1496 | def _call(attr, *args, **kwargs):
1497 |     df = pipein()
1498 |     dfn = getattr(df, attr)(*args, **kwargs)
1499 |     if attr in _ATTRS_WITH_SERIES_OUTPUT:
1500 |         dfn = dfn.reset_index()
1501 |         dfn = dfn.T
1502 |         pipeout(dfn, header=False)
1503 |     else:
1504 |         pipeout(dfn)
1505 | 
1506 | 
1507 | def register_forward(attr):
1508 |     def partial(*args, **kwargs):
1509 |         return _call(attr, *args, **kwargs)
1510 | 
1511 |     partial.__name__ = attr
1512 |     COMMANDS[attr] = partial
1513 | 
1514 | 
1515 | @register
1516 | def head(n=10):
1517 |     """Similar to `head` but keeps the header.
1518 | 
1519 |     Print the header followed by the first 10 (or n) lines of the stream to
1520 |     standard output.
1521 | 
1522 |     Usage: cat a.csv | ph head
1523 |            cat a.csv | ph head 8
1524 | 
1525 | 
1526 |     """
1527 |     _call("head", int(n))
1528 | 
1529 | 
1530 | @register
1531 | def tail(n=10):
1532 |     """Similar to `tail` but keeps the header.
1533 | 
1534 |     Print the header followed by the last 10 (or n) lines of the stream to
1535 |     standard output.
1536 | 
1537 |     """
1538 |     _call("tail", int(n))
1539 | 
1540 | 
1541 | def __tryparse(x):
1542 |     if x is None or x == "None":
1543 |         return None
1544 |     x_ = x
1545 |     try:
1546 |         x_ = float(x)
1547 |         x_ = int(x)
1548 |     except ValueError:
1549 |         pass
1550 |     except OverflowError:
1551 |         x_ = float("inf")
1552 |     return x_
1553 | 
1554 | 
1555 | @register
1556 | def replace(old, new, column=None, newcolumn=None):
1557 |     """Replace a value (in a column) with a new value.
1558 | 
1559 |     Usage: cat a.csv | ph replace 8 100 # replace in all columns
1560 |            cat a.csv | ph replace 8 100 --column=y
1561 |            cat a.csv | ph replace 8 100 --column=y --newcolumn=z
1562 | 
1563 |     Beware that it is difficult to know which _types_ we are searching for,
1564 |     therefore we only apply a heuristic, which is doomed to be faulty.
1565 |     """
1566 |     if newcolumn is None:
1567 |         newcolumn = column
1568 |     df = pipein()
1569 | 
1570 |     if column is None:
1571 |         if newcolumn is not None:
1572 |             sys.exit("Cannot use newcolumn and not column.")
1573 |         df = df.replace(to_replace=old, value=new, inplace=False)
1574 |     elif column not in df:
1575 |         sys.exit("Column {} does not exist.".format(column))
1576 |     else:
1577 |         df[newcolumn] = df[column].replace(to_replace=old, value=new, inplace=False)
1578 |     pipeout(df)
1579 | 
1580 | 
1581 | @register
1582 | def rename(before, after):
1583 |     """Rename a column name.
1584 | 
1585 |     Usage:  ph rename before after
1586 | 
1587 |     Example:  cat a.csv | ph rename x a | ph rename y b
1588 | 
1589 |     """
1590 |     pipeout(pipein().rename(columns={before: after}))
1591 | 
1592 | 
1593 | @register
1594 | def columns(*cols, **kwargs):
1595 |     """ph columns servers two purposes.
1596 | 
1597 |     Called without any arguments, it lists the names of the columns in
1598 |     the stream.
1599 | 
1600 |     Called with arguments, it streams out the csv data from the given columns
1601 |     with prescribed order.
1602 | 
1603 |     Takes also arguments --startswith=the_prefix and --endswith=the_suffix which
1604 |     selects all columns matching either pattern.
1605 | 
1606 | 
1607 |     Usage: cat a.csv | ph columns      # will list all column names
1608 |            cat a.csv | ph columns y x  # select only columns y and x
1609 |            cat a.csv | ph columns --startswith=sepal
1610 | 
1611 |     """
1612 |     cols = list(cols)
1613 |     df = pipein()
1614 |     if "startswith" in kwargs:
1615 |         q = kwargs["startswith"]
1616 |         for col in df.columns:
1617 |             if col.startswith(q) and col not in cols:
1618 |                 cols.append(col)
1619 |     if "endswith" in kwargs:
1620 |         q = kwargs["endswith"]
1621 |         for col in df.columns:
1622 |             if col.endswith(q) and col not in cols:
1623 |                 cols.append(col)
1624 | 
1625 |     _assert_cols(df, cols, "columns")
1626 | 
1627 |     if not cols and not kwargs:
1628 |         print("columns")
1629 |         print("\n".join(list(df.columns)))
1630 |     else:
1631 |         pipeout(df[cols])
1632 | 
1633 | 
1634 | @register
1635 | def spencer(*cols):
1636 |     """Compute Spencer's 15-weight average.
1637 | 
1638 |     Usage: cat a.csv | ph spencer
1639 | 
1640 |     Experimental feature for computing Spencer's 15-weight average.
1641 |     Smooths out curves by removing high frequency noise.  Will
1642 |     ultimately lose some data on each end of the timeseries.
1643 | 
1644 |     """
1645 |     _SPENCER = (-3, -6, -5, 3, 21, 46, 67, 74, 67, 46, 21, 3, -5, -6, -3)
1646 |     _SPENCER_SUM = sum(_SPENCER)
1647 | 
1648 |     def spencer_(lst):
1649 |         for i in range(7, len(lst) - 8):
1650 |             seq = lst[i - 7 : i + 8]
1651 |             yield sum(seq[i] * _SPENCER[i] / _SPENCER_SUM for i in range(15))
1652 | 
1653 |     df = pipein()
1654 |     _assert_cols(df, cols, "spencer")
1655 |     prefix = [float("nan")] * 7
1656 |     suffix = [float("nan")] * 8
1657 |     if not cols:
1658 |         cols = list(df.columns)
1659 |     for col in cols:
1660 |         lst = list(df[col])
1661 |         s = list(spencer_(lst))
1662 |         ncol = prefix + s + suffix
1663 |         df[col] = ncol
1664 |     pipeout(df)
1665 | 
1666 | 
1667 | def _parse_slice(slicestr):
1668 |     pattern = ":<int> | <int>: | <int>:<int> | <int>:<int>:<int>"
1669 |     error = "Input to slice is {} _not_ {}".format(pattern, slicestr)
1670 | 
1671 |     assert ":" in slicestr, error
1672 |     start = None
1673 |     end = None
1674 |     step = None
1675 |     tup = slicestr.split(":")
1676 |     if len(tup) > 3:
1677 |         sys.exit(error)
1678 |     start = tup[0] or None
1679 |     if start is not None:
1680 |         start = int(start)
1681 |     end = tup[1] or None
1682 |     if end is not None:
1683 |         end = int(end)
1684 |     if len(tup) == 3:
1685 |         step = tup[2] or None
1686 |         if step is not None:
1687 |             step = int(step)
1688 |     return start, end, step
1689 | 
1690 | 
1691 | @registerx("slice")
1692 | def slice_(slicestr):
1693 |     """Slice a dataframe with Python slice pattern.
1694 | 
1695 |     Usage: cat a.csv | ph slice :10    # head
1696 |            cat a.csv | ph slice -10:   # tail
1697 |            cat a.csv | ph slice ::2    # every even row
1698 |            cat a.csv | ph slice 1::2   # every odd row
1699 |            cat a.csv | ph slice ::-1   # reverse file
1700 | 
1701 |     """
1702 |     pattern = ":<int> | <int>: | <int>:<int> | <int>:<int>:<int>"
1703 |     error = "Input to slice is {} _not_ {}".format(pattern, slicestr)
1704 |     df = pipein()
1705 |     if isinstance(slicestr, int) or ":" not in slicestr:
1706 |         sys.exit(error)
1707 |     start, end, step = _parse_slice(slicestr)
1708 |     retval = df[start:end:step]
1709 |     pipeout(retval)
1710 | 
1711 | 
1712 | @register
1713 | def drop(*columns, **kwargs):
1714 |     """Drop specified labels from rows or columns.
1715 | 
1716 |     Remove rows or columns by specifying label names and corresponding
1717 |     axis, or by specifying directly index or column names.
1718 | 
1719 |     Usage: cat a.csv | ph drop 'x' --axis=columns
1720 |            cat a.csv | ph drop 0   --axis=index
1721 | 
1722 |     """
1723 |     for opt in ("axis", "levels"):
1724 |         if opt in kwargs:
1725 |             kwargs[opt] = __tryparse(kwargs[opt])
1726 |     if "inplace" in kwargs:
1727 |         sys.exit("inplace is nonsensical in ph")
1728 | 
1729 |     df = pipein()
1730 | 
1731 |     if kwargs.get("axis") in (None, 0, "index"):
1732 |         columns = [__tryparse(col) for col in columns]
1733 |     elif kwargs.get("axis") in (1, "columns"):
1734 |         _assert_cols(df, columns, "drop")
1735 |     else:
1736 |         sys.exit(
1737 |             "--axis=index (or 0) or --axis=columns (or 1), not {}".format(
1738 |                 kwargs.get("axis")
1739 |             )
1740 |         )
1741 | 
1742 |     ndf = df.drop(list(columns), **kwargs)
1743 |     pipeout(ndf)
1744 | 
1745 | 
1746 | @register
1747 | def shape():
1748 |     """Print the shape of the csv file, i.e. num cols and num rows.
1749 | 
1750 |     The output will have two rows and two columns, with header "rows,columns".
1751 | 
1752 |     """
1753 |     print("rows,columns\n" + ",".join([str(x) for x in pipein().shape]))
1754 | 
1755 | 
1756 | @register
1757 | def empty():
1758 |     """Print a csv file with one column containing True or False.
1759 | 
1760 |     The output depends on whether the csv input was empty.
1761 | 
1762 |     """
1763 |     print("empty\n{}".format(pipein().empty))
1764 | 
1765 | 
1766 | @register
1767 | def index():
1768 |     """Reset the index to a 0..n-1 counter.
1769 | 
1770 |     Usage: cat a.csv | ph index
1771 | 
1772 |     Adds a left-most column `index`.
1773 |     """
1774 |     pipeout(pipein().reset_index())
1775 | 
1776 | 
1777 | @register
1778 | def sort(*col):
1779 |     """Sort csv input by column(s).
1780 | 
1781 |     This is the only way to sort on multiple columns since sort is not stable.
1782 | 
1783 |     Usage: cat iris.csv | ph sort setosa
1784 |            cat iris.csv | ph sort setosa verginica
1785 | 
1786 |     """
1787 |     df = pipein()
1788 |     _assert_cols(df, col, "sort")
1789 |     pipeout(df.sort_values(list(col)))
1790 | 
1791 | 
1792 | @register
1793 | def polyfit(x, y, deg=1):
1794 |     """Perform linear/polynomial regression.
1795 | 
1796 |     Usage: cat a.csv | ph polyfix x y
1797 |            cat a.csv | ph polyfix x y --deg=1  # default
1798 |            cat a.csv | ph polyfix x y --deg=2  # default
1799 | 
1800 |     Outputs a column polyfit_{deg} containing the evaluated index.
1801 | 
1802 |     """
1803 |     df = pipein()
1804 |     _assert_cols(df, (x, y), "polyfit")
1805 |     deg = __tryparse(deg)
1806 |     if not isinstance(deg, int) or deg <= 0:
1807 |         sys.exit("deg={} should be a positive int".format(deg))
1808 |     try:
1809 |         import numpy
1810 |     except ImportError:
1811 |         sys.exit("numpy needed for polyfit.  pip install numpy")
1812 | 
1813 |     polynomial = numpy.polynomial.Polynomial.fit(df[x], df[y], deg=deg)
1814 |     df["polyfit_{}".format(deg)] = df[x].map(polynomial)
1815 |     pipeout(df)
1816 | 
1817 | 
1818 | def __process(attr):
1819 |     if attr in COMMANDS:
1820 |         return False
1821 |     if attr.startswith("_"):
1822 |         return False
1823 |     if attr.startswith("to_"):
1824 |         return False
1825 |     if attr == "T":
1826 |         return False
1827 |     return True
1828 | 
1829 | 
1830 | for attr in dir(pd.DataFrame):
1831 |     if __process(attr):
1832 |         register_forward(attr)
1833 | 
1834 | 
1835 | def _main(argv):
1836 |     if len(argv) < 2:
1837 |         sys.exit("Usage: ph command [args]\n       ph help")
1838 |     cmd = argv[1]
1839 |     if cmd in ("-v", "--version"):
1840 |         print_version()
1841 |         sys.exit()
1842 |     if cmd in ("-h", "--h", "--help"):
1843 |         cmd = "help"
1844 |     if cmd not in COMMANDS:
1845 |         sys.exit("Unknown command {}.".format(cmd))
1846 | 
1847 |     # Self-implemented parsing of arguments.
1848 |     # Arguments of type "abc" and "--abc" go into args
1849 |     # Arguments of type "--abc=def" go into kwargs as key, value pairs
1850 |     args = []
1851 |     kwarg = {}
1852 |     for a in argv[2:]:
1853 |         if KWARG.match(a):
1854 |             args.append(a)
1855 |         elif KWARG_WITH_VALUE.match(a):
1856 |             split = a.index("=")
1857 |             k = a[2:split]
1858 |             v = a[split + 1 :]
1859 |             kwarg[k] = __tryparse(v)
1860 |         else:
1861 |             args.append(__tryparse(a))
1862 |     try:
1863 |         COMMANDS[cmd](*args, **kwarg)
1864 |     except TypeError as err:
1865 |         sys.exit(err)
1866 | 
1867 | 
1868 | def main():
1869 |     _main(sys.argv)
1870 | 
1871 | 
1872 | if __name__ == "__main__":
1873 |     main()
1874 | 


--------------------------------------------------------------------------------