├── TAGS ├── debian ├── compat ├── rules ├── changelog └── control ├── requirements.txt ├── ph ├── _version.py └── __init__.py ├── assets ├── cases.png ├── lifeexp.png ├── polyfit.png ├── scatter.png └── covid-plot.png ├── tests ├── test_data │ ├── headless.csv │ ├── a.csv │ ├── broken.csv │ ├── inf.csv │ ├── sheet.xlsx │ ├── left.csv │ ├── right.csv │ ├── f.csv │ ├── group.csv │ ├── mergel.csv │ ├── merger.csv │ ├── t.tsv │ ├── d.csv │ ├── d.scsv │ ├── slugit.csv │ ├── derr.csv │ ├── date-fmt.csv │ ├── date-utc.csv │ ├── padded_decimals.csv │ ├── strip.csv │ ├── covid.csv │ ├── usa.csv │ └── iris.csv └── test_ph.py ├── .github └── workflows │ └── pythonapp.yml ├── LICENSE ├── setup.py ├── .gitignore └── README.md /TAGS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 10 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | -------------------------------------------------------------------------------- /ph/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.5" 2 | -------------------------------------------------------------------------------- /assets/cases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/cases.png -------------------------------------------------------------------------------- /tests/test_data/headless.csv: -------------------------------------------------------------------------------- 1 | 12,76 2 | 13,74 3 | 14,75 4 | 15,79 5 | 16,77 6 | -------------------------------------------------------------------------------- /assets/lifeexp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/lifeexp.png -------------------------------------------------------------------------------- /assets/polyfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/polyfit.png -------------------------------------------------------------------------------- /assets/scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/scatter.png -------------------------------------------------------------------------------- /tests/test_data/a.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 3,8 3 | 4,9 4 | 5,10 5 | 6,11 6 | 7,12 7 | 8,13 8 | -------------------------------------------------------------------------------- /assets/covid-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/assets/covid-plot.png -------------------------------------------------------------------------------- /tests/test_data/broken.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 3,8,,,3 3 | 4,9, 4 | 5,10,2 5 | 6,11 6 | 7,12 7 | 8,13 8 | -------------------------------------------------------------------------------- /tests/test_data/inf.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | nan,8 3 | nan,9 4 | nan,10 5 | inf,11 6 | 7,12 7 | 8,13 8 | -------------------------------------------------------------------------------- /tests/test_data/sheet.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgdr/ph/HEAD/tests/test_data/sheet.xlsx -------------------------------------------------------------------------------- /tests/test_data/left.csv: -------------------------------------------------------------------------------- 1 | key1,key2,A,B 2 | K0,K0,A0,B0 3 | K0,K1,A1,B1 4 | K1,K0,A2,B2 5 | K2,K1,A3,B3 6 | -------------------------------------------------------------------------------- /tests/test_data/right.csv: -------------------------------------------------------------------------------- 1 | key1,key2,C,D 2 | K0,K0,C0,D0 3 | K1,K0,C1,D1 4 | K1,K0,C2,D2 5 | K2,K0,C3,D3 6 | -------------------------------------------------------------------------------- /tests/test_data/f.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 2,1 3 | 4,1 4 | 6,2 5 | 8,3 6 | 10,5 7 | 12,8 8 | 14,13 9 | 16,21 10 | -------------------------------------------------------------------------------- /tests/test_data/group.csv: -------------------------------------------------------------------------------- 1 | Animal,Max Speed 2 | Falcon,380.0 3 | Falcon,370.0 4 | Parrot,24.0 5 | Parrot,26.0 6 | -------------------------------------------------------------------------------- /tests/test_data/mergel.csv: -------------------------------------------------------------------------------- 1 | lk1,lk2,lk3,lk4 2 | K0,K5,A0,B0 3 | K1,K4,A1,B1 4 | K2,K3,A2,B2 5 | K3,K2,A3,B3 6 | -------------------------------------------------------------------------------- /tests/test_data/merger.csv: -------------------------------------------------------------------------------- 1 | rk1,rk2,rk3,rk4 2 | K2,K3,A0,B0 3 | K3,K4,A1,B1 4 | K4,K5,A2,B2 5 | K5,K6,A3,B3 6 | -------------------------------------------------------------------------------- /tests/test_data/t.tsv: -------------------------------------------------------------------------------- 1 | a b 2 | 1 0 3 | 10 1 4 | 100 2 5 | 1,000 3 6 | 10,000 4 7 | 100,000 5 8 | 1,000,000 6 9 | -------------------------------------------------------------------------------- /tests/test_data/d.csv: -------------------------------------------------------------------------------- 1 | year,month,day 2 | 2003,3,8 3 | 2004,4,9 4 | 2005,5,10 5 | 2006,6,11 6 | 2007,7,12 7 | 2008,8,13 8 | -------------------------------------------------------------------------------- /tests/test_data/d.scsv: -------------------------------------------------------------------------------- 1 | year;month;day 2 | 2003;3;8 3 | 2004;4;9 4 | 2005;5;10 5 | 2006;6;11 6 | 2007;7;12 7 | 2008;8;13 8 | -------------------------------------------------------------------------------- /tests/test_data/slugit.csv: -------------------------------------------------------------------------------- 1 | Stupid column 1, Jerky-column No. 2 2 | 3,8 3 | 4,9 4 | 5,10 5 | 6,11 6 | 7,12 7 | 8,13 8 | -------------------------------------------------------------------------------- /tests/test_data/derr.csv: -------------------------------------------------------------------------------- 1 | year,month,day 2 | 2003-01-01,3,8 3 | 2004-01-01,4,9 4 | 2005-01-01,5,10 5 | 200-01,6,11 6 | 2007,7,12 7 | 2008,8,13 8 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #! /usr/bin/make -f 2 | 3 | #export DH_VERBOSE = 1 4 | export PYBUILD_NAME=ph 5 | 6 | %: 7 | dh $@ --with python3 --buildsystem=pybuild -------------------------------------------------------------------------------- /tests/test_data/date-fmt.csv: -------------------------------------------------------------------------------- 1 | date,x,y 2 | 2020_02/02,3,8 3 | 2020_02/03,4,9 4 | 2020_02/04,5,10 5 | 2020_02/05,6,11 6 | 2020_02/06,7,12 7 | 2020_02/07,8,13 8 | -------------------------------------------------------------------------------- /tests/test_data/date-utc.csv: -------------------------------------------------------------------------------- 1 | date,x,y 2 | 1580601600,3,8 3 | 1580688000,4,9 4 | 1580774400,5,10 5 | 1580860800,6,11 6 | 1580947200,7,12 7 | 1581033600,8,13 8 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | python3-ph (1.1.2) bionic; urgency=medium 2 | 3 | * First deb release 4 | 5 | -- PG Drange Thu Jul 21 11:42:31 CEST 2022 6 | -------------------------------------------------------------------------------- /tests/test_data/padded_decimals.csv: -------------------------------------------------------------------------------- 1 | idx,paddecim 2 | 0," 502,50" 3 | 1," 172,50" 4 | 2," 7,50" 5 | 3," 142,50" 6 | 4," 157,50" 7 | 5," 487,50" 8 | 6," 1.470,00" 9 | -------------------------------------------------------------------------------- /tests/test_data/strip.csv: -------------------------------------------------------------------------------- 1 | idx,date,x,y 2 | 1, 2020-05-12 ,3,8 3 | 2, 2020-05-13 ,4,9 4 | 3, 2020-05-14 ,5,10 5 | 4, 2020-05-15 ,6,11 6 | 5, 2020-05-16 ,7,12 7 | 6, 2020-05-17 ,8,13 8 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: python3-ph 2 | Maintainer: PG Drange 3 | Build-Depends: debhelper,dh-python,python3-all,python3-setuptools 4 | Section: devel 5 | Priority: optional 6 | Standards-Version: 3.9.6 7 | X-Python3-Version: >= 3.6 8 | 9 | Package: python3-ph 10 | Architecture: all 11 | Description: Tabular data shell tool 12 | Depends: ${python3:Depends},python3-pandas,python3-matplotlib -------------------------------------------------------------------------------- /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | name: ph tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python 3.7 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: 3.7 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | pip install . 25 | - name: Lint with flake8 26 | run: | 27 | pip install flake8 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=99 --statistics 30 | - name: Test with pytest 31 | run: | 32 | pip install pytest 33 | pytest 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Pål Grønås Drange 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_data/covid.csv: -------------------------------------------------------------------------------- 1 | China,S. Korea,Italy,Iran,France,Germany,Spain,USA,UK,Canada 2 | 571,58,79,95,57,79,84,75,87,96 3 | 830,111,157,139,100,130,120,100,116,158 4 | 1287,209,229,245,130,165,165,124,164,190 5 | 1975,436,323,388,191,203,228,158,209,249 6 | 2744,602,470,593,212,262,282,221,278,300 7 | 4515,833,655,978,285,545,401,319,321, 8 | 5974,977,889,1501,423,670,525,435,383, 9 | 7711,1261,1128,2336,653,800,674,541,456, 10 | 9692,1766,1701,2922,949,1040,1231,704,590, 11 | 11791,2337,2036,3513,1209,1224,1695,994,798, 12 | 14380,3150,2502,4747,1412,1565,2277,1329,1140, 13 | 17205,3736,3089,5823,1784,1966,3146,1762,1140, 14 | 20440,4335,3858,6566,2281,2745,5232,2247,, 15 | 24324,5186,4636,7161,2876,3675,6391,2943,, 16 | 28018,5621,5883,8042,3661,4599,7753,3046,, 17 | 31161,6284,7375,9000,4469,5381,,,, 18 | 34546,6593,9172,10075,4499,,,,, 19 | 37198,7041,10149,11364,,,,,, 20 | 40171,7313,12462,12729,,,,,, 21 | 42638,7478,15113,13938,,,,,, 22 | 44653,7513,17660,,,,,,, 23 | 58761,7755,21157,,,,,,, 24 | 63851,7869,21157,,,,,,, 25 | 66492,7979,,,,,,,, 26 | 68500,8086,,,,,,,, 27 | 70548,8162,,,,,,,, 28 | 72436,,,,,,,,, 29 | 74185,,,,,,,,, 30 | 74576,,,,,,,,, 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | from setuptools import setup 6 | 7 | 8 | __pgdr = "PG Drange " 9 | __source = "https://github.com/pgdr/ph" 10 | __webpage = __source 11 | __description = "ph - the tabular data shell tool" 12 | 13 | _min_req = ["pandas"] 14 | requirements = { 15 | "minimum": _min_req, 16 | "parquet": _min_req + ["pyarrow"], 17 | "xls": _min_req + ["xlrd"], 18 | "xlsw": _min_req + ["xlrd", "xlwt"], 19 | "plot": _min_req + ["matplotlib"], 20 | "data": _min_req + ["scikit-learn"], 21 | "math": _min_req + ["numpy"], 22 | "iplot": _min_req + ["cufflinks"], 23 | "gpx": _min_req + ["gpxpy"], 24 | } 25 | requirements["complete"] = sorted(set(sum(requirements.values(), []))) 26 | 27 | 28 | def _src(x): 29 | root = os.path.dirname(__file__) 30 | return os.path.abspath(os.path.join(root, x)) 31 | 32 | 33 | def _read_file(fname, op): 34 | with open(_src(fname), "r") as fin: 35 | return op(fin.readlines()) 36 | 37 | 38 | def readme(): 39 | try: 40 | return _read_file("README.md", lambda lines: "".join(lines)) 41 | except Exception: 42 | return __description 43 | 44 | 45 | VERSIONFILE = "ph/_version.py" 46 | verstrline = open(VERSIONFILE, "rt").read() 47 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" 48 | mo = re.search(VSRE, verstrline, re.M) 49 | if mo: 50 | verstr = mo.group(1) 51 | else: 52 | raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,)) 53 | 54 | setup( 55 | version=verstr, 56 | name="ph", 57 | packages=["ph"], 58 | description=__description, 59 | long_description=readme(), 60 | long_description_content_type="text/markdown", 61 | author="PG Drange", 62 | author_email="Pal.Drange@uib.no", 63 | maintainer=__pgdr, 64 | url=__webpage, 65 | project_urls={ 66 | "Bug Tracker": "{}/issues".format(__source), 67 | "Documentation": "{}/blob/master/README.md".format(__source), 68 | "Source Code": __source, 69 | }, 70 | license="MIT", 71 | keywords="tabular data, pandas, csv, pipeline, unix, command line tool", 72 | install_requires=requirements["minimum"], 73 | entry_points={ 74 | "console_scripts": [ 75 | "ph = ph:main", 76 | ], 77 | }, 78 | test_suite="tests", 79 | tests_require=["pytest"], 80 | extras_require=requirements, 81 | ) 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | TAGS 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /tests/test_data/usa.csv: -------------------------------------------------------------------------------- 1 | dateRep,day,month,year,cases,deaths,geoId 2 | 01/04/2020,1,4,2020,24998,909,US 3 | 31/03/2020,31,3,2020,21595,661,US 4 | 30/03/2020,30,3,2020,18360,318,US 5 | 29/03/2020,29,3,2020,19979,484,US 6 | 28/03/2020,28,3,2020,18695,411,US 7 | 27/03/2020,27,3,2020,16797,246,US 8 | 26/03/2020,26,3,2020,13963,249,US 9 | 25/03/2020,25,3,2020,8789,211,US 10 | 24/03/2020,24,3,2020,11236,119,US 11 | 23/03/2020,23,3,2020,8459,131,US 12 | 22/03/2020,22,3,2020,7123,80,US 13 | 21/03/2020,21,3,2020,5374,110,US 14 | 20/03/2020,20,3,2020,4835,0,US 15 | 19/03/2020,19,3,2020,2988,42,US 16 | 18/03/2020,18,3,2020,1766,23,US 17 | 17/03/2020,17,3,2020,887,16,US 18 | 16/03/2020,16,3,2020,823,12,US 19 | 15/03/2020,15,3,2020,777,10,US 20 | 14/03/2020,14,3,2020,511,7,US 21 | 13/03/2020,13,3,2020,351,10,US 22 | 12/03/2020,12,3,2020,287,2,US 23 | 11/03/2020,11,3,2020,271,2,US 24 | 10/03/2020,10,3,2020,200,5,US 25 | 09/03/2020,9,3,2020,121,4,US 26 | 08/03/2020,8,3,2020,95,3,US 27 | 07/03/2020,7,3,2020,105,2,US 28 | 06/03/2020,6,3,2020,74,1,US 29 | 05/03/2020,5,3,2020,34,2,US 30 | 04/03/2020,4,3,2020,22,3,US 31 | 03/03/2020,3,3,2020,14,4,US 32 | 02/03/2020,2,3,2020,20,1,US 33 | 01/03/2020,1,3,2020,3,1,US 34 | 29/02/2020,29,2,2020,6,0,US 35 | 28/02/2020,28,2,2020,1,0,US 36 | 27/02/2020,27,2,2020,6,0,US 37 | 26/02/2020,26,2,2020,0,0,US 38 | 25/02/2020,25,2,2020,18,0,US 39 | 24/02/2020,24,2,2020,0,0,US 40 | 23/02/2020,23,2,2020,0,0,US 41 | 22/02/2020,22,2,2020,19,0,US 42 | 21/02/2020,21,2,2020,1,0,US 43 | 20/02/2020,20,2,2020,0,0,US 44 | 19/02/2020,19,2,2020,0,0,US 45 | 18/02/2020,18,2,2020,0,0,US 46 | 17/02/2020,17,2,2020,0,0,US 47 | 16/02/2020,16,2,2020,0,0,US 48 | 15/02/2020,15,2,2020,0,0,US 49 | 14/02/2020,14,2,2020,1,0,US 50 | 13/02/2020,13,2,2020,1,0,US 51 | 12/02/2020,12,2,2020,0,0,US 52 | 11/02/2020,11,2,2020,1,0,US 53 | 10/02/2020,10,2,2020,0,0,US 54 | 09/02/2020,9,2,2020,0,0,US 55 | 08/02/2020,8,2,2020,0,0,US 56 | 07/02/2020,7,2,2020,0,0,US 57 | 06/02/2020,6,2,2020,1,0,US 58 | 05/02/2020,5,2,2020,0,0,US 59 | 04/02/2020,4,2,2020,0,0,US 60 | 03/02/2020,3,2,2020,3,0,US 61 | 02/02/2020,2,2,2020,1,0,US 62 | 01/02/2020,1,2,2020,1,0,US 63 | 31/01/2020,31,1,2020,1,0,US 64 | 30/01/2020,30,1,2020,0,0,US 65 | 29/01/2020,29,1,2020,0,0,US 66 | 28/01/2020,28,1,2020,0,0,US 67 | 27/01/2020,27,1,2020,3,0,US 68 | 26/01/2020,26,1,2020,0,0,US 69 | 25/01/2020,25,1,2020,1,0,US 70 | 24/01/2020,24,1,2020,0,0,US 71 | 23/01/2020,23,1,2020,0,0,US 72 | 22/01/2020,22,1,2020,0,0,US 73 | 21/01/2020,21,1,2020,1,0,US 74 | 20/01/2020,20,1,2020,0,0,US 75 | 19/01/2020,19,1,2020,0,0,US 76 | 18/01/2020,18,1,2020,0,0,US 77 | 17/01/2020,17,1,2020,0,0,US 78 | 16/01/2020,16,1,2020,0,0,US 79 | 15/01/2020,15,1,2020,0,0,US 80 | 14/01/2020,14,1,2020,0,0,US 81 | 13/01/2020,13,1,2020,0,0,US 82 | 12/01/2020,12,1,2020,0,0,US 83 | 11/01/2020,11,1,2020,0,0,US 84 | 10/01/2020,10,1,2020,0,0,US 85 | 09/01/2020,9,1,2020,0,0,US 86 | 08/01/2020,8,1,2020,0,0,US 87 | 07/01/2020,7,1,2020,0,0,US 88 | 06/01/2020,6,1,2020,0,0,US 89 | 05/01/2020,5,1,2020,0,0,US 90 | 04/01/2020,4,1,2020,0,0,US 91 | 03/01/2020,3,1,2020,0,0,US 92 | 02/01/2020,2,1,2020,0,0,US 93 | 01/01/2020,1,1,2020,0,0,US 94 | 31/12/2019,31,12,2019,0,0,US 95 | -------------------------------------------------------------------------------- /tests/test_data/iris.csv: -------------------------------------------------------------------------------- 1 | 150,4,setosa,versicolor,virginica 2 | 5.1,3.5,1.4,0.2,0 3 | 4.9,3.0,1.4,0.2,0 4 | 4.7,3.2,1.3,0.2,0 5 | 4.6,3.1,1.5,0.2,0 6 | 5.0,3.6,1.4,0.2,0 7 | 5.4,3.9,1.7,0.4,0 8 | 4.6,3.4,1.4,0.3,0 9 | 5.0,3.4,1.5,0.2,0 10 | 4.4,2.9,1.4,0.2,0 11 | 4.9,3.1,1.5,0.1,0 12 | 5.4,3.7,1.5,0.2,0 13 | 4.8,3.4,1.6,0.2,0 14 | 4.8,3.0,1.4,0.1,0 15 | 4.3,3.0,1.1,0.1,0 16 | 5.8,4.0,1.2,0.2,0 17 | 5.7,4.4,1.5,0.4,0 18 | 5.4,3.9,1.3,0.4,0 19 | 5.1,3.5,1.4,0.3,0 20 | 5.7,3.8,1.7,0.3,0 21 | 5.1,3.8,1.5,0.3,0 22 | 5.4,3.4,1.7,0.2,0 23 | 5.1,3.7,1.5,0.4,0 24 | 4.6,3.6,1.0,0.2,0 25 | 5.1,3.3,1.7,0.5,0 26 | 4.8,3.4,1.9,0.2,0 27 | 5.0,3.0,1.6,0.2,0 28 | 5.0,3.4,1.6,0.4,0 29 | 5.2,3.5,1.5,0.2,0 30 | 5.2,3.4,1.4,0.2,0 31 | 4.7,3.2,1.6,0.2,0 32 | 4.8,3.1,1.6,0.2,0 33 | 5.4,3.4,1.5,0.4,0 34 | 5.2,4.1,1.5,0.1,0 35 | 5.5,4.2,1.4,0.2,0 36 | 4.9,3.1,1.5,0.2,0 37 | 5.0,3.2,1.2,0.2,0 38 | 5.5,3.5,1.3,0.2,0 39 | 4.9,3.6,1.4,0.1,0 40 | 4.4,3.0,1.3,0.2,0 41 | 5.1,3.4,1.5,0.2,0 42 | 5.0,3.5,1.3,0.3,0 43 | 4.5,2.3,1.3,0.3,0 44 | 4.4,3.2,1.3,0.2,0 45 | 5.0,3.5,1.6,0.6,0 46 | 5.1,3.8,1.9,0.4,0 47 | 4.8,3.0,1.4,0.3,0 48 | 5.1,3.8,1.6,0.2,0 49 | 4.6,3.2,1.4,0.2,0 50 | 5.3,3.7,1.5,0.2,0 51 | 5.0,3.3,1.4,0.2,0 52 | 7.0,3.2,4.7,1.4,1 53 | 6.4,3.2,4.5,1.5,1 54 | 6.9,3.1,4.9,1.5,1 55 | 5.5,2.3,4.0,1.3,1 56 | 6.5,2.8,4.6,1.5,1 57 | 5.7,2.8,4.5,1.3,1 58 | 6.3,3.3,4.7,1.6,1 59 | 4.9,2.4,3.3,1.0,1 60 | 6.6,2.9,4.6,1.3,1 61 | 5.2,2.7,3.9,1.4,1 62 | 5.0,2.0,3.5,1.0,1 63 | 5.9,3.0,4.2,1.5,1 64 | 6.0,2.2,4.0,1.0,1 65 | 6.1,2.9,4.7,1.4,1 66 | 5.6,2.9,3.6,1.3,1 67 | 6.7,3.1,4.4,1.4,1 68 | 5.6,3.0,4.5,1.5,1 69 | 5.8,2.7,4.1,1.0,1 70 | 6.2,2.2,4.5,1.5,1 71 | 5.6,2.5,3.9,1.1,1 72 | 5.9,3.2,4.8,1.8,1 73 | 6.1,2.8,4.0,1.3,1 74 | 6.3,2.5,4.9,1.5,1 75 | 6.1,2.8,4.7,1.2,1 76 | 6.4,2.9,4.3,1.3,1 77 | 6.6,3.0,4.4,1.4,1 78 | 6.8,2.8,4.8,1.4,1 79 | 6.7,3.0,5.0,1.7,1 80 | 6.0,2.9,4.5,1.5,1 81 | 5.7,2.6,3.5,1.0,1 82 | 5.5,2.4,3.8,1.1,1 83 | 5.5,2.4,3.7,1.0,1 84 | 5.8,2.7,3.9,1.2,1 85 | 6.0,2.7,5.1,1.6,1 86 | 5.4,3.0,4.5,1.5,1 87 | 6.0,3.4,4.5,1.6,1 88 | 6.7,3.1,4.7,1.5,1 89 | 6.3,2.3,4.4,1.3,1 90 | 5.6,3.0,4.1,1.3,1 91 | 5.5,2.5,4.0,1.3,1 92 | 5.5,2.6,4.4,1.2,1 93 | 6.1,3.0,4.6,1.4,1 94 | 5.8,2.6,4.0,1.2,1 95 | 5.0,2.3,3.3,1.0,1 96 | 5.6,2.7,4.2,1.3,1 97 | 5.7,3.0,4.2,1.2,1 98 | 5.7,2.9,4.2,1.3,1 99 | 6.2,2.9,4.3,1.3,1 100 | 5.1,2.5,3.0,1.1,1 101 | 5.7,2.8,4.1,1.3,1 102 | 6.3,3.3,6.0,2.5,2 103 | 5.8,2.7,5.1,1.9,2 104 | 7.1,3.0,5.9,2.1,2 105 | 6.3,2.9,5.6,1.8,2 106 | 6.5,3.0,5.8,2.2,2 107 | 7.6,3.0,6.6,2.1,2 108 | 4.9,2.5,4.5,1.7,2 109 | 7.3,2.9,6.3,1.8,2 110 | 6.7,2.5,5.8,1.8,2 111 | 7.2,3.6,6.1,2.5,2 112 | 6.5,3.2,5.1,2.0,2 113 | 6.4,2.7,5.3,1.9,2 114 | 6.8,3.0,5.5,2.1,2 115 | 5.7,2.5,5.0,2.0,2 116 | 5.8,2.8,5.1,2.4,2 117 | 6.4,3.2,5.3,2.3,2 118 | 6.5,3.0,5.5,1.8,2 119 | 7.7,3.8,6.7,2.2,2 120 | 7.7,2.6,6.9,2.3,2 121 | 6.0,2.2,5.0,1.5,2 122 | 6.9,3.2,5.7,2.3,2 123 | 5.6,2.8,4.9,2.0,2 124 | 7.7,2.8,6.7,2.0,2 125 | 6.3,2.7,4.9,1.8,2 126 | 6.7,3.3,5.7,2.1,2 127 | 7.2,3.2,6.0,1.8,2 128 | 6.2,2.8,4.8,1.8,2 129 | 6.1,3.0,4.9,1.8,2 130 | 6.4,2.8,5.6,2.1,2 131 | 7.2,3.0,5.8,1.6,2 132 | 7.4,2.8,6.1,1.9,2 133 | 7.9,3.8,6.4,2.0,2 134 | 6.4,2.8,5.6,2.2,2 135 | 6.3,2.8,5.1,1.5,2 136 | 6.1,2.6,5.6,1.4,2 137 | 7.7,3.0,6.1,2.3,2 138 | 6.3,3.4,5.6,2.4,2 139 | 6.4,3.1,5.5,1.8,2 140 | 6.0,3.0,4.8,1.8,2 141 | 6.9,3.1,5.4,2.1,2 142 | 6.7,3.1,5.6,2.4,2 143 | 6.9,3.1,5.1,2.3,2 144 | 5.8,2.7,5.1,1.9,2 145 | 6.8,3.2,5.9,2.3,2 146 | 6.7,3.3,5.7,2.5,2 147 | 6.7,3.0,5.2,2.3,2 148 | 6.3,2.5,5.0,1.9,2 149 | 6.5,3.0,5.2,2.0,2 150 | 6.2,3.4,5.4,2.3,2 151 | 5.9,3.0,5.1,1.8,2 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ph (pronounced _φ_) - the tabular data shell tool ![ph tests](https://github.com/pgdr/ph/workflows/ph%20tests/badge.svg?branch=master) 2 | 3 | 4 | Spoiler: Working with tabular data (csv) in the command line is difficult. 5 | 6 | `ph` makes it easy: 7 | 8 | ```bash 9 | $ pip install ph 10 | $ cat iris.csv | ph columns 11 | 150 12 | 4 13 | setosa 14 | versicolor 15 | virginica 16 | $ cat iris.csv | ph columns setosa versicolor | ph head 15 | ph tail 5 | ph show 17 | setosa versicolor 18 | -- -------- ------------ 19 | 0 1.5 0.2 20 | 1 1.6 0.2 21 | 2 1.4 0.1 22 | 3 1.1 0.1 23 | 4 1.2 0.2 24 | ``` 25 | 26 | ```bash 27 | $ cat iris.csv | ph describe 28 | 150 4 setosa versicolor virginica 29 | count 150.000000 150.000000 150.000000 150.000000 150.000000 30 | mean 5.843333 3.057333 3.758000 1.199333 1.000000 31 | std 0.828066 0.435866 1.765298 0.762238 0.819232 32 | min 4.300000 2.000000 1.000000 0.100000 0.000000 33 | 25% 5.100000 2.800000 1.600000 0.300000 0.000000 34 | 50% 5.800000 3.000000 4.350000 1.300000 1.000000 35 | 75% 6.400000 3.300000 5.100000 1.800000 2.000000 36 | max 7.900000 4.400000 6.900000 2.500000 2.000000 37 | ``` 38 | 39 | Occasionally you would like to plot a CSV file real quick, in which case you can 40 | simply pipe it to `ph plot`: 41 | 42 | Suppose you have a dataset `covid.csv` 43 | 44 | ```csv 45 | SK,Italy,Iran,France,Spain,US 46 | 51,79,95,57,84,85 47 | 104,150,139,100,125,111 48 | 204,227,245,130,169,176 49 | 433,320,388,191,228,252 50 | 602,445,593,212,282,352 51 | 833,650,978,285,365,495 52 | 977,888,1501,423,430,640 53 | 1261,1128,2336,613,674,926 54 | 1766,1694,2922,949,1231,NaN 55 | 2337,2036,3513,1126,1696,NaN 56 | 3150,2502,4747,1412,NaN,NaN 57 | 4212,3089,5823,1748,NaN,NaN 58 | 4812,3858,6566,NaN,NaN,NaN 59 | 5328,4638,7161,NaN,NaN,NaN 60 | 5766,5883,8042,NaN,NaN,NaN 61 | 6284,7375,NaN,NaN,NaN,NaN 62 | 6767,9172,NaN,NaN,NaN,NaN 63 | 7134,10149,NaN,NaN,NaN,NaN 64 | 7382,NaN,NaN,NaN,NaN,NaN 65 | 7513,NaN,NaN,NaN,NaN,NaN 66 | ``` 67 | 68 | With this simple command, you get a certified _"So fancy" plot_. 69 | 70 | ```bash 71 | $ cat covid.csv | ph plot 72 | ``` 73 | 74 | ![So fancy covid plot](https://raw.githubusercontent.com/pgdr/ph/master/assets/covid-plot.png) 75 | 76 | 77 | _(Notice that this needs [matplotlib](https://matplotlib.org/): `pip install ph[plot]`)_ 78 | 79 | 80 | --- 81 | 82 | ## Raison d'être 83 | 84 | Using the _pipeline_ in Linux is nothing short of a dream in the life of the 85 | computer super user. 86 | 87 | However the pipe is clearly most suited for a stream of lines of textual data, 88 | and not when the stream is actually tabular data. 89 | 90 | Tabular data is much more complex to work with due to its dual indexing and the 91 | fact that we often read horizontally and often read vertically. 92 | 93 | The defacto format for tabular data is `csv` 94 | ([comma-separated values](https://en.wikipedia.org/wiki/Comma-separated_values), 95 | which is not perfect in any sense 96 | of the word), and the defacto tool for working with tabular data in Python is 97 | Pandas. 98 | 99 | This is a shell utility `ph` (pronounced _phi_) 100 | that reads tabular data from 101 | [_standard in_](https://en.wikipedia.org/wiki/Standard_streams#Standard_input_(stdin)) 102 | and allows 103 | you to perform a pandas function on the data, before writing it to standard out 104 | in `csv` format. 105 | 106 | The goal is to create a tool which makes it nicer to work with tabular data in a 107 | pipeline. 108 | 109 | To achieve the goal, `ph` then reads csv data, does some manipulation, 110 | and prints out csv data. With csv as the invariant, `ph` can be used in 111 | a pipeline. 112 | 113 | --- 114 | 115 | A very quick introduction to what `ph` can do for you, 116 | run this in your shell: 117 | 118 | ```bash 119 | ph open csv https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/archived/ecdc/total_cases.csv \ 120 | | ph slugify \ 121 | | ph columns date norway sweden denmark \ 122 | | ph diff norway sweden denmark \ 123 | | ph spencer norway sweden denmark \ 124 | | ph rolling 7 norway sweden denmark --how=mean \ 125 | | ph dropna \ 126 | | ph slice 50: \ 127 | | ph plot --linewidth=3 --savefig=cases.svg --index=date 128 | ``` 129 | 130 | ![cases](https://raw.githubusercontent.com/pgdr/ph/master/assets/cases.png) 131 | 132 | --- 133 | 134 | ## Table of contents 135 | 136 | 1. [Getting started](#getting-started) 137 | 1. [Example usage](#example-usage) 138 | 1. [The tools](#the-tools) 139 | 1. [Concatenating, merging, filtering](#concatenating-merging-filtering) 140 | 1. [`cat`, `open`, `from`](#cat-open-from) 141 | 1. [`dropna` and `fillna`](#dropna-and-fillna) 142 | 1. [`head` and `tail`](#head-and-tail) 143 | 1. [`date`](#date) 144 | 1. [`merge`](#merge) 145 | 1. [Editing the csv](#editing-the-csv) 146 | 1. [`columns`, listing, selecting and re-ordering of](#columns-listing-selecting-and-re-ordering-of) 147 | 1. [`rename`](#rename) 148 | 1. [`replace`](#replace) 149 | 1. [`slice`](#slice) 150 | 1. [`eval`; Mathematipulating and creating new columns](#eval-mathematipulating-and-creating-new-columns) 151 | 1. [`normalize`](#normalize) 152 | 1. [`query`](#query) 153 | 1. [`grep`](#grep) 154 | 1. [`strip`](#strip) 155 | 1. [`removeprefix` and `removesuffix`](#removeprefix-and-removesuffix) 156 | 1. [Analyzing the csv file](#analyzing-the-csv-file) 157 | 1. [`describe`](#describe) 158 | 1. [`show`](#show) 159 | 1. [`tabulate`](#tabulate) 160 | 1. [`sort` values by column](#sort-values-by-column) 161 | 1. [`plot`](#plot) 162 | 1. [`groupby`](#groupby) 163 | 1. [`rolling`, `ewm`, `expanding`](#rolling-ewm-expanding) 164 | 1. [`index`](#index) 165 | 1. [`polyfit`](#polyfit) 166 | 1. [Working with different formats](#working-with-different-formats) 167 | 1. [`open`](#open) 168 | 1. [`to` and `from`; Exporting and importing](#to-and-from-exporting-and-importing) 169 | 1. [Supported formats](#supported-formats) 170 | 171 | 172 | --- 173 | 174 | 175 | ## Getting started 176 | 177 | If you have installed `ph[data]`, you can experiment using `ph dataset` if you 178 | don't have an appropriate csv file available. 179 | 180 | 181 | ```bash 182 | ph dataset boston | ph describe 183 | ``` 184 | 185 | Available datasets are from 186 | [scikit-learn.datasets](https://scikit-learn.org/stable/datasets/index.html) 187 | 188 | Toy datasets: 189 | 190 | * `boston` 191 | * `iris` 192 | * `diabetes` 193 | * `digits` 194 | * `linnerud` 195 | * `wine` 196 | * `breast_cancer` 197 | 198 | 199 | Real world: 200 | 201 | * `olivetti_faces` 202 | * `lfw_people` 203 | * `lfw_pairs` 204 | * `rcv1` 205 | * `kddcup99` 206 | * `california_housing` 207 | 208 | 209 | ## Example usage 210 | 211 | Suppose you have a csv file `a.csv` that looks like this: 212 | 213 | ```csv 214 | x,y 215 | 3,8 216 | 4,9 217 | 5,10 218 | 6,11 219 | 7,12 220 | 8,13 221 | ``` 222 | 223 | Transpose: 224 | 225 | ```bash 226 | $ cat a.csv | ph transpose 227 | 0,1,2,3,4,5 228 | 3,4,5,6,7,8 229 | 8,9,10,11,12,13 230 | ``` 231 | 232 | `median` (as well as many others, e.g. `abs`, `corr`, `count`, `cov`, `cummax`, 233 | `cumsum`, `diff`, `max`, `product`, `quantile`, `rank`, `round`, `sum`, `std`, 234 | `var` etc.): 235 | 236 | ```bash 237 | $ cat a.csv | ph median 238 | x,y 239 | 5.5,10.5 240 | ``` 241 | 242 | **_Use `ph help` to list all commands_** 243 | 244 | 245 | ## The tools 246 | 247 | ### Concatenating, merging, filtering 248 | 249 | #### `cat`, `open`, `from` 250 | 251 | **cat** 252 | 253 | It is possible to _concatenate_ (`cat`) multiple csv-files with `ph cat`: 254 | 255 | ```bash 256 | $ ph cat a.csv b.csv --axis=index 257 | ``` 258 | 259 | ```bash 260 | $ ph cat a.csv b.csv --axis=columns 261 | ``` 262 | 263 | The functionality is described in 264 | [`pandas.concat`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html). 265 | 266 | 267 | **open** 268 | 269 | You can open a csv, json, excel, gpx (etc., see [_supported 270 | formats_](#supported-formats)) using `ph open type file`: 271 | 272 | ```bash 273 | $ ph open excel a.xlsx 274 | ``` 275 | 276 | ```bash 277 | $ ph open excel a.xlsx --sheet_name=0 --skiprows=3 278 | ``` 279 | 280 | 281 | ```bash 282 | $ ph open tsv a.tsv 283 | ``` 284 | 285 | ```bash 286 | $ ph open csv a.csv 287 | ``` 288 | 289 | In the event that the csv data starts on the first line (i.e. no 290 | header is present), use `--header=None`: 291 | 292 | ```bash 293 | $ ph open csv a.csv --header=None 294 | ``` 295 | 296 | 297 | 298 | **from** 299 | 300 | The `ph from` command works similarly to `ph open` but reads from stdin 301 | instead of opening a file. It therefore does not take a filename 302 | argument: 303 | 304 | ```bash 305 | $ cat /etc/passwd | ph from csv --sep=':' --header=None 306 | ``` 307 | 308 | 309 | #### `dropna` and `fillna` 310 | 311 | 312 | Consider again the `covid.csv` file from above. 313 | 314 | ```bash 315 | $ cat covid.csv | ph dropna 316 | ``` 317 | 318 | will remove all rows that contain N/A (`nan`) values. If we want to keep all 319 | rows with at least 5 non-N/A values, we can use 320 | 321 | ```bash 322 | $ cat covid.csv | ph dropna --thresh=5 323 | ``` 324 | 325 | If we want to drop all _columns_ with N/A values instead of all _rows_, we use 326 | `--axis=1`. 327 | 328 | If we want to drop only columns (resp. rows) with _all n/a_ values, we use 329 | `--how=all`. 330 | 331 | 332 | To _replace_ N/A values with other values, we can simply run 333 | 334 | ```bash 335 | cat covid.csv | ph fillna 999.75 336 | ``` 337 | 338 | If we instead want to _pad_ the N/A values, we use `--method=pad` 339 | 340 | ```bash 341 | cat covid.csv | ph fillna --method=pad 342 | ``` 343 | 344 | We can limit the number of consecutive N/A values that are filled by using 345 | (e.g.) `--limit=7`. 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | #### `head` and `tail` 355 | 356 | Using `head` and `tail` works approximately as the normal shell equivalents, 357 | however they will preserve the header if there is one, e.g. 358 | 359 | ```bash 360 | $ cat a.csv | ph head 7 | ph tail 3 361 | x,y 362 | 6,11 363 | 7,12 364 | 8,13 365 | ``` 366 | 367 | #### `date` 368 | 369 | If the `csv` file contains a column, e.g. named `x` containing 370 | timestamps, it can be parsed as such with `ph date x`: 371 | 372 | ```bash 373 | $ cat a.csv | ph date x 374 | x,y 375 | 1970-01-04,8 376 | 1970-01-05,9 377 | 1970-01-06,10 378 | 1970-01-07,11 379 | 1970-01-08,12 380 | 1970-01-09,13 381 | ``` 382 | 383 | If your column is formatted with _freedom units_, `mm/dd/yyyy`, you can 384 | use the flag `--dayfirst=True`: 385 | 386 | ```csv 387 | dateRep,geoId 388 | 01/04/2020,US 389 | 31/03/2020,US 390 | 30/03/2020,US 391 | 29/03/2020,US 392 | 28/03/2020,US 393 | ``` 394 | 395 | ```bash 396 | $ cat ~/cov.csv | ph date dateRep --dayfirst=True 397 | dateRep,geoId 398 | 2020-04-01,US 399 | 2020-03-31,US 400 | 2020-03-30,US 401 | 2020-03-29,US 402 | 2020-03-28,US 403 | ``` 404 | 405 | 406 | 407 | To get a column with integers (e.g. 3-8) parsed as, e.g. 2003 - 2008, some 408 | amount of hacking is necessary. We will go into details later on the `eval` and 409 | `appendstr`. 410 | 411 | ```bash 412 | $ cat a.csv | ph eval "x = 2000 + x" | ph appendstr x - | ph date x 413 | x,y 414 | 2003-01-01,8 415 | 2004-01-01,9 416 | 2005-01-01,10 417 | 2006-01-01,11 418 | 2007-01-01,12 419 | 2008-01-01,13 420 | ``` 421 | 422 | However, it is possible to provide a `--format` instruction to `date`: 423 | 424 | ```bash 425 | $ cat a.csv | ph eval "x = 2000 + x" | ph date x --format="%Y" 426 | x,y 427 | 2003-01-01,8 428 | 2004-01-01,9 429 | 2005-01-01,10 430 | 2006-01-01,11 431 | 2007-01-01,12 432 | 2008-01-01,13 433 | ``` 434 | 435 | Under some very special circumstances, we may have a `unix timestamp` in 436 | a column, in which the `--utc=True` handle becomes useful: 437 | 438 | Consider `utc.csv`: 439 | 440 | ```csv 441 | date,x,y 442 | 1580601600,3,8 443 | 1580688000,4,9 444 | 1580774400,5,10 445 | 1580860800,6,11 446 | 1580947200,7,12 447 | 1581033600,8,13 448 | ``` 449 | 450 | where you get the correct dates: 451 | 452 | ```bash 453 | $ cat utc.csv | ph date date --utc=True 454 | date,x,y 455 | 2020-02-02,3,8 456 | 2020-02-03,4,9 457 | 2020-02-04,5,10 458 | 2020-02-05,6,11 459 | 2020-02-06,7,12 460 | 2020-02-07,8,13 461 | ``` 462 | 463 | 464 | #### `merge` 465 | 466 | Merging two csv files is made available through `ph merge f1 f2`. 467 | 468 | Consider `left.csv` 469 | 470 | ```csv 471 | key1,key2,A,B 472 | K0,K0,A0,B0 473 | K0,K1,A1,B1 474 | K1,K0,A2,B2 475 | K2,K1,A3,B3 476 | ``` 477 | 478 | and `right.csv` 479 | 480 | ```csv 481 | key1,key2,C,D 482 | K0,K0,C0,D0 483 | K1,K0,C1,D1 484 | K1,K0,C2,D2 485 | K2,K0,C3,D3 486 | ``` 487 | 488 | We can merge them using (default to `--how=inner`) 489 | 490 | ```bash 491 | $ ph merge left.csv right.csv 492 | key1,key2,A,B,C,D 493 | K0,K0,A0,B0,C0,D0 494 | K1,K0,A2,B2,C1,D1 495 | K1,K0,A2,B2,C2,D2 496 | ``` 497 | 498 | or using an _outer_ join: 499 | 500 | ```bash 501 | $ ph merge left.csv right.csv --how=outer 502 | key1,key2,A,B,C,D 503 | K0,K0,A0,B0,C0,D0 504 | K0,K1,A1,B1,, 505 | K1,K0,A2,B2,C1,D1 506 | K1,K0,A2,B2,C2,D2 507 | K2,K1,A3,B3,, 508 | K2,K0,,,C3,D3 509 | ``` 510 | 511 | and we can specify on which column to join: 512 | 513 | ```bash 514 | $ ph merge left.csv right.csv --on=key1 --how=outer 515 | key1,key2_x,A,B,key2_y,C,D 516 | K0,K0,A0,B0,K0,C0,D0 517 | K0,K1,A1,B1,K0,C0,D0 518 | K1,K0,A2,B2,K0,C1,D1 519 | K1,K0,A2,B2,K0,C2,D2 520 | K2,K1,A3,B3,K0,C3,D3 521 | ``` 522 | 523 | 524 | In the case when the two files do not share a common column key, we can 525 | join them on key1 from the left file and key2 from the right file by specifying 526 | 527 | ```bash 528 | $ ph merge mergel.csv merger.csv --left=key1 --right=key2 529 | ``` 530 | 531 | 532 | 533 | ### Editing the csv 534 | 535 | #### `columns`, listing, selecting and re-ordering of 536 | 537 | Consider `c.csv`: 538 | 539 | ```csv 540 | it,fr,de 541 | 79,57,79 542 | 157,100,130 543 | 229,130,165 544 | 323,191,203 545 | 470,212,262 546 | 655,285,545 547 | 889,423,670 548 | 1128,653,800 549 | 1701,949,1040 550 | 2036,1209,1224 551 | 2502,1412,1565 552 | 3089,1784,1966 553 | 3858,2281,2745 554 | 4636,2876,3675 555 | 5883,3661,4181 556 | ``` 557 | 558 | Print the column names: 559 | 560 | ```bash 561 | $ cat c.csv | ph columns 562 | it 563 | fr 564 | de 565 | ``` 566 | 567 | Selecting only certain columns, e.g. `de` and `it` 568 | 569 | ```bash 570 | $ cat c.csv | ph columns de it | ph tail 3 571 | de,it 572 | 2745,3858 573 | 3675,4636 574 | 4181,5883 575 | ``` 576 | 577 | 578 | #### `rename` 579 | 580 | ```bash 581 | $ cat c.csv | ph rename de Germany | ph rename it Italy | ph columns Italy Germany 582 | Italy,Germany 583 | 79,79 584 | 157,130 585 | 229,165 586 | 323,203 587 | 470,262 588 | 655,545 589 | 889,670 590 | 1128,800 591 | 1701,1040 592 | 2036,1224 593 | 2502,1565 594 | 3089,1966 595 | 3858,2745 596 | 4636,3675 597 | 5883,4181 598 | ``` 599 | 600 | In addition to `rename` there is an auxiliary function `slugify` that 601 | lets you _slugify_ the column names. Consider `slugit.csv` 602 | 603 | ```csv 604 | Stupid column 1, Jerky-column No. 2 605 | 3,8 606 | 4,9 607 | 5,10 608 | 6,11 609 | 7,12 610 | 8,13 611 | ``` 612 | 613 | ```bash 614 | $ cat slugit.csv | ph slugify 615 | stupid_column_1,jerky_column_no_2 616 | 3,8 617 | 4,9 618 | 5,10 619 | 6,11 620 | 7,12 621 | 8,13 622 | ``` 623 | 624 | Then you can do 625 | 626 | ```bash 627 | $ cat slugit.csv | ph slugify | ph rename stupid_column_1 first | ph rename jerky_column_no_2 second 628 | first,second 629 | 3,8 630 | 4,9 631 | 5,10 632 | 6,11 633 | 7,12 634 | 8,13 635 | ``` 636 | 637 | 638 | #### `replace` 639 | 640 | We can replace values in the data (or in a single column) using `ph 641 | replace`. The syntax is 642 | `ph replace old new [--column=x [--newcolumn=xp]]`: 643 | 644 | ```bash 645 | $ cat a.csv| ph replace 8 100 646 | x,y 647 | 3,100 648 | 4,9 649 | 5,10 650 | 6,11 651 | 7,12 652 | 100,13 653 | ``` 654 | 655 | ```bash 656 | $ cat a.csv| ph replace 8 100 --column=x 657 | x,y 658 | 3,8 659 | 4,9 660 | 5,10 661 | 6,11 662 | 7,12 663 | 100,13 664 | ``` 665 | 666 | ```bash 667 | $ cat a.csv| ph replace 8 100 --column=x --newcolumn=xp 668 | x,y,xp 669 | 3,8,3 670 | 4,9,4 671 | 5,10,5 672 | 6,11,6 673 | 7,12,7 674 | 8,13,100 675 | ``` 676 | 677 | 678 | 679 | #### `slice` 680 | 681 | Slicing in Python is essential, and occasionally, we want to slice 682 | tabular data, e.g. look at only the 100 first, or 100 last rows, or 683 | perhaps we want to look at only every 10th row. All of this is achieved 684 | using `ph slice start:end:step` with standard Python slice syntax. 685 | 686 | ```bash 687 | $ cat a.csv | ph slice 1:9:2 688 | x,y 689 | 4,9 690 | 6,11 691 | 8,13 692 | ``` 693 | 694 | Reversing: 695 | 696 | ``` 697 | $ cat a.csv|ph slice ::-1 698 | x,y 699 | 8,13 700 | 7,12 701 | 6,11 702 | 5,10 703 | 4,9 704 | 3,8 705 | ``` 706 | 707 | See also `ph head` and `ph tail`. 708 | 709 | ```bash 710 | $ cat a.csv | ph slice :3 711 | x,y 712 | 3,8 713 | 4,9 714 | 5,10 715 | ``` 716 | 717 | equivalent to 718 | 719 | ```bash 720 | $ cat a.csv | ph head 3 721 | x,y 722 | 3,8 723 | 4,9 724 | 5,10 725 | ``` 726 | 727 | 728 | 729 | #### `eval`; Mathematipulating and creating new columns 730 | 731 | You can sum columns and place the result in a new column using 732 | `eval` (from 733 | [`pandas.DataFrame.eval`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html#pandas.DataFrame.eval)). 734 | 735 | ```bash 736 | $ cat c.csv | ph eval "total = it + fr + de" | ph tail 3 737 | it,fr,de,total 738 | 3858,2281,2745,8884 739 | 4636,2876,3675,11187 740 | 5883,3661,4181,13725 741 | ``` 742 | 743 | 744 | ```bash 745 | $ cat a.csv | ph eval "z = x**2 + y" 746 | x,y,z 747 | 3,8,17 748 | 4,9,25 749 | 5,10,35 750 | 6,11,47 751 | 7,12,61 752 | 8,13,77 753 | ``` 754 | 755 | 756 | If you only want the result, you leave the `eval` expression without assignment 757 | 758 | ```bash 759 | $ cat a.csv | ph eval "x**2" 760 | x 761 | 9 762 | 16 763 | 25 764 | 36 765 | 49 766 | 64 767 | ``` 768 | 769 | 770 | #### `normalize` 771 | 772 | You can normalize a column using `ph normalize col`. 773 | 774 | ```bash 775 | $ cat a.csv | ph eval "z = x * y" | ph normalize z 776 | x,y,z 777 | 3,8,0.0 778 | 4,9,0.15 779 | 5,10,0.325 780 | 6,11,0.525 781 | 7,12,0.75 782 | 8,13,1.0 783 | ``` 784 | 785 | 786 | 787 | #### `query` 788 | 789 | We can [query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html) data using `ph query expr`. 790 | 791 | ```bash 792 | $ cat a.csv | ph query "x > 5" 793 | x,y 794 | 6,11 795 | 7,12 796 | 8,13 797 | ``` 798 | 799 | 800 | ```bash 801 | $ ph open csv 'http://bit.ly/2cLzoxH' | ph query "country == 'Norway'" | ph tabulate --headers 802 | country year pop continent lifeExp gdpPercap 803 | -- --------- ------ ----------- ----------- --------- ----------- 804 | 0 Norway 1952 3.32773e+06 Europe 72.67 10095.4 805 | 1 Norway 1957 3.49194e+06 Europe 73.44 11654 806 | 2 Norway 1962 3.63892e+06 Europe 73.47 13450.4 807 | 3 Norway 1967 3.78602e+06 Europe 74.08 16361.9 808 | 4 Norway 1972 3.933e+06 Europe 74.34 18965.1 809 | 5 Norway 1977 4.04320e+06 Europe 75.37 23311.3 810 | 6 Norway 1982 4.11479e+06 Europe 75.97 26298.6 811 | 7 Norway 1987 4.18615e+06 Europe 75.89 31541 812 | 8 Norway 1992 4.28636e+06 Europe 77.32 33965.7 813 | 9 Norway 1997 4.40567e+06 Europe 78.32 41283.2 814 | 10 Norway 2002 4.53559e+06 Europe 79.05 44684 815 | 11 Norway 2007 4.62793e+06 Europe 80.196 49357.2 816 | ``` 817 | 818 | 819 | 820 | #### `grep` 821 | 822 | The powerful `grep` is one of the most used command line tools, and it 823 | would be silly to not ship a version of it ourselves. Using `ph grep` 824 | is rarely necessary, but helps when you want to ensure the header is 825 | kept. 826 | 827 | ```bash 828 | $ cat txtfile.csv | ph grep "a|b" --case=False --column=Text_Column --regex=False 829 | ``` 830 | 831 | The arguments denote 832 | 833 | * `--case` should be case sensitive? 834 | * `--column` grep only in given column 835 | * `--regex` use regex for pattern? 836 | 837 | 838 | 839 | #### `strip` 840 | 841 | Occasionally csv files come with additional spaces which can lead to 842 | difficulties in parsing the cells' contents. A csv file should be 843 | formatted without spaces after the comma `42,17` over `42, 17`. But 844 | since we are human, we sometimes make mistakes. 845 | 846 | If we want to _strip_, or _trim_, the contents of a column, we use `ph 847 | strip`: 848 | 849 | ```bash 850 | $ cat txtfile.csv | ph strip col1 col2 851 | ``` 852 | 853 | 854 | 855 | #### `removeprefix` and `removesuffix` 856 | 857 | If `strip` is not sufficiently powerful, it is possible to 858 | `removeprefix` or `removesuffix` using 859 | 860 | ```bash 861 | $cat txtfile.csv | ph removeprefix col1 pattern 862 | ``` 863 | 864 | and similarly for `removesuffix`. 865 | 866 | 867 | 868 | 869 | 870 | ### Analyzing the csv file 871 | 872 | 873 | #### `describe` 874 | 875 | The normal Pandas `describe` is of course available: 876 | 877 | ```bash 878 | $ cat a.csv | ph describe 879 | x y 880 | count 6.000000 6.000000 881 | mean 5.500000 10.500000 882 | std 1.870829 1.870829 883 | min 3.000000 8.000000 884 | 25% 4.250000 9.250000 885 | 50% 5.500000 10.500000 886 | 75% 6.750000 11.750000 887 | max 8.000000 13.000000 888 | ``` 889 | 890 | 891 | #### `show` 892 | 893 | The shorthand `ph show` simply calls the below `ph tabulate --headers`. 894 | 895 | ```bash 896 | $ cat a.csv | ph show 897 | x y 898 | -- --- --- 899 | 0 3 8 900 | 1 4 9 901 | 2 5 10 902 | 3 6 11 903 | 4 7 12 904 | 5 8 13 905 | ``` 906 | 907 | #### `tabulate` 908 | 909 | The amazing _tabulate_ tool comes from the Python package 910 | [tabulate on PyPI](https://pypi.org/project/tabulate/). 911 | 912 | The `tabulate` command takes arguments `--headers` to toggle printing of header 913 | row, `--format=[grid,...]` to modify the table style and `--noindex` to remove 914 | the running index (leftmost column in the example above). 915 | 916 | Among the supported format styles are 917 | 918 | * `plain`, `simple`, 919 | * `grid`, `fancy_grid`, `pretty`, 920 | * `github`, `rst`, `mediawiki`, `html`, `latex`, 921 | * ... (See full list at the project homepage at 922 | [python-tabulate](https://github.com/astanin/python-tabulate).) 923 | 924 | 925 | #### `sort` values by column 926 | 927 | You can the columns in the csv data by a certain column: 928 | 929 | ```bash 930 | $ cat iris.csv | ph sort setosa | ph tail 5 931 | 150,4,setosa,versicolor,virginica 932 | 7.9,3.8,6.4,2.0,2 933 | 7.6,3.0,6.6,2.1,2 934 | 7.7,3.8,6.7,2.2,2 935 | 7.7,2.8,6.7,2.0,2 936 | 7.7,2.6,6.9,2.3,2 937 | ``` 938 | 939 | #### `plot` 940 | 941 | You can plot data using `ph plot [--index=col]`. 942 | 943 | ```bash 944 | $ ph open parquet 1A_2019.parquet | ph columns Time Value | ph plot --index=Time 945 | ``` 946 | 947 | This will take the columns `Time` and `Value` from the timeseries provided by 948 | the given `parquet` file and plot the `Value` series using `Time` as _index_. 949 | 950 | 951 | The following example plots the life expectancy in Norway using `year` as _index_: 952 | 953 | ```bash 954 | $ ph open csv http://bit.ly/2cLzoxH | ph query "country == 'Norway'" | ph appendstr year -01-01 | ph columns year lifeExp | ph plot --index=year 955 | ``` 956 | 957 | ![life-expectancy over time](https://raw.githubusercontent.com/pgdr/ph/master/assets/lifeexp.png) 958 | 959 | > _Note:_ The strange `ph appendstr year -01-01` turns the items `1956` into 960 | > `"1956-01-01"` and `2005` into `"2005-01-01"`. These are necessary to make 961 | > pandas to interpret `1956` as a _year_ and not as a _millisecond_. 962 | > 963 | > The command `ph appendstr col str [newcol]` takes a string and appends it to a 964 | > column, overwriting the original column, or writing it to `newcol` if provided. 965 | 966 | **Advanced plotting** 967 | 968 | You can choose the _kind_ of plotting ( ‘line’, ‘bar’, ‘barh’, ‘hist’, ‘box’, 969 | ‘kde’, ‘density’, ‘area’, ‘pie’, ‘scatter’, ‘hexbin’), the _style_ of plotting 970 | (e.g. `--style=o`), and in case of scatter plot, you need to specify `--x=col1` 971 | and `--y=col2`, e.g.: 972 | 973 | ```bash 974 | $ ph open csv http://bit.ly/2cLzoxH | ph query "continent == 'Europe'" | ph plot --kind=scatter --x=lifeExp --y=gdpPercap 975 | ``` 976 | 977 | ![life-expectancy vs gdp](https://raw.githubusercontent.com/pgdr/ph/master/assets/scatter.png) 978 | 979 | 980 | 981 | 982 | 983 | To specify the styling `k--` gives a black dashed line: 984 | 985 | ```bash 986 | $ ph open csv http://bit.ly/2cLzoxH | ph query "country == 'Norway'" | ph appendstr year -01-01 | ph columns year lifeExp | ph plot --index=year --style=k-- 987 | ``` 988 | 989 | 990 | **Using `plot` headless** 991 | 992 | Occasionally we would like to generate a plot to an image(-like) file on 993 | the command line or in a script, without necessarily launching any 994 | graphic user interface. 995 | 996 | Calling `ph plot` with the argument `--savefig=myfile.png` will create a 997 | PNG file called `myfile.png` instead of opening the matplotlib window. 998 | It is also possible to get other formats by using different extensions, 999 | like `eps`, `pdf`, `pgf`, `png`, `ps`, `raw`, `rgba`, `svg`, `svgz`. 1000 | 1001 | 1002 | **_`iplot`_ with `plotly` and `cufflinks`** 1003 | 1004 | Instead of using the `matplotlib` backend, there is an option for using `plotly` 1005 | and [`cufflinks`](https://github.com/santosjorge/cufflinks) to generate 1006 | interactive plots. 1007 | This depends on `cufflinks`, and can be installed with `pip install ph[iplot]`. 1008 | 1009 | ```bash 1010 | $ cat a.csv | ph iplot --kind=bar --barmode=stack 1011 | ``` 1012 | 1013 | ```bash 1014 | $ cat a.csv | ph iplot --kind=scatter --mode=markers 1015 | ``` 1016 | 1017 | 1018 | #### `groupby` 1019 | 1020 | Suppose you have a csv file 1021 | 1022 | ```csv 1023 | Animal,Max Speed 1024 | Falcon,380.0 1025 | Falcon,370.0 1026 | Parrot,24.0 1027 | Parrot,26.0 1028 | ``` 1029 | 1030 | You can use Pandas' `groupby` functionality to get the aggregated `sum`, 1031 | `mean`, or `first` value: 1032 | 1033 | ```bash 1034 | $ cat group.csv | ph groupby Animal --how=mean 1035 | Max Speed 1036 | 375.0 1037 | 25.0 1038 | ``` 1039 | 1040 | If you want to retain the index column, 1041 | 1042 | ```bash 1043 | $ cat group.csv | ph groupby Animal --how=mean --as_index=False 1044 | Animal,Max Speed 1045 | Falcon,375.0 1046 | Parrot,25.0 1047 | ``` 1048 | 1049 | 1050 | 1051 | #### `rolling`, `ewm`, `expanding` 1052 | 1053 | **rolling** 1054 | 1055 | Compute rolling averages/sums using `ph rolling 3 --how=mean` 1056 | 1057 | Consider again `a.csv`: 1058 | 1059 | ```csv 1060 | x,y 1061 | 3,8 1062 | 4,9 1063 | 5,10 1064 | 6,11 1065 | 7,12 1066 | 8,13 1067 | ``` 1068 | 1069 | Moving average with window size 3: 1070 | 1071 | ```bash 1072 | $ cat a.csv|ph rolling 3 --how=mean | ph dropna 1073 | x,y 1074 | 4.0,9.0 1075 | 5.0,10.0 1076 | 6.0,11.0 1077 | 7.0,12.0 1078 | ``` 1079 | 1080 | 1081 | Rolling sum with window size 2: 1082 | 1083 | ```bash 1084 | $ cat a.csv|ph rolling 2 --how=sum | ph dropna 1085 | x,y 1086 | 7.0,17.0 1087 | 9.0,19.0 1088 | 11.0,21.0 1089 | 13.0,23.0 1090 | 15.0,25.0 1091 | ``` 1092 | 1093 | 1094 | **ewm — exponentially weighted methods** 1095 | 1096 | ```bash 1097 | $ cat a.csv | ph ewm --com=0.5 --how=mean | ph show 1098 | x y 1099 | -- ------- -------- 1100 | 0 3 8 1101 | 1 3.75 8.75 1102 | 2 4.61538 9.61538 1103 | 3 5.55 10.55 1104 | 4 6.52066 11.5207 1105 | 5 7.50824 12.5082 1106 | ``` 1107 | 1108 | Use either `com` (center of mass), `span`, `halflife`, or `alpha`, 1109 | together with `--how=mean`, `--how=std`, `--how=var`, etc. 1110 | 1111 | 1112 | **expanding — expanding window** 1113 | 1114 | > A common alternative to rolling statistics is to use an expanding 1115 | > window, which yields the value of the statistic with all the data 1116 | > available up to that point in time. 1117 | 1118 | ```bash 1119 | $ cat a.csv | ph expanding 3 1120 | x,y 1121 | , 1122 | , 1123 | 12.0,27.0 1124 | 18.0,38.0 1125 | 25.0,50.0 1126 | 33.0,63.0 1127 | ``` 1128 | 1129 | 1130 | **Spencer's 15-weight average** 1131 | 1132 | We also support an experimental and slow version of Spencer's 15-weight 1133 | average. This method takes a window of size 15, and pointwise multiply 1134 | with the following vector (normalized) 1135 | 1136 | ``` 1137 | (-3, -6, -5, 3, 21, 46, 67, 74, 67, 46, 21, 3, -5, -6, -3) 1138 | ``` 1139 | 1140 | and then takes the sum of the resulting vector. 1141 | 1142 | Spencer's 15-weight average is an interesting (impulse response) filter 1143 | that preserves all up to cubic polynomial functions. 1144 | 1145 | 1146 | #### `index` 1147 | 1148 | Occasionally you need to have an index, in which case `ph index` is your tool: 1149 | 1150 | ```bash 1151 | $ cat a.csv | ph index 1152 | index,x,y 1153 | 0,3,8 1154 | 1,4,9 1155 | 2,5,10 1156 | 3,6,11 1157 | 4,7,12 1158 | 5,8,13 1159 | ``` 1160 | 1161 | #### `polyfit` 1162 | 1163 | You can perform **linear regression** and **polynomial regression** on a certain 1164 | index column `x` and a `y = f(x)` column using `ph polyfit`. It takes two 1165 | arguments, the `x` column name, the `y` column name and an optional 1166 | `--deg=`, the degree of the polynomial. The default option is `--deg=1` 1167 | which corresponds to a linear regression. 1168 | 1169 | Suppose you have a csv file `lr.csv` with content 1170 | 1171 | ```csv 1172 | x,y 1173 | 4,12 1174 | 5,19 1175 | 6,17 1176 | 7,24 1177 | 8,28 1178 | 9,34 1179 | ``` 1180 | 1181 | With linear (polynomial) regression, you get an extra column, `polyfit_{deg}`: 1182 | 1183 | ```bash 1184 | $ cat lr.csv | ph polyfit x y | ph astype int 1185 | x,y,polyfit_1 1186 | 4,12,12 1187 | 5,19,16 1188 | 6,17,20 1189 | 7,24,24 1190 | 8,28,28 1191 | 9,34,32 1192 | ``` 1193 | 1194 | Using `ph plot --index=x` results in this plot: 1195 | 1196 | ![polyfit](https://raw.githubusercontent.com/pgdr/ph/master/assets/polyfit.png) 1197 | 1198 | ## Working with different formats 1199 | 1200 | 1201 | ### `open` 1202 | 1203 | Pandas supports reading a multitude of [readers](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html). 1204 | 1205 | To read an Excel file and pipe the stream, you can use `ph open`. 1206 | 1207 | The syntax of `ph open` is `ph open ftype fname`, where `fname` is the 1208 | file you want to stream and `ftype` is the type of the file. 1209 | 1210 | A list of all available formats is given below. 1211 | 1212 | ```bash 1213 | $ ph open xls a.xlsx 1214 | x,y 1215 | 3,8 1216 | 4,9 1217 | 5,10 1218 | 6,11 1219 | 7,12 1220 | 8,13 1221 | ``` 1222 | 1223 | 1224 | You can open a _semicolon separated values_ file using `--sep=";"` 1225 | 1226 | ```bash 1227 | $ ph open csv --sep=";" fname.csv 1228 | ``` 1229 | 1230 | 1231 | 1232 | ### `to` and `from`; Exporting and importing 1233 | 1234 | Observe the following: 1235 | 1236 | ```json 1237 | {"x":{"0":3,"1":4,"2":5,"3":6,"4":7,"5":8}, 1238 | "y":{"0":8,"1":9,"2":10,"3":11,"4":12,"5":13}} 1239 | ``` 1240 | 1241 | Of course, then, 1242 | 1243 | ```bash 1244 | $ cat a.csv | ph to json | ph from json 1245 | x,y 1246 | 3,8 1247 | 4,9 1248 | 5,10 1249 | 6,11 1250 | 7,12 1251 | 8,13 1252 | ``` 1253 | 1254 | This also means that 1255 | 1256 | ```bash 1257 | $ cat a.csv | ph to json > a.json 1258 | $ cat a.json 1259 | {"x":{"0":3,"1":4,"2":5,"3":6,"4":7,"5":8}, 1260 | "y":{"0":8,"1":9,"2":10,"3":11,"4":12,"5":13}} 1261 | $ cat a.json | ph from json 1262 | x,y 1263 | 3,8 1264 | 4,9 1265 | 5,10 1266 | 6,11 1267 | 7,12 1268 | 8,13 1269 | ``` 1270 | 1271 | You can open Excel-like formats using `ph open excel fname.xls[x]`, `parquet` 1272 | files with `ph open parquet data.parquet`. Note that these two examples require 1273 | `xlrd` and `pyarrow`, respectively, or simply 1274 | 1275 | ``` 1276 | pip install ph[complete] 1277 | ``` 1278 | 1279 | 1280 | ### Supported formats 1281 | 1282 | * `csv` / `tsv` (the latter for tab-separated values) 1283 | * `fwf` (fixed-width file format) 1284 | * `json` 1285 | * `html` 1286 | * `clipboard` (pastes tab-separated content from clipboard) 1287 | * `xls` 1288 | * `odf` 1289 | * `hdf5` 1290 | * `feather` 1291 | * `parquet` 1292 | * `orc` 1293 | * `stata` 1294 | * `sas` 1295 | * `spss` 1296 | * `pickle` 1297 | * `sql` 1298 | * `gbq` / `google` / `bigquery` 1299 | 1300 | We also support reading GPX files with `ph open gpx`. 1301 | This uses the GPX Python library [gpxpy](https://github.com/tkrajina/gpxpy). 1302 | -------------------------------------------------------------------------------- /tests/test_ph.py: -------------------------------------------------------------------------------- 1 | import ph 2 | 3 | import os.path 4 | import io 5 | 6 | import pytest 7 | import contextlib 8 | 9 | import pandas as pd 10 | import datetime as dt 11 | import math 12 | 13 | NAN = float("nan") 14 | LEFT_COLUMNS = ["key1", "key2", "A", "B"] # columns of left.csv 15 | 16 | 17 | def __have_xlrd(): 18 | try: 19 | import xlrd # noqa 20 | 21 | return True 22 | except ImportError: 23 | return False 24 | 25 | 26 | def _assert_a(df): 27 | assert list(df.shape) == [6, 2] 28 | assert list(df.columns) == ["x", "y"] 29 | assert list(df["x"]) == list(range(3, 9)) 30 | assert list(df["y"]) == list(range(8, 14)) 31 | 32 | 33 | def _get_path(name, extension="csv"): 34 | pth = "test_data/{}.{}".format(name, extension) 35 | root = os.path.dirname(__file__) 36 | path = os.path.abspath(os.path.join(root, pth)) 37 | 38 | return path 39 | 40 | 41 | def _get_data(name, extension="csv"): 42 | path = _get_path(name, extension) 43 | with open(path, "r") as fin: 44 | data = "".join(fin.readlines()) 45 | 46 | return data 47 | 48 | 49 | def _get_io(name, extension="csv"): 50 | return io.StringIO(_get_data(name, extension)) 51 | 52 | 53 | class Capture: 54 | # Just a mutable string container for ctx mgr around capture.out 55 | def __init__(self, outerr=None): 56 | if outerr is not None: 57 | self.out = outerr.out 58 | self.err = outerr.err 59 | else: 60 | self.out = "" 61 | self.err = "" 62 | self._df = None 63 | 64 | @property 65 | def df(self): 66 | if self._df is None: 67 | self._df = pd.read_csv(io.StringIO(self.out)) 68 | return self._df 69 | 70 | def read_df(self, *args, **kwargs): 71 | self._df = pd.read_csv(io.StringIO(self.out), *args, **kwargs) 72 | return self.df 73 | 74 | def assert_shape(self, rows, cols): 75 | assert list(self.df.shape) == [rows, cols] 76 | 77 | def assert_columns(self, columns): 78 | assert list(self.df.columns) == list(columns) 79 | 80 | 81 | @pytest.fixture 82 | def phmgr(capsys, monkeypatch): 83 | @contextlib.contextmanager 84 | def phmgr(dataset="a", extension="csv"): 85 | monkeypatch.setattr("sys.stdin", _get_io(dataset, extension)) 86 | cap = Capture() 87 | yield cap 88 | outerr = capsys.readouterr() 89 | cap.out, cap.err = outerr.out, outerr.err 90 | assert not cap.err, "Std error not empty: {}".format(cap.err) 91 | 92 | return phmgr 93 | 94 | 95 | def _call(cmd, extra=None): 96 | if extra is None: 97 | extra = [] 98 | ph._main(["ph"] + cmd.split(" ") + extra) 99 | 100 | 101 | def test_cat(phmgr): 102 | with phmgr() as captured: 103 | _call("cat") 104 | assert captured.out == _get_data("a") 105 | 106 | 107 | def test_cat_many(capsys): 108 | _call("cat {} {} --axis=index".format(_get_path("a"), _get_path("covid"))) 109 | cap = Capture(capsys.readouterr()) 110 | assert not cap.err 111 | cap.assert_shape(35, 12) 112 | 113 | _call("cat {} {} --axis=columns".format(_get_path("a"), _get_path("covid"))) 114 | cap = Capture(capsys.readouterr()) 115 | assert not cap.err 116 | cap.assert_shape(29, 12) 117 | 118 | 119 | def test_columns(phmgr): 120 | with phmgr("iris") as captured: 121 | _call("columns") 122 | assert not captured.err 123 | captured.assert_columns(["columns"]) 124 | assert list(captured.df["columns"]) == [ 125 | "150", 126 | "4", 127 | "setosa", 128 | "versicolor", 129 | "virginica", 130 | ] 131 | 132 | 133 | def test_columns_args(phmgr): 134 | with phmgr("iris") as captured: 135 | _call("columns setosa versicolor") 136 | assert not captured.err 137 | captured.assert_shape(150, 2) 138 | captured.assert_columns(["setosa", "versicolor"]) 139 | 140 | 141 | def test_drop_columns(phmgr): 142 | with phmgr("iris") as captured: 143 | _call("drop setosa virginica --axis=columns") 144 | assert not captured.err 145 | df = captured.df 146 | captured.assert_shape(150, 3) 147 | captured.assert_columns( 148 | [ 149 | "150", 150 | "4", 151 | "versicolor", 152 | ] 153 | ) 154 | assert list(df.iloc[0]) == [5.1, 3.5, 0.2] 155 | 156 | 157 | def test_drop_index(phmgr): 158 | with phmgr("iris") as captured: 159 | _call("drop 0 --axis=index") 160 | assert not captured.err 161 | df = captured.df 162 | captured.assert_shape(149, 5) 163 | assert list(df.iloc[0]) == [4.9, 3.0, 1.4, 0.2, 0] 164 | 165 | 166 | def test_open_skiprows(capsys): 167 | _call("open csv {} --skiprows=6".format(_get_path("f"))) 168 | captured = Capture(capsys.readouterr()) 169 | assert not captured.err 170 | df = captured.df 171 | captured.assert_shape(2, 2) 172 | assert list(df.iloc[0]) == [14, 13] 173 | assert list(df.iloc[1]) == [16, 21] 174 | 175 | 176 | def test_from_headless(phmgr): 177 | with phmgr("headless") as captured: 178 | _call("from --header=None") 179 | assert not captured.err 180 | captured.assert_shape(5, 2) 181 | captured.assert_columns(["0", "1"]) 182 | 183 | 184 | def test_open_headless(capsys): 185 | _call("open csv {} --header=None".format(_get_path("headless"))) 186 | captured = Capture(capsys.readouterr()) 187 | assert not captured.err 188 | captured.assert_shape(5, 2) 189 | captured.assert_columns(["0", "1"]) 190 | 191 | 192 | def test_diff_all(phmgr): 193 | with phmgr() as captured: 194 | _call("diff --periods=2") 195 | assert not captured.err 196 | captured.assert_shape(6, 2) 197 | captured.assert_columns(["x", "y"]) 198 | 199 | 200 | def test_diff_xy(phmgr): 201 | with phmgr() as captured: 202 | _call("diff x y --periods=2") 203 | assert not captured.err 204 | 205 | 206 | def test_diff_with_date_col(phmgr): 207 | with phmgr("usa") as captured: 208 | _call("diff cases deaths") 209 | assert not captured.err 210 | captured.assert_shape(93, 7) 211 | df = captured.df 212 | c = list(df["cases"]) 213 | d = list(df["deaths"]) 214 | assert c[1:6] == [-3403.0, -3235.0, 1619.0, -1284.0, -1898.0] 215 | assert d[1:7] == [-248.0, -343.0, 166.0, -73.0, -165.0, 3.0] 216 | 217 | 218 | @pytest.mark.skipif( 219 | os.getenv("GITHUB_WORKFLOW") is not None, reason="clipboard not on headless" 220 | ) 221 | def test_clipboard(capsys): 222 | # This test is a bit nasty as we necessarily need to modify the 223 | # clipboard. We do, however, try to preserve the content. YMMV. 224 | import pandas.io.clipboard as cp 225 | 226 | old = cp.paste() 227 | try: 228 | df = pd.read_csv(_get_path("a")) 229 | df.to_clipboard() 230 | 231 | _call("from clipboard") 232 | captured = Capture(capsys.readouterr()) 233 | assert not captured.err 234 | df = captured.df 235 | captured.assert_shape(6, 2) 236 | finally: 237 | cp.copy(old) 238 | 239 | 240 | def test_sep_from(phmgr): 241 | with phmgr("d", extension="scsv") as captured: 242 | _call("from csv --sep=;") 243 | assert not captured.err 244 | captured.assert_shape(6, 3) 245 | 246 | 247 | def test_from_skiprows(phmgr): 248 | with phmgr("f") as captured: 249 | _call("from csv --skiprows=6") 250 | assert not captured.err 251 | df = captured.df 252 | captured.assert_shape(2, 2) 253 | assert list(df.iloc[0]) == [14, 13] 254 | assert list(df.iloc[1]) == [16, 21] 255 | 256 | 257 | def test_sep_to_with_sep(capsys, monkeypatch): 258 | monkeypatch.setattr("sys.stdin", _get_io("d")) 259 | _call("to csv --sep=_") 260 | captured = Capture(capsys.readouterr()) 261 | assert not captured.err 262 | captured.assert_shape(6, 1) 263 | 264 | df = pd.read_csv(io.StringIO(captured.out), sep="_") 265 | assert list(df.shape) == [6, 3] 266 | assert list(df["year"]) == list(range(2003, 2009)) 267 | 268 | 269 | def test_sep_to_with_index(capsys, monkeypatch): 270 | monkeypatch.setattr("sys.stdin", _get_io("d")) 271 | _call("to csv --index=true") 272 | captured = Capture(capsys.readouterr()) 273 | assert not captured.err 274 | captured.assert_shape(6, 4) 275 | 276 | 277 | def test_thousands_from(capsys, monkeypatch): 278 | monkeypatch.setattr("sys.stdin", _get_io("t", extension="tsv")) 279 | _call("from csv --thousands=, --sep=\t") 280 | captured = Capture(capsys.readouterr()) 281 | assert not captured.err 282 | df = captured.df 283 | captured.assert_shape(7, 2) 284 | assert all(df["a"] == 10 ** df["b"]) 285 | 286 | 287 | def test_thousands_from_escaped_tab(capsys, monkeypatch): 288 | monkeypatch.setattr("sys.stdin", _get_io("t", extension="tsv")) 289 | _call("from csv --thousands=, --sep=\\t") 290 | captured = Capture(capsys.readouterr()) 291 | assert not captured.err 292 | df = captured.df 293 | captured.assert_shape(7, 2) 294 | assert all(df["a"] == 10 ** df["b"]) 295 | 296 | 297 | def test_strip_default(phmgr): 298 | with phmgr("right") as captured: 299 | _call("strip --lstrip=True") 300 | assert not captured.err 301 | captured.assert_shape(4, 4) 302 | captured.assert_columns(["key1", "key2", "C", "D"]) 303 | 304 | assert list(captured.df["C"]) == ["C0", "C1", "C2", "C3"] 305 | 306 | 307 | def test_strip_actual(phmgr): 308 | with phmgr("strip") as captured: 309 | _call("strip date") 310 | assert not captured.err 311 | 312 | captured.assert_shape(6, 4) 313 | captured.assert_columns(["idx", "date", "x", "y"]) 314 | _assert_a(captured.df[["x", "y"]]) 315 | assert list(captured.df.date) == ["2020-05-{}".format(i) for i in range(12, 18)] 316 | 317 | 318 | def test_removeprefix(phmgr): 319 | with phmgr("left") as captured: 320 | _call("removeprefix A A") 321 | assert not captured.err 322 | captured.assert_shape(4, 4) 323 | captured.assert_columns(LEFT_COLUMNS) 324 | assert list(captured.df["A"]) == [0, 1, 2, 3] 325 | 326 | 327 | def test_removesuffix(phmgr): 328 | with phmgr("left") as captured: 329 | _call("removesuffix key1 0") 330 | assert not captured.err 331 | captured.assert_shape(4, 4) 332 | captured.assert_columns(LEFT_COLUMNS) 333 | assert list(captured.df["key1"]) == ["K", "K", "K1", "K2"] 334 | 335 | 336 | def test_describe(phmgr): 337 | with phmgr() as captured: 338 | _call("describe") 339 | assert len(captured.out.split("\n")) == 10 340 | header = set(captured.out.split("\n")[0].split()) 341 | assert "x" in header 342 | assert "y" in header 343 | assert "max" in captured.out 344 | 345 | 346 | def test_shape(phmgr): 347 | with phmgr("covid") as captured: 348 | _call("shape") 349 | df = captured.df 350 | captured.assert_columns(["rows", "columns"]) 351 | assert list(df["rows"]) == [29] 352 | assert list(df["columns"]) == [10] 353 | 354 | 355 | def test_transpose(phmgr): 356 | with phmgr() as captured: 357 | _call("transpose") 358 | assert ( 359 | captured.out 360 | == """\ 361 | 0,1,2,3,4,5 362 | 3,4,5,6,7,8 363 | 8,9,10,11,12,13 364 | """ 365 | ) 366 | 367 | 368 | def test_head_tail(capsys, monkeypatch): 369 | monkeypatch.setattr("sys.stdin", _get_io("a")) 370 | _call("head 7") 371 | captured = capsys.readouterr() 372 | assert not captured.err 373 | 374 | monkeypatch.setattr("sys.stdin", io.StringIO(captured.out)) 375 | _call("tail 3") 376 | captured = capsys.readouterr() 377 | assert ( 378 | captured.out 379 | == """\ 380 | x,y 381 | 6,11 382 | 7,12 383 | 8,13 384 | """ 385 | ) 386 | assert not captured.err 387 | 388 | 389 | def test_open_with_decimals(phmgr): 390 | with phmgr("padded_decimals") as captured: 391 | _call("from csv --decimal=, --thousands=.") 392 | assert not captured.err 393 | df = captured.df 394 | captured.assert_shape(7, 2) 395 | assert "paddecim" in df.columns 396 | assert str(df["paddecim"].dtype).startswith("float") 397 | assert df["paddecim"].sum() == 1470.0 * 2 398 | 399 | 400 | def test_from_with_decimals(capsys, monkeypatch): 401 | monkeypatch.setattr("sys.stdin", _get_io("padded_decimals")) 402 | _call("from csv --decimal=, --thousands=.") 403 | captured = Capture(capsys.readouterr()) 404 | 405 | assert not captured.err 406 | df = captured.df 407 | captured.assert_shape(7, 2) 408 | assert "paddecim" in df.columns 409 | assert str(df["paddecim"].dtype).startswith("float") 410 | assert df["paddecim"].sum() == 1470.0 * 2 411 | 412 | 413 | def test_date(phmgr): 414 | with phmgr() as captured: 415 | _call("date x --unit=D") 416 | df = captured.df 417 | df["x"] = pd.to_datetime(captured.df["x"]) 418 | assert list(df["y"]) == list(range(8, 14)) 419 | x = list(df["x"]) 420 | assert len(list(df["x"])) == 6 421 | for i in range(6): 422 | assert str(x[i]) == "1970-01-0{} 00:00:00".format(i + 4) 423 | 424 | with phmgr("d") as captured: 425 | _call("date") 426 | df = captured.df 427 | assert len(df) == 6 428 | captured.assert_columns(["0"]) 429 | act = [str(x) for x in df["0"]] 430 | exp = [ 431 | "2003-03-08", 432 | "2004-04-09", 433 | "2005-05-10", 434 | "2006-06-11", 435 | "2007-07-12", 436 | "2008-08-13", 437 | ] 438 | assert act == exp 439 | 440 | 441 | def test_date_dayfirst(phmgr): 442 | with phmgr("usa") as captured: 443 | _call("date dateRep --dayfirst=True") 444 | df = captured.df 445 | captured.assert_shape(93, 7) 446 | df["dateRep"] = pd.to_datetime(df["dateRep"]) 447 | df["realdate"] = pd.to_datetime(df[["year", "month", "day"]]) 448 | assert all(df["realdate"] == df["dateRep"]) 449 | 450 | 451 | def test_date_errors(phmgr): 452 | with pytest.raises(SystemExit) as exit_: 453 | with phmgr("derr") as captured: 454 | _call("date --col=x") 455 | assert str(exit_.value) == "ph date: Unknown column x" 456 | 457 | with pytest.raises(SystemExit) as exit_: 458 | with phmgr("derr") as captured: 459 | _call("date --col=year") 460 | assert str(exit_.value).startswith("Out of bounds nanosecond timestamp") 461 | 462 | with pytest.raises(SystemExit) as exit_: 463 | with phmgr("derr") as captured: 464 | _call("date --col=year --errors=nosucherr") 465 | assert str(exit_.value).startswith("Errors must be one of") 466 | 467 | with phmgr("derr") as captured: 468 | _call("date --col=year --errors=coerce") 469 | assert not captured.err 470 | df = captured.df 471 | assert df["year"].dtype == dt.datetime 472 | 473 | with phmgr("derr") as captured: 474 | _call("date --col=year --errors=ignore") 475 | assert not captured.err 476 | df = captured.df 477 | assert "200-01" in list(df["year"]) 478 | 479 | 480 | def test_date_fmt(phmgr): 481 | with phmgr("date-fmt") as captured: 482 | _call("date date --format=%Y_%m/%d") 483 | assert not captured.err 484 | captured.assert_shape(6, 3) 485 | captured.assert_columns(["date", "x", "y"]) 486 | _assert_a(captured.df[["x", "y"]]) 487 | dates = list(captured.df["date"]) 488 | assert dates == [ 489 | "2020-02-02", 490 | "2020-02-03", 491 | "2020-02-04", 492 | "2020-02-05", 493 | "2020-02-06", 494 | "2020-02-07", 495 | ] 496 | 497 | 498 | def test_date_utc(phmgr): 499 | with phmgr("date-utc") as captured: 500 | _call("date date --utc=True") 501 | assert not captured.err 502 | captured.assert_shape(6, 3) 503 | captured.assert_columns(["date", "x", "y"]) 504 | _assert_a(captured.df[["x", "y"]]) 505 | dates = list(captured.df["date"]) 506 | assert dates == [ 507 | "2020-02-02", 508 | "2020-02-03", 509 | "2020-02-04", 510 | "2020-02-05", 511 | "2020-02-06", 512 | "2020-02-07", 513 | ] 514 | 515 | 516 | def test_eval(phmgr): 517 | with phmgr() as captured: 518 | _call("eval", ["x = x**2"]) 519 | assert ( 520 | captured.out 521 | == """\ 522 | x,y 523 | 9,8 524 | 16,9 525 | 25,10 526 | 36,11 527 | 49,12 528 | 64,13 529 | """ 530 | ) 531 | 532 | 533 | def test_dropna(phmgr): 534 | with phmgr("covid") as captured: 535 | _call("dropna") 536 | captured.assert_shape(5, 10) 537 | 538 | with phmgr("covid") as captured: 539 | _call("dropna --thresh=7") 540 | captured.assert_shape(15, 10) 541 | 542 | with phmgr("covid") as captured: 543 | _call("dropna --axis=1 --thresh=17") 544 | captured.assert_shape(29, 5) 545 | 546 | 547 | def test_fillna(phmgr): 548 | with phmgr("covid") as captured: 549 | _call("fillna 17") 550 | assert captured.df["Canada"].sum() == 1401 551 | 552 | with phmgr("covid") as captured: 553 | _call("fillna 19 --limit=3") 554 | assert captured.df["Canada"].sum() == 1050 555 | 556 | with phmgr("covid") as captured: 557 | _call("fillna --method=pad --limit=5") 558 | assert captured.df["Canada"].sum() == 2493 559 | 560 | with pytest.raises(SystemExit) as exit_: 561 | _call("fillna") 562 | assert "'ph fillna' needs exactly one of" in str(exit_.value) 563 | 564 | 565 | def test_merge(capsys): 566 | lft = _get_path("left") 567 | rht = _get_path("right") 568 | ph.merge(lft, rht) 569 | cap = Capture(capsys.readouterr()) 570 | assert not cap.err 571 | cap.assert_shape(3, 6) 572 | 573 | ph.merge(lft, rht, how="left") 574 | cap = Capture(capsys.readouterr()) 575 | assert not cap.err 576 | cap.assert_shape(5, 6) 577 | 578 | ph.merge(lft, rht, how="outer") 579 | cap = Capture(capsys.readouterr()) 580 | assert not cap.err 581 | cap.assert_shape(6, 6) 582 | 583 | ph.merge(lft, rht, on="key1") 584 | cap = Capture(capsys.readouterr()) 585 | assert not cap.err 586 | cap.assert_shape(5, 7) 587 | 588 | lm = _get_path("mergel") 589 | rm = _get_path("merger") 590 | ph.merge(lm, rm, left="lk2", right="rk2") 591 | cap = Capture(capsys.readouterr()) 592 | assert not cap.err 593 | cap.assert_shape(3, 8) 594 | assert list(cap.df.iloc[0]) == [ 595 | "K0", 596 | "K5", 597 | "A0", 598 | "B0", 599 | "K4", 600 | "K5", 601 | "A2", 602 | "B2", 603 | ] 604 | 605 | 606 | def test_groupby_sum_default(phmgr): 607 | with phmgr("group") as captured: 608 | _call("groupby Animal") 609 | assert not captured.err 610 | df = captured.df 611 | captured.assert_shape(2, 2) 612 | assert list(df.iloc[0]) == ["Falcon", 750.0] 613 | assert list(df.iloc[1]) == ["Parrot", 50.0] 614 | 615 | 616 | def test_groupby_sum(phmgr): 617 | with phmgr("group") as captured: 618 | _call("groupby Animal --how=sum") 619 | assert not captured.err 620 | df = captured.df 621 | captured.assert_shape(2, 2) 622 | assert list(df.iloc[0]) == ["Falcon", 750.0] 623 | assert list(df.iloc[1]) == ["Parrot", 50.0] 624 | 625 | 626 | def test_groupby_mean(phmgr): 627 | with phmgr("group") as captured: 628 | _call("groupby Animal --how=count --as_index=True") 629 | assert not captured.err 630 | df = captured.df 631 | captured.assert_shape(2, 1) 632 | assert list(df.iloc[0]) == [2] 633 | assert list(df.iloc[1]) == [2] 634 | 635 | 636 | def test_groupby_first(phmgr): 637 | with phmgr("group") as captured: 638 | _call("groupby Animal --how=first") 639 | assert not captured.err 640 | df = captured.df 641 | captured.assert_shape(2, 2) 642 | assert list(df.iloc[0]) == ["Falcon", 380.0] 643 | assert list(df.iloc[1]) == ["Parrot", 24.0] 644 | 645 | 646 | def test_rolling_default(phmgr): 647 | with phmgr("iris") as captured: 648 | _call("rolling 3") 649 | assert not captured.err 650 | captured.assert_shape(150, 5) 651 | assert captured.df["setosa"].dropna().sum() == pytest.approx(1671.0, 0.01) 652 | 653 | 654 | def test_rolling_mean(phmgr): 655 | with phmgr("iris") as captured: 656 | _call("rolling 7 --how=mean") 657 | assert not captured.err 658 | captured.assert_shape(150, 5) 659 | assert captured.df["setosa"].dropna().sum() == pytest.approx(543.83, 0.01) 660 | 661 | 662 | def test_rolling_subset_columns(phmgr): 663 | with phmgr("date-fmt") as captured: 664 | _call("rolling 3 x y --how=median") 665 | assert not captured.err 666 | captured.assert_shape(6, 3) 667 | captured.assert_columns(["date", "x", "y"]) 668 | x = list(captured.df["x"]) 669 | y = list(captured.df["y"]) 670 | date = list(captured.df["date"]) 671 | assert math.isnan(x[0]) 672 | assert math.isnan(x[1]) 673 | assert math.isnan(y[0]) 674 | assert math.isnan(y[1]) 675 | assert x[2:] == [4, 5, 6, 7] 676 | assert y[2:] == [9, 10, 11, 12] 677 | assert date == ["2020_02/0{}".format(i) for i in range(2, 8)] 678 | 679 | 680 | def test_rolling_broken_window(phmgr): 681 | with phmgr("date-fmt") as _: 682 | with pytest.raises(SystemExit) as exit_: 683 | _call("rolling 3") 684 | err = 'ph rolling: Could not perform rolling window on column "date"' 685 | assert str(exit_.value) == err 686 | 687 | 688 | def test_ewm_default(phmgr): 689 | with phmgr("iris") as captured: 690 | _call("ewm 2 --com=0.5") 691 | assert not captured.err 692 | captured.assert_shape(150, 5) 693 | assert captured.df["setosa"].dropna().sum() == pytest.approx(560.411) 694 | 695 | 696 | def test_expanding_default(phmgr): 697 | with phmgr("iris") as captured: 698 | _call("expanding 3") 699 | assert not captured.err 700 | captured.assert_shape(150, 5) 701 | assert captured.df["setosa"].dropna().sum() == pytest.approx(32468.9) 702 | 703 | 704 | def test_expanding_quantile(phmgr): 705 | with phmgr("iris") as captured: 706 | _call("expanding 3 --how=quantile --quantile=0.9") 707 | assert not captured.err 708 | captured.assert_shape(150, 5) 709 | assert captured.df["setosa"].dropna().sum() == pytest.approx(563.81) 710 | 711 | 712 | def test_index(phmgr): 713 | with phmgr("a") as captured: 714 | _call("index") 715 | 716 | assert not captured.err 717 | assert list(captured.df["index"]) == [i for i in range(6)] 718 | 719 | 720 | def test_split(phmgr): 721 | with phmgr("padded_decimals") as captured: 722 | _call("split paddecim ,") 723 | assert not captured.err 724 | captured.assert_shape(7, 3) 725 | captured.assert_columns(["idx", "paddecim", "paddecim_rhs"]) 726 | captured.read_df(thousands=".") 727 | assert set(captured.df["paddecim_rhs"]) == {0, 50} 728 | assert list(captured.df["paddecim"]) == [ 729 | 502, 730 | 172, 731 | 7, 732 | 142, 733 | 157, 734 | 487, 735 | 1470, 736 | ] 737 | 738 | 739 | def test_split_intcol(phmgr): 740 | """Testing that columns that are of int type can be split""" 741 | with phmgr("usa") as captured: 742 | _call("split year 0") 743 | assert not captured.err 744 | captured.assert_shape(93, 8) 745 | captured.assert_columns( 746 | ["dateRep", "day", "month", "year", "cases", "deaths", "geoId", "year_rhs"] 747 | ) 748 | 749 | 750 | def test_split_twice(capsys, monkeypatch): 751 | monkeypatch.setattr("sys.stdin", _get_io("date-fmt")) 752 | _call("split date /") 753 | captured = capsys.readouterr() 754 | assert not captured.err 755 | 756 | monkeypatch.setattr("sys.stdin", io.StringIO(captured.out)) 757 | _call("split date _") 758 | captured = capsys.readouterr() 759 | assert ( 760 | captured.out 761 | == """\ 762 | date,x,y,date_rhs,date_rhs_2 763 | 2020,3,8,2,02 764 | 2020,4,9,3,02 765 | 2020,5,10,4,02 766 | 2020,6,11,5,02 767 | 2020,7,12,6,02 768 | 2020,8,13,7,02 769 | """ 770 | ) 771 | assert not captured.err 772 | 773 | 774 | def test_sort(phmgr): 775 | with phmgr("iris") as captured: 776 | _call("sort setosa") 777 | assert not captured.err 778 | lst = list(captured.df["setosa"]) 779 | assert lst == sorted(lst) 780 | 781 | 782 | def test_grep_case_1(phmgr): 783 | with phmgr("left") as captured: 784 | _call("grep k0") 785 | assert not captured.err 786 | captured.assert_shape(0, 4) 787 | captured.assert_columns(LEFT_COLUMNS) 788 | 789 | 790 | def test_grep_case_0(phmgr): 791 | with phmgr("left") as captured: 792 | _call("grep k0 --case=0") 793 | assert not captured.err 794 | captured.assert_shape(3, 4) 795 | captured.assert_columns(LEFT_COLUMNS) 796 | 797 | 798 | def test_grep_case_false(phmgr): 799 | with phmgr("left") as captured: 800 | _call("grep k0 --case=False") 801 | assert not captured.err 802 | captured.assert_shape(3, 4) 803 | captured.assert_columns(LEFT_COLUMNS) 804 | 805 | 806 | def test_grep_col1(phmgr): 807 | with phmgr("left") as captured: 808 | _call("grep K0 --column=key1") 809 | assert not captured.err 810 | captured.assert_shape(2, 4) 811 | captured.assert_columns(LEFT_COLUMNS) 812 | assert list(captured.df["A"]) == ["A0", "A1"] 813 | 814 | 815 | def test_grep_col2(phmgr): 816 | with phmgr("left") as captured: 817 | _call("grep K0 --column=key2") 818 | assert not captured.err 819 | captured.assert_shape(2, 4) 820 | captured.assert_columns(LEFT_COLUMNS) 821 | assert list(captured.df["A"]) == ["A0", "A2"] 822 | 823 | 824 | def test_grep_col1_pattern(phmgr): 825 | with phmgr("left") as captured: 826 | ph.grep("K[0|1]", column="key1") 827 | assert not captured.err 828 | captured.assert_shape(3, 4) 829 | captured.assert_columns(LEFT_COLUMNS) 830 | assert list(captured.df["A"]) == ["A0", "A1", "A2"] 831 | 832 | 833 | def test_grep_col1_pattern_regex(phmgr): 834 | with phmgr("left") as captured: 835 | _call("grep K. --column=key1") 836 | assert not captured.err 837 | captured.assert_shape(4, 4) 838 | captured.assert_columns(LEFT_COLUMNS) 839 | assert list(captured.df["A"]) == ["A0", "A1", "A2", "A3"] 840 | 841 | 842 | def test_grep_col1_pattern_regex_off(phmgr): 843 | with phmgr("left") as captured: 844 | _call("grep K. --column=key1 --regex=False") 845 | assert not captured.err 846 | captured.assert_shape(0, 4) 847 | captured.assert_columns(LEFT_COLUMNS) 848 | 849 | 850 | def test_polyfit(phmgr): 851 | with phmgr() as captured: 852 | _call("polyfit x y") 853 | assert not captured.err 854 | df = captured.df 855 | assert list(df.columns) == ["x", "y", "polyfit_1"] 856 | assert df["y"].equals(df["polyfit_1"].astype(int)) 857 | 858 | 859 | def test_version(phmgr): 860 | import ph._version 861 | 862 | with phmgr() as captured: 863 | ph.print_version() 864 | assert not captured.err 865 | assert captured.out == ph._version.__version__ + "\n" 866 | 867 | 868 | def test_slugify_method(): 869 | actexp = { 870 | "abc": "abc", 871 | "abc123": "abc123", 872 | "abc_ 123 ": "abc_123", 873 | "abc(123)": "abc_123", 874 | "abc(123)_": "abc_123_", 875 | "(abc)/123": "abc_123", 876 | "_abc: 123": "_abc_123", 877 | '[]()abc-^ \\ "': "abc", 878 | "0": "0_", 879 | 0: "0_", 880 | -3: "3_", 881 | "-3": "3_", 882 | "3.14": "3_14_", 883 | 3.14: "3_14_", 884 | } 885 | for act, exp in actexp.items(): 886 | assert ph.slugify_name(act) == exp 887 | 888 | 889 | def test_replace(phmgr): 890 | with phmgr() as captured: 891 | _call("replace 8 100") 892 | assert not captured.err 893 | captured.assert_shape(6, 2) 894 | assert list(captured.df.x) == list(range(3, 8)) + [100] 895 | assert list(captured.df.y) == [100] + list(range(9, 14)) 896 | 897 | 898 | def test_replace_col_and_inf(phmgr): 899 | with phmgr("inf") as captured: 900 | _call("replace inf 0 --column=x") 901 | assert not captured.err 902 | captured.assert_shape(6, 2) 903 | 904 | x = captured.df.x 905 | lx = list(x) 906 | xna = x.dropna() 907 | assert list(xna) == [0, 7, 8] 908 | assert math.isnan(lx[0]) 909 | assert math.isnan(lx[1]) 910 | assert math.isnan(lx[2]) 911 | assert list(captured.df.y) == list(range(8, 14)) 912 | 913 | 914 | def test_slice(phmgr): 915 | with phmgr("a") as captured: 916 | _call("slice 0:3") 917 | assert not captured.err 918 | captured.assert_shape(3, 2) 919 | assert list(captured.df.x) == list(range(3, 6)) 920 | assert list(captured.df.y) == list(range(8, 11)) 921 | 922 | 923 | def test_slice_end(phmgr): 924 | with phmgr("a") as captured: 925 | _call("slice :3") 926 | assert not captured.err 927 | captured.assert_shape(3, 2) 928 | assert list(captured.df.x) == list(range(3, 6)) 929 | assert list(captured.df.y) == list(range(8, 11)) 930 | 931 | 932 | def test_slice_start(phmgr): 933 | with phmgr("a") as captured: 934 | _call("slice 3:") 935 | assert not captured.err 936 | captured.assert_shape(3, 2) 937 | assert list(captured.df.x) == list(range(6, 9)) 938 | assert list(captured.df.y) == list(range(11, 14)) 939 | 940 | 941 | def test_slice_start_step(phmgr): 942 | with phmgr("a") as captured: 943 | _call("slice 1::2") 944 | assert not captured.err 945 | captured.assert_shape(3, 2) 946 | assert list(captured.df.x) == list(range(4, 9, 2)) 947 | assert list(captured.df.y) == list(range(9, 14, 2)) 948 | 949 | 950 | def test_slice_start_end_step(phmgr): 951 | with phmgr("a") as captured: 952 | _call("slice 1:5:2") 953 | assert not captured.err 954 | captured.assert_shape(2, 2) 955 | assert list(captured.df.x) == list(range(4, 7, 2)) 956 | assert list(captured.df.y) == list(range(9, 12, 2)) 957 | 958 | 959 | def test_slugify_df(phmgr): 960 | with phmgr("slugit") as captured: 961 | _call("slugify") 962 | 963 | assert not captured.err 964 | 965 | cols = list(captured.df.columns) 966 | assert cols == ["stupid_column_1", "jerky_column_no_2"] 967 | 968 | 969 | def test_slugify_rename_df(capsys, monkeypatch): 970 | monkeypatch.setattr("sys.stdin", _get_io("slugit")) 971 | _call("slugify") 972 | captured = Capture(capsys.readouterr()) 973 | 974 | assert not captured.err 975 | cols = list(captured.df.columns) 976 | assert cols == ["stupid_column_1", "jerky_column_no_2"] 977 | 978 | monkeypatch.setattr("sys.stdin", io.StringIO(captured.out)) 979 | _call("rename stupid_column_1 first") 980 | captured = Capture(capsys.readouterr()) 981 | assert not captured.err 982 | cols = list(captured.df.columns) 983 | assert cols == ["first", "jerky_column_no_2"] 984 | 985 | monkeypatch.setattr("sys.stdin", io.StringIO(captured.out)) 986 | _call("rename jerky_column_no_2 second") 987 | captured = Capture(capsys.readouterr()) 988 | assert not captured.err 989 | cols = list(captured.df.columns) 990 | assert cols == ["first", "second"] 991 | 992 | 993 | def test_doc_plot(capsys): 994 | _call("help plot") 995 | captured = Capture(capsys.readouterr()) 996 | assert not captured.err 997 | assert "Plot the csv file" in captured.out 998 | 999 | 1000 | def test_doc_open_(capsys): # tests registerx 1001 | _call("help open") 1002 | captured = Capture(capsys.readouterr()) 1003 | assert not captured.err 1004 | assert "Use a reader to open a file" in captured.out 1005 | 1006 | 1007 | def test_median(phmgr): 1008 | with phmgr() as captured: 1009 | _call("median") 1010 | assert not captured.err 1011 | assert captured.out == "x,y\n5.5,10.5\n" 1012 | 1013 | 1014 | @pytest.mark.filterwarnings("ignore") 1015 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd") 1016 | def test_xlsx_default_sheet_0(capsys): 1017 | pth = _get_path("sheet", extension="xlsx") 1018 | cmd = "open excel {} {}".format(pth, "--skiprows=4") 1019 | _call(cmd) 1020 | captured = Capture(capsys.readouterr()) 1021 | assert not captured.err 1022 | _assert_a(captured.df) 1023 | 1024 | 1025 | @pytest.mark.filterwarnings("ignore") 1026 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd") 1027 | def test_xlsx_explicit_sheet_0(capsys): 1028 | pth = _get_path("sheet", extension="xlsx") 1029 | cmd = "open excel {} {} {}".format(pth, "--skiprows=4", "--sheet_name=0") 1030 | _call(cmd) 1031 | captured = Capture(capsys.readouterr()) 1032 | assert not captured.err 1033 | _assert_a(captured.df) 1034 | 1035 | 1036 | @pytest.mark.filterwarnings("ignore") 1037 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd") 1038 | def test_xlsx_sheet_1(capsys): 1039 | pth = _get_path("sheet", extension="xlsx") 1040 | cmd = "open excel {} {} {}".format(pth, "--skiprows=1", "--sheet_name=1") 1041 | _call(cmd) 1042 | captured = Capture(capsys.readouterr()) 1043 | assert not captured.err 1044 | captured.assert_shape(6, 4) 1045 | captured.assert_columns(["Unnamed: 0", "year", "month", "day"]) 1046 | 1047 | 1048 | @pytest.mark.filterwarnings("ignore") 1049 | @pytest.mark.skipif(not __have_xlrd(), reason="missing xlrd") 1050 | def test_xlsx_borked(capsys): 1051 | with pytest.raises(SystemExit) as exit_: 1052 | pth = _get_path("sheet", extension="xlsx") 1053 | cmd = "open excel {} {} {}".format(pth, "--skiprows=4", "--sheet_name=None") 1054 | _call(cmd) 1055 | 1056 | errm = 'Specify --sheet_name="a sheet with spaces|the other sheet"' 1057 | assert str(exit_.value) == errm 1058 | 1059 | 1060 | def test_raw(phmgr): 1061 | with phmgr("broken") as captured: 1062 | _call("raw") 1063 | 1064 | assert not captured.err 1065 | captured.assert_shape(7, 5) 1066 | 1067 | 1068 | _COVID_COLS = [ 1069 | "China", 1070 | "S. Korea", 1071 | "Italy", 1072 | "Iran", 1073 | "France", 1074 | "Germany", 1075 | "Spain", 1076 | "USA", 1077 | "UK", 1078 | "Canada", 1079 | ] 1080 | 1081 | 1082 | def test_spencer(phmgr): 1083 | with phmgr("covid") as captured: 1084 | _call("spencer") 1085 | 1086 | assert not captured.err 1087 | captured.assert_shape(29, 10) 1088 | captured.assert_columns(_COVID_COLS) 1089 | -------------------------------------------------------------------------------- /ph/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from .tabulate import tabulate as tabulate_ 4 | import sys 5 | import pandas as pd 6 | import re 7 | import datetime 8 | 9 | 10 | def _get_version(): 11 | import ph._version 12 | 13 | return ph._version.__version__ 14 | 15 | 16 | def print_version(): 17 | print(_get_version()) 18 | 19 | 20 | # Command line parsing of (1) --abc and (2) --abc=def 21 | KWARG = re.compile("^--[a-z0-9_-]+$") 22 | KWARG_WITH_VALUE = re.compile("^--[a-z0-9_-]+=") 23 | 24 | 25 | USAGE_TEXT = """ 26 | ph is a command line tool for streaming csv data. 27 | 28 | If you have a csv file `a.csv`, you can pipe it through `ph` on the 29 | command line by using 30 | 31 | $ cat a.csv | ph columns x y | ph eval "z = x**2 - y" | ph show 32 | 33 | Use ph help [command] for help on the individual commands. 34 | 35 | A list of available commands follows. 36 | """ 37 | 38 | COMMANDS = {} 39 | DOCS = {} 40 | 41 | 42 | def _gpx(fname): 43 | try: 44 | import gpxpy 45 | except ImportError: 46 | sys.exit("ph gpx needs gpxpy, pip install ph[gpx]") 47 | 48 | def from_trackpoint(tp=None): 49 | if tp is None: 50 | return "time", "latitude", "longitude", "elevation", "distance" 51 | p = tp.point 52 | return str(p.time), p.latitude, p.longitude, p.elevation, tp.distance_from_start 53 | 54 | with open(fname, "r") as fin: 55 | gpx = gpxpy.parse(fin) 56 | data = gpx.get_points_data() 57 | columns = from_trackpoint() 58 | dfdata = [from_trackpoint(tp) for tp in data] 59 | return pd.DataFrame(dfdata, columns=columns) 60 | 61 | 62 | def _tsv(*args, **kwargs): 63 | kwargs["sep"] = "\t" 64 | return pd.read_csv(*args, **kwargs) 65 | 66 | 67 | # These are all lambdas because they lazy load, and some of these 68 | # readers are introduced in later pandas. 69 | READERS = { 70 | "csv": pd.read_csv, 71 | "clipboard": pd.read_clipboard, 72 | "fwf": pd.read_fwf, 73 | "json": pd.read_json, 74 | "html": pd.read_html, 75 | "tsv": _tsv, 76 | "gpx": _gpx, 77 | } 78 | 79 | try: 80 | READERS["excel"] = pd.read_excel 81 | READERS["xls"] = pd.read_excel 82 | READERS["odf"] = pd.read_excel 83 | except AttributeError: 84 | pass 85 | 86 | try: 87 | READERS["hdf5"] = pd.read_hdf 88 | except AttributeError: 89 | pass 90 | 91 | 92 | try: 93 | READERS["feather"] = pd.read_feather 94 | except AttributeError: 95 | pass 96 | 97 | 98 | try: 99 | READERS["parquet"] = pd.read_parquet 100 | except AttributeError: 101 | pass 102 | 103 | 104 | try: 105 | READERS["orc"] = pd.read_orc 106 | except AttributeError: 107 | pass 108 | 109 | 110 | try: 111 | READERS["msgpack"] = pd.read_msgpack 112 | except AttributeError: 113 | pass 114 | 115 | 116 | try: 117 | READERS["stata"] = pd.read_stata 118 | except AttributeError: 119 | pass 120 | 121 | 122 | try: 123 | READERS["sas"] = pd.read_sas 124 | except AttributeError: 125 | pass 126 | 127 | 128 | try: 129 | READERS["spss"] = pd.read_spss 130 | except AttributeError: 131 | pass 132 | 133 | 134 | try: 135 | READERS["pickle"] = pd.read_pickle 136 | except AttributeError: 137 | pass 138 | 139 | 140 | try: 141 | READERS["gbq"] = pd.read_gbq 142 | except AttributeError: 143 | pass 144 | 145 | 146 | try: 147 | READERS["google"] = pd.read_gbq 148 | except AttributeError: 149 | pass 150 | 151 | 152 | try: 153 | READERS["bigquery"] = pd.read_gb 154 | except AttributeError: 155 | pass 156 | 157 | 158 | WRITERS = { 159 | "csv": "to_csv", 160 | "fwf": "to_fwf", 161 | "json": "to_json", 162 | "html": "to_html", 163 | "clipboard": "to_clipboard", 164 | "xls": "to_excel", 165 | "odf": "to_excel", 166 | "hdf5": "to_hdf", 167 | "feather": "to_feather", 168 | "parquet": "to_parquet", 169 | "orc": "to_orc", 170 | "msgpack": "to_msgpack", 171 | "stata": "to_stata", 172 | "sas": "to_sas", 173 | "spss": "to_spss", 174 | "pickle": "to_pickle", 175 | "gbq": "to_gbq", 176 | "google": "to_gbq", 177 | "bigquery": "to_gbq", 178 | # extras 179 | "tsv": "to_csv", 180 | } 181 | 182 | 183 | FALSY = ("False", "false", "No", "no", "0", False, 0, "None") 184 | TRUTHY = ("True", "true", "Yes", "yes", "1", True, 1) 185 | 186 | 187 | def _assert_col(df, col, caller=None): 188 | if col not in df.columns: 189 | if caller is not None: 190 | sys.exit("ph {}: Unknown column {}".format(caller, col)) 191 | sys.exit("Unknown column {}".format(col)) 192 | 193 | 194 | def _assert_cols(df, cols, caller=None): 195 | for col in cols: 196 | _assert_col(df, col, caller=caller) 197 | 198 | 199 | def register(fn, name=None): 200 | if name is None: 201 | name = fn.__name__ 202 | COMMANDS[name] = fn 203 | DOCS[name] = fn.__doc__ 204 | return fn 205 | 206 | 207 | def registerx(name): 208 | def inner(fn): 209 | register(fn, name) 210 | return fn 211 | 212 | return inner 213 | 214 | 215 | @register 216 | def dataset(dset=None): 217 | """Load dataset as csv. 218 | 219 | Usage: ph dataset linnerud | ph describe 220 | """ 221 | try: 222 | import sklearn.datasets 223 | except ImportError: 224 | sys.exit("You need scikit-learn. Install ph[data].") 225 | 226 | REALDATA = { 227 | "olivetti_faces": sklearn.datasets.fetch_olivetti_faces, 228 | "20newsgroups": sklearn.datasets.fetch_20newsgroups, 229 | "20newsgroups_vectorized": sklearn.datasets.fetch_20newsgroups_vectorized, 230 | "lfw_people": sklearn.datasets.fetch_lfw_people, 231 | "lfw_pairs": sklearn.datasets.fetch_lfw_pairs, 232 | "covtype": sklearn.datasets.fetch_covtype, 233 | "rcv1": sklearn.datasets.fetch_rcv1, 234 | "kddcup99": sklearn.datasets.fetch_kddcup99, 235 | "california_housing": sklearn.datasets.fetch_california_housing, 236 | } 237 | 238 | TOYDATA = { 239 | "boston": sklearn.datasets.load_boston, 240 | "iris": sklearn.datasets.load_iris, 241 | "diabetes": sklearn.datasets.load_diabetes, 242 | "digits": sklearn.datasets.load_digits, 243 | "linnerud": sklearn.datasets.load_linnerud, 244 | "wine": sklearn.datasets.load_wine, 245 | "breast_cancer": sklearn.datasets.load_breast_cancer, 246 | } 247 | 248 | if dset is None: 249 | print("type,name") 250 | print("\n".join("{},{}".format("real", k) for k in REALDATA)) 251 | print("\n".join("{},{}".format("toy", k) for k in TOYDATA)) 252 | sys.exit() 253 | 254 | if dset not in TOYDATA.keys() | REALDATA.keys(): 255 | sys.exit("Unknown dataset {}. See ph help dataset.".format(dset)) 256 | if dset in TOYDATA: 257 | data = TOYDATA[dset]() 258 | else: 259 | data = REALDATA[dset]() 260 | try: 261 | df = pd.DataFrame(data.data, columns=data.feature_names) 262 | except AttributeError: 263 | df = pd.DataFrame(data.data) 264 | try: 265 | df["target"] = pd.Series(data.target) 266 | except Exception: 267 | pass 268 | pipeout(df) 269 | 270 | 271 | @register 272 | def drop_duplicates(*cols): 273 | """Drop duplicates""" 274 | df = pipein() 275 | pipeout(df.drop_duplicates(cols)) 276 | 277 | 278 | @register 279 | def diff(*cols, periods=1, axis=0): 280 | """Calculate the difference of an element compared with another element 281 | in the csv file (default is element in previous row). 282 | 283 | Argument: --periods=1 284 | 285 | Periods to shift for calculating difference, default 1. Accepts 286 | negative values. 287 | 288 | Argument: --axis=0 289 | 290 | Take difference over rows (0) or columns (1), default 0. 291 | 292 | """ 293 | 294 | df = pipein() 295 | if not cols: 296 | df = df.diff(periods=periods, axis=axis) 297 | else: 298 | _assert_cols(df, cols, "diff") 299 | columns = list(cols) 300 | df[columns] = df[columns].diff(periods=periods, axis=axis) 301 | pipeout(df) 302 | 303 | 304 | @register 305 | def dropna(axis=0, how="any", thresh=None): 306 | """Remove rows (or columns) with N/A values. 307 | 308 | Argument: --axis=0 309 | Defaults to axis=0 (columns), use --axis=1 to remove rows. 310 | 311 | Argument: --how=any 312 | Defaults to how='any', which removes columns (resp. rows) containing 313 | nan values. Use how='all' to remove columns (resp. rows) containing 314 | only nan values. 315 | 316 | Argument: --thresh=5 317 | If --thresh=x is specified, will delete any column (resp. row) with 318 | fewer than x non-na values. 319 | 320 | Usage: cat a.csv | ph dropna 321 | cat a.csv | ph dropna --axis=1 # for row-wise 322 | cat a.csv | ph dropna --thresh=5 # keep cols with >= 5 non-na 323 | cat a.csv | ph dropna --how=all # delete only if all vals na 324 | 325 | """ 326 | try: 327 | axis = int(axis) 328 | if axis not in (0, 1): 329 | sys.exit("ph dropna --axis=0 or --axis=1, not {}".format(axis)) 330 | except ValueError: 331 | sys.exit("ph dropna --axis=0 or --axis=1, not {}".format(axis)) 332 | 333 | if thresh is not None: 334 | try: 335 | thresh = int(thresh) 336 | except ValueError: 337 | sys.exit("ph dropna --thresh=0 or --thresh=1, not {}".format(thresh)) 338 | 339 | df = pipein() 340 | try: 341 | df = df.dropna(axis=axis, how=how, thresh=thresh) 342 | except Exception as err: 343 | sys.exit(str(err)) 344 | pipeout(df) 345 | 346 | 347 | def _safe_out(output): 348 | """Prints output to standard out, catching broken pipe.""" 349 | try: 350 | print(output) 351 | except BrokenPipeError: 352 | try: 353 | sys.stdout.close() 354 | except IOError: 355 | pass 356 | try: 357 | sys.stderr.close() 358 | except IOError: 359 | pass 360 | 361 | 362 | def pipeout(df, sep=",", index=False, *args, **kwargs): 363 | csv = df.to_csv(sep=sep, index=index, *args, **kwargs) 364 | output = csv.rstrip("\n") 365 | _safe_out(output) 366 | 367 | 368 | def pipein(ftype="csv", **kwargs): 369 | skiprows = kwargs.get("skiprows") 370 | if skiprows is not None: 371 | try: 372 | skiprows = int(skiprows) 373 | if skiprows < 0: 374 | raise ValueError("Negative") 375 | except ValueError: 376 | sys.exit("skiprows must be a non-negative int, not {}".format(skiprows)) 377 | kwargs["skiprows"] = skiprows 378 | 379 | if kwargs.get("sep") == "\\t": 380 | kwargs["sep"] = "\t" 381 | 382 | try: 383 | return READERS[ftype](sys.stdin, **kwargs) 384 | except pd.errors.EmptyDataError: 385 | return pd.DataFrame() 386 | except pd.errors.ParserError as err: 387 | sys.exit(str(err)) 388 | 389 | 390 | @register 391 | def fillna(value=None, method=None, limit=None): 392 | """Fill na values with a certain value or method, at most `limit` many. 393 | 394 | Takes either a value, or a method using (e.g.) --method=ffill. 395 | 396 | Argument: value 397 | If provided, replaces all N/A values with prescribed value. 398 | 399 | Argument: --method=pad 400 | If provided, value cannot be provided. Allowed methods are 401 | backfill, bfill, pad, ffill 402 | 403 | Argument: --limit=x 404 | If provided, limits number of consecutive N/A values to fill. 405 | 406 | 407 | Usage: cat a.csv | ph fillna 999.75 408 | cat a.csv | ph fillna -1 409 | cat a.csv | ph fillna --method=pad 410 | cat a.csv | ph fillna --method=pad --limit=5 411 | 412 | """ 413 | if limit is not None: 414 | try: 415 | limit = int(limit) 416 | except ValueError: 417 | sys.exit("--limit=x must be an integer, not {}".format(limit)) 418 | METHODS = ("backfill", "bfill", "pad", "ffill") 419 | if method is not None: 420 | if method not in METHODS: 421 | sys.exit("method must be one of {}, not {}".format(METHODS, method)) 422 | pipeout(pipein().fillna(method=method, limit=limit)) 423 | elif value is not None: 424 | value = __tryparse(value) 425 | pipeout(pipein().fillna(value=value, limit=limit)) 426 | else: 427 | sys.exit("'ph fillna' needs exactly one of value and method") 428 | 429 | 430 | @register 431 | def query(expr): 432 | """Using pandas queries. 433 | 434 | Usage: cat a.csv | ph query "x > 5" 435 | 436 | """ 437 | df = pipein() 438 | new_df = df.query(expr) 439 | pipeout(new_df) 440 | 441 | 442 | @register 443 | def grep(*expr, case=True, na=float("nan"), regex=True, column=None): 444 | """Grep (with regex) for content in csv file. 445 | 446 | Usage: cat a.csv | ph grep 0 447 | cat a.csv | ph grep search_string 448 | cat a.csv | ph grep "A|B" # search hits a or b 449 | cat a.csv | ph grep "a|b" --case=False # case insensitive 450 | cat a.csv | ph grep 4 --column=x 451 | 452 | To disable regex (e.g. simple search for "." or "*" characters, use 453 | --regex=False). 454 | 455 | Search only in a specific column with --column=col. 456 | 457 | Supports regex search queries such as "0-9A-F" and "\\d" (possibly 458 | double-escaped.) 459 | 460 | """ 461 | df = pipein() 462 | 463 | if case is True or case in TRUTHY: 464 | case = True 465 | elif case in FALSY: 466 | case = False 467 | else: 468 | sys.exit("ph grep: Unknown --case={} should be True or False".format(case)) 469 | 470 | if regex is True or regex in TRUTHY: 471 | regex = True 472 | elif regex in FALSY: 473 | regex = False 474 | else: 475 | sys.exit("ph grep: Unknown --regex={} should be True or False".format(regex)) 476 | 477 | if column is not None: 478 | _assert_col(df, column, "grep") 479 | 480 | expr = " ".join(str(e) for e in expr) # force string input 481 | 482 | try: 483 | import numpy 484 | except ImportError: 485 | sys.exit("numpy needed for grep. pip install numpy") 486 | 487 | retval = df[ 488 | numpy.logical_or.reduce( 489 | [ 490 | df[col].astype(str).str.contains(expr, case=case, na=na, regex=regex) 491 | for col in (df.columns if column is None else [column]) 492 | ] 493 | ) 494 | ] 495 | pipeout(retval) 496 | 497 | 498 | @register 499 | def appendstr(col, s, newcol=None): 500 | """Special method to append a string to the end of a column. 501 | 502 | Usage: cat e.csv | ph appendstr year -01-01 | ph date year 503 | """ 504 | df = pipein() 505 | if newcol is None: 506 | newcol = col 507 | df[newcol] = df[col].astype(str) + s 508 | pipeout(df) 509 | 510 | 511 | @register 512 | def split(col, pat=" "): 513 | """Split a column in two based on a given pattern, default is " ". 514 | 515 | The resulting csv will have one extra column called "col_rhs" where 516 | "col" is the name of the column being split. 517 | 518 | Usage: cat dates.csv | ph split date / 519 | 520 | """ 521 | pat = str(pat) 522 | df = pipein() 523 | _assert_col(df, col, "split") 524 | new_name = col + "_rhs" 525 | suffix = "" 526 | name = lambda: (new_name + "_" + str(suffix)).rstrip("_") 527 | while name() in df.columns: 528 | if not suffix: 529 | suffix = 1 530 | suffix += 1 531 | df[[col, name()]] = df[col].astype(str).str.split(pat=pat, n=1, expand=True) 532 | pipeout(df) 533 | 534 | 535 | @register 536 | def strip(*cols, lstrip=False, rstrip=False): 537 | """Strip (trim) a string. 538 | 539 | Usage: cat x.csv | ph strip 540 | cat x.csv | ph strip --lstrip=True 541 | cat x.csv | ph strip --rstrip=True 542 | 543 | """ 544 | df = pipein() 545 | if not cols: 546 | cols = list(df.columns) 547 | else: 548 | cols = list(cols) 549 | _assert_cols(df, cols, "strip") 550 | for c in cols: 551 | if lstrip in TRUTHY: 552 | df[c] = df[c].str.lstrip() 553 | elif rstrip in TRUTHY: 554 | df[c] = df[c].str.rstrip() 555 | else: 556 | df[c] = df[c].str.strip() 557 | pipeout(df) 558 | 559 | 560 | @register 561 | def removeprefix(col, prefix=" "): 562 | """Remove prefix of contents of a column. 563 | 564 | Usage: cat a.csv | ph removeprefix col1 .. 565 | 566 | See also @removesuffix @strip 567 | 568 | """ 569 | prefix = str(prefix) 570 | plen = len(prefix) 571 | df = pipein() 572 | _assert_col(df, col, "removeprefix") 573 | df[col] = df[col].apply( 574 | lambda s: str(s)[plen:] if str(s).startswith(prefix) else str(s) 575 | ) 576 | pipeout(df) 577 | 578 | 579 | @register 580 | def removesuffix(col, suffix=" "): 581 | """Remove suffix of contents of a column. 582 | 583 | Usage: cat a.csv | ph removesuffix col1 .. 584 | 585 | See also @removeprefix @strip 586 | 587 | """ 588 | suffix = str(suffix) 589 | plen = len(suffix) 590 | df = pipein() 591 | _assert_col(df, col, "removesuffix") 592 | df[col] = df[col].apply( 593 | lambda s: str(s)[:-plen] if str(s).endswith(suffix) else str(s) 594 | ) 595 | pipeout(df) 596 | 597 | 598 | @register 599 | def astype(type, column=None, newcolumn=None): 600 | """Cast a column to a different type. 601 | 602 | Usage: cat a.csv | ph astype double x [new_x] 603 | 604 | """ 605 | df = pipein() 606 | try: 607 | if column is None: 608 | df = df.astype(type) 609 | elif newcolumn is not None: 610 | df[newcolumn] = df[column].astype(type) 611 | else: 612 | df[column] = df[column].astype(type) 613 | except ValueError as err: 614 | sys.exit("Could not convert to {}: {}".format(type, err)) 615 | pipeout(df) 616 | 617 | 618 | @register 619 | def dtypes(t=None): 620 | """If no argument is provided, output types, otherwise filter on types. 621 | 622 | If no argument is provided, output a csv with two columns, "column" and 623 | "dtype". The "column" column contains the names of the columns in the input 624 | csv and the "dtype" column contains their respective types. 625 | 626 | If an argument is provided, all columns with the prescribed type is output. 627 | 628 | Usage: cat a.csv | ph dtypes 629 | cat a.csv | ph dtypes float64 630 | 631 | """ 632 | if t is None: 633 | df = pipein() 634 | newdf = pd.DataFrame(pd.Series(df.columns), columns=["column"]) 635 | newdf["dtype"] = pd.Series([str(e) for e in df.dtypes]) 636 | pipeout(newdf.T, header=False) 637 | else: 638 | df = pipein().select_dtypes(t) 639 | pipeout(df) 640 | 641 | 642 | @register 643 | def pivot(columns, index=None, values=None): 644 | """Reshape csv organized by given index / column values. 645 | 646 | Suppose b.csv is 647 | foo,bar,baz,zoo 648 | one,A,1,x 649 | one,B,2,y 650 | one,C,3,z 651 | two,A,4,q 652 | two,B,5,w 653 | two,C,6,t 654 | 655 | Usage: cat b.csv | ph pivot bar --index=foo --values=baz 656 | 657 | A B C 658 | -- --- --- --- 659 | 0 1 2 3 660 | 1 4 5 6 661 | 662 | """ 663 | pipeout(pipein().pivot(index=index, columns=columns, values=values)) 664 | 665 | 666 | @register 667 | def crosstab(column): 668 | """Perform a very simplistic crosstabulation on one column of the input csv. 669 | 670 | Usage: cat b.csv | ph crosstab foo 671 | """ 672 | newcol = "crosstab_{}".format(column) 673 | df = pd.crosstab(pipein()[column], newcol) 674 | df["id"] = list(df[newcol].index) 675 | pipeout(df) 676 | 677 | 678 | @register 679 | def groupby(*columns, how="sum", as_index=False): 680 | """Group by columns, then apply `how` function. 681 | 682 | Usage: cat a.csv | ph groupby animal # default to sum 683 | cat a.csv | ph groupby animal --how=mean 684 | cat a.csv | ph groupby animal --how=prod 685 | cat a.csv | ph groupby animal --as_index=True # removes index 686 | """ 687 | columns = list(columns) 688 | if not columns: 689 | sys.exit("Needs at least one column to group by") 690 | df = pipein() 691 | _assert_cols(df, columns, "groupby") 692 | if as_index in TRUTHY: 693 | as_index = True 694 | elif as_index in FALSY: 695 | as_index = False 696 | else: 697 | sys.exit("--as_index=True or False, not {}".format(as_index)) 698 | 699 | grouped = df.groupby(columns, as_index=as_index) 700 | try: 701 | fn = getattr(grouped, how) 702 | except AttributeError: 703 | sys.exit("Unknown --how={}, should be sum, mean, ...".format(how)) 704 | retval = fn() 705 | 706 | pipeout(retval) 707 | 708 | 709 | @register 710 | def rolling(window, *columns, how="sum", win_type=None, std=None, beta=None, tau=None): 711 | """Rolling window calculations using provided `how` function. 712 | 713 | Usage: cat a.csv | ph rolling 3 714 | cat a.csv | ph rolling 5 --how=mean 715 | cat a.csv | ph rolling 5 colA colB --how=mean 716 | cat a.csv | ph rolling 5 --win_type=gaussian --std=7.62 717 | """ 718 | df = pipein() 719 | orig_columns = list(df.columns) 720 | columns = list(columns) 721 | _assert_cols(df, columns, "rolling") 722 | 723 | if not columns: 724 | columns = list(df.columns) 725 | 726 | noncols = [c for c in df.columns if c not in columns] 727 | 728 | rollin = df[columns].rolling(window, win_type=win_type) 729 | nonrollin = df[noncols] 730 | try: 731 | fn = getattr(rollin, how) 732 | except AttributeError: 733 | sys.exit("Unknown --how={}, should be sum, mean, ...".format(how)) 734 | 735 | if {std, beta, tau} != {None}: 736 | retval = fn(std=std, beta=beta, tau=tau) 737 | else: 738 | retval = fn() 739 | 740 | df = pd.concat([retval, nonrollin], axis=1) 741 | for col in orig_columns: 742 | if col not in df.columns: 743 | op = "ph rolling" 744 | sys.exit( 745 | '{}: Could not perform rolling window on column "{}"'.format(op, col) 746 | ) 747 | df = df[orig_columns] 748 | pipeout(df) 749 | 750 | 751 | @register 752 | def ewm( 753 | min_periods=0, 754 | adjust=True, 755 | ignore_na=False, 756 | axis=0, 757 | com=None, 758 | span=None, 759 | halflife=None, 760 | alpha=None, 761 | how="mean", 762 | ): 763 | """Provide exponential weighted functions. 764 | 765 | A related set of functions are exponentially weighted versions of 766 | several of the above statistics. A similar interface to rolling and 767 | expanding is accessed through the ewm method to receive an EWM 768 | object. A number of expanding EW (exponentially weighted) methods 769 | are provided: 770 | 771 | * mean 772 | * var 773 | * std 774 | * corr 775 | * cov 776 | 777 | Usage: cat a.csv | ph ewm --com=0.5 --how=mean 778 | cat a.csv | ph ewm --halflife=0.5 --how=std 779 | 780 | """ 781 | if {com, span, halflife, alpha} == {None}: 782 | sys.exit("Must pass one of com, span, halflife, or alpha") 783 | 784 | df = pipein() 785 | 786 | ewm_ = df.ewm( 787 | min_periods=min_periods, 788 | adjust=adjust, 789 | ignore_na=ignore_na, 790 | axis=axis, 791 | com=com, 792 | span=span, 793 | halflife=halflife, 794 | alpha=alpha, 795 | ) 796 | try: 797 | fn = getattr(ewm_, how) 798 | except AttributeError: 799 | sys.exit("Unknown --how={}, should be mean, var, std, corr, cov..".format(how)) 800 | 801 | retval = fn() 802 | 803 | pipeout(retval) 804 | 805 | 806 | @register 807 | def expanding(min_periods=1, axis=0, how="sum", quantile=None): 808 | """Provide expanding transformations. 809 | 810 | A common alternative to rolling statistics is to use an expanding 811 | window, which yields the value of the statistic with all the data 812 | available up to that point in time. 813 | 814 | For working with data, a number of window functions are provided for 815 | computing common window or rolling statistics. Among these are 816 | count, sum, mean, median, correlation, variance, covariance, 817 | standard deviation, skewness, and kurtosis. 818 | 819 | 820 | Usage: cat a.csv | ph expanding 821 | cat a.csv | ph expanding 1 --how=sum # above equivalent to this 822 | cat a.csv | ph expanding 2 823 | cat a.csv | ph expanding 5 --how=quantile --quantile=0.25 824 | 825 | """ 826 | 827 | df = pipein() 828 | 829 | if quantile is not None: 830 | if how != "quantile": 831 | sys.exit("Use both or none of --how=quantile and --quantile=") 832 | if how == "quantile" and quantile is None: 833 | 834 | sys.exit("--how=quantile needs --quantile=, e.g. --quantile=0.25") 835 | expanding_ = df.expanding(min_periods=min_periods, axis=axis) 836 | try: 837 | fn = getattr(expanding_, how) 838 | except AttributeError: 839 | sys.exit("Unknown --how={}, should be sum, mean, max, quantile..".format(how)) 840 | 841 | if how == "quantile": 842 | retval = fn(quantile) 843 | else: 844 | retval = fn() 845 | 846 | pipeout(retval) 847 | 848 | 849 | @register 850 | def monotonic(column, direction="+"): 851 | """Check if a certain column is monotonically increasing or decreasing. 852 | 853 | Usage: cat a.csv | ph monotonic x 854 | cat a.csv | ph monotonic x + # equivalent to above 855 | cat a.csv | ph monotonic x - # for decreasing 856 | 857 | """ 858 | df = pipein() 859 | if column not in df: 860 | sys.exit("Unknown column {}".format(column)) 861 | if direction not in "+-": 862 | sys.exit("direction must be either + or -") 863 | print("{}_monotonic".format(column)) 864 | if direction == "+": 865 | print(df[column].is_monotonic) 866 | else: 867 | print(df[column].is_monotonic_decreasing) 868 | 869 | 870 | @register 871 | def iplot(*args, **kwargs): 872 | """Use plotly/cufflinks for interactive plot. 873 | 874 | This option is similar to `plot` but creates an HTML file and opens a 875 | browser for an interactive plot. 876 | 877 | Usage: cat a.csv | ph iplot 878 | cat a.csv | ph iplot --kind=bar 879 | cat a.csv | ph iplot --kind=bar --barmode=stack 880 | cat a.csv | ph iplot --kind=scatter --mode=markers --x=x --y=y 881 | 882 | 883 | Depends on cufflinks: pip install ph[iplot]. 884 | 885 | """ 886 | try: 887 | import cufflinks # noqa 888 | import plotly as py 889 | except ImportError: 890 | sys.exit("iplot needs cufflinks, pip install ph[iplot]") 891 | 892 | df = pipein() 893 | fig = df.iplot(*args, asFigure=True, **kwargs) 894 | py.offline.plot(fig) 895 | pipeout(df) 896 | 897 | 898 | @register 899 | def plot(*args, **kwargs): 900 | """Plot the csv file. 901 | 902 | Usage: ph plot 903 | ph plot --index=col 904 | ph plot --kind=bar 905 | ph plot --kind=scatter --x=col1 --y=col2 906 | ph plot --style=k-- 907 | ph plot --logx=True 908 | ph plot --logy=True 909 | ph plot --loglog=True 910 | ph plot --savefig=fname.png 911 | ph plot --savefig=fname.svg 912 | ph plot --savefig=fname.svg --savefig-dpi=300 913 | """ 914 | try: 915 | import matplotlib.pyplot as plt 916 | except ImportError: 917 | sys.exit("plot depends on matplotlib, install ph[plot]") 918 | 919 | df = pipein() 920 | index = kwargs.get("index") 921 | if index is not None: 922 | _assert_col(df, index, caller="plot") 923 | df = df.set_index(index) 924 | del kwargs["index"] 925 | for log_ in ("logx", "logy", "loglog"): 926 | if kwargs.get(log_) in TRUTHY: 927 | kwargs[log_] = True 928 | fname = kwargs.get("savefig") 929 | dpi = kwargs.get("savefig-dpi") 930 | 931 | if fname: 932 | del kwargs["savefig"] 933 | if dpi: 934 | del kwargs["savefig-dpi"] 935 | 936 | fig, ax = plt.subplots() 937 | df.plot(**kwargs, ax=ax) 938 | 939 | if index == "date": 940 | fig.autofmt_xdate() 941 | 942 | if fname: 943 | plt.tight_layout() 944 | plt.savefig(fname, dpi=dpi) 945 | else: 946 | plt.show() 947 | pipeout(df) 948 | 949 | 950 | @register 951 | def eval(expr): 952 | """Eval expr using pandas.DataFrame.eval. 953 | 954 | Example: cat a.csv | ph eval "z = x + y" 955 | 956 | """ 957 | df = pipein() 958 | pipeout(df.eval(expr)) 959 | 960 | 961 | @register 962 | def normalize(col=None): 963 | """Normalize a column or an entire dataframe. 964 | 965 | Usage: cat a.csv | ph normalize 966 | cat a.csv | ph normalize x 967 | 968 | 969 | Warning: This is probably not what you want. 970 | 971 | """ 972 | df = pipein() 973 | if col is None: 974 | df = (df - df.min()) / (df.max() - df.min()) 975 | else: 976 | df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) 977 | pipeout(df) 978 | 979 | 980 | @register 981 | def date(col=None, unit=None, origin="unix", errors="raise", dayfirst=False, **kwargs): 982 | """Assemble datetime from multiple columns or from one column 983 | 984 | --unit can be D, s, us, ns (defaults to ns, ns from origin) 985 | 986 | --origin can be unix, julian, or time offset, e.g. '2000-01-01' 987 | 988 | --errors can be raise, coerce, ignore (see pandas.to_datetime) 989 | 990 | --format a strptime format string, e.g. '%Y-%m-%d %H:%M:%S' 991 | 992 | --utc=True if the input is in utc, i.e. seconds from epoch 993 | 994 | Usage: cat a.csv | ph date x 995 | cat a.csv | ph date x --unit=s --origin="1984-05-17 09:30" 996 | cat a.csv | ph date x --dayfirst=True 997 | cat a.csv | ph date # if a.csv contains year, month, date 998 | cat a.csv | ph date x --format="%Y-%m-%d" 999 | cat a.csv | ph date x --utc=True 1000 | 1001 | """ 1002 | DATE_ERRORS = ("ignore", "raise", "coerce") 1003 | if errors not in DATE_ERRORS: 1004 | sys.exit("Errors must be one of {}, not {}.".format(DATE_ERRORS, errors)) 1005 | 1006 | dayfirst = dayfirst in TRUTHY 1007 | 1008 | date_parser = None 1009 | if "format" in kwargs: 1010 | date_parser = lambda d: [ 1011 | datetime.datetime.strptime(str(e), kwargs["format"]) for e in d 1012 | ] 1013 | if kwargs.get("utc") in TRUTHY: 1014 | date_parser = lambda d: [datetime.datetime.utcfromtimestamp(e) for e in d] 1015 | df = pipein() 1016 | try: 1017 | if col is None: 1018 | df = pd.to_datetime(df, unit=unit, origin=origin, errors=errors) 1019 | else: 1020 | _assert_col(df, col, "date") 1021 | if date_parser is None: 1022 | df[col] = pd.to_datetime( 1023 | df[col], unit=unit, origin=origin, errors=errors, dayfirst=dayfirst 1024 | ) 1025 | else: 1026 | df[col] = date_parser(df[col]) 1027 | except Exception as err: 1028 | sys.exit(err) 1029 | 1030 | pipeout(df) 1031 | 1032 | 1033 | @register 1034 | def round(col, decimals=0): 1035 | """Round column to `decimals` decimals. 1036 | 1037 | Usage: cat a.csv | ph round x 2 1038 | """ 1039 | df = pipein() 1040 | _assert_col(df, col, "round") 1041 | df[col] = df[col].round(decimals=decimals) 1042 | pipeout(df) 1043 | 1044 | @register 1045 | def describe(): 1046 | """Run DataFrame's describe method. 1047 | 1048 | The result is NOT tabular data, so pipeline ends. 1049 | 1050 | Usage: cat a.csv | ph describe 1051 | """ 1052 | df = pipein() 1053 | try: 1054 | out = df.describe() 1055 | except ValueError as err: 1056 | sys.exit(str(err)) 1057 | _safe_out(out) 1058 | 1059 | 1060 | @register 1061 | def info(): 1062 | """Run DataFrame's info method. 1063 | 1064 | The result is NOT tabular data, so pipeline ends. 1065 | 1066 | Usage: cat a.csv | ph info 1067 | """ 1068 | print(pipein().info()) 1069 | 1070 | 1071 | @register 1072 | def to(ftype, fname=None, sep=None, index=False): 1073 | """Export csv to given format (possibly csv). 1074 | 1075 | Supports csv, html, json, parquet, bigquery, tsv, etc. (see README for full 1076 | list). 1077 | 1078 | Usage: cat a.csv | ph to html 1079 | cat a.csv | ph to tsv 1080 | cat a.csv | ph to csv --index=True 1081 | cat a.csv | ph to csv --sep=';' 1082 | cat a.csv | ph to clipboard 1083 | cat a.csv | ph to json 1084 | cat a.csv | ph to parquet out.parquet 1085 | 1086 | """ 1087 | if ftype not in WRITERS: 1088 | sys.exit("Unknown datatype {}.".format(ftype)) 1089 | 1090 | if not fname: 1091 | if ftype in ("parquet", "xls", "xlsx", "ods", "pickle"): 1092 | sys.exit("{} needs a path".format(ftype)) 1093 | 1094 | if ftype == "hdf5": 1095 | sys.exit("hdf5 writer not implemented") 1096 | 1097 | if index not in TRUTHY + FALSY: 1098 | sys.exit("Index must be True or False, not {}".format(index)) 1099 | index = index in TRUTHY 1100 | 1101 | if ftype == "fwf": 1102 | # pandas has not yet implemented to_fwf 1103 | df = pipein() 1104 | content = tabulate_(df.values.tolist(), list(df.columns), tablefmt="plain") 1105 | if fname: 1106 | with open(fname, "w") as wout: 1107 | wout.write(content) 1108 | else: 1109 | print(content) 1110 | sys.exit() 1111 | 1112 | if sep is not None: 1113 | if ftype != "csv": 1114 | sys.exit("Only csv mode supports separator") 1115 | 1116 | writer = WRITERS[ftype] 1117 | df = pipein() 1118 | fn = getattr(df, writer) 1119 | kwargs = {} 1120 | if ftype == "tsv": 1121 | kwargs["sep"] = "\t" 1122 | elif ftype == "csv" and sep is not None: 1123 | kwargs["sep"] = sep 1124 | 1125 | if ftype == "json": 1126 | index = True 1127 | 1128 | if fname is not None: 1129 | print(fn(fname, index=index, **kwargs)) 1130 | else: 1131 | print(fn(index=index, **kwargs)) 1132 | 1133 | 1134 | @registerx("from") 1135 | def from_(ftype="csv", **kwargs): 1136 | """Read a certain (default csv) format from standard in and stream out as csv. 1137 | 1138 | Usage: cat a.json | ph from json 1139 | cat /etc/passwd | ph from csv --sep=':' --header=None 1140 | 1141 | The following pipes should be equivalent: 1142 | 1143 | cat a.csv 1144 | cat a.csv | ph to json | ph from json 1145 | cat a.tsv | ph from tsv 1146 | cat a.tsv | ph from csv --sep='\t' 1147 | cat a.tsv | ph from csv --sep='\t' --thousands=',' 1148 | 1149 | In the event that the csv data starts on the first line (i.e. no 1150 | header is present), use --header=None. 1151 | """ 1152 | if "header" in kwargs: 1153 | kwargs["header"] = __tryparse(kwargs["header"]) 1154 | skiprows = kwargs.get("skiprows") 1155 | if skiprows is not None: 1156 | try: 1157 | skiprows = int(skiprows) 1158 | if skiprows < 0: 1159 | raise ValueError("Negative") 1160 | except ValueError: 1161 | sys.exit("skiprows must be a non-negative int, not {}".format(skiprows)) 1162 | kwargs["skiprows"] = skiprows 1163 | 1164 | if kwargs.get("sep") == "\\t": 1165 | kwargs["sep"] = "\t" 1166 | 1167 | if ftype == "clipboard": 1168 | pipeout(READERS["clipboard"](**kwargs)) 1169 | return 1170 | 1171 | pipeout(pipein(ftype, **kwargs)) 1172 | 1173 | 1174 | @register 1175 | def cat(*fnames, axis="index"): 1176 | """Concatenates all files provided. 1177 | 1178 | Usage: ph cat a.csv b.csv c.csv 1179 | ph cat a.csv b.csv c.csv --axis=index # default 1180 | ph cat a.csv b.csv c.csv --axis=columns 1181 | 1182 | If no arguments are provided, read from std in. 1183 | 1184 | """ 1185 | if axis not in ("index", "columns"): 1186 | sys.exit("Unknown axis command '{}'".format(axis)) 1187 | if not fnames: 1188 | pipeout(pipein()) 1189 | else: 1190 | dfs = [] 1191 | for fname in fnames: 1192 | df = pd.read_csv(fname) 1193 | dfs.append(df) 1194 | retval = pd.concat(dfs, axis=axis) 1195 | pipeout(retval) 1196 | 1197 | 1198 | @register 1199 | def merge(fname1, fname2, how="inner", on=None, left=None, right=None): 1200 | """Merging two csv files. 1201 | 1202 | If the two files have a common column name, then the merge will be 1203 | on that column. If the files have several common column names, use 1204 | --on=key for merging on a specific column. 1205 | 1206 | If you want to merge on columns with different names, use 1207 | --left=lkey --right=rkey. 1208 | 1209 | Choose between left merge, right merge, inner merge and outer merge 1210 | by using (e.g.) --how=inner. 1211 | 1212 | Usage: ph merge a.csv b.csv --on=ijk 1213 | ph merge a.csv b.csv --on ijk --how=inner 1214 | ph merge a.csv b.csv --left=key_a --right=key_b 1215 | 1216 | """ 1217 | hows = ("left", "right", "outer", "inner") 1218 | if how not in hows: 1219 | sys.exit("Unknown merge --how={}, must be one of {}".format(how, hows)) 1220 | try: 1221 | df1 = pd.read_csv(fname1) 1222 | df2 = pd.read_csv(fname2) 1223 | except Exception as err: 1224 | sys.exit(str(err)) 1225 | if set([on, left, right]) == set([None]) and not set(df1.columns).intersection(set(df2.columns)): 1226 | sys.exit("No common columns to perform merge on. Merge options: on, or: left=None, right=None.") 1227 | if set([on, left, right]) == set([None]): 1228 | pipeout(pd.merge(df1, df2, how=how)) 1229 | else: 1230 | if left is None and right is None: 1231 | pipeout(pd.merge(df1, df2, how=how, on=on)) 1232 | elif left is not None and right is not None: 1233 | _assert_col(df1, left, "merge") 1234 | _assert_col(df2, right, "merge") 1235 | pipeout(pd.merge(df1, df2, how=how, left_on=left, right_on=right)) 1236 | else: 1237 | sys.exit("Specify columns in both files. left was {}, right was {}".format(left, right)) 1238 | 1239 | 1240 | @register 1241 | def tab(): 1242 | """Equivalent to `ph to tsv`. 1243 | 1244 | Usage: cat a.csv | ph tab 1245 | """ 1246 | pipeout(pipein(), sep="\t") 1247 | 1248 | 1249 | @register 1250 | def tabulate(*args, **kwargs): 1251 | """Tabulate the output for pretty-printing. 1252 | 1253 | Usage: cat a.csv | ph tabulate --headers --noindex --format=grid 1254 | 1255 | Takes arguments 1256 | * --headers 1257 | * --noindex 1258 | * --format=[grid, latex, pretty, ...]. 1259 | 1260 | For a full list of format styles confer the README. 1261 | 1262 | This function uses the tabulate project available as a standalone 1263 | package from PyPI. 1264 | 1265 | Using `tabulate` in a pipeline usually means that the `ph` pipeline ends. 1266 | This is because of `tabulate`'s focus on user readability over machine 1267 | readability. 1268 | 1269 | """ 1270 | headers = tuple() 1271 | fmt = kwargs.get("format") 1272 | index = True 1273 | if "--noindex" in args: 1274 | index = False 1275 | if "--headers" in args: 1276 | headers = "keys" 1277 | df = pipein() 1278 | out = tabulate_(df, tablefmt=fmt, headers=headers, showindex=index) 1279 | _safe_out(out) 1280 | 1281 | 1282 | @register 1283 | def show(noindex=False): 1284 | """Similar to ph tabulate --headers [--noindex]. 1285 | 1286 | Usage: cat a.csv | ph show 1287 | cat a.csv | ph show --noindex 1288 | """ 1289 | if noindex: 1290 | tabulate("--headers", "--noindex") 1291 | else: 1292 | tabulate("--headers") 1293 | 1294 | 1295 | def _print_commands(cmds): 1296 | num_cols = 72 // max(len(cmd) for cmd in cmds) 1297 | while (len(cmds) % num_cols) != 0: 1298 | cmds.append("") 1299 | df = pd.DataFrame(pd.Series(cmds).values.reshape(num_cols, -1)) 1300 | print(tabulate_(df.transpose(), showindex=False)) 1301 | 1302 | 1303 | @registerx("help") 1304 | def help_(*args, **kwargs): 1305 | """Writes help (docstring) about the different commands.""" 1306 | if not args: 1307 | print("Usage: ph command arguments") 1308 | print(USAGE_TEXT) 1309 | _print_commands(sorted(COMMANDS.keys())) 1310 | sys.exit(0) 1311 | cmd = args[0] 1312 | 1313 | ds = None 1314 | if cmd in DOCS: 1315 | ds = DOCS[cmd] 1316 | else: 1317 | try: 1318 | fn = getattr(pd.DataFrame, cmd) 1319 | ds = getattr(fn, "__doc__") 1320 | except AttributeError: 1321 | pass 1322 | if ds is None: 1323 | sys.exit("Unknown command {}".format(cmd)) 1324 | print("Usage: ph {}".format(cmd)) 1325 | print(" {}".format(ds.strip())) 1326 | 1327 | 1328 | def slugify_name(name): 1329 | name_ = name 1330 | try: 1331 | name = float(name_) 1332 | except ValueError: 1333 | pass 1334 | if isinstance(name_, (int, str)): 1335 | try: 1336 | name = int(name_) 1337 | except ValueError: 1338 | pass 1339 | if isinstance(name, (int, float)): 1340 | name = str(name) + "_" 1341 | if not name: 1342 | return "unnamed" 1343 | if name == "_": 1344 | return "_" 1345 | lead_under = name[0] == "_" 1346 | trail_under = name[-1] == "_" 1347 | 1348 | name = name.strip().lower() 1349 | unwanted = set(c for c in name if not c.isalnum()) 1350 | for u in unwanted: 1351 | name = name.replace(u, "_").strip() 1352 | while "__" in name: 1353 | name = name.replace("__", "_").strip() 1354 | name = name.strip("_") 1355 | if lead_under: 1356 | name = "_" + name 1357 | if trail_under: 1358 | name = name + "_" 1359 | return name 1360 | 1361 | 1362 | @register 1363 | def slugify(): 1364 | """Slugify the column headers. 1365 | 1366 | Usage: cat a.csv | ph slugify 1367 | 1368 | Removes all non-alphanumeric characters aside from the underscore. 1369 | 1370 | Is useful in scenarios where you have possibly many columns with 1371 | very ugly names. Can be a good preprocessor to @rename: 1372 | 1373 | Usage: cat a.csv | ph slugify | ph rename less_bad_name good_name 1374 | 1375 | """ 1376 | df = pipein() 1377 | df.columns = [slugify_name(name) for name in df.columns] 1378 | pipeout(df) 1379 | 1380 | 1381 | @register 1382 | def raw(fname=None): 1383 | """Do your best to read this comma-separated input.""" 1384 | import csv 1385 | 1386 | if fname is None: 1387 | d = csv.reader(sys.stdin) 1388 | df = pd.DataFrame(d) 1389 | else: 1390 | with open(fname, "r") as fin: 1391 | d = csv.reader(fin) 1392 | df = pd.DataFrame(d) 1393 | pipeout(df) 1394 | 1395 | 1396 | @registerx("open") 1397 | def open_(ftype, fname, **kwargs): 1398 | """Use a reader to open a file. 1399 | 1400 | Open ftype file with name fname and stream out. 1401 | 1402 | Usage: ph open csv a.csv 1403 | ph open csv a.csv --skiprows=7 1404 | ph open json a.json 1405 | ph open parquet a.parquet 1406 | ph open excel a.ods 1407 | ph open excel a.xls 1408 | ph open excel a.xlsx 1409 | ph open excel a.xls --sheet_name=2 1410 | ph open excel a.xls --sheet_name="The Real Dataset sheet" 1411 | ph open csv a.csv --thousands=',' 1412 | 1413 | 1414 | In the event that the csv data starts on the first line (i.e. no 1415 | header is present), use --header=None. 1416 | 1417 | """ 1418 | if "header" in kwargs: 1419 | kwargs["header"] = __tryparse(kwargs["header"]) 1420 | 1421 | if ftype not in READERS: 1422 | sys.exit("Unknown filetype {}".format(ftype)) 1423 | reader = READERS[ftype] 1424 | 1425 | if kwargs.get("sep") == "\\t": 1426 | kwargs["sep"] = "\t" 1427 | 1428 | if ftype == "clipboard" and fname is not None: 1429 | sys.exit("clipboard does not take fname") 1430 | if ftype != "clipboard" and fname is None: 1431 | sys.exit("filename is required for {}".format(ftype)) 1432 | 1433 | skiprows = kwargs.get("skiprows") 1434 | if skiprows is not None: 1435 | try: 1436 | skiprows = int(skiprows) 1437 | if skiprows < 0: 1438 | raise ValueError("Negative") 1439 | except ValueError: 1440 | sys.exit("skiprows must be a non-negative int, not {}".format(skiprows)) 1441 | kwargs["skiprows"] = skiprows 1442 | 1443 | try: 1444 | if ftype == "clipboard": 1445 | df = reader(**kwargs) 1446 | elif ftype in ("excel", "xls", "odf"): 1447 | try: 1448 | df = reader(fname, **kwargs) 1449 | except Exception as err: 1450 | sys.exit(err) 1451 | if not isinstance(df, pd.DataFrame): # could be dict 1452 | try: 1453 | errormsg = 'Specify --sheet_name="{}"'.format( 1454 | "|".join(str(k) for k in df.keys()) 1455 | ) 1456 | except Exception: 1457 | errormsg = "Specify --sheet_name" 1458 | sys.exit(errormsg) 1459 | else: 1460 | df = reader(fname, **kwargs) 1461 | except AttributeError as err: 1462 | sys.exit( 1463 | "{} is not supported in your Pandas installation\n{}".format(ftype, err) 1464 | ) 1465 | except ImportError as err: 1466 | sys.exit( 1467 | "{} is not supported in your Pandas installation\n{}".format(ftype, err) 1468 | ) 1469 | except FileNotFoundError as err: 1470 | sys.exit("File not found: {}".format(err)) 1471 | pipeout(df) 1472 | 1473 | 1474 | _ATTRS_WITH_SERIES_OUTPUT = ( 1475 | "all", 1476 | "any", 1477 | "count", 1478 | "kurt", 1479 | "kurtosis", 1480 | "mad", 1481 | "mean", 1482 | "median", 1483 | "min", 1484 | "nunique", 1485 | "prod", 1486 | "product", 1487 | "quantile", 1488 | "sem", 1489 | "skew", 1490 | "std", 1491 | "sum", 1492 | "var", 1493 | ) 1494 | 1495 | 1496 | def _call(attr, *args, **kwargs): 1497 | df = pipein() 1498 | dfn = getattr(df, attr)(*args, **kwargs) 1499 | if attr in _ATTRS_WITH_SERIES_OUTPUT: 1500 | dfn = dfn.reset_index() 1501 | dfn = dfn.T 1502 | pipeout(dfn, header=False) 1503 | else: 1504 | pipeout(dfn) 1505 | 1506 | 1507 | def register_forward(attr): 1508 | def partial(*args, **kwargs): 1509 | return _call(attr, *args, **kwargs) 1510 | 1511 | partial.__name__ = attr 1512 | COMMANDS[attr] = partial 1513 | 1514 | 1515 | @register 1516 | def head(n=10): 1517 | """Similar to `head` but keeps the header. 1518 | 1519 | Print the header followed by the first 10 (or n) lines of the stream to 1520 | standard output. 1521 | 1522 | Usage: cat a.csv | ph head 1523 | cat a.csv | ph head 8 1524 | 1525 | 1526 | """ 1527 | _call("head", int(n)) 1528 | 1529 | 1530 | @register 1531 | def tail(n=10): 1532 | """Similar to `tail` but keeps the header. 1533 | 1534 | Print the header followed by the last 10 (or n) lines of the stream to 1535 | standard output. 1536 | 1537 | """ 1538 | _call("tail", int(n)) 1539 | 1540 | 1541 | def __tryparse(x): 1542 | if x is None or x == "None": 1543 | return None 1544 | x_ = x 1545 | try: 1546 | x_ = float(x) 1547 | x_ = int(x) 1548 | except ValueError: 1549 | pass 1550 | except OverflowError: 1551 | x_ = float("inf") 1552 | return x_ 1553 | 1554 | 1555 | @register 1556 | def replace(old, new, column=None, newcolumn=None): 1557 | """Replace a value (in a column) with a new value. 1558 | 1559 | Usage: cat a.csv | ph replace 8 100 # replace in all columns 1560 | cat a.csv | ph replace 8 100 --column=y 1561 | cat a.csv | ph replace 8 100 --column=y --newcolumn=z 1562 | 1563 | Beware that it is difficult to know which _types_ we are searching for, 1564 | therefore we only apply a heuristic, which is doomed to be faulty. 1565 | """ 1566 | if newcolumn is None: 1567 | newcolumn = column 1568 | df = pipein() 1569 | 1570 | if column is None: 1571 | if newcolumn is not None: 1572 | sys.exit("Cannot use newcolumn and not column.") 1573 | df = df.replace(to_replace=old, value=new, inplace=False) 1574 | elif column not in df: 1575 | sys.exit("Column {} does not exist.".format(column)) 1576 | else: 1577 | df[newcolumn] = df[column].replace(to_replace=old, value=new, inplace=False) 1578 | pipeout(df) 1579 | 1580 | 1581 | @register 1582 | def rename(before, after): 1583 | """Rename a column name. 1584 | 1585 | Usage: ph rename before after 1586 | 1587 | Example: cat a.csv | ph rename x a | ph rename y b 1588 | 1589 | """ 1590 | pipeout(pipein().rename(columns={before: after})) 1591 | 1592 | 1593 | @register 1594 | def columns(*cols, **kwargs): 1595 | """ph columns servers two purposes. 1596 | 1597 | Called without any arguments, it lists the names of the columns in 1598 | the stream. 1599 | 1600 | Called with arguments, it streams out the csv data from the given columns 1601 | with prescribed order. 1602 | 1603 | Takes also arguments --startswith=the_prefix and --endswith=the_suffix which 1604 | selects all columns matching either pattern. 1605 | 1606 | 1607 | Usage: cat a.csv | ph columns # will list all column names 1608 | cat a.csv | ph columns y x # select only columns y and x 1609 | cat a.csv | ph columns --startswith=sepal 1610 | 1611 | """ 1612 | cols = list(cols) 1613 | df = pipein() 1614 | if "startswith" in kwargs: 1615 | q = kwargs["startswith"] 1616 | for col in df.columns: 1617 | if col.startswith(q) and col not in cols: 1618 | cols.append(col) 1619 | if "endswith" in kwargs: 1620 | q = kwargs["endswith"] 1621 | for col in df.columns: 1622 | if col.endswith(q) and col not in cols: 1623 | cols.append(col) 1624 | 1625 | _assert_cols(df, cols, "columns") 1626 | 1627 | if not cols and not kwargs: 1628 | print("columns") 1629 | print("\n".join(list(df.columns))) 1630 | else: 1631 | pipeout(df[cols]) 1632 | 1633 | 1634 | @register 1635 | def spencer(*cols): 1636 | """Compute Spencer's 15-weight average. 1637 | 1638 | Usage: cat a.csv | ph spencer 1639 | 1640 | Experimental feature for computing Spencer's 15-weight average. 1641 | Smooths out curves by removing high frequency noise. Will 1642 | ultimately lose some data on each end of the timeseries. 1643 | 1644 | """ 1645 | _SPENCER = (-3, -6, -5, 3, 21, 46, 67, 74, 67, 46, 21, 3, -5, -6, -3) 1646 | _SPENCER_SUM = sum(_SPENCER) 1647 | 1648 | def spencer_(lst): 1649 | for i in range(7, len(lst) - 8): 1650 | seq = lst[i - 7 : i + 8] 1651 | yield sum(seq[i] * _SPENCER[i] / _SPENCER_SUM for i in range(15)) 1652 | 1653 | df = pipein() 1654 | _assert_cols(df, cols, "spencer") 1655 | prefix = [float("nan")] * 7 1656 | suffix = [float("nan")] * 8 1657 | if not cols: 1658 | cols = list(df.columns) 1659 | for col in cols: 1660 | lst = list(df[col]) 1661 | s = list(spencer_(lst)) 1662 | ncol = prefix + s + suffix 1663 | df[col] = ncol 1664 | pipeout(df) 1665 | 1666 | 1667 | def _parse_slice(slicestr): 1668 | pattern = ": | : | : | ::" 1669 | error = "Input to slice is {} _not_ {}".format(pattern, slicestr) 1670 | 1671 | assert ":" in slicestr, error 1672 | start = None 1673 | end = None 1674 | step = None 1675 | tup = slicestr.split(":") 1676 | if len(tup) > 3: 1677 | sys.exit(error) 1678 | start = tup[0] or None 1679 | if start is not None: 1680 | start = int(start) 1681 | end = tup[1] or None 1682 | if end is not None: 1683 | end = int(end) 1684 | if len(tup) == 3: 1685 | step = tup[2] or None 1686 | if step is not None: 1687 | step = int(step) 1688 | return start, end, step 1689 | 1690 | 1691 | @registerx("slice") 1692 | def slice_(slicestr): 1693 | """Slice a dataframe with Python slice pattern. 1694 | 1695 | Usage: cat a.csv | ph slice :10 # head 1696 | cat a.csv | ph slice -10: # tail 1697 | cat a.csv | ph slice ::2 # every even row 1698 | cat a.csv | ph slice 1::2 # every odd row 1699 | cat a.csv | ph slice ::-1 # reverse file 1700 | 1701 | """ 1702 | pattern = ": | : | : | ::" 1703 | error = "Input to slice is {} _not_ {}".format(pattern, slicestr) 1704 | df = pipein() 1705 | if isinstance(slicestr, int) or ":" not in slicestr: 1706 | sys.exit(error) 1707 | start, end, step = _parse_slice(slicestr) 1708 | retval = df[start:end:step] 1709 | pipeout(retval) 1710 | 1711 | 1712 | @register 1713 | def drop(*columns, **kwargs): 1714 | """Drop specified labels from rows or columns. 1715 | 1716 | Remove rows or columns by specifying label names and corresponding 1717 | axis, or by specifying directly index or column names. 1718 | 1719 | Usage: cat a.csv | ph drop 'x' --axis=columns 1720 | cat a.csv | ph drop 0 --axis=index 1721 | 1722 | """ 1723 | for opt in ("axis", "levels"): 1724 | if opt in kwargs: 1725 | kwargs[opt] = __tryparse(kwargs[opt]) 1726 | if "inplace" in kwargs: 1727 | sys.exit("inplace is nonsensical in ph") 1728 | 1729 | df = pipein() 1730 | 1731 | if kwargs.get("axis") in (None, 0, "index"): 1732 | columns = [__tryparse(col) for col in columns] 1733 | elif kwargs.get("axis") in (1, "columns"): 1734 | _assert_cols(df, columns, "drop") 1735 | else: 1736 | sys.exit( 1737 | "--axis=index (or 0) or --axis=columns (or 1), not {}".format( 1738 | kwargs.get("axis") 1739 | ) 1740 | ) 1741 | 1742 | ndf = df.drop(list(columns), **kwargs) 1743 | pipeout(ndf) 1744 | 1745 | 1746 | @register 1747 | def shape(): 1748 | """Print the shape of the csv file, i.e. num cols and num rows. 1749 | 1750 | The output will have two rows and two columns, with header "rows,columns". 1751 | 1752 | """ 1753 | print("rows,columns\n" + ",".join([str(x) for x in pipein().shape])) 1754 | 1755 | 1756 | @register 1757 | def empty(): 1758 | """Print a csv file with one column containing True or False. 1759 | 1760 | The output depends on whether the csv input was empty. 1761 | 1762 | """ 1763 | print("empty\n{}".format(pipein().empty)) 1764 | 1765 | 1766 | @register 1767 | def index(): 1768 | """Reset the index to a 0..n-1 counter. 1769 | 1770 | Usage: cat a.csv | ph index 1771 | 1772 | Adds a left-most column `index`. 1773 | """ 1774 | pipeout(pipein().reset_index()) 1775 | 1776 | 1777 | @register 1778 | def sort(*col): 1779 | """Sort csv input by column(s). 1780 | 1781 | This is the only way to sort on multiple columns since sort is not stable. 1782 | 1783 | Usage: cat iris.csv | ph sort setosa 1784 | cat iris.csv | ph sort setosa verginica 1785 | 1786 | """ 1787 | df = pipein() 1788 | _assert_cols(df, col, "sort") 1789 | pipeout(df.sort_values(list(col))) 1790 | 1791 | 1792 | @register 1793 | def polyfit(x, y, deg=1): 1794 | """Perform linear/polynomial regression. 1795 | 1796 | Usage: cat a.csv | ph polyfix x y 1797 | cat a.csv | ph polyfix x y --deg=1 # default 1798 | cat a.csv | ph polyfix x y --deg=2 # default 1799 | 1800 | Outputs a column polyfit_{deg} containing the evaluated index. 1801 | 1802 | """ 1803 | df = pipein() 1804 | _assert_cols(df, (x, y), "polyfit") 1805 | deg = __tryparse(deg) 1806 | if not isinstance(deg, int) or deg <= 0: 1807 | sys.exit("deg={} should be a positive int".format(deg)) 1808 | try: 1809 | import numpy 1810 | except ImportError: 1811 | sys.exit("numpy needed for polyfit. pip install numpy") 1812 | 1813 | polynomial = numpy.polynomial.Polynomial.fit(df[x], df[y], deg=deg) 1814 | df["polyfit_{}".format(deg)] = df[x].map(polynomial) 1815 | pipeout(df) 1816 | 1817 | 1818 | def __process(attr): 1819 | if attr in COMMANDS: 1820 | return False 1821 | if attr.startswith("_"): 1822 | return False 1823 | if attr.startswith("to_"): 1824 | return False 1825 | if attr == "T": 1826 | return False 1827 | return True 1828 | 1829 | 1830 | for attr in dir(pd.DataFrame): 1831 | if __process(attr): 1832 | register_forward(attr) 1833 | 1834 | 1835 | def _main(argv): 1836 | if len(argv) < 2: 1837 | sys.exit("Usage: ph command [args]\n ph help") 1838 | cmd = argv[1] 1839 | if cmd in ("-v", "--version"): 1840 | print_version() 1841 | sys.exit() 1842 | if cmd in ("-h", "--h", "--help"): 1843 | cmd = "help" 1844 | if cmd not in COMMANDS: 1845 | sys.exit("Unknown command {}.".format(cmd)) 1846 | 1847 | # Self-implemented parsing of arguments. 1848 | # Arguments of type "abc" and "--abc" go into args 1849 | # Arguments of type "--abc=def" go into kwargs as key, value pairs 1850 | args = [] 1851 | kwarg = {} 1852 | for a in argv[2:]: 1853 | if KWARG.match(a): 1854 | args.append(a) 1855 | elif KWARG_WITH_VALUE.match(a): 1856 | split = a.index("=") 1857 | k = a[2:split] 1858 | v = a[split + 1 :] 1859 | kwarg[k] = __tryparse(v) 1860 | else: 1861 | args.append(__tryparse(a)) 1862 | try: 1863 | COMMANDS[cmd](*args, **kwarg) 1864 | except TypeError as err: 1865 | sys.exit(err) 1866 | 1867 | 1868 | def main(): 1869 | _main(sys.argv) 1870 | 1871 | 1872 | if __name__ == "__main__": 1873 | main() 1874 | --------------------------------------------------------------------------------