├── drake ├── csv2arff ├── arff2csv ├── servewd ├── Rio-pca ├── Rio-mds ├── README.md ├── weka-cluster ├── pbc ├── Rio-scatter ├── dseq ├── body ├── explain ├── LICENSE ├── cols ├── unpack ├── trim ├── header ├── weka ├── .gitignore ├── dumbplot ├── scrape ├── Rio └── csv2vw /drake: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | drip -cp ${DRAKEPATH}/drake.jar drake.core "$@" 3 | -------------------------------------------------------------------------------- /csv2arff: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | weka core.converters.CSVLoader /dev/stdin 3 | -------------------------------------------------------------------------------- /arff2csv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | weka core.converters.CSVSaver -i /dev/stdin 3 | -------------------------------------------------------------------------------- /servewd: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ARGS="$@" 3 | python3 -m http.server ${ARGS} 2>/dev/null & 4 | -------------------------------------------------------------------------------- /Rio-pca: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])' 3 | -------------------------------------------------------------------------------- /Rio-mds: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])' 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dsutils 2 | Command-line tools for doing data science 3 | 4 | 5 | ## Notes 6 | 7 | * The `sample` tool has moved to its own repository at https://github.com/jeroenjanssens/sample-stream/ 8 | -------------------------------------------------------------------------------- /weka-cluster: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ALGO="$@" 3 | IN=$(mktemp --tmpdir weka-cluster-XXXXXXXX).arff 4 | 5 | finish () { 6 | rm -f $IN 7 | } 8 | trap finish EXIT 9 | 10 | csv2arff > $IN 11 | weka filters.unsupervised.attribute.AddCluster -W "weka.${ALGO}" -i $IN -o /dev/stdout | arff2csv 12 | -------------------------------------------------------------------------------- /pbc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # pbc: parallel bc. First column of input CSV is mapped to {1}, second to {2}, and so forth. 3 | # 4 | # Example usage: paste -d, <(seq 100) <(seq 100 -1 1) | ./pbc 'sqrt({1}*{2})' 5 | # 6 | # Dependency: GNU parallel 7 | # 8 | # Author: http://jeroenjanssens.com 9 | 10 | parallel -C, -k -j100% "echo '$1' | bc -l" 11 | -------------------------------------------------------------------------------- /Rio-scatter: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Rio-scatter: create scatter plot from CSV 4 | # 5 | # Default colour is 1 (blue) 6 | # 7 | # Example usage: 8 | # curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display 9 | # 10 | # Dependency: Rio 11 | # 12 | # Author: Jeroen Janssens (http://jeroenjanssens.com) 13 | 14 | X="$1" 15 | Y="$2" 16 | COLOR="${3:-1}" 17 | Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))" 18 | -------------------------------------------------------------------------------- /dseq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # dseq: generate sequence of dates relative to today. 3 | # 4 | # Usage: dseq LAST 5 | # or: dseq FIRST LAST 6 | # or: dseq FIRST INCREMENT LAST 7 | # 8 | # Example usage: 9 | # $ dseq 1 # tomorrow 10 | # $ dseq 0 0 # today 11 | # $ dseq 7 # next 7 days 12 | # $ dseq -2 0 # day before yesterday till today 13 | # $ dseq 1 7 365 # tomorrow and then every week for a year 14 | # 15 | # Author: Jeroen Janssens 16 | 17 | seq -f "%g day" "$@" | date --file - +%F 18 | -------------------------------------------------------------------------------- /body: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # body: apply expression to all but the first line. 4 | # Use multiple times in case the header spans more than one line. 5 | # 6 | # Example usage: 7 | # $ seq 10 | header -a 'values' | body sort -nr 8 | # $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr 9 | # $ printf "first_name\njim\nbob\nmary\n" | body ruby -nle 'puts $_.capitalize' 10 | # 11 | # From: https://unix.stackexchange.com/a/11859 12 | # 13 | # See also: header (https://github.com/jeroenjanssens/dsutils) 14 | IFS= read -r header 15 | printf '%s\n' "$header" 16 | "$@" 17 | -------------------------------------------------------------------------------- /explain: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # explain: Command-line wrapper for explainshell.com 3 | # 4 | # Example usage: explain tar xzvf 5 | # 6 | # Dependency: scrape (from: https://github.com/jeroenjanssens/dsutils) 7 | # 8 | # Author: http://jeroenjanssens.com 9 | 10 | 11 | COMMAND="$@" 12 | URL="http://explainshell.com/explain?cmd=${COMMAND}" 13 | SYSTEM=$(uname) 14 | 15 | if [[ "$SYSTEM" == "Linux" ]] 16 | then 17 | curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -re 's/<(\/?)[^>]*>//g' 18 | elif [[ "$SYSTEM" == "Darwin" ]] 19 | then 20 | curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -Ee 's/<(\/?)[^>]*>//g' 21 | fi 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jeroen Janssens 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cols: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # cols: apply a command to a subset of the columns and merge back with the remaining columns. 3 | # 4 | # Assumes that the input data is comma-delimited and that it has a header. 5 | # Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org 6 | # 7 | # Example usage 1: reverse sort column 'a' 8 | # $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr 9 | # 10 | # Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set: 11 | # $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species 12 | # 13 | # See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science) 14 | # 15 | # Author: http://jeroenjanssens.com 16 | 17 | ARG="$1" 18 | ARG_INV="$(tr cC Cc <<< ${ARG})" 19 | shift 20 | COLUMNS="$1" 21 | shift 22 | EXPR="$@" 23 | 24 | finish() { 25 | rm -f $OTHER_COLUMNS 26 | } 27 | trap finish EXIT 28 | 29 | if [ -z "$TMPDIR" ]; then 30 | TMPDIR=/tmp 31 | fi 32 | OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX) 33 | 34 | tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS) 35 | -------------------------------------------------------------------------------- /unpack: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # unpack: Extract common file formats 3 | 4 | # Dependencies: unrar, unzip, p7zip-full 5 | 6 | # Author: Patrick Brisbin 7 | # From: http://linuxtidbits.wordpress.com/2009/08/04/week-of-bash-scripts-extract/ 8 | 9 | # Display usage if no parameters given 10 | if [[ -z "$@" ]]; then 11 | echo " ${0##*/} - extract common file formats)" 12 | exit 13 | fi 14 | 15 | # Required program(s) 16 | req_progs=(7z unrar unzip) 17 | for p in ${req_progs[@]}; do 18 | hash "$p" 2>&- || \ 19 | { echo >&2 " Required program \"$p\" not installed."; exit 1; } 20 | done 21 | 22 | # Test if file exists 23 | if [ ! -f "$@" ]; then 24 | echo "File "$@" doesn't exist" 25 | exit 26 | fi 27 | 28 | # Extract file by using extension as reference 29 | case "$@" in 30 | *.7z ) 7z x "$@" ;; 31 | *.tar.bz2 ) tar xvjf "$@" ;; 32 | *.bz2 ) bunzip2 "$@" ;; 33 | *.deb ) ar vx "$@" ;; 34 | *.tar.gz ) tar xvf "$@" ;; 35 | *.gz ) gunzip "$@" ;; 36 | *.tar ) tar xvf "$@" ;; 37 | *.tbz2 ) tar xvjf "$@" ;; 38 | *.tar.xz ) tar xvf "$@" ;; 39 | *.tgz ) tar xvzf "$@" ;; 40 | *.rar ) unrar x "$@" ;; 41 | *.zip ) unzip "$@" ;; 42 | *.Z ) uncompress "$@" ;; 43 | * ) echo " Unsupported file format" ;; 44 | esac 45 | -------------------------------------------------------------------------------- /trim: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # NAME 4 | # trim -- trim output to a given height and width. 5 | # 6 | # USAGE 7 | # trim [height] [width] 8 | # 9 | # By default, output is trimmed to 10 lines and the width of the terminal. 10 | # Pass a negative number to disable trimming the height and or width. 11 | # Before trimming, tabs are expanded to spaces. 12 | # 13 | # EXAMPLES 14 | # seq 100 | trim 15 | # seq 100 | trim 20 16 | # seq 100 | trim 20 40 17 | # seq 100 | trim -1 40 18 | # seq 100 | trim 20 -1 19 | # 20 | # Author: Jeroen Janssens (https://jeroenjanssens.com) 21 | # 22 | # LICENSE: MIT (2021) 23 | 24 | set -euf -o pipefail 25 | 26 | HEIGHT="${1:-10}" 27 | WIDTH="${2:-$(tput cols)}" 28 | 29 | expand | 30 | awk -v height="${HEIGHT}" -v width="${WIDTH}" \ 31 | 'function tprint() { 32 | if (width > 0 && length($0) > width) { 33 | print substr($0, 1, width - 1) "…" 34 | } else { 35 | print 36 | } 37 | } 38 | 39 | height <= 0 || NR <= height { 40 | tprint() 41 | } 42 | 43 | END { 44 | if (height > 0) { 45 | if (NR == height + 1) { 46 | tprint() 47 | } else if (NR > height + 1) { 48 | print "… with " (NR - height) " more lines" 49 | } 50 | } 51 | }' 52 | -------------------------------------------------------------------------------- /header: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # header: add, replace, and delete header lines. 3 | # 4 | # Example usage: 5 | # $ seq 10 | header -a 'values' 6 | # $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"' 7 | # $ seq 10 | header -a 'values' | header -d 8 | # $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_" 9 | # 10 | # See also: body (https://github.com/jeroenjanssens/command-line-tools-for-data-science) 11 | # 12 | # Author: http://jeroenjanssens.com 13 | 14 | usage () { 15 | cat << EOF 16 | header: add, replace, and delete header lines. 17 | 18 | usage: header OPTIONS 19 | 20 | OPTIONS: 21 | -n Number of lines to consider as header [default: 1] 22 | -a Add header 23 | -r Replace header 24 | -e Apply expression to header 25 | -d Delete header 26 | -h Show this message 27 | 28 | Example usage: 29 | $ seq 10 | header -a 'values' 30 | $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"' 31 | $ seq 10 | header -a 'values' | header -d 32 | $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_" 33 | 34 | See also: body 35 | EOF 36 | } 37 | 38 | get_header () { 39 | for i in $(seq $NUMROWS); do 40 | IFS= read -r LINE 41 | OLDHEADER="${OLDHEADER}${LINE}\n" 42 | done 43 | } 44 | 45 | print_header () { 46 | echo -ne "$1" 47 | } 48 | 49 | print_body () { 50 | cat 51 | } 52 | 53 | OLDHEADER= 54 | NUMROWS=1 55 | 56 | while getopts "dn:ha:r:e:" OPTION 57 | do 58 | case $OPTION in 59 | n) 60 | NUMROWS=$OPTARG 61 | ;; 62 | a) 63 | print_header "$OPTARG\n" 64 | print_body 65 | exit 0 66 | ;; 67 | d) 68 | get_header 69 | print_body 70 | exit 0 71 | ;; 72 | r) 73 | get_header 74 | print_header "$OPTARG\n" 75 | print_body 76 | exit 0 77 | ;; 78 | e) 79 | get_header 80 | print_header "$(echo -ne $OLDHEADER | eval $OPTARG)\n" 81 | print_body 82 | exit 0 83 | ;; 84 | h) 85 | usage 86 | exit 0 87 | ;; 88 | esac 89 | done 90 | 91 | get_header 92 | print_header "${OLDHEADER}" 93 | -------------------------------------------------------------------------------- /weka: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # weka: run Weka from the command-line 3 | # 4 | # Weka can be obtained from http://www.cs.waikato.ac.nz/ml/weka/downloading.html 5 | # Make sure that WEKAPATH is set to the full path that contains weka.jar in your .bashrc or .zshrc 6 | # The snippets below enable tab completion in Bash and Zsh, respectively. 7 | # 8 | # Author: Jeroen Janssens (http://jeroenjanssens.com) 9 | # 10 | # See csv2arff and arff2csv for two examples 11 | 12 | java -Xmx1024M -cp ${WEKAPATH}/weka.jar "weka.$@" 13 | 14 | ######################################################### 15 | # Tab completion for Bash # 16 | ######################################################### 17 | # 18 | # export WEKAPATH="/home/joe/bin/" 19 | # 20 | # weka-classes () { 21 | # unzip -l $WEKAPATH/weka.jar | 22 | # sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' | 23 | # tr '/' '.' 24 | # } 25 | # 26 | # weka-folders () { 27 | # unzip -l $WEKAPATH/weka.jar | 28 | # sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' | 29 | # tr '/' '.' 30 | # } 31 | # 32 | # _completeweka() { 33 | # local curw=${COMP_WORDS[COMP_CWORD]} 34 | # local wordlist=$(weka-folders; weka-classes) 35 | # COMPREPLY=($(compgen -W '${wordlist[@]}' -- "$curw")) 36 | # return 0 37 | # } 38 | # 39 | # complete -o nospace -F _completeweka weka 40 | # 41 | ######################################################### 42 | # Tab completion for Zsh # 43 | ######################################################### 44 | # 45 | # export WEKAJAR="/home/joe/bin/weka.jar" 46 | # 47 | # weka-classes () { 48 | # unzip -l $WEKAJAR | 49 | # sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' | 50 | # tr '/' '.' 51 | # } 52 | # 53 | # weka-folders () { 54 | # unzip -l $WEKAJAR | 55 | # sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' | 56 | # tr '/' '.' 57 | # } 58 | # 59 | # function _completeweka { 60 | # reply=($(weka-folders; weka-classes)) 61 | # } 62 | # 63 | # compctl -K _completeweka weka 64 | # 65 | ######################################################### 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /dumbplot: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # dumbplot: Output plot on the terminal given list of X,Y coordinates. 3 | # Can either be a scatter plot or a bar (or bar-like) plot that 4 | # assumes that the y-coordinate is numeric and just displays the 5 | # x-coordinate data as-is, and in the order that it is fed to the 6 | # scripts. 7 | # 8 | # Dependency: gnuplot 9 | # 10 | # Author: http://jeroenjanssens.com 11 | 12 | usage () { 13 | cat << EOF 14 | dumbplot: Use gnuplot to quickly get ascii plot of x-y data. 15 | 16 | usage: dumbplot OPTIONS 17 | 18 | OPTIONS: 19 | -a As-is. Use the x-coord (first coord) as categorical data for plotting. 20 | -b Boxplot. Use vertical boxes rather than just a marker. Autosets -a switch. 21 | -w Terminal width do use. Defaults to actual terminal width. 22 | -v Terminal height to use. Defaults to terminal height / 2. 23 | -h Show this message 24 | 25 | Example usage: 26 | $ paste -d, <(echo 1 2 4 8 9 | tr ' ' '\n') <(echo 1 2 4 2 1 | tr ' ' '\n') | dumbplot 27 | $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -a 28 | $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b 29 | $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b -w 80 -v 22 30 | 31 | EOF 32 | } 33 | 34 | 35 | 36 | 37 | ASIS= 38 | BARCHART= 39 | PWIDTH=$(tput cols) 40 | PHEIGHT=$(tput lines) 41 | 42 | PHEIGHT=$(echo $PHEIGHT / 2 | bc) 43 | 44 | while getopts "abw:v:" OPTION 45 | do 46 | case $OPTION in 47 | a) 48 | ASIS=1 49 | ;; 50 | b) 51 | ASIS=1 52 | BARCHART=1 53 | ;; 54 | w) 55 | PWIDTH=$OPTARG 56 | ;; 57 | v) 58 | PHEIGHT=$OPTARG 59 | ;; 60 | h) 61 | usage 62 | exit 0 63 | ;; 64 | esac 65 | done 66 | 67 | 68 | 69 | 70 | ## Decision logic to execute right gnuplot command 71 | if [ ! -z "$ASIS" ] ; then 72 | if [ ! -z "$BARCHART" ] ; then 73 | # Categorical data, uses boxes in plot 74 | nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey; plot "-" using 1:3:xtic(2) with boxes' 75 | else 76 | # Categorical data, do NOT use boxes 77 | nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey; plot "-" using 1:3:xtic(2)' 78 | fi 79 | else 80 | # Scatterplot of data 81 | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey; plot "-"' 82 | fi 83 | 84 | -------------------------------------------------------------------------------- /scrape: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # scrape: Extract HTML elements using an XPath query or CSS3 selector. 4 | # 5 | # Example usage: 6 | # $ curl 'https://en.wikipedia.org/wiki/List_of_sovereign_states' -s \ 7 | # | scrape -be 'table.wikitable > tbody > tr > td > b > a' 8 | # 9 | # Dependencies: lxml and optionally cssselect 10 | # 11 | # Author: http://jeroenjanssens.com 12 | 13 | import sys 14 | import argparse 15 | from lxml import etree 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('html', nargs='?', type=argparse.FileType('rb'), 21 | default=sys.stdin, help="HTML", metavar="HTML") 22 | parser.add_argument('-a', '--argument', default="", 23 | help="argument to extract from tag") 24 | parser.add_argument('-b', '--body', action='store_true', default=False, 25 | help="Enclose output with HTML and BODY tags") 26 | parser.add_argument('-e', '--expression', default=[], action='append', 27 | help="XPath query or CSS3 selector") 28 | parser.add_argument('-f', '--file', default='', 29 | help="File to read input from") 30 | parser.add_argument('-x', '--check-existance', action='store_true', default=False, 31 | help="Process return value signifying existance") 32 | parser.add_argument('-r', '--rawinput', action='store_true', default=False, 33 | help="Do not parse HTML before feeding etree (useful" 34 | "for escaping CData)") 35 | args = parser.parse_args() 36 | 37 | args.expression = [e.decode('utf-8') for e in args.expression] 38 | 39 | from cssselect import GenericTranslator 40 | 41 | expression = [e if e.startswith('//') else GenericTranslator().css_to_xpath(e) for e in args.expression] 42 | 43 | html_parser = etree.HTMLParser(encoding='utf-8', recover=True, 44 | strip_cdata=True) 45 | 46 | inp = open(args.file) if args.file else args.html 47 | if args.rawinput: 48 | document = etree.fromstring(inp.read()) 49 | else: 50 | document = etree.parse(inp, html_parser) 51 | 52 | if args.body: 53 | sys.stdout.write("\n\n\n") 54 | 55 | for e in expression: 56 | els = list(document.xpath(e)) 57 | 58 | if args.check_existance: 59 | sys.exit(1 if len(els) == 0 else 0) 60 | 61 | for e in els: 62 | if isinstance(e, basestring): 63 | text = e 64 | elif not args.argument: 65 | text = etree.tostring(e) 66 | else: 67 | text = e.get(args.argument) 68 | if text is not None: 69 | sys.stdout.write(text.encode('utf-8').strip() + "\t") 70 | 71 | if args.body: 72 | sys.stdout.write("\n") 73 | 74 | sys.stdout.write('\n') 75 | sys.stdout.flush() 76 | 77 | if __name__ == "__main__": 78 | exit(main()) 79 | -------------------------------------------------------------------------------- /Rio: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout 3 | # 4 | # Example usage: 5 | # $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)') 6 | # 7 | # $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv 8 | # $ < iris.csv Rio -e 'df$SepalLength^2' 9 | # $ < iris.csv Rio -f summary 10 | # $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")' 11 | # $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png 12 | # 13 | # Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, sqldf, and ggmap) 14 | # 15 | # Author: http://jeroenjanssens.com 16 | 17 | usage() { 18 | cat << EOF 19 | Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout 20 | 21 | usage: Rio OPTIONS 22 | 23 | OPTIONS: 24 | -d Delimiter 25 | -e Commands to execute 26 | -f Single command to execute on data.frame 27 | -g Import ggplot2 28 | -h Show this message 29 | -m Import ggmap 30 | -n CSV has no header 31 | -r Import dplyr and tidyr 32 | -s Import sqldf 33 | -b Use same settings as used for book Data Science at the Command Line 34 | -v Verbose 35 | 36 | EOF 37 | } 38 | 39 | finish() { 40 | rm -f $IN $OUT ${OUT%.png} ${ERR%.err} 41 | 42 | ## Removes error file if error file is empty. 43 | if [[ ! -s $ERR ]]; then 44 | rm -f $ERR 45 | fi 46 | 47 | rm -f Rplots.pdf 48 | } 49 | 50 | trap finish EXIT 51 | 52 | callR() { 53 | Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}" 54 | } 55 | 56 | SCRIPT= 57 | REQUIRES= 58 | DELIMITER="," 59 | HEADER="T" 60 | VERBOSE=false 61 | 62 | # OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional. 63 | # This explicitly uses a template, which works for both. The $TMPDIR is in case 64 | # it isn't set as an enviroment variable, assumes you have /tmp. 65 | if [ -z "$TMPDIR" ]; then 66 | TMPDIR=/tmp/ 67 | fi 68 | IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX) 69 | OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png 70 | ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err 71 | 72 | while getopts "d:hgmnprsve:f:b" OPTION 73 | do 74 | case $OPTION in 75 | b) 76 | RIO_DPI=300 77 | ;; 78 | d) 79 | DELIMITER=$OPTARG 80 | ;; 81 | e) 82 | SCRIPT=$OPTARG 83 | if ! echo $SCRIPT | grep -qe "; *$" 84 | then 85 | SCRIPT="${SCRIPT};" 86 | fi 87 | ;; 88 | f) 89 | SCRIPT="${OPTARG}(df);" 90 | ;; 91 | h) 92 | usage 93 | exit 1 94 | ;; 95 | g) 96 | REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);" 97 | ;; 98 | n) 99 | HEADER="F" 100 | ;; 101 | r) 102 | REQUIRES="${REQUIRES}require(dplyr);require(tidyr);" 103 | ;; 104 | s) 105 | REQUIRES="${REQUIRES}require(sqldf);" 106 | ;; 107 | m) 108 | REQUIRES="${REQUIRES}require(ggmap);" 109 | ;; 110 | v) 111 | VERBOSE=true 112 | ;; 113 | ?) 114 | usage 115 | exit 116 | ;; 117 | esac 118 | done 119 | 120 | cat /dev/stdin > $IN 121 | 122 | if $VERBOSE 123 | then 124 | callR 125 | else 126 | callR > $ERR 2>&1 127 | fi 128 | 129 | if [[ ! -f $OUT ]]; then 130 | cat $ERR 131 | else 132 | RESULT="$(cat $OUT | tr '\0' '\n')" 133 | if [ "$RESULT" == "NULL" ]; then 134 | cat $ERR 135 | else 136 | cat $OUT 137 | fi 138 | fi 139 | 140 | -------------------------------------------------------------------------------- /csv2vw: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import csv 5 | from sys import stdin, stdout, stderr, exit 6 | import itertools 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser( 11 | epilog="""If both --classes and --auto-relabel are omitted, 12 | label values are left as-is. By default, features with value 0 are not 13 | printed. This can be overridden with --null""", 14 | usage="""%(prog)s [OPTION]... [FILE] 15 | 16 | Convert CSV to Vowpal Wabbit input format. 17 | 18 | Examples: 19 | 20 | # Leave label values as is: 21 | $ csv2vw spam.csv --label target 22 | 23 | # Relabel values 'ham' to 0 and 'spam' to 1: 24 | $ csv2vw spam.csv --label target --classes ham,spam 25 | 26 | # Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss): 27 | $ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one 28 | 29 | # Relabel first label value to 0, second to 1, and ignore the rest: 30 | $ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes 31 | 32 | # Relabel first label value to 1, second to 2, and so on: 33 | $