├── drake
├── csv2arff
├── arff2csv
├── servewd
├── Rio-pca
├── Rio-mds
├── README.md
├── weka-cluster
├── pbc
├── Rio-scatter
├── dseq
├── body
├── explain
├── LICENSE
├── cols
├── unpack
├── trim
├── header
├── weka
├── .gitignore
├── dumbplot
├── scrape
├── Rio
└── csv2vw


/drake:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | drip -cp ${DRAKEPATH}/drake.jar drake.core "$@"
3 | 


--------------------------------------------------------------------------------
/csv2arff:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | weka core.converters.CSVLoader /dev/stdin
3 | 


--------------------------------------------------------------------------------
/arff2csv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | weka core.converters.CSVSaver -i /dev/stdin
3 | 


--------------------------------------------------------------------------------
/servewd:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ARGS="$@"
3 | python3 -m http.server ${ARGS} 2>/dev/null &
4 | 


--------------------------------------------------------------------------------
/Rio-pca:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])'
3 | 


--------------------------------------------------------------------------------
/Rio-mds:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])'
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dsutils
2 | Command-line tools for doing data science
3 | 
4 | 
5 | ## Notes
6 | 
7 | * The `sample` tool has moved to its own repository at https://github.com/jeroenjanssens/sample-stream/
8 | 


--------------------------------------------------------------------------------
/weka-cluster:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ALGO="$@"
 3 | IN=$(mktemp --tmpdir weka-cluster-XXXXXXXX).arff
 4 | 
 5 | finish () {
 6 | 	rm -f $IN
 7 | }
 8 | trap finish EXIT
 9 | 
10 | csv2arff > $IN
11 | weka filters.unsupervised.attribute.AddCluster -W "weka.${ALGO}" -i $IN -o /dev/stdout | arff2csv
12 | 


--------------------------------------------------------------------------------
/pbc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # pbc: parallel bc. First column of input CSV is mapped to {1}, second to {2}, and so forth.
 3 | #
 4 | # Example usage: paste -d, <(seq 100) <(seq 100 -1 1) | ./pbc 'sqrt({1}*{2})'
 5 | #
 6 | # Dependency: GNU parallel
 7 | #
 8 | # Author: http://jeroenjanssens.com
 9 | 
10 | parallel -C, -k -j100% "echo '$1' | bc -l"
11 | 


--------------------------------------------------------------------------------
/Rio-scatter:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Rio-scatter: create scatter plot from CSV
 4 | # 
 5 | # Default colour is 1 (blue)
 6 | #
 7 | # Example usage:
 8 | # curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display
 9 | #
10 | # Dependency: Rio
11 | #
12 | # Author: Jeroen Janssens (http://jeroenjanssens.com)
13 | 
14 | X="$1"
15 | Y="$2"
16 | COLOR="${3:-1}"
17 | Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))"
18 | 


--------------------------------------------------------------------------------
/dseq:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # dseq: generate sequence of dates relative to today.
 3 | #
 4 | # Usage: dseq LAST
 5 | #    or: dseq FIRST LAST
 6 | #    or: dseq FIRST INCREMENT LAST
 7 | #
 8 | # Example usage:
 9 | # $ dseq 1       # tomorrow
10 | # $ dseq 0 0     # today
11 | # $ dseq 7       # next 7 days
12 | # $ dseq -2 0	 # day before yesterday till today
13 | # $ dseq 1 7 365 # tomorrow and then every week for a year
14 | #
15 | # Author: Jeroen Janssens
16 | 
17 | seq -f "%g day" "$@" | date --file - +%F
18 | 


--------------------------------------------------------------------------------
/body:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # body: apply expression to all but the first line.
 4 | # Use multiple times in case the header spans more than one line.
 5 | #
 6 | # Example usage:
 7 | # $ seq 10 | header -a 'values' | body sort -nr
 8 | # $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr
 9 | # $ printf "first_name\njim\nbob\nmary\n" | body ruby -nle 'puts $_.capitalize'
10 | #
11 | # From: https://unix.stackexchange.com/a/11859
12 | #
13 | # See also: header (https://github.com/jeroenjanssens/dsutils)
14 | IFS= read -r header
15 | printf '%s\n' "$header"
16 | "$@"
17 | 


--------------------------------------------------------------------------------
/explain:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # explain: Command-line wrapper for explainshell.com
 3 | # 
 4 | # Example usage: explain tar xzvf 
 5 | #
 6 | # Dependency: scrape (from: https://github.com/jeroenjanssens/dsutils)
 7 | #
 8 | # Author: http://jeroenjanssens.com
 9 | 
10 | 
11 | COMMAND="$@"
12 | URL="http://explainshell.com/explain?cmd=${COMMAND}"
13 | SYSTEM=$(uname)
14 | 
15 | if [[ "$SYSTEM" == "Linux" ]]
16 | then
17 |   curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -re 's/<(\/?)[^>]*>//g'
18 | elif [[ "$SYSTEM" == "Darwin" ]]
19 | then
20 |   curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -Ee 's/<(\/?)[^>]*>//g'
21 | fi
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jeroen Janssens
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cols:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # cols: apply a command to a subset of the columns and merge back with the remaining columns.
 3 | #
 4 | # Assumes that the input data is comma-delimited and that it has a header.
 5 | # Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org
 6 | # 
 7 | # Example usage 1: reverse sort column 'a'
 8 | # $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr
 9 | #
10 | # Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set:
11 | # $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species
12 | # 
13 | # See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
14 | #
15 | # Author: http://jeroenjanssens.com
16 | 
17 | ARG="$1"
18 | ARG_INV="$(tr cC Cc <<< ${ARG})"
19 | shift
20 | COLUMNS="$1"
21 | shift
22 | EXPR="$@"
23 | 
24 | finish() {
25 | 	rm -f $OTHER_COLUMNS
26 | }
27 | trap finish EXIT
28 | 
29 | if [ -z "$TMPDIR" ]; then
30 |     TMPDIR=/tmp
31 | fi
32 | OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX)
33 | 
34 | tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS)
35 | 


--------------------------------------------------------------------------------
/unpack:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # unpack: Extract common file formats
 3 |  
 4 | # Dependencies: unrar, unzip, p7zip-full
 5 | 
 6 | # Author: Patrick Brisbin
 7 | # From: http://linuxtidbits.wordpress.com/2009/08/04/week-of-bash-scripts-extract/
 8 | 
 9 | # Display usage if no parameters given
10 | if [[ -z "$@" ]]; then
11 | 	echo " ${0##*/} <archive> - extract common file formats)"
12 | 	exit
13 | fi
14 |  
15 | # Required program(s)
16 | req_progs=(7z unrar unzip)
17 | for p in ${req_progs[@]}; do
18 | 	hash "$p" 2>&- || \
19 | 	{ echo >&2 " Required program \"$p\" not installed."; exit 1; }
20 | done
21 |  
22 | # Test if file exists
23 | if [ ! -f "$@" ]; then
24 | 	echo "File "$@" doesn't exist"
25 | 	exit
26 | fi
27 |  
28 | # Extract file by using extension as reference
29 | case "$@" in
30 | 	*.7z ) 7z x "$@" ;;
31 | 	*.tar.bz2 ) tar xvjf "$@" ;;
32 | 	*.bz2 ) bunzip2 "$@" ;;
33 | 	*.deb ) ar vx "$@" ;;
34 | 	*.tar.gz ) tar xvf "$@" ;;
35 | 	*.gz ) gunzip "$@" ;;
36 | 	*.tar ) tar xvf "$@" ;;
37 | 	*.tbz2 ) tar xvjf "$@" ;;
38 | 	*.tar.xz ) tar xvf "$@" ;;
39 | 	*.tgz ) tar xvzf "$@" ;;
40 | 	*.rar ) unrar x "$@" ;;
41 | 	*.zip ) unzip "$@" ;;
42 | 	*.Z ) uncompress "$@" ;;
43 | 	* ) echo " Unsupported file format" ;;
44 | esac
45 | 


--------------------------------------------------------------------------------
/trim:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # NAME
 4 | #       trim -- trim output to a given height and width.
 5 | #
 6 | # USAGE
 7 | #       trim [height] [width]
 8 | #
 9 | #       By default, output is trimmed to 10 lines and the width of the terminal.
10 | #       Pass a negative number to disable trimming the height and or width. 
11 | #       Before trimming, tabs are expanded to spaces.
12 | #
13 | # EXAMPLES
14 | #       seq 100 | trim
15 | #       seq 100 | trim 20 
16 | #       seq 100 | trim 20 40
17 | #       seq 100 | trim -1 40
18 | #       seq 100 | trim 20 -1
19 | #
20 | # Author: Jeroen Janssens (https://jeroenjanssens.com)
21 | #
22 | # LICENSE: MIT (2021)
23 | 
24 | set -euf -o pipefail
25 | 
26 | HEIGHT="${1:-10}"
27 | WIDTH="${2:-$(tput cols)}"
28 | 
29 | expand |
30 | awk -v height="${HEIGHT}" -v width="${WIDTH}" \
31 | 'function tprint() {
32 |     if (width > 0 && length($0) > width) {
33 |         print substr($0, 1, width - 1) "…"
34 |     } else {
35 |         print
36 |     }
37 | }
38 | 
39 | height <= 0 || NR <= height {
40 |     tprint()
41 | }
42 | 
43 | END {
44 |     if (height > 0) {
45 |         if (NR == height + 1) {
46 |             tprint()
47 |         } else if (NR > height + 1) {
48 |             print "… with " (NR - height) " more lines"
49 |         }
50 |     }
51 | }'
52 | 


--------------------------------------------------------------------------------
/header:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # header: add, replace, and delete header lines.
 3 | # 
 4 | # Example usage:
 5 | # $ seq 10 | header -a 'values'
 6 | # $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"'
 7 | # $ seq 10 | header -a 'values' | header -d
 8 | # $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_"
 9 | #
10 | # See also: body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
11 | #
12 | # Author: http://jeroenjanssens.com
13 | 
14 | usage () {
15 | cat << EOF
16 | header: add, replace, and delete header lines.
17 | 
18 | usage: header OPTIONS
19 | 
20 | OPTIONS:
21 |   -n      Number of lines to consider as header [default: 1]
22 |   -a      Add header
23 |   -r      Replace header
24 |   -e      Apply expression to header
25 |   -d      Delete header
26 |   -h      Show this message
27 | 
28 | Example usage:
29 |   $ seq 10 | header -a 'values'
30 |   $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"'
31 |   $ seq 10 | header -a 'values' | header -d
32 |   $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_"
33 | 
34 | See also: body
35 | EOF
36 | }
37 | 
38 | get_header () {
39 | 	for i in $(seq $NUMROWS); do
40 | 		IFS= read -r LINE
41 | 		OLDHEADER="${OLDHEADER}${LINE}\n"
42 | 	done
43 | }
44 | 
45 | print_header () {
46 | 	echo -ne "$1"
47 | }
48 | 
49 | print_body () {
50 | 	cat
51 | }
52 | 
53 | OLDHEADER=
54 | NUMROWS=1
55 | 
56 | while getopts "dn:ha:r:e:" OPTION
57 | do
58 | 	case $OPTION in
59 | 		n)
60 | 			NUMROWS=$OPTARG
61 | 			;;
62 | 		a)
63 | 			print_header "$OPTARG\n"
64 | 			print_body
65 | 			exit 0
66 | 			;;
67 | 		d)
68 | 			get_header
69 | 			print_body
70 | 			exit 0
71 | 			;;
72 | 		r)
73 | 			get_header
74 | 			print_header "$OPTARG\n"
75 | 			print_body
76 | 			exit 0
77 | 			;;
78 | 		e)
79 | 			get_header
80 | 			print_header "$(echo -ne $OLDHEADER | eval $OPTARG)\n"
81 | 			print_body
82 | 			exit 0
83 | 			;;
84 | 		h)
85 | 			usage
86 | 			exit 0
87 | 			;;
88 | 	esac
89 | done
90 | 
91 | get_header
92 | print_header "${OLDHEADER}"
93 | 


--------------------------------------------------------------------------------
/weka:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # weka: run Weka from the command-line
 3 | #
 4 | # Weka can be obtained from http://www.cs.waikato.ac.nz/ml/weka/downloading.html
 5 | # Make sure that WEKAPATH is set to the full path that contains weka.jar in your .bashrc or .zshrc
 6 | # The snippets below enable tab completion in Bash and Zsh, respectively.
 7 | # 
 8 | # Author: Jeroen Janssens (http://jeroenjanssens.com)
 9 | #
10 | # See csv2arff and arff2csv for two examples
11 | 
12 | java -Xmx1024M -cp ${WEKAPATH}/weka.jar "weka.$@"
13 | 
14 | #########################################################
15 | # Tab completion for Bash                               #
16 | #########################################################
17 | # 
18 | # export WEKAPATH="/home/joe/bin/"
19 | #
20 | # weka-classes () {
21 | # 	unzip -l $WEKAPATH/weka.jar |
22 | # 	sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' |
23 | # 	tr '/' '.'
24 | # }
25 | # 
26 | # weka-folders () {
27 | # 	unzip -l $WEKAPATH/weka.jar |
28 | # 	sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' |
29 | # 	tr '/' '.'
30 | # }
31 | # 
32 | # _completeweka() {
33 | #   local curw=${COMP_WORDS[COMP_CWORD]}
34 | #   local wordlist=$(weka-folders; weka-classes)
35 | #   COMPREPLY=($(compgen -W '${wordlist[@]}' -- "$curw"))
36 | #   return 0
37 | # }
38 | # 
39 | # complete -o nospace -F _completeweka weka
40 | #
41 | #########################################################
42 | # Tab completion for Zsh                                #
43 | #########################################################
44 | #
45 | # export WEKAJAR="/home/joe/bin/weka.jar"
46 | # 
47 | # weka-classes () {
48 | # 	unzip -l $WEKAJAR |
49 | # 	sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' |
50 | # 	tr '/' '.'
51 | # }
52 | # 
53 | # weka-folders () {
54 | # 	unzip -l $WEKAJAR |
55 | # 	sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' |
56 | # 	tr '/' '.'
57 | # }
58 | # 
59 | # function _completeweka {
60 | # 	reply=($(weka-folders; weka-classes))
61 | # }
62 | # 
63 | # compctl -K _completeweka weka
64 | #
65 | #########################################################
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/dumbplot:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # dumbplot: Output plot on the terminal given list of X,Y coordinates.
 3 | #   Can either be a scatter plot or a bar (or bar-like) plot that
 4 | #   assumes that the y-coordinate is numeric and just displays the
 5 | #   x-coordinate data as-is, and in the order that it is fed to the
 6 | #   scripts.
 7 | #
 8 | # Dependency: gnuplot
 9 | #
10 | # Author: http://jeroenjanssens.com
11 | 
12 | usage () {
13 | cat << EOF
14 | dumbplot: Use gnuplot to quickly get ascii plot of x-y data.
15 | 
16 | usage: dumbplot OPTIONS
17 | 
18 | OPTIONS:
19 |   -a      As-is.  Use the x-coord (first coord) as categorical data for plotting.
20 |   -b      Boxplot.  Use vertical boxes rather than just a marker.  Autosets -a switch.
21 |   -w      Terminal width do use.  Defaults to actual terminal width.
22 |   -v      Terminal height to use.  Defaults to terminal height / 2.
23 |   -h      Show this message
24 | 
25 | Example usage:
26 |     $ paste -d, <(echo 1 2 4 8 9 | tr ' ' '\n') <(echo 1 2 4 2 1 | tr ' ' '\n') | dumbplot
27 |     $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -a
28 |     $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b
29 |     $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b -w 80 -v 22
30 | 
31 | EOF
32 | }
33 | 
34 | 
35 | 
36 | 
37 | ASIS=
38 | BARCHART=
39 | PWIDTH=$(tput cols)
40 | PHEIGHT=$(tput lines)
41 | 
42 | PHEIGHT=$(echo $PHEIGHT / 2 | bc)
43 | 
44 | while getopts "abw:v:" OPTION
45 | do
46 | 	case $OPTION in
47 | 		a)
48 | 			ASIS=1
49 | 			;;
50 | 		b)
51 | 			ASIS=1
52 | 			BARCHART=1
53 | 			;;
54 | 		w)
55 |             PWIDTH=$OPTARG
56 | 			;;
57 | 		v)
58 |             PHEIGHT=$OPTARG
59 | 			;;
60 | 		h)
61 | 			usage
62 | 			exit 0
63 | 			;;
64 | 	esac
65 | done
66 | 
67 | 
68 | 
69 | 
70 | ## Decision logic to execute right gnuplot command
71 | if [ ! -z "$ASIS" ] ; then
72 |     if [ ! -z "$BARCHART" ] ; then
73 |         # Categorical data, uses boxes in plot
74 |         nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-" using 1:3:xtic(2) with boxes'
75 |     else
76 |         # Categorical data, do NOT use boxes
77 |         nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-" using 1:3:xtic(2)'
78 |     fi
79 | else
80 |     # Scatterplot of data
81 |     gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-"'
82 | fi
83 | 
84 | 


--------------------------------------------------------------------------------
/scrape:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # scrape: Extract HTML elements using an XPath query or CSS3 selector.
 4 | #
 5 | # Example usage:
 6 | # $ curl 'https://en.wikipedia.org/wiki/List_of_sovereign_states' -s \
 7 | # | scrape -be 'table.wikitable > tbody > tr  > td > b > a'
 8 | #
 9 | # Dependencies: lxml and optionally cssselect
10 | #
11 | # Author: http://jeroenjanssens.com
12 | 
13 | import sys
14 | import argparse
15 | from lxml import etree
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('html', nargs='?', type=argparse.FileType('rb'),
21 |                         default=sys.stdin, help="HTML", metavar="HTML")
22 |     parser.add_argument('-a', '--argument', default="",
23 |                         help="argument to extract from tag")
24 |     parser.add_argument('-b', '--body', action='store_true', default=False,
25 |                         help="Enclose output with HTML and BODY tags")
26 |     parser.add_argument('-e', '--expression', default=[], action='append',
27 |                         help="XPath query or CSS3 selector")
28 |     parser.add_argument('-f', '--file', default='',
29 |                         help="File to read input from")
30 |     parser.add_argument('-x', '--check-existance', action='store_true', default=False,
31 |                         help="Process return value signifying existance")
32 |     parser.add_argument('-r', '--rawinput', action='store_true', default=False,
33 |                         help="Do not parse HTML before feeding etree (useful"
34 |                         "for escaping CData)")
35 |     args = parser.parse_args()
36 | 
37 |     args.expression = [e.decode('utf-8') for e in args.expression]
38 | 
39 |     from cssselect import GenericTranslator
40 | 
41 |     expression = [e if e.startswith('//') else GenericTranslator().css_to_xpath(e) for e in args.expression]
42 | 
43 |     html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
44 |                                    strip_cdata=True)
45 | 
46 |     inp = open(args.file) if args.file else args.html
47 |     if args.rawinput:
48 |         document = etree.fromstring(inp.read())
49 |     else:
50 |         document = etree.parse(inp, html_parser)
51 | 
52 |     if args.body:
53 |         sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n")
54 | 
55 |     for e in expression:
56 |         els = list(document.xpath(e))
57 | 
58 |         if args.check_existance:
59 |             sys.exit(1 if len(els) == 0 else 0)
60 | 
61 |         for e in els:
62 |             if isinstance(e, basestring):
63 |                 text = e
64 |             elif not args.argument:
65 |                 text = etree.tostring(e)
66 |             else:
67 |                 text = e.get(args.argument)
68 |             if text is not None:
69 |                 sys.stdout.write(text.encode('utf-8').strip() + "\t")
70 | 
71 |     if args.body:
72 |         sys.stdout.write("</body>\n</html>")
73 | 
74 |     sys.stdout.write('\n')
75 |     sys.stdout.flush()
76 | 
77 | if __name__ == "__main__":
78 |     exit(main())
79 | 


--------------------------------------------------------------------------------
/Rio:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout
  3 | #
  4 | # Example usage:
  5 | # $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)')
  6 | #
  7 | # $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv
  8 | # $ < iris.csv Rio -e 'df$SepalLength^2'
  9 | # $ < iris.csv Rio -f summary
 10 | # $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")'
 11 | # $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png
 12 | #
 13 | # Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, sqldf, and ggmap)
 14 | #
 15 | # Author: http://jeroenjanssens.com
 16 | 
 17 | usage() {
 18 | cat << EOF
 19 | Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout
 20 | 
 21 | usage: Rio OPTIONS
 22 | 
 23 | OPTIONS:
 24 |    -d      Delimiter
 25 |    -e      Commands to execute
 26 |    -f      Single command to execute on data.frame
 27 |    -g      Import ggplot2
 28 |    -h      Show this message
 29 |    -m      Import ggmap
 30 |    -n      CSV has no header
 31 |    -r      Import dplyr and tidyr
 32 |    -s      Import sqldf
 33 |    -b      Use same settings as used for book Data Science at the Command Line
 34 |    -v      Verbose
 35 | 
 36 | EOF
 37 | }
 38 | 
 39 | finish() {
 40 | 	rm -f $IN $OUT ${OUT%.png} ${ERR%.err}
 41 | 
 42 |         ## Removes error file if error file is empty.
 43 |     if [[ ! -s $ERR ]]; then
 44 |         rm -f $ERR
 45 |     fi
 46 | 
 47 | 	rm -f Rplots.pdf
 48 | }
 49 | 
 50 | trap finish EXIT
 51 | 
 52 | callR() {
 53 | 	Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}"
 54 | }
 55 | 
 56 | SCRIPT=
 57 | REQUIRES=
 58 | DELIMITER=","
 59 | HEADER="T"
 60 | VERBOSE=false
 61 | 
 62 | # OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional.
 63 | # This explicitly uses a template, which works for both.  The $TMPDIR is in case
 64 | # it isn't set as an enviroment variable, assumes you have /tmp.
 65 | if [ -z "$TMPDIR" ]; then
 66 |     TMPDIR=/tmp/
 67 | fi
 68 | IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX)
 69 | OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png
 70 | ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err
 71 | 
 72 | while getopts "d:hgmnprsve:f:b" OPTION
 73 | do
 74 | 	case $OPTION in
 75 | 		b)
 76 | 			RIO_DPI=300
 77 | 			;;
 78 | 		d)
 79 | 			DELIMITER=$OPTARG
 80 | 			;;
 81 | 		e)
 82 | 			SCRIPT=$OPTARG
 83 | 			if ! echo $SCRIPT | grep -qe "; *$"
 84 | 			then
 85 | 				SCRIPT="${SCRIPT};"
 86 | 			fi
 87 | 			;;
 88 | 		f)
 89 | 			SCRIPT="${OPTARG}(df);"
 90 | 			;;
 91 | 		h)
 92 | 			usage
 93 | 			exit 1
 94 | 			;;
 95 | 		g)
 96 | 			REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);"
 97 | 			;;
 98 | 		n)
 99 | 			HEADER="F"
100 | 			;;
101 | 		r)
102 | 			REQUIRES="${REQUIRES}require(dplyr);require(tidyr);"
103 | 			;;
104 | 		s)
105 | 			REQUIRES="${REQUIRES}require(sqldf);"
106 | 			;;
107 | 		m)
108 | 			REQUIRES="${REQUIRES}require(ggmap);"
109 | 			;;
110 | 		v)
111 | 			VERBOSE=true
112 | 			;;
113 | 		?)
114 | 			usage
115 | 			exit
116 | 		;;
117 | 	esac
118 | done
119 | 
120 | cat /dev/stdin > $IN
121 | 
122 | if $VERBOSE
123 | then
124 | 	callR
125 | else
126 | 	callR > $ERR 2>&1
127 | fi
128 | 
129 | if [[ ! -f $OUT ]]; then
130 | 	cat $ERR
131 | else
132 | 	RESULT="$(cat $OUT | tr '\0' '\n')"
133 | 	if [ "$RESULT" == "NULL" ]; then
134 | 		cat $ERR
135 | 	else
136 | 		cat $OUT
137 | 	fi 
138 | fi
139 | 
140 | 


--------------------------------------------------------------------------------
/csv2vw:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import csv
  5 | from sys import stdin, stdout, stderr, exit
  6 | import itertools
  7 | 
  8 | 
  9 | def main():
 10 |     parser = argparse.ArgumentParser(
 11 |         epilog="""If both --classes and --auto-relabel are omitted,
 12 |         label values are left as-is. By default, features with value 0 are not
 13 |         printed. This can be overridden with --null""",
 14 |         usage="""%(prog)s [OPTION]... [FILE]
 15 | 
 16 | Convert CSV to Vowpal Wabbit input format.
 17 | 
 18 |   Examples:
 19 | 
 20 |   # Leave label values as is:
 21 |   $ csv2vw spam.csv --label target
 22 | 
 23 |   # Relabel values 'ham' to 0 and 'spam' to 1:
 24 |   $ csv2vw spam.csv --label target --classes ham,spam
 25 | 
 26 |   # Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss):
 27 |   $ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one
 28 | 
 29 |   # Relabel first label value to 0, second to 1, and ignore the rest:
 30 |   $ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes
 31 | 
 32 |   # Relabel first label value to 1, second to 2, and so on:
 33 |   $ <iris.csv csv2vw -lspecies --multiclass --auto-relabel
 34 | 
 35 |   # Relabel 'versicolor' to 1, 'virginica' to 2, and 'setosa' to 3
 36 |   $ <iris.csv csv2vw -lspecies --multiclass -cversicolor,virginica,setosa""")
 37 | 
 38 |     parser.add_argument("file", nargs="?", type=argparse.FileType("r"),
 39 |                         default=stdin,
 40 |                         help="""Input CSV file. If omitted,
 41 |                         read from standard input.""",
 42 |                         metavar="FILE")
 43 |     parser.add_argument("-d", "--delimiter",
 44 |                         help="""Delimiting character of the input CSV file
 45 |                         (default: ,).""",
 46 |                         default=",")
 47 |     parser.add_argument("-l", "--label",
 48 |                         help="""Name of column that contains the class
 49 |                         labels.""")
 50 |     parser.add_argument("-c", "--classes",
 51 |                         help="""Ordered, comma-separated list of possible
 52 |                         class labels to relabel. If not specifying all possible
 53 |                         class labels, use --auto-relabel.""",
 54 |                         nargs="?")
 55 |     parser.add_argument("-n", "--null",
 56 |                         help="""Comma-separated list of null values (default:
 57 |                         '0').""",
 58 |                         nargs="?", default="0")
 59 |     parser.add_argument("-a", "--auto-relabel",
 60 |                         help="""Automatically relabel class labels in the order
 61 |                         in which they appear in the CSV file.""",
 62 |                         action="store_true")
 63 |     parser.add_argument("-m", "--multiclass",
 64 |                         help="""Indicates more than two classes; will start
 65 |                         counting at 1 instead of 0.""",
 66 |                         action="store_true")
 67 |     parser.add_argument("-+", "--minus-plus-one",
 68 |                         help="""Instead of relabeling to integers, relabel to
 69 |                         '-1' and '+1'. Needed when using VW with logistic or
 70 |                         hinge loss.""", action="store_true")
 71 |     parser.add_argument("-i", "--ignore-extra-classes",
 72 |                         help="""If there are more than two classes found, when
 73 |                         not using --multiclass, include the example with no
 74 |                         label instead of giving skipping it.""",
 75 |                         action="store_true")
 76 |     parser.add_argument("-t", "--tag",
 77 |                         help="""Name of column that contains the tags.""")
 78 | 
 79 |     args = parser.parse_args()
 80 | 
 81 |     auto_relabel = args.auto_relabel
 82 |     label_column = args.label
 83 |     tag_column = args.tag
 84 |     null_values = args.null.split(",")
 85 |     multiclass = args.multiclass
 86 |     minus_plus_one = args.minus_plus_one
 87 | 
 88 |     if minus_plus_one:
 89 |         new_classes = iter(["-1", "+1"])
 90 |     elif multiclass:
 91 |         new_classes = (str(i) for i in itertools.count(1))
 92 |     elif args.classes or auto_relabel:
 93 |         new_classes = iter(["0", "1"])
 94 |     else:
 95 |         new_classes = None
 96 | 
 97 |     if args.classes:
 98 |         old_classes = args.classes.split(",")
 99 |         relabel = dict(zip(old_classes, new_classes))
100 |     else:
101 |         relabel = dict()
102 | 
103 |     reader = csv.DictReader(args.file, delimiter=args.delimiter)
104 |     try:
105 |         for row in reader:
106 |             label = row.pop(label_column, "")
107 |             tag = row.pop(tag_column, "")
108 | 
109 |             if auto_relabel or new_classes:
110 |                 if auto_relabel:
111 |                     if label not in relabel:
112 |                         try:
113 |                             relabel[label] = next(new_classes)
114 |                         except StopIteration:
115 |                             if args.ignore_extra_classes:
116 |                                 relabel[label] = ""
117 |                             else:
118 |                                 stderr.write("Found too many different classes;"
119 |                                              " skipping example. Use "
120 |                                              "--multiclass or "
121 |                                              "--ignore-extra-classes.\n")
122 |                                 continue
123 |                 label = relabel[label]
124 | 
125 |             features = " ".join([k + ":" + v for k, v in sorted(row.items())
126 |                                  if v not in null_values])
127 |             line = label + " " + tag + "| " + features + "\n"
128 |             stdout.write(line)
129 |             stdout.flush()
130 |     except (IOError, KeyboardInterrupt, BrokenPipeError):
131 |         stderr.close()
132 | 
133 | if __name__ == "__main__":
134 |     exit(main())
135 | 


--------------------------------------------------------------------------------