├── README.md └── rplot.R /README.md: -------------------------------------------------------------------------------- 1 | ### Example usage 2 | 3 | # Get the script 4 | curl https://raw.githubusercontent.com/geotheory/r-plot/master/rplot.R > rplot.R 5 | 6 | # Help 7 | Rscript rplot.R -h | more 8 | 9 | # Get some data 10 | Rscript -e "write.csv(mtcars, 'mtcars.csv')"; head -6 mtcars.csv 11 | 12 | # Scatterplot - car weight vs fuel efficiency (with correlation) 13 | Rscript rplot.R mtcars.csv wt mpg -R 14 | 15 | # Hashplot - car fuel efficiencies 16 | Rscript rplot.R mtcars.csv 1 mpg 17 | 18 | # Ordering plot data 19 | Rscript rplot.R mtcars.csv 1 mpg -o 20 | 21 | # bash function (e.g. for .bash_profile) 22 | rplot() { Rscript rplot.R "$1" ${*:2}; } 23 | 24 | # Iris dataset 25 | Rscript -e "write.csv(iris, 'iris.csv')"; head -6 iris.csv 26 | 27 | # Aggregating data for categorical averages 28 | rplot iris.csv Species Sepal.Length -am 29 | 30 | # Plotting a single numeric variable 31 | rplot iris.csv Sepal.Width 32 | 33 | # Reordered and without summary 34 | rplot iris.csv Sepal.Width -ox 35 | 36 | # Change scatterplot size 37 | rplot iris.csv Sepal.Width -ox -r 40 -c 80 38 | 39 | # Histogram of a single numeric variable (20 bins) 40 | rplot iris.csv Sepal.Width -F -b 20 41 | 42 | # Single categorical variable frequency (ie. aggregate by length) 43 | rplot iris.csv Species -al 44 | 45 | # Other data formats (eg. semicolon-seperated) 46 | Rscript -e "print(names(airquality)); write.table(airquality, 'airquality.csv', sep=';', row.names=T, col.names=F)"; head -3 airquality.csv 47 | 48 | # Specifying seperating-character, no-header row, and fields by column index (eg. Ozone 'V2' and Temp 'V5') 49 | rplot airquality.csv 2 5 -ns ";" 50 | 51 | # Passing r-plot a bash text object instead of file 52 | rplot "$(cat mtcars.csv)" mpg disp -x 53 | 54 | # Just view aggregated data without plot 55 | rplot mtcars.csv gear mpg -HamzQ 56 | 57 | # Useful functions to report header row - e.g. `headz iris.csv ";"` (specify delimiter if not comma) 58 | 59 | headx() { Rscript -e "l<-scan(text='$(head -1 $1)',what='character',sep=ifelse('$2'=='',',','$2'),quiet=T); for(i in 1:length(l)) cat(i,l[i],'\\\n')"; } 60 | 61 | headz() { Rscript -e "a=commandArgs(T);l=read.table(text=a[1],se=ifelse('$2'=='',',','$2'),strin=F,h=T);for(i in 1:ncol(l)){f=l[,i];cat(i,' ',names(l)[i],' <',class(f),'> ',strtrim(paste(f,collapse=', '),min(80,as.integer(system('tput cols',int=T)))),'\\\n',sep='')}" "$(head -60 $1)"; } 62 | 63 | ----------------------------------------------- 64 | 65 | ### Output from above 66 | 67 | $ curl https://raw.githubusercontent.com/geotheory/r-plot/master/rplot.R > rplot.R 68 | % Total % Received % Xferd Average Speed Time Time Time Current 69 | Dload Upload Total Spent Left Speed 70 | 100 13386 100 13386 0 0 5739 0 0:00:02 0:00:02 --:--:-- 5740 71 | 72 | $ # Help 73 | 74 | $ Rscript rplot.R -h | more 75 | 76 | ********************** 77 | ******* R-PLOT ******* 78 | ********************** 79 | 80 | This library plots a scatterplot or hashbar plot (bars made of hashes!) of a csv or a similarly formatted 81 | file or string in your console. If 2 numeric id_fields are provided a scatterplot will default, else hashbars. 82 | Required arguments: csv file/string, then column name(s)/index(ices) (values-column last for hashbars) 83 | 84 | NB read.table check.names=T so e.g. numeric colnames prepend 'X' and those with spaces have spaces replaced by '.'. 85 | Use '-Pz | head' to suppress the plot and see the colnames that are read in.. 86 | 87 | USAGE 88 | Example csv call - scatterplot (by column name): 89 | "Rscript rplot.R file.csv num_field1 num_field2" 90 | Example csv call - scatterplot (by column index): 91 | "Rscript rplot.R file.csv col#1 col#3" 92 | Example csv call - hashbar plot: 93 | "Rscript rplot.R file.csv id_field1 id_field2 value_field" 94 | Example csv text string call: 95 | "Rscript rplot.R "$(cat file.csv)" id_field value_field" 96 | For convenience you can set up an alias in .bash_profile or equivalent, e.g. 97 | "rplot() { Rscript /pathto/rplot.R "$1" ${*:2}; }" 98 | and call with: 99 | "rplot file.csv field1 field2 etc.." 100 | 101 | OPTIONS: 102 | Data handling: 103 | -n Specify no header row for input data. Use col indices instead 104 | -s sep character for input data (default `,`). Requires value e.g. ";" "\t" "" (inc. quotes) 105 | -a Aggregate (default `sum`) a hashbar plot data by its categorical variables 106 | -m Aggregate by `mean` if `-a` selected 107 | -l Aggregate by `length` (count instances) if `-a` selected 108 | Plotting: 109 | -o Reorder hashbar chart by value (also reorders data.frames) 110 | -H Override a default scatterplot with hashbar plot 111 | -S Override a default hashbar plot with scatterplot (NA values are removed) 112 | -r Scatterplot rows/height (default 20). Requires following value. 113 | -c Scatterplot cols/width (default 50). Requires following value. 114 | -p pch char (defaults: `#` hashbars, `*` scatterplots without overplotting, 115 | `. : ■ █` scatterplots with o/p). Requires 1 char eg. `-p "."` (eg. with -y) 116 | or a 4 char string eg. ".°*@" to change overplot symbols (inc. quotes) 117 | -R Add r2 correlation (bivariate only)) 118 | -x Suppress summary in case of scatterplot 119 | -y Suppress scatterplot point symbols (that show overplotting) 120 | -z Suppress plot (eg. use with -P or -Q) 121 | Other: 122 | -h Call this help (also --help) 123 | -P Output raw data.frame to console (truncated 1000 rows) 124 | -Q Output processed data.frame to console (truncated 1000 rows) 125 | 126 | ^[[B 127 | $ # Get some data 128 | 129 | $ Rscript -e "write.csv(mtcars, 'mtcars.csv')"; head -6 mtcars.csv 130 | "","mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb" 131 | "Mazda RX4",21,6,160,110,3.9,2.62,16.46,0,1,4,4 132 | "Mazda RX4 Wag",21,6,160,110,3.9,2.875,17.02,0,1,4,4 133 | "Datsun 710",22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 134 | "Hornet 4 Drive",21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 135 | "Hornet Sportabout",18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 136 | 137 | $ # Scatterplot - car weight vs fuel efficiency (with correlation) 138 | 139 | $ Rscript rplot.R mtcars.csv wt mpg -R 140 | __________________________________________________ 141 | | . | Points 142 | | . | . 1 (x̄ 1.0) 143 | |.. | : 2 (x̄ 2.0) 144 | | | 145 | | | 146 | | . | 147 | | . | 148 | | . | 149 | | . . | 150 | | | mpg 151 | | . . . . . | 152 | | . . . | 153 | | : | 154 | | . . | 155 | | . . . | 156 | | .: . . | 157 | | . | 158 | | | 159 | | | 160 | | . .| R²=-0.868 161 | |__________________________________________________| 162 | wt 163 | 32 data rows plotted 164 | wt mpg 165 | Min. :1.513 Min. :10.40 166 | 1st Qu.:2.581 1st Qu.:15.43 167 | Median :3.325 Median :19.20 168 | Mean :3.217 Mean :20.09 169 | 3rd Qu.:3.610 3rd Qu.:22.80 170 | Max. :5.424 Max. :33.90 171 | 172 | $ # Hashplot - car fuel efficiencies 173 | 174 | $ Rscript rplot.R mtcars.csv 1 mpg 175 | 32 data rows plotted 176 | X mpg 177 | Mazda RX4 21 ########################################## 178 | Mazda RX4 Wag 21 ########################################## 179 | Datsun 710 22.8 ############################################## 180 | Hornet 4 Drive 21.4 ########################################### 181 | Hornet Sportabout 18.7 ###################################### 182 | Valiant 18.1 #################################### 183 | Duster 360 14.3 ############################# 184 | Merc 240D 24.4 ################################################# 185 | Merc 230 22.8 ############################################## 186 | Merc 280 19.2 ####################################### 187 | Merc 280C 17.8 #################################### 188 | Merc 450SE 16.4 ################################# 189 | Merc 450SL 17.3 ################################### 190 | Merc 450SLC 15.2 ############################## 191 | Cadillac Fleetwood 10.4 ##################### 192 | Lincoln Continental 10.4 ##################### 193 | Chrysler Imperial 14.7 ############################# 194 | Fiat 128 32.4 ################################################################# 195 | Honda Civic 30.4 ############################################################# 196 | Toyota Corolla 33.9 ##################################################################### 197 | Toyota Corona 21.5 ########################################### 198 | Dodge Challenger 15.5 ############################### 199 | AMC Javelin 15.2 ############################## 200 | Camaro Z28 13.3 ########################### 201 | Pontiac Firebird 19.2 ####################################### 202 | Fiat X1-9 27.3 ####################################################### 203 | Porsche 914-2 26 #################################################### 204 | Lotus Europa 30.4 ############################################################# 205 | Ford Pantera L 15.8 ################################ 206 | Ferrari Dino 19.7 ######################################## 207 | Maserati Bora 15 ############################## 208 | Volvo 142E 21.4 ########################################### 209 | 210 | $ # Ordering plot data 211 | 212 | $ Rscript rplot.R mtcars.csv 1 mpg -o 213 | 32 data rows plotted 214 | X mpg 215 | Toyota Corolla 33.9 ##################################################################### 216 | Fiat 128 32.4 ################################################################# 217 | Honda Civic 30.4 ############################################################# 218 | Lotus Europa 30.4 ############################################################# 219 | Fiat X1-9 27.3 ####################################################### 220 | Porsche 914-2 26 #################################################### 221 | Merc 240D 24.4 ################################################# 222 | Datsun 710 22.8 ############################################## 223 | Merc 230 22.8 ############################################## 224 | Toyota Corona 21.5 ########################################### 225 | Hornet 4 Drive 21.4 ########################################### 226 | Volvo 142E 21.4 ########################################### 227 | Mazda RX4 21 ########################################## 228 | Mazda RX4 Wag 21 ########################################## 229 | Ferrari Dino 19.7 ######################################## 230 | Merc 280 19.2 ####################################### 231 | Pontiac Firebird 19.2 ####################################### 232 | Hornet Sportabout 18.7 ###################################### 233 | Valiant 18.1 #################################### 234 | Merc 280C 17.8 #################################### 235 | Merc 450SL 17.3 ################################### 236 | Merc 450SE 16.4 ################################# 237 | Ford Pantera L 15.8 ################################ 238 | Dodge Challenger 15.5 ############################### 239 | Merc 450SLC 15.2 ############################## 240 | AMC Javelin 15.2 ############################## 241 | Maserati Bora 15 ############################## 242 | Chrysler Imperial 14.7 ############################# 243 | Duster 360 14.3 ############################# 244 | Camaro Z28 13.3 ########################### 245 | Cadillac Fleetwood 10.4 ##################### 246 | Lincoln Continental 10.4 ##################### 247 | 248 | $ # bash function (e.g. for .bash_profile) 249 | 250 | $ rplot() { Rscript rplot.R "$1" ${*:2}; } 251 | 252 | $ # Iris dataset 253 | 254 | $ Rscript -e "write.csv(iris, 'iris.csv')"; head -6 iris.csv 255 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 256 | "1",5.1,3.5,1.4,0.2,"setosa" 257 | "2",4.9,3,1.4,0.2,"setosa" 258 | "3",4.7,3.2,1.3,0.2,"setosa" 259 | "4",4.6,3.1,1.5,0.2,"setosa" 260 | "5",5,3.6,1.4,0.2,"setosa" 261 | 262 | $ # Aggregating data for categorical averages 263 | 264 | $ rplot iris.csv Species Sepal.Length -am 265 | Aggregate function is mean 266 | 150 data rows plotted 267 | Species Sepal.Length 268 | setosa 5.01 ##################################################### 269 | versicolor 5.94 ############################################################### 270 | virginica 6.59 ###################################################################### 271 | 272 | $ # Plotting a single numeric variable 273 | 274 | $ rplot iris.csv Sepal.Width 275 | __________________________________________________ 276 | | . | Points 277 | | . | . 1 (x̄ 1.0) 278 | | . | : 2 (x̄ 2.0) 279 | | . .. | ■ 3 (x̄ 3.0) 280 | | .. .. . . | 281 | | . . . | 282 | | . . . . | 283 | |.... ...:...... . . . | 284 | | . . . . . . | 285 | |. . . . .: . .. . . . | Sepal.Width 286 | |. . . . . . . :: | 287 | |. ... . . . .:: :: ... ■ ::.. . .. .. ...| 288 | | .. .: . . ...:: | 289 | | . . .. . . . . . | 290 | | . .. . . | 291 | | . .. : . . .. . . | 292 | | . . . . | 293 | | . . . | 294 | | | 295 | | . | 296 | |__________________________________________________| 297 | Index 298 | 150 data rows plotted 299 | Index Sepal.Width 300 | Min. : 1.00 Min. :2.000 301 | 1st Qu.: 38.25 1st Qu.:2.800 302 | Median : 75.50 Median :3.000 303 | Mean : 75.50 Mean :3.057 304 | 3rd Qu.:112.75 3rd Qu.:3.300 305 | Max. :150.00 Max. :4.400 306 | 307 | $ # Reordered and without summary 308 | 309 | $ rplot iris.csv Sepal.Width -ox 310 | __________________________________________________ 311 | | .| Points 312 | | . | . 1 (x̄ 1.0) 313 | | . | : 2 (x̄ 2.0) 314 | | :. | ■ 3 (x̄ 3.0) 315 | | :■. | █ 4 (x̄ 4.0) 316 | | :. | 317 | | ■. | 318 | | ■■■■■■ | 319 | | ■■ | 320 | | .■■■■ | Sepal.Width 321 | | ■■■: | 322 | | :■■■■■■■■█■■ | 323 | | .■■■■. | 324 | | .■■: | 325 | | ■: | 326 | | :■■■ | 327 | | ■. | 328 | |■ | 329 | | | 330 | |. | 331 | |__________________________________________________| 332 | Index 333 | 334 | $ # Change scatterplot size 335 | 336 | $ rplot iris.csv Sepal.Width -ox -r 40 -c 80 337 | ________________________________________________________________________________ 338 | | .| Points 339 | | | . 1 (x̄ 1.0) 340 | | | : 2 (x̄ 2.0) 341 | | . | 342 | | . | 343 | | | 344 | | . | 345 | | | 346 | | : | 347 | | ::: | 348 | | | 349 | | .: | 350 | | | 351 | | :.. | 352 | | ::: | 353 | | | 354 | | .:.:::: | 355 | | .::. | 356 | | | 357 | | .::.:::. | Sepal.Width 358 | | | 359 | | :::::. | 360 | | :::::.:::::::. | 361 | | | 362 | | .:.::: | 363 | | | 364 | | .::::::. | 365 | | .:::: | 366 | | | 367 | | ::. | 368 | | .::.: | 369 | | | 370 | | :. | 371 | | | 372 | | :: | 373 | |.: | 374 | | | 375 | | | 376 | | | 377 | |. | 378 | |________________________________________________________________________________| 379 | Index 380 | 381 | $ # Histogram of a single numeric variable (20 bins) 382 | 383 | $ rplot iris.csv Sepal.Width -F -b 20 384 | 150 data rows plotted 385 | Sepal.Width frequency 386 | -- 2 -- 1 ## 387 | 3 ###### 388 | 4 ######## 389 | -- 2.5 -- 11 ###################### 390 | 5 ########## 391 | 9 ################## 392 | 14 ############################ 393 | -- 3 -- 36 ######################################################################## 394 | 11 ###################### 395 | 13 ########################## 396 | 6 ############ 397 | -- 3.5 -- 18 #################################### 398 | 4 ######## 399 | 3 ###### 400 | 6 ############ 401 | -- 4 -- 3 ###### 402 | 1 ## 403 | 1 ## 404 | 1 ## 405 | 406 | $ # Single categorical variable frequency (ie. aggregate by length) 407 | 408 | $ rplot iris.csv Species -al 409 | Aggregate function is length 410 | 150 data rows plotted 411 | Species length 412 | setosa 50 ############################################################################ 413 | versicolor 50 ############################################################################ 414 | virginica 50 ############################################################################ 415 | 416 | $ # Other data formats (eg. semicolon-seperated) 417 | 418 | $ Rscript -e "print(names(airquality)); write.table(airquality, 'airquality.csv', sep=';', row.names=T, col.names=F)"; head -3 airquality.csv 419 | [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day" 420 | "1";41;190;7.4;67;5;1 421 | "2";36;118;8;72;5;2 422 | "3";12;149;12.6;74;5;3 423 | 424 | $ # Specifying seperating-character, no-header row, and fields by column index (eg. Ozone 'V2' and Temp 'V5') 425 | 426 | $ rplot airquality.csv 2 5 -ns ";" 427 | __________________________________________________ 428 | | . . | Points 429 | | . . . . | . 1 (x̄ 1.0) 430 | | . .: | : 2 (x̄ 2.0) 431 | | . .. . . | ■ 3 (x̄ 3.0) 432 | | . . . :. | 433 | | . . ■ . ... . | 434 | | . . .: . | 435 | | . :..:.:..:. . .| 436 | | . . . .. . | 437 | | :.■■.. . .. | V5 438 | | . :.. | 439 | | ■. . : | 440 | | .. . . | 441 | | . .... . | 442 | | . . .. . . | 443 | | : .. | 444 | | ... | 445 | |. . . | 446 | |. .. | 447 | | . | 448 | |__________________________________________________| 449 | V2 450 | 116 data rows plotted. 37 rows with NA values omitted 451 | V2 V5 452 | Min. : 1.00 Min. :57.00 453 | 1st Qu.: 18.00 1st Qu.:71.00 454 | Median : 31.50 Median :79.00 455 | Mean : 42.13 Mean :77.87 456 | 3rd Qu.: 63.25 3rd Qu.:85.00 457 | Max. :168.00 Max. :97.00 458 | 459 | $ # Passing r-plot a bash text object instead of file 460 | 461 | $ rplot "$(cat mtcars.csv)" mpg disp -x 462 | __________________________________________________ 463 | |: | Points 464 | | . | . 1 (x̄ 1.0) 465 | | | : 2 (x̄ 2.0) 466 | | . | 467 | | | 468 | | . . . . | 469 | | | 470 | | : | 471 | | . | 472 | | . . . | disp 473 | | . | 474 | | . | 475 | | | 476 | | | 477 | | . . : | 478 | | . . . | 479 | | .. . | 480 | | . . | 481 | | . . . | 482 | | .| 483 | |__________________________________________________| 484 | mpg 485 | 486 | $ # Just view aggregated data without plot 487 | 488 | $ rplot mtcars.csv gear mpg -HamzQ 489 | Aggregate function is mean 490 | gear mpg 491 | 1 3 16.10667 492 | 2 4 24.53333 493 | 3 5 21.38000 494 | 32 data rows plotted 495 | 496 | -------------------------------------------------------------------------------- /rplot.R: -------------------------------------------------------------------------------- 1 | # A small library for rendering bar or scatterplots of csv data to console using ascii characters. 2 | # Call `Rscript rplot.R -h` for instructions.. 3 | # by @geotheory | geotheory.co.uk 2016 4 | 5 | # manage input arguments 6 | args_in = commandArgs(trailingOnly=T) 7 | 8 | # arguments that require a following value (e.g. "-p '+'") 9 | pars = list(sep=c('-s',','), quote=c('-q',"\"'"), pch=c('-p','*'), x=c('-c',50), 10 | y=c('-r',20), bins=c('-b',15), X=c('-X','%'), size=c('-d',NA), asp=c('-A',1)) 11 | 12 | # split up combined arguments (e.g. '-am' for aggregate by mean) 13 | args = c(unlist(sapply(args_in, function(a) { 14 | if(substr(a,1,1)=='-') return(paste0('-',strsplit(substr(a,2,100),'')[[1]])) else a 15 | }), use.names = F), '--') # '--' added in case -A included without an argument 16 | 17 | # update pars argument update received 18 | for(i in 1:length(pars)) { 19 | p = pars[[i]] 20 | if(p[1] %in% args) { 21 | n = match(p[1], args) + 1 22 | if(substr(args[n],1,1) != '-') { 23 | if(p[1] == '-s') { 24 | if(is.na(args[n])) args[n] = "" # whitespace seperator 25 | if(args[n] == "\\t") args[n] = "\t" # string to tab char 26 | } 27 | pars[[i]][2] = args[n] 28 | args = args[-n] 29 | } 30 | } 31 | } 32 | 33 | plot_args = args[substr(args, 1, 1) == '-'] 34 | field_args = args[!substr(args, 1, 1) == '-'] 35 | if('-A' %in% plot_args) if(is.na(pars$asp[2])) pars$asp[2] = 1 # when no argument provided 36 | 37 | if(any(c('-h','--help') %in% args_in | '-h' %in% plot_args)) { 38 | cat("********************** 39 | ****** R-PLOT ****** 40 | ********************** 41 | 42 | This library plots a scatterplot or hashbar plot (bars made of hashes!) of a csv or a similarly formatted 43 | file or string in your console. If 2 numeric id_fields are provided a scatterplot will default, else hashbars. 44 | Required arguments: csv file/string, then column name(s)/index(ices) (values-column last for hashbars) 45 | 46 | NB read.table check.names=T so e.g. numeric colnames prepend 'X' and those with spaces have spaces replaced by '.'. 47 | Use '-Pz | head' or '-O' to suppress plots and investigate data read-in. 48 | 49 | USAGE 50 | Example csv call - scatterplot (by column name): 51 | \"Rscript rplot.R file.csv num_field1 num_field2\" 52 | Example csv call - scatterplot (by column index): 53 | \"Rscript rplot.R file.csv col#1 col#3\" 54 | Example csv call - hashbar plot: 55 | \"Rscript rplot.R file.csv id_field1 id_field2 value_field\" 56 | Example csv text string call: 57 | \"Rscript rplot.R \"$(cat file.csv)\" id_field value_field\" 58 | For convenience you can set up an alias in .bash_profile or equivalent, e.g. 59 | \"rplot() { Rscript /pathto/rplot.R \"$1\" ${*:2}; }\" 60 | and call with: 61 | \"rplot file.csv field1 field2 etc..\" 62 | 63 | OPTIONS: 64 | Data handling: 65 | -n Specify no header row for input data. Use col indices instead 66 | -s sep character for input data (default `,`). Requires value e.g. \";\" \"\t\" \"\" (inc. quotes) 67 | -q quote character for input data. Requires value e.g. \"'\" or '\"' (inc. outer quotes) 68 | -X Non-numeric characters to remove from numbers (other than \"{space} , $ £ € %\") 69 | -a Aggregate (default `sum`) a hashbar plot data by its categorical variables 70 | -m Aggregate by `mean` if `-a` selected 71 | -M Aggregate by `median` if `-a` selected 72 | -l Aggregate by `length` (count instances) if `-a` selected 73 | -b Histogram bins (default 15) if `-F` selected. Requires following value 74 | Plotting: 75 | -o Reorder hashbar chart by value (also reorders data.frames) 76 | -H Override default scatterplot with hashbar plot 77 | -S Override default hashbar plot with scatterplot (NA values are removed) 78 | -F Override default scatter/hash plot with frequency histogram (requires single numeric field) 79 | -r Scatterplot rows/height (default 20). Requires following value 80 | -c Scatterplot cols/width (default 50). Requires following value 81 | -d Quick plot-size tool. Requires argument: 'l'/'s' large/small 82 | -A Fix y/x aspect ratio. Without argument defaults to 1, otherwise value given 83 | -p pch char (defaults: `#` hashbars, `*` scatterplots without overplotting, 84 | `. : ■ █` scatterplots with o/p). Requires 1 char eg. `-p \".\"` (eg. with -y) 85 | or a 4 char string eg. \".°*@\" to change overplot symbols (inc. quotes) 86 | -R Add r2 correlation (bivariate only)) 87 | -w Remove ID col max width constraints in hash plots, and scale to full console width 88 | -x Suppress summary in case of scatterplot 89 | -y Suppress scatterplot point symbols (that show overplotting) 90 | -z Suppress plot (eg. use with -P or -Q) 91 | Other: 92 | -h Call this help (also --help) 93 | -O Inspect data.frame before and after numeric parsing and quit 94 | -P Output raw data.frame to console and quit (truncated 1000 rows) 95 | -Q Output processed data.frame to console and quit (truncated 1000 rows) 96 | ") 97 | quit() 98 | } 99 | 100 | # 2 functions from {scales}: included as R's library path sometimes isn't accessible from a console call 101 | zero_range = function (x, tol = 1000 * .Machine$double.eps) { 102 | if (length(x) == 1) return(TRUE) 103 | if (length(x) != 2) stop("x must be length 1 or 2") 104 | if (any(is.na(x))) return(NA) 105 | if (x[1] == x[2]) return(TRUE) 106 | if (all(is.infinite(x))) return(FALSE) 107 | m <- min(abs(x)) 108 | if (m == 0) return(FALSE) 109 | abs((x[1] - x[2])/m) < tol 110 | } 111 | 112 | rescale = function (x, to=c(0,1), from, finite=T) { 113 | if(missing(from)) from = range(x, na.rm=T) 114 | if(zero_range(from) || zero_range(to)) return(ifelse(is.na(x), NA, mean(to))) 115 | (x - from[1])/diff(from) * diff(to) + to[1] 116 | } 117 | 118 | map = function(x, n) floor(rescale(x, to=c(1,n))) 119 | 120 | # coerce set to numeric if possible, else return FALSE 121 | num = function(n){ 122 | if(class(n) %in% c('integer','numeric','double')) return(n) 123 | n = gsub(paste(c(' ',',','$','£','€','%',pars$X[2]), collapse='|'), '', n) 124 | is.num = all(!is.na(suppressWarnings(as.numeric(na.omit(n))))) 125 | if(is.num) return(as.numeric(n)) 126 | return(F) 127 | } 128 | 129 | # because formatC can't quite cut it 130 | format_num = function(x) { 131 | if(length(unique(nchar(x)))==1 & sum(x%%1) == 0) return(x) # year 132 | f1 = abs(x) >= 10000000 133 | f2 = abs(x) >= 100 & abs(x) < 10000000 134 | f3 = abs(x) >= 1 & abs(x) < 100 135 | f4 = abs(x) >= 0.001 & abs(x) < 1 136 | f5 = x == 0 137 | f6 = !f1 & !f2 & !f3 & !f4 & !f5 138 | out = x 139 | out[f1] = formatC(x[f1], digits=2, format = "e") 140 | out[f2] = formatC(round(x[f2], 0), digits=1, big.mark=',', format = "f", drop0trailing=T) 141 | out[f3] = formatC(round(x[f3], 2), digits=2, big.mark=',', format = "f", drop0trailing=T) 142 | out[f4] = formatC(x[f4], digits=3, format = "f", drop0trailing=T) 143 | out[f5] = '0' 144 | out[f6] = formatC(x[f6], digits=2, format = "e") 145 | out 146 | } 147 | 148 | scatter_plot = function(x, y, cols=50, rows=20, pch="*", xlab="x", ylab="Y") { 149 | y0 = y 150 | if('-o' %in% plot_args) y = sort(as.numeric(y)) 151 | if('-A' %in% plot_args){ 152 | data_asp = diff(range(y)) / diff(range(x)) 153 | rows = ceiling(cols * data_asp * as.numeric(pars$asp[2]) * 2/5) 154 | } 155 | if(xlab == ylab) xlab = "Index" 156 | if(missing(x)) x <- 1:length(y) 157 | else x <- as.numeric(x) 158 | symbs = c('.', ':', '■', '█') 159 | if(nchar(pch)==4) symbs = strsplit(pch, '')[[1]] 160 | if(nchar(pch)!=1 & nchar(pch)!=4) stop("pch must be 1 or 4 characters long") 161 | 162 | # output processed data.frame to console 163 | orig_dat = data.frame(x, y, stringsAsFactors=F) 164 | names(orig_dat) = c(xlab, ylab) 165 | if('-Q' %in% plot_args) { 166 | print(head(orig_dat,1000)) 167 | quit() 168 | } 169 | if('-z' %in% plot_args) quit() 170 | 171 | # rescale to grid and count point overplotting 172 | if('-d' %in% plot_args){ 173 | if(pars$size[2] == 'l'){ 174 | rows = 40; cols = 100; 175 | } else if(pars$size[2] == 's'){ 176 | rows = 10; cols = 25; 177 | } else warning('-d parameter requires either \'l\' or \'s\' input') 178 | } 179 | summary = as.data.frame(table(paste(map(x,cols), map(-y,rows))), stringsAsFactors=F) # summarise 180 | summary = data.frame(apply(cbind(do.call('rbind', strsplit(summary[[1]], split=' ')), summary$Freq),2,as.numeric)) # parse 181 | names(summary) = c('x','y','freq') 182 | op = max(summary$freq) 183 | pr_labs = F; labs = NULL 184 | 185 | if(op > 1 & !'-y' %in% plot_args) { # overplotting and not manually over-riden 186 | if(nchar(pch)==1 & '-p' %in% plot_args) warning('Single character argument for -p is ignored when point overplotting is present except when -y selected.') 187 | 188 | # cluster overplots to map to symbols 189 | summary$grp = summary$freq 190 | if(length(unique(summary$freq)) > 4) { 191 | f = summary$grp != 1 # ignore 1s (over-plotting) 192 | centres = unique(as.numeric(quantile(summary$grp[f], 0:2/2))) 193 | summary$grp[f] = kmeans(summary$grp[f], centres)$cluster + 1 194 | } else{ 195 | summary$grp = as.numeric(factor(summary$freq)) 196 | } 197 | 198 | # symbol labels 199 | freqs = sort(unique(summary$freq)) 200 | if(!identical(freqs, c(1,2)) & !identical(freqs, 1)) { # ie. not points representable literally by comb of '.' and ':' 201 | pr_labs = T 202 | # data break points 203 | op_data = unique(summary[ order(summary$freq), 3:4 ]) 204 | op_data_rev = op_data[order(op_data$freq, decreasing=T), ] 205 | n = length(unique(op_data$grp)) 206 | labs = data.frame(p0 = 1:n, p1 = 1:n, lab = '', x=0, stringsAsFactors=F) 207 | for(g in unique(op_data$grp)) { 208 | labs$p0[g] = op_data$freq[match(g, op_data$grp)] # first match in group 209 | labs$p1[g] = op_data_rev$freq[match(g, op_data_rev$grp)] # last match in group 210 | labs$x[g] = mean(summary$freq[summary$grp == g]) # mean of grp frequencies 211 | } 212 | for(i in 1:nrow(labs)) { 213 | labs$lab[i] = ifelse(labs$p0[i] == labs$p1[i], labs$p0[i], paste0(labs$p0[i], '-', labs$p1[i])) 214 | lab_mean = ifelse(length(grep('-', labs$lab[i]))>0, paste0(' (x̄ ', format(round(labs$x[i],1),nsmall=1), ')'), '') 215 | labs$lab[i] = paste0(symbs[i], ' ', labs$lab[i], lab_mean) 216 | } 217 | labs = c('Points', labs$lab) 218 | } 219 | } else { # only 2 types of point 220 | symbs = pch 221 | summary$grp = 1 222 | } 223 | 224 | # output scatterplot 225 | l = rep(' ', cols) 226 | cat(' ', rep('_',cols), ' \n', sep='') 227 | k = 0 228 | for(i in 1:rows) { 229 | dat = subset(summary, y == i) 230 | ln = l 231 | ln[dat$x] = symbs[dat$grp] 232 | cat('|', ln, '| ', sep='') 233 | if(pr_labs & i <= length(labs)) cat(' ', labs[i], sep='') # point symbol key 234 | k = k + 1 235 | if(k == ceiling(rows/2)) cat('', ylab) # y label 236 | if(k == rows) if('-R' %in% plot_args & xlab!='Index') cat(' R²=', round(cor(x,y0),3), sep='') 237 | cat('\n') 238 | } 239 | cat('|', rep('_',cols), '|\n', sep='') 240 | xlab_mar = max(0, 1 + cols/2 - (nchar(xlab)/2)) 241 | cat(rep(' ', xlab_mar), xlab, '\n', sep="") # x label 242 | 243 | # add summary unless overriden 244 | if(!'-x' %in% plot_args) { 245 | cat(nrows, 'data rows plotted') 246 | if(nrow(d_orig) > nrows) cat('.', nrow(d_orig) - nrows, 'rows with NA values omitted') 247 | cat('\n') 248 | print(summary(orig_dat)) 249 | } 250 | } 251 | 252 | # report d.f. column classes 253 | inspect_df = function (obj) { 254 | cat('d.f. dimensions: ', dim(obj), '\n') 255 | try({ 256 | r = NULL 257 | for (i in 1:ncol(obj)) { 258 | r = c(r, class(obj[[i]])) 259 | } 260 | names(r) = names(obj) 261 | print(r) 262 | }, silent = FALSE) 263 | print(head(obj, 3)) 264 | } 265 | 266 | # read in data 267 | cons_width = as.integer(system('tput cols', intern=T)) - 10 268 | cons_width = ifelse("-w" %in% plot_args, cons_width, min(100, cons_width)) 269 | txt = field_args[1] 270 | rows = length(strsplit(txt, split='\n')[[1]]) 271 | 272 | # data from text blob argument or csv file 273 | if('-n' %in% plot_args) header = F else header = T 274 | if(rows == 1) d = read.table(txt, sep=pars$sep[2], stringsAsFactors=F, header=header, row.names=NULL, quote=pars$quote[2]) 275 | if(rows > 1) d = read.table(text=txt, header=header, sep=pars$sep[2], stringsAsFactors=F, quote=pars$quote[2]) 276 | 277 | if("-O" %in% plot_args){ 278 | cat('\nRaw data as read-in:\n') 279 | inspect_df(d) 280 | } 281 | 282 | # parse numerics 283 | for(i in 1:ncol(d)){ 284 | nums = num( d[,i] ) # check if numeric/coercible 285 | if(is.numeric(nums[1])) d[,i] = nums 286 | } 287 | d_orig = d # backup as is 288 | 289 | if("-O" %in% plot_args){ 290 | cat('\nNumerically parsed data:\n') 291 | inspect_df(d) 292 | cat('\n') 293 | quit() 294 | } 295 | 296 | # output data.frame to console 297 | if('-P' %in% plot_args){ 298 | print(head(d, 1000)) 299 | quit() 300 | } 301 | 302 | field_names = field_args[2:(length(field_args))] 303 | 304 | # interpret field names - check if valid as name or column index 305 | for(i in length(field_names):1) { 306 | f = field_names[i] 307 | badfield = F 308 | match = pmatch(f, names(d)) 309 | if(is.na(match)){ # not a valid col name 310 | if(is.numeric(num(f))){ # is possible number 311 | f = as.numeric(f) 312 | if(f <= ncol(d)){ # is within col index range 313 | field_names[i] = names(d)[f] # change to col name 314 | } else badfield = T 315 | } else badfield = T 316 | } else field_names[i] = names(d)[match] # in case partial match 317 | if(badfield) { 318 | warning(paste('fieldname not valid name or column index:', f)) 319 | quit() 320 | field_names = field_names[-i] 321 | } 322 | } 323 | 324 | id_fields = field_names[1:(length(field_names)-1)] 325 | values_field = field_names[length(field_names)] 326 | 327 | # omit rows with NA in plotting columns 328 | d = na.omit(d[,c(id_fields, values_field), drop=F]) 329 | nrows = nrow(d) # to calc NA removals 330 | 331 | # scatterplot if 2 fully numeric/NA variables or manually specified 332 | if(!'-F' %in% plot_args & length(id_fields) == 1) { 333 | v = d[[id_fields]] 334 | numvals = suppressWarnings(as.numeric( v[!is.na(v)] )) 335 | all_numeric = all(!is.na(numvals)) 336 | plot_scatter = F 337 | if(all_numeric & !'-H' %in% plot_args) plot_scatter = T 338 | if(!all_numeric & '-S' %in% plot_args) plot_scatter = T 339 | if(plot_scatter) { 340 | if(values_field == id_fields) { # ie. only a single field supplied 341 | dat = na.omit(data.frame(x = 1:length(v), y = d[[values_field]], stringsAsFactors=F)) 342 | } else dat = na.omit(data.frame(x = suppressWarnings(as.numeric(v)), y = d[[values_field]], stringsAsFactors=F)) 343 | scatter_plot(dat$x, dat$y, cols=as.numeric(pars$x[2]), rows=as.numeric(pars$y[2]), pch=pars$pch[2], xlab=id_fields, ylab=values_field) 344 | quit() 345 | } 346 | } 347 | 348 | # aggregate 349 | if('-a' %in% plot_args) { 350 | fun = 'sum' 351 | if('-m' %in% plot_args) fun = 'mean' 352 | if('-M' %in% plot_args) fun = 'median' 353 | if('-l' %in% plot_args) fun = 'length' 354 | cat('Aggregate function is', fun, '\n') 355 | if(length(id_fields) > 1) agg_list = as.list(d[,id_fields]) else agg_list = list(d[,id_fields]) 356 | if(fun == 'length') { 357 | d = aggregate(rep(1,nrow(d)), by=agg_list, FUN=sum, na.rm=T, simplify=T) 358 | } else d = aggregate(d[[values_field]], by=agg_list, FUN=fun, na.rm=T, simplify=T) 359 | if(length(unique(c(id_fields, values_field))) == 1) { 360 | values_field = fun # ie. 'length' 361 | d[[id_fields]] = 1:nrow(d) 362 | } 363 | } 364 | 365 | # rename fields if they've changed 366 | names(d) = c(id_fields, values_field) 367 | 368 | # reorder data hashbars 369 | if('-o' %in% plot_args) d = d[order(d[[values_field]], decreasing=T),] 370 | 371 | # output processed data.frame to console 372 | if('-Q' %in% plot_args){ 373 | print(head(d,1000)) 374 | quit() 375 | } 376 | 377 | # histogram for single numeric variable 378 | if('-F' %in% plot_args){ 379 | ran = range(d[[values_field]]) 380 | brks = seq(ran[1], ran[2], length.out = as.numeric(pars$bins[2])) 381 | cuts = cut(d[[values_field]], brks, include.lowest=T) 382 | cats = as.numeric(cuts) 383 | labs = levels(cuts) 384 | grps = as.data.frame(table(cats), stringsAsFactors=F) 385 | grps$cats = as.numeric(grps$cats) 386 | fullset = min(grps$cats):max(grps$cats) 387 | missing = fullset[!fullset %in% grps$cats] 388 | for(m in missing) grps = rbind(grps, data.frame(cats=m, Freq=0)) 389 | grps = grps[order(grps$cats),] 390 | for(i in c('\\[', '\\]', '\\(')) labs = gsub(i, '', labs) 391 | grps$means = as.numeric(lapply(sapply(labs, strsplit, split=','), function(i) mean(as.numeric(i)))) 392 | pretty_labs = pretty(grps$means, 4) 393 | pretty_labs = pretty_labs[pretty_labs >= ran[1] & pretty_labs <= ran[2]] 394 | ids = sapply(pretty_labs, function(x){ which(abs(grps$means-x)==min(abs(grps$means-x)))[1]} ) # closest group to assign label 395 | grps$lab = '' 396 | grps$lab[ids] = paste('--', pretty_labs, '--') 397 | d = grps[,c(4,2)] 398 | id_fields = values_field; values_field = 'frequency' 399 | names(d) = c(id_fields, values_field) 400 | } 401 | 402 | # calculate column widths 403 | field_data = list() 404 | pos_x = 1 405 | for(f in c(id_fields, values_field)) { 406 | n = length(field_data) + 1 407 | vals = d[[f]] 408 | numerics = !is.na(suppressWarnings(as.numeric(vals))) 409 | vals[numerics] = format_num(as.numeric(vals[numerics])) 410 | maxlen = max(nchar(f), nchar(vals)) 411 | maxlen = ifelse("-w" %in% plot_args, maxlen, min(maxlen, 30)) 412 | vals = substr(vals, 1, maxlen) 413 | padstr = paste0("%-", maxlen, "s") 414 | field_data[n] = list(list(name = sprintf(padstr, substr(f, 1, maxlen)), values = sprintf(padstr, vals), 415 | pos_start = pos_x, pos_end = pos_x + maxlen + 2)) 416 | pos_x = pos_x + maxlen + 3 417 | char_deficit = maxlen - nchar(field_data[[n]]$values) # fix for sprintf bug that ignores special characters when padding 418 | field_data[[n]]$values = paste0(field_data[[n]]$values, sapply(char_deficit, function(i) paste(rep(' ',i), collapse=''))) 419 | } 420 | 421 | plot_width = cons_width - field_data[[length(field_data)]]$pos_end 422 | values = d[[values_field]] 423 | 424 | # scale and spacing 425 | plot_ind = field_data[[ length(field_data) ]]$pos_end + 2 426 | 427 | # whether scale to zero or positive/negative extreme 428 | if(min(values) < 0 & max(values) > 0) { 429 | ran = range(values) # scale positive to negative 430 | } else if(min(values) >= 0) { 431 | ran = c(0, max(values)) # all positive, scale to zero 432 | } else{ 433 | ran = c(min(values), 0) # still plot hashbars from left axis 434 | } 435 | 436 | fact = (cons_width - plot_ind) / diff(ran) 437 | #plot_values = as.integer(rescale(values, to = fact * ran)) # old 438 | plot_values = as.integer(values * fact) 439 | minvalue = min(values) 440 | spaces = pmax(0, pmin(plot_values - fact * minvalue, fact * -minvalue)) 441 | hashes = fact * abs(values) 442 | 443 | # print hashbar plot 444 | cat(nrows, 'data rows plotted') 445 | if('-z' %in% plot_args) quit() 446 | if('-p' %in% plot_args) pch = pars$pch[2] else pch = '#' 447 | if(nrow(d_orig) > nrows) cat('.', nrow(d_orig) - nrows, 'rows with NA values omitted') 448 | cat('\n'); for(f in field_data) cat(f$name, ' '); cat('\n') 449 | 450 | for(i in 1:length(values)) { 451 | # id_fields 452 | for(f in field_data) cat(f$values[i], ' ') 453 | # hashes 454 | cat(rep(' ', spaces[i]), sep='') 455 | cat(rep(pch, hashes[i]), '\n', sep='') 456 | } 457 | --------------------------------------------------------------------------------