├── DESCRIPTION ├── MD5 ├── NAMESPACE ├── R ├── acquireAuditData.R ├── ada.R ├── ada_gui.R ├── appendLibLog.R ├── asRules.rpart.R ├── associate.R ├── biclust.R ├── binning.R ├── clara.R ├── cluster.R ├── comcat.R ├── ctree.R ├── data.R ├── errorMatrix.R ├── evaluate.R ├── ewkm.R ├── execute.R ├── executeBoxPlot2.R ├── executeExploreGGRaptR.R ├── executeHistPlot2.R ├── executeLogTab.R ├── executeModelAda.R ├── executeModelGlm.R ├── executeModelRF.R ├── executeModelRxBTrees.r ├── executeModelRxDForest.R ├── executeModelRxDTree.R ├── executeModelRxGlm.r ├── executeModelXGB.R ├── executePairsPlotSelect2.R ├── explore.R ├── export.R ├── fancyRpartPlot.R ├── ggVarImp.R ├── hclust.R ├── help.R ├── kmeans.R ├── loadLibs.R ├── loadTooltips.R ├── log.R ├── model.R ├── nnet.R ├── normVarNames.R ├── projects.R ├── psfchart.R ├── random_forest.R ├── rattle.R ├── rattleInfo.R ├── report.R ├── riskchart.R ├── rocChart.R ├── rpart.R ├── survival.R ├── test.R ├── textminer.R ├── textview.R ├── transform.R ├── unloadLibs.R ├── xgb.R ├── xgboostFormula.R └── zzz.R ├── build └── vignette.rds ├── data ├── audit.RData ├── locationsAUS.RData ├── weather.RData ├── weatherAUS.RData └── wine.RData ├── inst ├── CITATION ├── NEWS ├── arff │ ├── audit.arff │ └── weather.arff ├── csv │ ├── audit.csv │ ├── dvdtrans.csv │ └── weather.csv ├── doc │ ├── rattle.R │ ├── rattle.Rnw │ └── rattle.pdf ├── etc │ ├── Rlogo.png │ ├── gpl-license │ ├── rattle.glade │ ├── rattle.ui │ ├── rattle_macosx.ui │ ├── textviews.xml │ └── tooltips.xml ├── extdata │ └── audit.xlsx ├── odt │ └── data_summary.odt └── po │ ├── de │ └── LC_MESSAGES │ │ └── R-rattle.mo │ ├── es │ └── LC_MESSAGES │ │ └── R-rattle.mo │ ├── fr │ └── LC_MESSAGES │ │ └── R-rattle.mo │ ├── id │ └── LC_MESSAGES │ │ └── R-rattle.mo │ ├── ja │ └── LC_MESSAGES │ │ └── R-rattle.mo │ ├── no │ └── LC_MESSAGES │ │ └── R-rattle.mo │ └── zh_CN │ └── LC_MESSAGES │ └── R-rattle.mo ├── man ├── acquireAuditData.Rd ├── asRules.Rd ├── asRules.rpart.Rd ├── audit.Rd ├── binning.Rd ├── calcInitialDigitDistr.Rd ├── calculateAUC.Rd ├── centers.hclust.Rd ├── comcat.Rd ├── drawTreeNodes.Rd ├── drawTreesAda.Rd ├── errorMatrix.Rd ├── evaluateRisk.Rd ├── fancyRpartPlot.Rd ├── genPlotTitleCmd.Rd ├── ggVarImp.Rd ├── grouper.Rd ├── internal.Rd ├── listAdaVarsUsed.Rd ├── listTreesAda.Rd ├── listVersions.Rd ├── modalvalue.Rd ├── plotOptimalLine.Rd ├── plotRisk.Rd ├── printRandomForests.Rd ├── randomForest2Rules.Rd ├── rattle.Rd ├── rattle.print.summary.multinom.Rd ├── rattleInfo.Rd ├── riskchart.Rd ├── savePlotToFile.Rd ├── setupDataset.Rd ├── treeset.randomForest.Rd ├── weather.Rd ├── weatherAUS.Rd ├── whichNumerics.Rd └── wine.Rd └── vignettes └── rattle.Rnw /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rattle 2 | Type: Package 3 | Title: Graphical User Interface for Data Science in R 4 | Version: 5.5.1 5 | Date: 2022-03-20 6 | Authors@R: c(person("Graham", "Williams", 7 | role=c("aut", "cph", "cre"), 8 | email="Graham.Williams@togaware.com"), 9 | person("Mark", "Vere Culp", role="cph"), 10 | person("Ed", "Cox", role="ctb"), 11 | person("Anthony", "Nolan", role="ctb"), 12 | person("Denis", "White", role="cph"), 13 | person("Daniele", "Medri", role="ctb"), 14 | person("Akbar", "Waljee", role="ctb", 15 | comment="OOB AUC for Random Forest"), 16 | person("Brian", "Ripley", role="cph", 17 | comment="print.summary.nnet"), 18 | person("Jose", "Magana", role="ctb", 19 | comment="ggpairs plots"), 20 | person("Surendra", "Tipparaju", role="ctb", 21 | comment="initial RevoScaleR/XDF"), 22 | person("Durga", "Prasad Chappidi", role="ctb", 23 | comment="initial RevoScaleR/XDF"), 24 | person("Dinesh", "Manyam Venkata", role="ctb", 25 | comment="initial RevoScaleR/XDF"), 26 | person("Mrinal", "Chakraborty", role="ctb", 27 | comment="initial RevoScaleR/XDF"), 28 | person("Fang", "Zhou", role="ctb", 29 | comment="initial xgboost"), 30 | person("Cameron", "Chisholm", role="ctb", 31 | comment="risk plot on risk chart")) 32 | Depends: R (>= 3.5.0), tibble, bitops 33 | Imports: stats, utils, ggplot2, grDevices, graphics, magrittr, methods, 34 | stringi, stringr, tidyr, dplyr, XML, rpart.plot 35 | Suggests: pmml (>= 1.2.13), colorspace, ada, amap, arules, arulesViz, 36 | biclust, cairoDevice, cba, cluster, corrplot, descr, doBy, 37 | e1071, ellipse, fBasics, foreign, fpc, gdata, ggdendro, gplots, 38 | grid, gridExtra, gtools, Hmisc, janitor, kernlab, Matrix, mice, 39 | nnet, party, plyr, psych, RGtk2, randomForest, RColorBrewer, 40 | readxl, reshape, ROCR, RODBC, rpart, scales, SnowballC, 41 | survival, timeDate, tm, xgboost 42 | Description: The R Analytic Tool To Learn Easily (Rattle) provides a 43 | collection of utilities functions for the data scientist. A 44 | Gnome (RGtk2) based graphical interface is included with 45 | the aim to provide a simple and intuitive introduction to R 46 | for data science, allowing a user to quickly load data from a CSV file 47 | (or via ODBC), transform and explore the data, 48 | build and evaluate models, and export models as PMML (predictive 49 | modelling markup language) or as scores. A key aspect of the GUI 50 | is that all R commands are logged and commented through the log tab. 51 | This can be saved as a standalone R script file and as 52 | an aid for the user to 53 | learn R or to copy-and-paste directly into R itself. 54 | Note that RGtk2 and cairoDevice have been archived on CRAN. 55 | See for installation instructions. 56 | License: GPL (>= 2) 57 | LazyLoad: yes 58 | LazyData: yes 59 | URL: https://rattle.togaware.com/ 60 | NeedsCompilation: no 61 | Packaged: 2022-03-20 00:54:54 UTC; gjw 62 | Author: Graham Williams [aut, cph, cre], 63 | Mark Vere Culp [cph], 64 | Ed Cox [ctb], 65 | Anthony Nolan [ctb], 66 | Denis White [cph], 67 | Daniele Medri [ctb], 68 | Akbar Waljee [ctb] (OOB AUC for Random Forest), 69 | Brian Ripley [cph] (print.summary.nnet), 70 | Jose Magana [ctb] (ggpairs plots), 71 | Surendra Tipparaju [ctb] (initial RevoScaleR/XDF), 72 | Durga Prasad Chappidi [ctb] (initial RevoScaleR/XDF), 73 | Dinesh Manyam Venkata [ctb] (initial RevoScaleR/XDF), 74 | Mrinal Chakraborty [ctb] (initial RevoScaleR/XDF), 75 | Fang Zhou [ctb] (initial xgboost), 76 | Cameron Chisholm [ctb] (risk plot on risk chart) 77 | Maintainer: Graham Williams 78 | Repository: CRAN 79 | Date/Publication: 2022-03-21 13:10:02 UTC 80 | -------------------------------------------------------------------------------- /MD5: -------------------------------------------------------------------------------- 1 | dfde49a87bd8d229ac4b67c30ed5cb83 *DESCRIPTION 2 | c65d174f5ca6dd0ff225e5fe1d189228 *NAMESPACE 3 | 7150923073a603c1114d5f79c4c833bc *R/acquireAuditData.R 4 | af4de58c7fee7bd5e3d4f5b359683d0a *R/ada.R 5 | a563bb6edf034934e75f2885223f9663 *R/ada_gui.R 6 | 6250a0cc8caa2346890b9871aeb5258b *R/appendLibLog.R 7 | f13cbeb275e4a56c93eee9b96f14eaab *R/asRules.rpart.R 8 | 00622b1bf950a37914a6761790eca6d7 *R/associate.R 9 | 4ca37efa0619973496b4a89425d129dd *R/biclust.R 10 | 35b51e36a0bff1343963391c9e4d69b0 *R/binning.R 11 | 027d5c4a56fc74c6989fba15417e44a9 *R/clara.R 12 | 69aded1b1021fac79a1114be2513fffa *R/cluster.R 13 | d5310cdbd25ce45b7e723ae79a375277 *R/comcat.R 14 | 4e90fe79e27d3b193b94461b4f9a01a7 *R/ctree.R 15 | 405e9db3188bffa143b782eabea02c14 *R/data.R 16 | f56a37bc43d7fc103d8805d9e60266c6 *R/errorMatrix.R 17 | 032dd5633630e1862c5e3dc58e037a8f *R/evaluate.R 18 | b22d75b6e5b04d59de03eaa79f8a2124 *R/ewkm.R 19 | 45bf5b579f45c2214dc1596d8006765e *R/execute.R 20 | 4f8b94bc38a249628cfa73aa55c0e4d6 *R/executeBoxPlot2.R 21 | 40d678b83004a4e8439f79bf09271a90 *R/executeExploreGGRaptR.R 22 | 7bb54b05b9081d5b7fa0e17a721db7fc *R/executeHistPlot2.R 23 | ab1d71b812c5b403fd91d0259373ba9e *R/executeLogTab.R 24 | 5f0572134fd0b9086acafed51477caf3 *R/executeModelAda.R 25 | 9c66f88ae6ce325842942937dfcdaf41 *R/executeModelGlm.R 26 | 6576b5d8f639e570ed5d76c13350e3a7 *R/executeModelRF.R 27 | 7554b8864fbbbb5478f1c7591c87091f *R/executeModelRxBTrees.r 28 | c25a6a6e2008d3bb96d072171ab86906 *R/executeModelRxDForest.R 29 | 751dd36be37e49c314e94245dac18b79 *R/executeModelRxDTree.R 30 | b04967b2339bc546d47b1e427e91d031 *R/executeModelRxGlm.r 31 | 9d6a4f8979894a32ebc7cb1a181e9b05 *R/executeModelXGB.R 32 | 0715f6946819c12b80888bf7bce83d41 *R/executePairsPlotSelect2.R 33 | 7d9d7b0fc05c868182e09b0f4a31928b *R/explore.R 34 | eb49e240027d0576ed599c5d87c515b8 *R/export.R 35 | 887c99b05df4aa3af4fb34224b893537 *R/fancyRpartPlot.R 36 | d75adcc7b03ec298fd468c409a6f3b44 *R/ggVarImp.R 37 | e67b4ad415dc83bf9c94dcd68ebf5570 *R/hclust.R 38 | f9059408ed78651ca318db13174c23c0 *R/help.R 39 | 88fadcb1dc8d16d1b58c48f676d23648 *R/kmeans.R 40 | f84fce3659bd1b80082c7ad3353f87b5 *R/loadLibs.R 41 | ab71d282d13a6cf9078a306b63c6318f *R/loadTooltips.R 42 | cc707ff9d9c132f04d6b444b9cdf5300 *R/log.R 43 | 2aa8a9465bdbcfed998821e04005ede9 *R/model.R 44 | 2fdd806efa77d6985bdf8883e1481d17 *R/nnet.R 45 | ceafaa22c93f40fdbdf494c71bfaba22 *R/normVarNames.R 46 | 12e4d56b8d2a89e568a5583980581afe *R/projects.R 47 | 6b48c4bcc350deca1732d5eafb5302eb *R/psfchart.R 48 | f3dbc0f3e5a897c94663210714aec205 *R/random_forest.R 49 | d81b242b056dfd868ad361a1bf9d77de *R/rattle.R 50 | 73e575345a55eb688e4ef3e4f1a76a75 *R/rattleInfo.R 51 | 1287a0651ea622a1dba2c1ee85cfc502 *R/report.R 52 | 00a8b93dcd5f84ef91528eac1aef53e5 *R/riskchart.R 53 | b905bcf42c7d0d01b679913b3435b54b *R/rocChart.R 54 | 6f3de3d34845405536a55cc850ecd70a *R/rpart.R 55 | dc7925f6991d9bb6c7ae1545e4fad9ed *R/survival.R 56 | eae67386ff40a92e679a370f4c0645ab *R/test.R 57 | bbaccdfb16e5be19680e3b909a11dc70 *R/textminer.R 58 | 168d61cb2e51eb392b58e469b3474705 *R/textview.R 59 | e723b068b7aaf38721788132cc1a996d *R/transform.R 60 | 637bc5dc741a58b20266569883f429dd *R/unloadLibs.R 61 | 34b5ee0498869b617d68b994aef46797 *R/xgb.R 62 | b908ee73a14897a69baa2568c1602bc7 *R/xgboostFormula.R 63 | 70e0b3f9bffd8d1426d36fc9f5867e10 *R/zzz.R 64 | 82e7b4e845baa408ac394a2612f0f446 *build/vignette.rds 65 | 7d4cd53924a0de7e823fcab1f79eb9f3 *data/audit.RData 66 | e61807f15364f42b919b7a7aac8bf50e *data/locationsAUS.RData 67 | f4ee9d0d3a0e5cd54e7b247e6b293a67 *data/weather.RData 68 | 223e0030ea8656c55213a3c4958f81cf *data/weatherAUS.RData 69 | b238b94883a795bdb34beca7a3200109 *data/wine.RData 70 | 07b206873b335f3b59e1b339946a101b *inst/CITATION 71 | 9b0c1eea2aa96fa37b6936311ea39ba8 *inst/NEWS 72 | 7bb0f7665aff9ba1cd4615a7b3130f04 *inst/arff/audit.arff 73 | 8fcf1db9883917a9682d315200d75d3b *inst/arff/weather.arff 74 | 074d01593d19414e3b04ed5e5540b697 *inst/csv/audit.csv 75 | a2f3aac92bd6389bf4f6404fab1f25f9 *inst/csv/dvdtrans.csv 76 | 42eb6df45078d0da4a619279f93e6cfb *inst/csv/weather.csv 77 | eb51a60ca3f95d06477c88e4a9fee7f2 *inst/doc/rattle.R 78 | 097dc50ec6cd37e023ded2a67d2e650a *inst/doc/rattle.Rnw 79 | a3de9948a0b4e025a8e6a2aa246a9e92 *inst/doc/rattle.pdf 80 | 7381224c65138a2acdf3a8346f8275c4 *inst/etc/Rlogo.png 81 | 7d7c232d655fd1c91af00d34b00de5df *inst/etc/gpl-license 82 | 7633af88abaa7b6df08c0895f08bd4bb *inst/etc/rattle.glade 83 | 957f06081b8c8ed9840f9a52ab88ef09 *inst/etc/rattle.ui 84 | 77d262eb3e1f817d9a1e97f9d644c9a9 *inst/etc/rattle_macosx.ui 85 | 8cdab95c921b90ea2d12042fce84bc89 *inst/etc/textviews.xml 86 | 4d22c4dd38e4afd82f3edcc697deb669 *inst/etc/tooltips.xml 87 | 556dd7c7897ebe95cdf24c642639f9c9 *inst/extdata/audit.xlsx 88 | 6a187fbea9822787879c33c899b4a679 *inst/odt/data_summary.odt 89 | f7f970509860caafd2cd8af603186ecd *inst/po/de/LC_MESSAGES/R-rattle.mo 90 | a9e5b6844d8ed5c9139c553a865f8572 *inst/po/es/LC_MESSAGES/R-rattle.mo 91 | 755e570e0af45f1203f0846976dc2f2e *inst/po/fr/LC_MESSAGES/R-rattle.mo 92 | 313b8ddd254a47c8b37b21fdcedb5b0a *inst/po/id/LC_MESSAGES/R-rattle.mo 93 | 334e3682ee7587c54d737bda46151722 *inst/po/ja/LC_MESSAGES/R-rattle.mo 94 | c02e1f16560a877a4d138c7f8ae60fe0 *inst/po/no/LC_MESSAGES/R-rattle.mo 95 | 8898f72e15a7589f2828758bfb36f231 *inst/po/zh_CN/LC_MESSAGES/R-rattle.mo 96 | 9438af1222ef0076dfe7747e33fb4996 *man/acquireAuditData.Rd 97 | f98da230d559a00d983e1c5c3f1a2dfd *man/asRules.Rd 98 | f4ccc84132a7ff6ade803da6d6f744d7 *man/asRules.rpart.Rd 99 | 480efc0108c5ee421792bba2a19d12a4 *man/audit.Rd 100 | cc1f637becd94aa98175655ef7ca3500 *man/binning.Rd 101 | 2da47f56b82c0828936e9d92f28f7a92 *man/calcInitialDigitDistr.Rd 102 | 6ba7d6537415ff497777e4ff1e7360e0 *man/calculateAUC.Rd 103 | fd670c34ad0561b3b0108dfc9f0803b1 *man/centers.hclust.Rd 104 | 1a1f6e6d51246367a8ad79e775d9fb3f *man/comcat.Rd 105 | 9fb42f8c02fff5ae04f3ca38bc13e7fc *man/drawTreeNodes.Rd 106 | 4c7bb92a1f56761f2cfaf4e96def953c *man/drawTreesAda.Rd 107 | 6d2ec516d9a4fbd28218da1bfe97e94c *man/errorMatrix.Rd 108 | 38290cd7f879eceb2819a042dd156b8c *man/evaluateRisk.Rd 109 | 98a3f1e88289c663442312eb7666df51 *man/fancyRpartPlot.Rd 110 | 705ba3e6c5adf0311e8c51047bbaa786 *man/genPlotTitleCmd.Rd 111 | 53ca4d75c3cfaf9fd2d28d8d256f2921 *man/ggVarImp.Rd 112 | 2d1ce0ede5cd159ccbba46fa048ca62c *man/grouper.Rd 113 | 0001b09aaa6b511106242d26bb274132 *man/internal.Rd 114 | 63016b3c880bc270ab2ee753ea46bbfe *man/listAdaVarsUsed.Rd 115 | 40ea64e53023df0338c5581e3d8fe6b5 *man/listTreesAda.Rd 116 | 22fd24014699705d8183ddeb4d435f45 *man/listVersions.Rd 117 | eb1992d89f735861c40024bbb35efb63 *man/modalvalue.Rd 118 | 8338e98d812f479868a014df53f07460 *man/plotOptimalLine.Rd 119 | c4523d09988e2b0224e37c10f1b910a2 *man/plotRisk.Rd 120 | bf0c085ccb878f91d150d16399b258dc *man/printRandomForests.Rd 121 | 6c9885484c1360a411772d1b076a02f1 *man/randomForest2Rules.Rd 122 | eac079318b7a129830ad7e854489a4a7 *man/rattle.Rd 123 | 311fb9a850423fb53b797d85df0ae196 *man/rattle.print.summary.multinom.Rd 124 | 28f9ebcf0637ab7e101e72631db50571 *man/rattleInfo.Rd 125 | 956b40566e48f36850c8424bed8cd2a1 *man/riskchart.Rd 126 | 5d7020f0d56c4afea3cf7beb3335afe9 *man/savePlotToFile.Rd 127 | 57d7c001abceeeabffd0686e7763b282 *man/setupDataset.Rd 128 | 909e484a0497278ee205266ddee4c1bd *man/treeset.randomForest.Rd 129 | 2b539c44caa6c21d76c4c2243ffaf3f2 *man/weather.Rd 130 | 30d907ae3dbb53cfe6734f719c759214 *man/weatherAUS.Rd 131 | 881626c19b22ed6589d33ab010c82560 *man/whichNumerics.Rd 132 | be3dda89cea8d00f6502bbaff814778c *man/wine.Rd 133 | 097dc50ec6cd37e023ded2a67d2e650a *vignettes/rattle.Rnw 134 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | export( 2 | acquireAuditData, 3 | asRules, 4 | benfordDistr, 5 | binning, 6 | calcInitialDigitDistr, 7 | calculateAUC, 8 | centers.hclust, 9 | comcat, 10 | copyPlotToClipboard, 11 | digitDistr, 12 | drawTreesAda, 13 | drawTreeNodes, 14 | errorMatrix, 15 | evaluateRisk, 16 | fancyRpartPlot, 17 | generateAprioriSummary, 18 | genPlotTitleCmd, 19 | ggVarImp, 20 | importance, 21 | listAdaVarsUsed, 22 | listTreesAda, 23 | listVersions, 24 | modalvalue, 25 | normVarNames, 26 | plotDigitFreq, 27 | plotOptimalLine, 28 | plotRisk, 29 | predict.kmeans, 30 | predict.hclust, 31 | printPlot, 32 | printRandomForests, 33 | print.summary.nnet, 34 | psfchart, 35 | randomForest2Rules, 36 | rattle, 37 | rattleInfo, 38 | rattle.print.summary.multinom, 39 | rescale.by.group, 40 | riskchart, 41 | rocChart, 42 | # 120117 Remove for now - could be harmful. 43 | # overwritePackageFunction, 44 | savePlotToFile, 45 | setupDataset, 46 | toga, 47 | treeset.randomForest, 48 | whichNumerics, 49 | xgboost, 50 | crs,crv 51 | ) 52 | 53 | exportPattern("_") # Needed for the Glade interface 54 | 55 | S3method(asRules, rpart) 56 | S3method(ggVarImp, randomForest) 57 | S3method(ggVarImp, rpart) 58 | S3method(ggVarImp, rxDForest) 59 | S3method(ggVarImp, xgb.Booster) 60 | S3method(ggVarImp, xgb.formula) 61 | S3method(predict, xgb.formula) 62 | S3method(print, xgb.formula) 63 | S3method(importance, xgb.formula) 64 | S3method(xgboost, formula) 65 | S3method(predict, hclust) 66 | S3method(predict, kmeans) 67 | S3method(print.summary, nnet) 68 | 69 | #import(RGtk2) # Not required but will be used if available. 70 | import(stats) 71 | import(utils) 72 | import(grDevices) 73 | import(graphics) 74 | import(methods) # For possibleExtends() formal classes 75 | import(tibble) 76 | 77 | importFrom(magrittr, "%>%") 78 | importFrom(magrittr, "%<>%") 79 | importFrom(stringi, "%s+%") 80 | 81 | importFrom(bitops, cksum) 82 | -------------------------------------------------------------------------------- /R/acquireAuditData.R: -------------------------------------------------------------------------------- 1 | # Rattle: A GUI for Data Mining in R 2 | # 3 | # AUDIT DATASET 4 | # 5 | # Time-stamp: 6 | # 7 | # Copyright (c) 2009-2014 Togaware Pty Ltd 8 | # 9 | # This file is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | # 24 | ######################################################################## 25 | # 26 | # Generate an audit dataset that is fictional but illustrates the typcial 27 | # financial audit. 28 | 29 | acquireAuditData <- function(write.to.file=FALSE) 30 | { 31 | if (!file.exists('survey.csv')) 32 | { 33 | UCI <- "https://archive.ics.uci.edu/ml" 34 | REPOS <- "machine-learning-databases" 35 | survey.url <- sprintf("%s/%s/adult/adult.data", UCI, REPOS) 36 | download.file(survey.url, "survey.data") 37 | survey <- read.csv("survey.data", header=F, strip.white=TRUE, 38 | na.strings="?", 39 | col.names=c("Age", "Workclass", "fnlwgt", 40 | "Education", "Education.Num", "Marital.Status", 41 | "Occupation", "Relationship", "Race", "Gender", 42 | "Capital.Gain", "Capital.Loss", 43 | "Hours.Per.Week", "Native.Country", 44 | "Salary.Group")) 45 | write.table(survey, "survey.csv", sep=",", row.names=F) 46 | } 47 | 48 | survey <- read.csv("survey.csv") 49 | 50 | audit <- survey[,c(1,2,4,6,7,8,10,12,13,14,11,15)] 51 | 52 | colnames(audit)[2] <- "Employment" 53 | colnames(audit)[4] <- "Marital" 54 | colnames(audit)[6] <- "Income" 55 | colnames(audit)[8] <- "Deductions" 56 | colnames(audit)[9] <- "Hours" 57 | colnames(audit)[10] <- "Accounts" 58 | colnames(audit)[11] <- "Adjustment" 59 | colnames(audit)[12] <- "Adjusted" 60 | 61 | audit$Adjusted <- as.integer(audit$Adjusted)-1 62 | 63 | # Make sure most productive cases have an adjustment 64 | 65 | adj <- audit[audit$Adjusted==0 & audit$Adjustment != 0, 'Adjustment'] 66 | a <- length(adj) 67 | m <- length(audit[audit$Adjusted==1 & audit$Adjustment==0,'Adjusted']) 68 | r <- m%/%a*a 69 | 70 | set.seed(12345) 71 | audit[audit$Adjusted==1 & audit$Adjustment==0, 'Adjustment'][sample(m, r)] <- 72 | as.integer(adj*(rnorm(r) + 2)) 73 | 74 | # Make sure no nonproductive case has an adjustment 75 | 76 | audit[audit$Adjusted==0 & audit$Adjustment!=0,'Adjustment'] <- 0 77 | 78 | # Tidyup ForeignAccounts 79 | 80 | levels(audit$Accounts)[6] <- "NewZealand" 81 | levels(audit$Accounts)[8] <- "Singapore" 82 | levels(audit$Accounts)[15] <- "Holand" 83 | levels(audit$Accounts)[28] <- "Fiji" 84 | levels(audit$Accounts)[33] <- "Malaysia" 85 | levels(audit$Accounts)[35] <- "Vietnam" 86 | levels(audit$Accounts)[38] <- "Indonesia" 87 | levels(audit$Accounts)[39] <- "UnitedStates" 88 | 89 | # Tidyup Employment 90 | 91 | levels(audit$Employment)[1] <- "PSFederal" 92 | levels(audit$Employment)[2] <- "PSLocal" 93 | levels(audit$Employment)[3] <- "Unemployed" 94 | levels(audit$Employment)[5] <- "SelfEmp" 95 | levels(audit$Employment)[6] <- "Consultant" 96 | levels(audit$Employment)[7] <- "PSState" 97 | levels(audit$Employment)[8] <- "Volunteer" 98 | 99 | # Tidyup Marital 100 | 101 | levels(audit$Marital)[2] <- "Married" 102 | levels(audit$Marital)[3] <- "Married" 103 | levels(audit$Marital)[4] <- "Absent" 104 | levels(audit$Marital)[5] <- "Unmarried" 105 | 106 | # Tidyup Occupation 107 | 108 | levels(audit$Occupation)[1] <- "Clerical" 109 | levels(audit$Occupation)[2] <- "Military" 110 | levels(audit$Occupation)[3] <- "Repair" 111 | levels(audit$Occupation)[4] <- "Executive" 112 | levels(audit$Occupation)[5] <- "Farming" 113 | levels(audit$Occupation)[6] <- "Cleaner" 114 | levels(audit$Occupation)[7] <- "Machinist" 115 | levels(audit$Occupation)[8] <- "Service" 116 | levels(audit$Occupation)[9] <- "Home" 117 | levels(audit$Occupation)[10] <- "Professional" 118 | levels(audit$Occupation)[11] <- "Protective" 119 | levels(audit$Occupation)[12] <- "Sales" 120 | levels(audit$Occupation)[13] <- "Support" 121 | levels(audit$Occupation)[14] <- "Transport" 122 | 123 | levels(audit$Education)[1] <- "Yr10" 124 | levels(audit$Education)[2] <- "Yr11" 125 | levels(audit$Education)[3] <- "Yr12" 126 | levels(audit$Education)[4] <- "Yr1t4" 127 | levels(audit$Education)[5] <- "Yr5t6" 128 | levels(audit$Education)[6] <- "Yr7t8" 129 | levels(audit$Education)[7] <- "Yr9" 130 | levels(audit$Education)[8] <- "Associate" 131 | levels(audit$Education)[9] <- "Vocational" 132 | levels(audit$Education)[10] <- "Bachelor" 133 | levels(audit$Education)[11] <- "Doctorate" 134 | levels(audit$Education)[12] <- "HSgrad" 135 | levels(audit$Education)[13] <- "Master" 136 | levels(audit$Education)[14] <- "Preschool" 137 | levels(audit$Education)[15] <- "Professional" 138 | levels(audit$Education)[16] <- "College" 139 | 140 | # Turn Relationship into Income 141 | 142 | set.seed(12345) 143 | audit$Income <- round(abs(as.numeric(audit$Income)*rnorm(length(audit$Income), 144 | 35000, 15000)), 2) 145 | 146 | # Make deductions look more 0 for the non-productive cases! 147 | 148 | audit[audit$Adjusted==0,'Deductions'] <- 149 | audit[audit$Adjusted==0,'Deductions']/1.5 150 | 151 | # Sample just 2000 cases and add an Identifier - always the same 152 | 153 | set.seed(12345) 154 | cases <- sample(nrow(audit), 2000) 155 | set.seed(12345) 156 | idents <- as.integer(sort(runif(2000, 1000000, 9999999))) 157 | audit <- cbind(ID=idents, audit[cases,]) 158 | 159 | # Use standard prefixes 160 | 161 | colnames(audit)[11] <- "IGNORE_Accounts" # randomForest can't handle 162 | colnames(audit)[12] <- "RISK_Adjustment" 163 | colnames(audit)[13] <- "TARGET_Adjusted" 164 | 165 | audit.orig <- audit 166 | 167 | # Write out the data 168 | 169 | if (write.to.file) 170 | { 171 | audit <- read.csv("audit.csv") 172 | save(audit, file="audit.RData", compress=TRUE) 173 | write.table(audit, "audit.csv", sep=",", row.names=FALSE) 174 | 175 | arff <- audit 176 | arff$TARGET_Adjusted <- as.factor(arff$TARGET_Adjusted) 177 | if (write.to.file) foreign::write.arff(arff, "audit.arff") 178 | 179 | # Create a dataset with special variable names. 180 | # 080709 I now do this as default. 181 | 182 | # colnames(audit)[11] <- "IGNORE_Accounts" 183 | # colnames(audit)[12] <- "RISK_Adjustment" 184 | # write.table(audit, "audit_auto.csv", sep=",", row.names=FALSE) 185 | 186 | # Create a dataset with many more missing values. 187 | 188 | mr <- sample(1:nrow(audit), nrow(audit)/4, replace=TRUE) 189 | mc <- sample(2:(ncol(audit)-1), nrow(audit)/4, replace=TRUE) 190 | 191 | for (i in 1:(nrow(audit)/4)) 192 | { 193 | audit[mr[i], mc[i]] <- NA 194 | } 195 | write.table(audit, "audit_missing.csv", sep=",", row.names=FALSE) 196 | } 197 | if (write.to.file) 198 | invisible(audit.orig) 199 | else 200 | return(audit.orig) 201 | } 202 | 203 | 204 | -------------------------------------------------------------------------------- /R/appendLibLog.R: -------------------------------------------------------------------------------- 1 | #' Append a command to the Log tab dealing with namespaces 2 | #' 3 | #' Time-stamp: <2016-09-19 11:30:05 Graham Williams> 4 | #' 5 | #' @param comment A message to include as a comment. 6 | #' @param ... The command(s) to report in the log. 7 | #' @param include.libs Include any required library() calls. 8 | #' 9 | #' Report a command to the rattle Log tab textview. We check the 10 | #' commands for any namespace usage and then include an appropriate 11 | #' library() call for each and remove them from the commands 12 | #' themselves. 13 | #' 14 | #' Each command will be printed on a new line. 15 | #' 16 | appendLibLog <- function(comment, ..., include.libs=TRUE) 17 | { 18 | ## 160722 PLEASE NOTE 19 | # 20 | # I decided to revert to using appendLog() and exposing the :: 21 | # operator since I do so in my book and users can get familiar with 22 | # it and choose, but it is also more succint. 23 | # 24 | appendLog(comment, ...) 25 | return() 26 | ## 150828 27 | # This started as the old appendLog but with a simplified parameter 28 | # list and added in the extraction of namespaces and then rewrite 29 | # the commands to not include the namespace. 30 | 31 | # Only continue if this is called from inside Rattle. 32 | 33 | if (is.null(crv$rattleGUI)) return() 34 | 35 | # Identify namespace string and namespace string with function. 36 | 37 | ns <- '([a-zA-Z0-9_\\.]+)::' 38 | nsf <- stringr::str_c(ns, '([a-zA-Z0-9_\\.]+)') 39 | 40 | cmds <- 41 | list(...) %>% 42 | unlist() %>% 43 | stringr::str_c(collapse="\n") 44 | 45 | libs <- 46 | cmds %>% 47 | stringr::str_extract_all(nsf) %>% 48 | unlist() %>% 49 | unique() %>% 50 | stringr::str_split('::') %>% 51 | unlist() 52 | 53 | # 150917 Keep make check quiet.... 54 | pkg <- fun <- funs <- "." <- NULL 55 | 56 | if (is.null(libs)) 57 | include.libs <- FALSE 58 | else 59 | libs %<>% 60 | matrix(, ncol=2, byrow=TRUE) %>% 61 | data.frame(stringsAsFactors=FALSE) %>% 62 | magrittr::set_names(c("pkg", "fun")) %>% 63 | dplyr::group_by(pkg) %>% 64 | dplyr::summarise(funs=paste(fun, collapse="(), ")) %>% 65 | dplyr::group_by(pkg) %>% 66 | dplyr::summarise(cmd=sprintf("library(%s) # Provides %s().", pkg, funs)) %>% 67 | magrittr::extract2(2) %>% 68 | stringr::str_c(collapse="\n") 69 | 70 | cmds %<>% 71 | stringr::str_replace_all(ns, "") 72 | 73 | msg <- 74 | (if (include.libs) libs %s+% "\n\n" else "") %s+% 75 | cmds %>% 76 | paste(sep="", crv$start.log.comment, comment, crv$end.log.comment, .) 77 | 78 | # Always place the text at the end of the Log tab textview 79 | # irrespective of where the cursor is. 80 | 81 | log.buf <- 82 | theWidget("log_textview") $ 83 | getBuffer() 84 | 85 | location <- 86 | log.buf $ 87 | getEndIter() $ 88 | iter 89 | 90 | log.buf $ insert(location, msg) 91 | } 92 | 93 | -------------------------------------------------------------------------------- /R/asRules.rpart.R: -------------------------------------------------------------------------------- 1 | # Rattle: A GUI for Data Mining in R 2 | # 3 | # RPART RULES 4 | # 5 | # Time-stamp: <2020-05-13 11:42:26 Graham Williams> 6 | # 7 | # Copyright (c) 2009-2014 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | asRules <- function(model, compact=FALSE, ...) UseMethod("asRules") 25 | 26 | asRules.rpart <- function(model, compact=FALSE, classes=NULL, ...) 27 | { 28 | if (!inherits(model, "rpart")) stop(Rtxt("Not a legitimate rpart tree")) 29 | # if (model$method != "class")) stop("Model method needs to be class") 30 | # 31 | # Get some information. 32 | # 33 | rtree <- length(attr(model, "ylevels")) == 0 34 | target <- as.character(attr(model$terms, "variables")[2]) 35 | frm <- model$frame 36 | names <- row.names(frm) 37 | ylevels <- attr(model, "ylevels") 38 | ds.size <- model$frame[1,]$n 39 | # 40 | # Print each leaf node as a rule. 41 | # 42 | if (rtree) 43 | # Sort rules by coverage 44 | ordered <- rev(sort(frm$n, index=TRUE)$ix) 45 | else 46 | # Sort rules by probabilty of second class (usually the last in binary class) 47 | ordered <- rev(sort(frm$yval2[,5], index=TRUE)$ix) 48 | for (i in ordered) 49 | { 50 | if (frm[i,1] == "") 51 | { 52 | # The following [,5] is hardwired and works on one example.... 53 | if (rtree) 54 | yval <- frm[i,]$yval 55 | else 56 | yval <- ylevels[frm[i,]$yval] 57 | if (is.null(classes) || yval %in% classes) 58 | { 59 | cover <- frm[i,]$n 60 | pcover <- round(100*cover/ds.size) 61 | if (! rtree) prob <- frm[i,]$yval2[,5] 62 | cat("\n") 63 | pth <- rpart::path.rpart(model, nodes=as.numeric(names[i]), print.it=FALSE) 64 | pth <- unlist(pth)[-1] 65 | if (! length(pth)) pth <- "True" 66 | if (compact) 67 | { 68 | cat(sprintf("R%03s ", names[i])) 69 | if (rtree) 70 | cat(sprintf("[%2.0f%%,%0.2f]", pcover, prob)) 71 | else 72 | cat(sprintf("[%2.0f%%,%0.2f]", pcover, prob)) 73 | cat(sprintf(" %s", pth), sep="") 74 | } 75 | else 76 | { 77 | cat(sprintf(Rtxt(" Rule number: %s "), names[i])) 78 | if (rtree) 79 | cat(sprintf("[%s=%s cover=%d (%.0f%%)]\n", 80 | target, yval, cover, pcover)) 81 | else 82 | cat(sprintf("[%s=%s cover=%d (%.0f%%) prob=%0.2f]\n", 83 | target, yval, cover, pcover, prob)) 84 | cat(sprintf(" %s\n", pth), sep="") 85 | } 86 | } 87 | } 88 | } 89 | cat("\n") 90 | invisible(ordered) 91 | } 92 | -------------------------------------------------------------------------------- /R/biclust.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2017-09-10 10:08:08 Graham Williams> 4 | # 5 | # Implement biclust functionality. 6 | # 7 | # Copyright (c) 2010 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | ######################################################################## 25 | # ToDo 100121 26 | # 27 | # Graphical display of output. 28 | # Allow choice of methods and options 29 | 30 | ######################################################################## 31 | # Callbacks 32 | 33 | # When a radio button is selected, display the appropriate tab page. 34 | 35 | on_biclust_radiobutton_toggled <- function(button) 36 | { 37 | if (button$getActive()) 38 | crv$CLUSTER$setCurrentPage(crv$CLUSTER.BICLUST.TAB) 39 | setStatusBar() 40 | } 41 | 42 | ######################################################################## 43 | # Execution 44 | 45 | executeClusterBiclust <- function(include) 46 | { 47 | TV <- "biclust_textview" 48 | sampling <- not.null(crs$train) 49 | 50 | # Obtain interface information. 51 | 52 | method <- "BCCC" 53 | seed <- "crv$seed" 54 | 55 | # Start the log. 56 | 57 | startLog(commonName(crv$BICLUST)) 58 | 59 | # Load the required package. 60 | 61 | lib.cmd <- "library(biclust, quietly=TRUE)" 62 | if (! packageIsAvailable("biclust", Rtxt("perform bicluster analysis"))) return(FALSE) 63 | appendLog(packageProvides('biclust', 'biclust'), lib.cmd) 64 | eval(parse(text=lib.cmd)) 65 | 66 | # Set the seed so we can repeat. 67 | 68 | seed.cmd <- sprintf('set.seed(%s)', seed) 69 | appendLog(Rtxt("Reset the random number seed to obtain the same results each time."), 70 | seed.cmd) 71 | eval(parse(text=seed.cmd)) 72 | 73 | # Build the model. 74 | 75 | biclust.cmd <- sprintf(paste('crs$biclust <- biclust(', 76 | 'as.matrix(na.omit(crs$dataset[%s, %s])),', 77 | 'method=%s)', sep=""), 78 | ifelse(sampling, "crs$train", ""), 79 | include, method) 80 | 81 | appendLog(sprintf(Rtxt("Generate %s using method '%s'."), 82 | commonName(crv$BICLUST), method), 83 | biclust.cmd) 84 | 85 | start.time <- Sys.time() 86 | 87 | result <- try(eval(parse(text=biclust.cmd)), TRUE) 88 | time.taken <- Sys.time()-start.time 89 | 90 | # Check for errors. 91 | 92 | if (inherits(result, "try-error")) 93 | { 94 | errorDialog(errorMessageFun("biclust", result)) 95 | return(FALSE) 96 | } 97 | 98 | # Show the results. 99 | 100 | print.cmd <- "print(crs$biclust)" 101 | 102 | appendLog(sprintf(Rtxt("Generate a textual view of the %s model."), 103 | commonName(crv$BICLUST)), 104 | print.cmd) 105 | 106 | resetTextview(TV) 107 | setTextview(TV, 108 | sprintf(Rtxt("Summary of the %s model (built using '%s'):"), 109 | commonName(crv$BICLUST), "biclust"), 110 | "\n", 111 | collectOutput(print.cmd)) 112 | 113 | reportTimeTaken(TV, time.taken, model=commonName(crv$BICLUST)) 114 | 115 | return(TRUE) 116 | } 117 | 118 | -------------------------------------------------------------------------------- /R/binning.R: -------------------------------------------------------------------------------- 1 | # Rattle: A GUI for Data Mining in R 2 | # 3 | # BIN DATA 4 | # 5 | # Gnome R Data Miner: GNOME interface to R for Data Mining 6 | # 7 | # Time-stamp: <2014-09-05 21:27:32 gjw> 8 | # 9 | # Copyright (c) 2009-2014 Togaware Pty Ltd 10 | # 11 | # This files is part of Rattle. 12 | # 13 | # Rattle is free software: you can redistribute it and/or modify it 14 | # under the terms of the GNU General Public License as published by 15 | # the Free Software Foundation, either version 2 of the License, or 16 | # (at your option) any later version. 17 | # 18 | # Rattle is distributed in the hope that it will be useful, but 19 | # WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU General Public License 24 | # along with Rattle. If not, see . 25 | # 26 | #----------------------------------------------------------------------- 27 | # 28 | # 070131 From Daniele Medri. 29 | # 111025 Support wtd.quantiles suggested by Brenton R. Stone. 30 | 31 | binning <- function (x, bins=4, 32 | method=c("quantile", "wtd.quantile", "kmeans"), 33 | labels=NULL, ordered=TRUE, 34 | weights=NULL) 35 | { 36 | # Set ordered to FALSE in Rattle since randomForests don't work when 37 | # the factor is ordered, for some reason (080406). 38 | 39 | # Best k for natural breaks 40 | 41 | varkmeans <- function (x, centers, iter.max=10, num.seeds=bins) 42 | { 43 | if (mode(x) == "numeric") 44 | { 45 | x <- data.frame(new.x=x) 46 | } 47 | KM <- kmeans(x=x, centers=centers, iter.max=iter.max) 48 | for (i in seq_len(num.seeds)) 49 | { 50 | newKM <- kmeans(x=x, centers=centers, iter.max=iter.max) 51 | if (sum(newKM$withinss) < sum(KM$withinss)) 52 | { 53 | KM <- newKM 54 | } 55 | } 56 | KM$tot.withinss <- sum(KM$withinss) 57 | xmean <- apply(x, 2, mean) 58 | centers <- rbind(KM$centers, xmean) 59 | bss1 <- as.matrix(dist(centers)^2) 60 | KM$betweenss <- sum(as.vector(bss1[nrow(bss1), ]) * c(KM$size, 0)) 61 | return(KM) 62 | } 63 | 64 | method <- match.arg(method) 65 | if(is.factor(x)) stop(Rtxt("This variable is already a factor.")) 66 | if (is.data.frame(x)) stop(Rtxt("An object of class data.frame is required.")) 67 | if (length(x) < bins) stop(Rtxt("There are more bins than observations.")) 68 | if (method == "wtd.quantile" && 69 | ! packageIsAvailable("Hmisc", Rtxt("weighted quantile binning"))) 70 | stop(Rtxt("wtd.quantile requires the Hmisc package.")) 71 | 72 | # Binning 73 | 74 | x <- if (method == "quantile") 75 | { 76 | breaks <- c(quantile(x, probs = seq(0, 1, 1/bins), na.rm = TRUE, type=8)) 77 | breaks <- unique(breaks) 78 | breaks[1] <- min(x, na.rm=TRUE) 79 | breaks[length(breaks)] <- max(x, na.rm=TRUE) 80 | # quantiles from quantile() can be non-unique, which cut() doesn't 81 | # like. This is handled above through unique(). The function 82 | # cut2() in Hmisc handles this situation gracefully and it could 83 | # be used, but it is not necessary. 84 | if(length(breaks) >= 2) 85 | { 86 | cut(x, breaks, include.lowest = TRUE, labels = labels) 87 | } 88 | else 89 | { 90 | cat(Rtxt("Warning: the variable is not considered.\n")) 91 | return(NULL) 92 | } 93 | } 94 | else if (method == "wtd.quantile") 95 | { 96 | breaks <- c(Hmisc::wtd.quantile(x, weights=weights, probs=seq(0, 1, 1/bins), 97 | na.rm=TRUE, type="quantile")) 98 | breaks <- unique(breaks) 99 | breaks[1] <- min(x, na.rm=TRUE) 100 | breaks[length(breaks)] <- max(x, na.rm=TRUE) 101 | # quantiles from quantile() can be non-unique, which cut() doesn't 102 | # like. This is handled above through unique(). The function 103 | # cut2() in Hmisc handles this situation gracefully and it could 104 | # be used, but it is not necessary. 105 | if(length(breaks) >= 2) 106 | { 107 | cut(x, breaks, include.lowest = TRUE, labels = labels) 108 | } 109 | else 110 | { 111 | cat(Rtxt("Warning: the variable is not considered.\n")) 112 | return(NULL) 113 | } 114 | } 115 | else if (method == "kmeans") 116 | { 117 | xx <- na.omit(x) 118 | maxbins <-nlevels(as.factor(xx)) 119 | if(maxbins < bins) 120 | { 121 | bins <-maxbins 122 | } 123 | breaks <- c(min(xx), tapply(xx, varkmeans(xx, bins)$cluster, max)) 124 | if (length(unique(breaks)) >= 2) 125 | { 126 | cut(x, unique(breaks), include.lowest = TRUE, labels = labels) 127 | } 128 | else 129 | { 130 | cat(Rtxt("Warning: the variable is not considered.\n")) 131 | return(NULL) 132 | } 133 | } 134 | 135 | if(ordered == TRUE) 136 | result <- ordered(factor(x)) 137 | else 138 | result <- factor(x) 139 | 140 | attr(result, "breaks") <- breaks 141 | return(result) 142 | } 143 | -------------------------------------------------------------------------------- /R/clara.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2011-06-23 21:17:19 Graham Williams> 4 | # 5 | # Implement biclust functionality. 6 | # 7 | # Copyright (c) 2010 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | ######################################################################## 25 | # ToDo 100121 26 | # 27 | # Execute. 28 | # Graphical display of output. 29 | # Allow choice of methods. 30 | 31 | ######################################################################## 32 | # Callbacks 33 | 34 | # When a radio button is selected, display the appropriate tab page. 35 | 36 | on_clara_radiobutton_toggled <- function(button) 37 | { 38 | if (button$getActive()) 39 | crv$CLUSTER$setCurrentPage(crv$CLUSTER.CLARA.TAB) 40 | setStatusBar() 41 | } 42 | 43 | -------------------------------------------------------------------------------- /R/cluster.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2014-07-18 15:08:01 gjw> 4 | # 5 | # Implement cluster functionality. 6 | # 7 | # Copyright (c) 2009 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | ######################################################################## 25 | # EXECUTION 26 | 27 | executeClusterTab <- function() 28 | { 29 | # Can not cluster without a dataset. 30 | 31 | if (noDatasetLoaded()) return() 32 | 33 | # If it looks like the VARIABLES page has not been executed, complain.. 34 | 35 | if (variablesHaveChanged(Rtxt("building clusters"))) return() 36 | 37 | # Check if sampling needs executing. 38 | 39 | if (sampleNeedsExecute()) return() 40 | 41 | # 091216 Automatically handle any selected categorics by converting 42 | # them to numeric, so they then become included variables. This 43 | # works, but it risks suprising the user with hte addition of new 44 | # variables outside their control. So let's leave it to the user to 45 | # do the transforms, or use clara. 46 | 47 | # factors <- crs$input[sapply(crs$input, function(x) 48 | # is.factor(crs$dataset[[x]]))] 49 | # sapply(factors, executeTransformRemapPerform, action="indicator", 50 | # remap.prefix="TIN") 51 | 52 | # Kmeans and hclust only work for numeric data, so identify 53 | # variables to include. Only work with the INPUT/TARGET/RISK 54 | # variables. That is, only exclude the IGNORE and IDENT variables. 55 | 56 | include <- "crs$numeric" # 20110102 getNumericVariables() 57 | if (! length(include)) 58 | { 59 | errorDialog(Rtxt("Clusters are currently calculated only for numeric data.", 60 | "No numeric variables were found in the dataset", 61 | "from amongst those having an input/target/risk role.")) 62 | return() 63 | } 64 | 65 | # Dispatch. 66 | 67 | if (theWidget("kmeans_radiobutton")$getActive()) 68 | { 69 | if (executeClusterKMeans(include)) 70 | theWidget("evaluate_kmeans_checkbutton")$setActive(TRUE) 71 | } 72 | else if (theWidget("ewkm_radiobutton")$getActive()) 73 | { 74 | if (executeClusterEwkm(include)) 75 | theWidget("evaluate_kmeans_checkbutton")$setActive(TRUE) 76 | } 77 | ## else if (theWidget("clara_radiobutton")$getActive()) 78 | ## { 79 | ## infoDialog(Rtxt("Not yet implemented.")) 80 | ## if (executeClusterClara(include)) 81 | ## theWidget("evaluate_clara_checkbutton")$setActive(TRUE) 82 | ## } 83 | ## else if (theWidget("pam_radiobutton")$getActive()) 84 | ## { 85 | ## infoDialog(Rtxt("Not yet implemented.")) 86 | ## if (executeClusterPam(include)) 87 | ## theWidget("evaluate_pam_checkbutton")$setActive(TRUE) 88 | ## } 89 | else if (theWidget("hclust_radiobutton")$getActive()) 90 | { 91 | if (executeClusterHClust(include)) 92 | theWidget("evaluate_hclust_checkbutton")$setActive(TRUE) 93 | } 94 | else if (theWidget("biclust_radiobutton")$getActive()) 95 | { 96 | executeClusterBiclust(include) 97 | # theWidget("evaluate_biclust_checkbutton")$setActive(TRUE) 98 | } 99 | } 100 | 101 | ######################################################################## 102 | # EXPORT 103 | 104 | exportClusterTab <- function() 105 | { 106 | 107 | if (noDatasetLoaded()) return() 108 | 109 | if (theWidget("kmeans_radiobutton")$getActive()) 110 | { 111 | exportKMeansTab() 112 | } 113 | else if (theWidget("ewkm_radiobutton")$getActive()) 114 | { 115 | exportEwkmTab() 116 | } 117 | else if (theWidget("hclust_radiobutton")$getActive()) 118 | { 119 | exportHClustTab() 120 | } 121 | else 122 | { 123 | errorDialog(Rtxt("PMML export for this model is not yet implemented.")) 124 | return() 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /R/comcat.R: -------------------------------------------------------------------------------- 1 | # 20170129 Convenience combinine format with comma and cat("\n") to 2 | # return a printed string rather than print(). 3 | 4 | comcat <- function(x, ...) 5 | { 6 | cat(format(x, ..., big.mark=",", scientific=FALSE, trim=TRUE), "\n") 7 | } 8 | 9 | -------------------------------------------------------------------------------- /R/ctree.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2017-09-10 10:08:18 Graham Williams> 4 | # 5 | # CTREE OPTION OF THE TREE TAB 6 | # 7 | # Copyright (c) 2009 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | ######################################################################## 25 | # 26 | # Model -> Tree -> Conditional 27 | # 28 | 29 | # 100815 TODO The "partykit" package from R-Forge (only for now) 30 | # includes .list.rules.party() to convert tree into rules: 31 | # 32 | # install.packages("partykit", repos = "https://R-Forge.R-project.org") 33 | # library("partykit") 34 | # Rebuild the ctree as partykit provides new ctree. 35 | # partykit:::.list.rules.party(crs$rpart) 36 | 37 | executeModelCTree <- function() 38 | { 39 | # 080815 This is currently just copied from rpart.R, and slowly 40 | # being tuned for ctree specifically. 41 | 42 | # Initial setup 43 | 44 | TV <- "rpart_textview" 45 | 46 | num.classes <- length(levels(as.factor(crs$dataset[[crs$target]]))) 47 | control <- NULL 48 | parms <- NULL 49 | 50 | # Scrape the value of the tuning controls 51 | 52 | tune.controls <- theWidget("rpart_tune_entry")$getText() 53 | 54 | # Retrieve the Priors, and check there is the right number and that 55 | # they add up to 1. 56 | 57 | priors <- theWidget("model_tree_priors_entry")$getText() 58 | if (nchar(priors) > 0) 59 | { 60 | pr <- as.numeric(unlist(strsplit(priors, ","))) 61 | if (length(pr) != num.classes) 62 | { 63 | errorDialog(sprintf(Rtxt("The supplied priors (%s)", 64 | "need to correspond to the number of classes", 65 | "found in the target variable '%s'.", 66 | "Please supply exactly %d priors."), 67 | priors, crs$target, num.classes)) 68 | return(FALSE) 69 | } 70 | if (sum(pr) != 1) 71 | { 72 | errorDialog(sprintf(Rtxt("The supplied priors (%s)", 73 | "add up to %0.2f whereas", 74 | "they need to add up 1.00.", 75 | "Please provide appropriate priors."), 76 | priors, sum(pr))) 77 | return(FALSE) 78 | } 79 | if (is.null(parms)) 80 | parms <- sprintf(", parms=list(prior=c(%s))", priors) 81 | else 82 | parms <- gsub(")$", sprintf(", prior=c(%s)", priors), parms) 83 | } 84 | 85 | # Retrieve the Min Split and check if it is different from the 86 | # default, and if so then use it. 87 | 88 | minsplit <- theWidget("rpart_minsplit_spinbutton")$getValue() 89 | if (minsplit != crv$rpart.minsplit.default) 90 | { 91 | if (is.null(control)) 92 | control <- sprintf(", control=ctree_control(minsplit=%d)", minsplit) 93 | else 94 | control <- gsub(")$", sprintf(", minsplit=%d)", minsplit), control) 95 | } 96 | 97 | # Retrieve the Min Bucket and check if it is different from the 98 | # default, and if so then use it. 99 | 100 | minbucket <- theWidget("rpart_minbucket_spinbutton")$getValue() 101 | if (minbucket != crv$rpart.minbucket.default) 102 | { 103 | if (is.null(control)) 104 | control <- sprintf(", control=ctree_control(minbucket=%d)", minbucket) 105 | else 106 | control <- gsub(")$", sprintf(", minbucket=%d)", minbucket), control) 107 | } 108 | 109 | # Retrieve the Max Depth and check if it is different from the 110 | # default, and if so then use it. 111 | 112 | maxdepth <- theWidget("rpart_maxdepth_spinbutton")$getValue() 113 | if (maxdepth != crv$rpart.maxdepth.default) 114 | { 115 | if (is.null(control)) 116 | control <- sprintf(", control=ctree_control(maxdepth=%d)", maxdepth) 117 | else 118 | control <- gsub(")$", sprintf(", maxdepth=%d)", maxdepth), control) 119 | } 120 | 121 | # Build the formula for the model. 122 | 123 | frml <- paste(crs$target, "~ .") 124 | 125 | # Variables to be included --- a string of indicies. 126 | 127 | # included <- getIncludedVariables() 128 | included <- "c(crs$input, crs$target)" # 20110102 129 | 130 | # Some convenience booleans 131 | 132 | sampling <- not.null(crs$train) 133 | including <- not.null(included) 134 | subsetting <- sampling || including 135 | 136 | # Commands. 137 | 138 | lib.cmd <- "library(party, quietly=TRUE)" 139 | if (! packageIsAvailable("party", Rtxt("build conditional trees"))) return(FALSE) 140 | 141 | fit.cmd <- paste("crs$rpart <- ctree(", frml, ", data=crs$dataset", 142 | if (subsetting) "[", 143 | if (sampling) "crs$train", 144 | if (subsetting) ",", 145 | if (including) included, 146 | if (subsetting) "]", 147 | if (! is.null(crs$weights)) 148 | sprintf(",\n weights=as.integer(%s)%s", 149 | crs$weights, 150 | ifelse(sampling, "[crs$train]", "")), 151 | ifelse(is.null(control), "", control), 152 | ")", sep="") 153 | 154 | print.cmd <- "print(crs$rpart)" 155 | 156 | # Load the required library. 157 | 158 | startLog(Rtxt("Conditional inference tree.")) 159 | appendLog(Rtxt("Build a conditional tree using the party package."), lib.cmd) 160 | 161 | eval(parse(text=lib.cmd)) 162 | 163 | # Build the model. 164 | 165 | appendLog(Rtxt("Build a ctree model."), fit.cmd) 166 | start.time <- Sys.time() 167 | result <- try(eval(parse(text=fit.cmd)), silent=TRUE) 168 | time.taken <- Sys.time()-start.time 169 | if (inherits(result, "try-error")) 170 | { 171 | errorDialog(errorMessageFun("ctree", result)) 172 | return(FALSE) 173 | } 174 | 175 | # Display the resulting model. 176 | 177 | appendLog(Rtxt("Generate summary of the ctree model."), print.cmd) 178 | 179 | resetTextview(TV) 180 | setTextview(TV, 181 | sprintf(Rtxt("Summary of the %s model for %s (built using '%s'):\n"), 182 | commonName("ctree"), 183 | Rtxt("Classification"), # 080604 TODO put the right type 184 | "ctree"), 185 | collectOutput(print.cmd), "\n") 186 | 187 | if (sampling) crs$smodel <- union(crs$smodel, crv$RPART) 188 | 189 | # Now that we have a model, make sure the rules and plot buttons are 190 | # not visible. 191 | 192 | showModelRPartExists() 193 | 194 | # Finish up. 195 | 196 | reportTimeTaken(TV, time.taken, model=commonName(crv$RPART)) 197 | 198 | return(TRUE) 199 | } 200 | -------------------------------------------------------------------------------- /R/errorMatrix.R: -------------------------------------------------------------------------------- 1 | #' @title Generate an error (confusion) matrix. 2 | #' 3 | #' @param actual a vector of true values. 4 | #' @param predicted a vector of predicted values. 5 | #' @param percentage return percentages. 6 | #' @param digits the number of digits to round results. 7 | #' @param count return counts. 8 | #' 9 | #' @value An error matrix (also known as a confusion matrix) is 10 | #' generated based on the comparison of the actual and predicted 11 | #' values. One of three forms is returned: percentages (pc), counts, 12 | #' or proportions (if both percentage and counts are FALSE). 13 | 14 | errorMatrix <- function(actual, 15 | predicted, 16 | percentage=TRUE, 17 | digits=ifelse(percentage,1,3), 18 | count=FALSE) 19 | { 20 | # Preconditions. 21 | 22 | if (!missing(percentage) & percentage & count) 23 | stop("percentages not possible as counts were specified") 24 | 25 | # Data quality checks. 26 | # 27 | # If both actual and predicted are factors they must 28 | # have the same levels in the same order else the table will have 29 | # rearranged column or row orders - the table is expeted to have the 30 | # labels in the same order column and row wise. 31 | # 32 | # If either is a factor and the other a character then convert the 33 | # character to a factor with the levels of the factor used. 34 | # 35 | # If both are character or numeric leave it to table() to sort out. 36 | 37 | if (is.factor(actual) & is.factor(predicted)) 38 | { 39 | if (! all(levels(actual) == levels(predicted))) 40 | stop("The supplied actual and predicted must have the same levels.") 41 | } else if (is.factor(actual)) 42 | { 43 | predicted <- factor(predicted, levels=levels(actual)) 44 | } else if (is.factor(predicted)) 45 | { 46 | actual <- factor(actual, levels=levels(predicted)) 47 | } 48 | 49 | # Initial table. 50 | 51 | x <- table(actual, predicted) 52 | 53 | # Number of classes. 54 | 55 | nc <- nrow(x) 56 | 57 | # Number of values. 58 | 59 | nv <- length(actual) - sum(is.na(actual) | is.na(predicted)) 60 | 61 | # Calculate proportions. 62 | 63 | if (!count) x <- x/nv 64 | 65 | # Calculate class error. For row r this is the sum of all values in 66 | # the row minus the r'th value, divided by the sum of all values in 67 | # the row. If count then the error is returned as a percentage rather 68 | # than a proportion. 69 | 70 | tbl <- cbind(x, 71 | Error=sapply(1:nc, 72 | function(r) 73 | { 74 | y <- sum(x[r,-r])/sum(x[r,]) 75 | if (count) y <- round(100*y, digits) 76 | return(y) 77 | })) 78 | 79 | names(attr(tbl, "dimnames")) <- c("Actual", "Predicted") 80 | 81 | # Round the resulting percentages or proportions unless we are 82 | # returning count. 83 | 84 | if (!count) tbl <- if (percentage) round(100*tbl, digits) else round(tbl, digits) 85 | 86 | return(tbl) 87 | } 88 | -------------------------------------------------------------------------------- /R/execute.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2015-05-17 08:55:47 gjw> 4 | # 5 | # Implement functionality associated with the Execute button and Menu. 6 | # 7 | # Copyright (c) 2009-2013 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | on_execute_button_clicked <- function(action, window) 25 | { 26 | # 100402 Allow Execute to be running just once, irrespective of the 27 | # number of times the Execute button is clicked. Otherwise we get a 28 | # second load of a CSV dataset whilst still loading the first. 29 | 30 | if (! is.null(crv$executing) && crv$executing) return() 31 | crv$executing <- TRUE 32 | on.exit(crv$executing <- FALSE) 33 | 34 | # Wrap up the actual call with a "try" so that the watch cursor 35 | # turns off even on error. 36 | 37 | setStatusBar() 38 | 39 | # 081117 This ensures spinbuttons, for example, lose focus and hence 40 | # their current value is properly noted. Otherwise I was finding the 41 | # user had to either press Enter or click somewhere else to ensure 42 | # the value is noted. 43 | 44 | theWidget("rattle_window")$setFocus() 45 | 46 | # 090102 Set the cursor to busy, and make sure on failure or 47 | # interrupt we set it back. TODO Currently, I can interrupt with 48 | # Ctrl-C in the console, and that does interrupt the Rattle process, 49 | # but I can't work out how to get a Ctrl-C (or perhaps an ESC) in 50 | # the Rattle GUI to cause an interrupt. 51 | 52 | set.cursor("watch") 53 | tryCatch(dispatchExecuteButton(), 54 | interrupt=function(m) setStatusBar(Rtxt("Processing interrupted by user.")), 55 | finally=set.cursor()) 56 | 57 | # library(multicore) 58 | # set.cursor("watch") 59 | # crs$process <- parallel(dispatchExecuteButton()) 60 | # interrupt=function(m) 61 | # setStatusBar("Processing interrupted by user."), 62 | # finally=set.cursor())) 63 | # collect() 64 | 65 | # 090103 Return nothing, otherwise we get the results from the 66 | # tryCatch above. 67 | 68 | return() 69 | } 70 | 71 | dispatchExecuteButton <- function() 72 | { 73 | # Check which tab of notebook and dispatch to appropriate execute action 74 | 75 | ct <- getCurrentPageLabel(crv$NOTEBOOK) 76 | # REMOVE 100424 No longer required here - this is done earlier now. 77 | # Encoding(ct) <- "UTF-8" # 100408 For French, but see if it's okay always! 78 | 79 | if (ct == crv$NOTEBOOK.DATA.NAME) 80 | { 81 | executeDataTab() 82 | } 83 | else if (ct == crv$NOTEBOOK.EXPLORE.NAME) 84 | { 85 | executeExploreTab() 86 | } 87 | else if (ct == crv$NOTEBOOK.TEST.NAME) 88 | { 89 | executeTestTab() 90 | } 91 | else if (ct == crv$NOTEBOOK.TRANSFORM.NAME) 92 | { 93 | executeTransformTab() 94 | } 95 | else if (ct == crv$NOTEBOOK.CLUSTER.NAME) 96 | { 97 | executeClusterTab() 98 | } 99 | else if (ct == crv$NOTEBOOK.ASSOCIATE.NAME) 100 | { 101 | executeAssociateTab() 102 | } 103 | else if (ct == crv$NOTEBOOK.MODEL.NAME) 104 | { 105 | executeModelTab() 106 | } 107 | else if (ct == crv$NOTEBOOK.EVALUATE.NAME) 108 | { 109 | 110 | # The wrap mode of the confusion_textview may have been set to 111 | # word wrap when a model was Executed if it had more than 2 112 | # classes, since a message is printed about ROCR etc not handling 113 | # any more than 2 classes. 114 | 115 | theWidget("confusion_textview")$setWrapMode("none") 116 | executeEvaluateTab() 117 | } 118 | else if (ct == crv$NOTEBOOK.LOG.NAME) 119 | { 120 | executeLogTab() 121 | } 122 | else 123 | { 124 | errorDialog(Rtxt("'dispatchExecuteButton' has been called with an unknown tab."), 125 | "\n\n", ct, 126 | "\n\n", crv$support.msg) 127 | return() 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /R/executeBoxPlot2.R: -------------------------------------------------------------------------------- 1 | #' Display boxplots using ggplot2. 2 | #' 3 | #' Time-stamp: <2016-09-19 19:28:44 Graham Williams> 4 | #' 5 | executeBoxPlot2 <- function(dataset, vars, target, targets, stratify, sampling, pmax) 6 | { 7 | # Check prerequisite packages. 8 | 9 | if (!packageIsAvailable("ggplot2", Rtxt("build plots using a grammar of graphics"))) return(FALSE) 10 | if (!packageIsAvailable("gridExtra", Rtxt("arrange plots on a grid"))) return(FALSE) 11 | if (!packageIsAvailable("dplyr", Rtxt("mutate the supplied dataset"))) return(FALSE) 12 | 13 | # Report to the Log script. 14 | 15 | startLog(Rtxt("Display box plots for the selected variables.")) 16 | 17 | # Start a new plot as we could be drawing multiple types of plots. 18 | 19 | newPlot() 20 | 21 | for (i in seq_along(vars)) 22 | { 23 | title.txt <- genPlotTitleCmd(generateTitleText(vars[i], 24 | target, 25 | sampling, 26 | stratify && length(targets)), 27 | vector=TRUE) 28 | 29 | plot.cmd <- stringr::str_c('# Generate a box plot.\n\n', 30 | sprintf("p%02d", i), ' <- crs %>%\n', 31 | ' with(', dataset, ') %>%\n', 32 | if (length(target)) 33 | stringr::str_c(' dplyr::mutate(', target, 34 | '=as.factor(', target, ')) %>%\n'), 35 | ' ggplot2::ggplot(ggplot2::aes(y=', vars[i], ')) +\n', 36 | ' ggplot2::geom_boxplot(ggplot2::aes(x="All"), ', 37 | 'notch=TRUE, fill="grey") +\n', 38 | ' ggplot2::stat_summary(ggplot2::aes(x="All"), ', 39 | 'fun.y=mean, geom="point", shape=8) +\n', 40 | if (length(target)) 41 | stringr::str_c(' ggplot2::geom_boxplot(', 42 | 'ggplot2::aes(x=', target, ', ', 43 | 'fill=', target, '), notch=TRUE) +\n', 44 | ' ggplot2::stat_summary(', 45 | 'ggplot2::aes(x=', target, '), ', 46 | 'fun.y=mean, geom="point", ', 47 | 'shape=8) +\n'), 48 | ' ggplot2::xlab("', 49 | if (length(target)) 50 | stringr::str_c(target, '\\n\\n'), 51 | title.txt[2], '") +\n', 52 | ' ggplot2::ggtitle("', title.txt[1], '") +\n', 53 | ' ggplot2::theme(legend.position="none")') 54 | 55 | comment <- paste(Rtxt("Use ggplot2 to generate box plot for"), vars[i]) 56 | appendLibLog(comment, plot.cmd, include.libs=(i==1)) 57 | eval(parse(text=plot.cmd)) 58 | } 59 | 60 | display.cmd <- 61 | "gridExtra::grid.arrange(" %s+% 62 | paste(sprintf("p%02d", seq_len(i)), collapse=", ") %s+% 63 | ")" 64 | 65 | appendLibLog("Display the plots.", display.cmd) 66 | eval(parse(text=display.cmd)) 67 | 68 | } 69 | 70 | -------------------------------------------------------------------------------- /R/executeExploreGGRaptR.R: -------------------------------------------------------------------------------- 1 | #' Perform the required operations for displaying interactive plot generator. 2 | #' 3 | #' Time-stamp: <2017-08-10 17:07:41 Graham Williams> 4 | #' 5 | executeExploreGGRaptR <- function(df_name, df) 6 | { 7 | # Check prerequisite packages. 8 | 9 | if (!packageIsAvailable("ggraptR", 10 | Rtxt("interactively generate ggplot2 graphics"))) 11 | return(FALSE) 12 | 13 | startLog(Rtxt("Display interactive plot builder.")) 14 | 15 | df_file <- 'ggraptr_df.rds' 16 | saveRDS(df, file=df_file) 17 | r_expr <- sprintf( 18 | '%s <- readRDS(\'%s\');file.remove(\'%s\');ggraptR::ggraptR(%s, port=5002)', 19 | df_name, df_file, df_file, df_name) 20 | 21 | appendLog("Initiate the ggraptR application in a browser", r_expr) 22 | 23 | system(sprintf('R -q --vanilla -e "%s"', r_expr), wait=F, intern=F) 24 | 25 | return() 26 | } 27 | -------------------------------------------------------------------------------- /R/executeHistPlot2.R: -------------------------------------------------------------------------------- 1 | #' Perform the required operations for displaying histograms using ggplot2. 2 | #' 3 | #' Time-stamp: <2016-09-19 17:06:40 Graham Williams> 4 | #' 5 | executeHistPlot2 <- function(dataset, vars, target, targets, stratify, sampling, pmax) 6 | { 7 | # Check prerequisite packages. 8 | 9 | if (!packageIsAvailable("ggplot2", Rtxt("build plots using a grammar of graphics"))) return() 10 | if (!packageIsAvailable("dplyr", Rtxt("mutate the supplied dataset"))) return() 11 | if (!packageIsAvailable("gridExtra", Rtxt("arrange plots on a grid"))) return() 12 | 13 | startLog(Rtxt("Display histogram plots for the selected variables.")) 14 | 15 | # We start a new plot since we could be drawing multiple types of 16 | # plots. 17 | 18 | newPlot() 19 | 20 | for (i in seq_along(vars)) 21 | { 22 | title.txt <- genPlotTitleCmd(generateTitleText(vars[i], 23 | target, 24 | sampling, 25 | stratify && length(targets)), 26 | vector=TRUE) 27 | 28 | plot.cmd <- stringr::str_c('# Generate the plot.\n\n', 29 | sprintf("p%02d", i), ' <- crs %>%\n', 30 | ' with(', dataset, ') %>%\n', 31 | if (length(target)) 32 | stringr::str_c(' dplyr::mutate(', target, 33 | '=as.factor(', target, ')) %>%\n'), 34 | ' dplyr::select(', vars[i], 35 | ifelse(length(target), stringr::str_c(", ", target), ""), 36 | ') %>%\n', 37 | ' ggplot2::ggplot(ggplot2::aes(x=', vars[i], ')) +\n', 38 | ' ggplot2::geom_density(lty=3) +\n', 39 | ifelse(length(target), 40 | stringr::str_c(' ggplot2::geom_density(ggplot2', 41 | sprintf("::aes(fill=%s, colour=%s)", 42 | target, target), 43 | ', alpha=0.55) +\n'), 44 | ""), 45 | ' ggplot2::xlab("', vars[i], 46 | '\\n\\n', title.txt[2], '") +\n', 47 | ' ggplot2::ggtitle("', title.txt[1], '") +\n', 48 | ' ggplot2::labs(', 49 | ifelse(length(target), 50 | stringr::str_c('fill="', target, '", '), 51 | ""), 52 | 'y="Density")') 53 | 54 | ## plot.cmd <- stringr::str_c('# Calculate the variable value range.\n\n', 55 | ## 'vrange <- crs %>%\n', 56 | ## ' with(', dataset, ') %>%\n', # Need access to crs vars 57 | ## ' dplyr::select(', vars[i], ') %>%\n', 58 | ## ' range(na.rm=TRUE)\n\n', 59 | ## '# Then detemine a good bin width for the bars.\n\n', 60 | ## 'bwidth <- crs %>%\n', 61 | ## ' with(', dataset, '$', vars[i], ') %>%\n', 62 | ## ' na.omit() %>%\n', 63 | ## ' nclass.FD() %>%\n', 64 | ## ' magrittr::divide_by(vrange[2]-vrange[1], .)\n\n', 65 | ## '# Generate the plot.\n\n', 66 | ## sprintf("p%02d", i), ' <- crs %>%\n', 67 | ## ' with(', dataset, ') %>%\n', 68 | ## ' dplyr::select(', vars[i], 69 | ## ifelse(length(target), stringr::str_c(", ", target), ""), 70 | ## ') %>%\n', 71 | ## ' ggplot2::ggplot(ggplot2::aes(x=', vars[i], ')) +\n', 72 | ## ' ggplot2::geom_histogram(ggplot2::aes(y=..density..), ', 73 | ## 'binwidth=bwidth, fill="grey", colour="black") +\n', 74 | ## ' ggplot2::geom_density(', 75 | ## ifelse(length(target), 76 | ## sprintf("ggplot2::aes(colour=%s)", target), ""), 77 | ## ') +\n', 78 | ## ' ggplot2::xlab("', vars[i], 79 | ## '\\n\\n', title.txt[2], '") +\n', 80 | ## ' ggplot2::ggtitle("', title.txt[1], '") +\n', 81 | ## ' ggplot2::labs(colour="", y="Density")') 82 | 83 | comment <- paste(Rtxt("Use ggplot2 to generate histogram plot for"), vars[i]) 84 | appendLibLog(comment, plot.cmd, include.libs=(i==1)) 85 | eval(parse(text=plot.cmd)) 86 | } 87 | 88 | display.cmd <- 89 | "gridExtra::grid.arrange(" %s+% 90 | paste(sprintf("p%02d", seq_len(i)), collapse=", ") %s+% 91 | ")" 92 | 93 | appendLibLog("Display the plots.", display.cmd) 94 | eval(parse(text=display.cmd)) 95 | 96 | } 97 | -------------------------------------------------------------------------------- /R/executeLogTab.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2014-07-24 21:30:01 gjw> 4 | # 5 | # Execute Log Tab 6 | # 7 | # Copyright (c) 2014 Togaware Pty Ltd 8 | # 9 | # This file is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | executeLogTab <- function() 25 | { 26 | log.text <- getTextviewContent("log_textview") 27 | eval(parse(text=log.text)) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /R/executeModelAda.R: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------- 2 | # 3 | # MODEL ADA 4 | # 5 | 6 | executeModelAda <- function(dataset, formula) 7 | { 8 | # Initial setup. 9 | 10 | TV <- "ada_textview" 11 | VAR <- "crs$ada" 12 | 13 | # Build model. 14 | 15 | crs$ada <- buildModelAda( 16 | formula, 17 | dataset, 18 | tv = theWidget("ada_textview"), 19 | maxdepth = theWidget("ada_maxdepth_spinbutton")$getValue(), 20 | minsplit = theWidget("ada_minsplit_spinbutton")$getValue(), 21 | cp = theWidget("ada_cp_spinbutton")$getValue(), 22 | xval = theWidget("ada_xval_spinbutton")$getValue(), 23 | ntree = theWidget("ada_ntree_spinbutton")$getValue()) 24 | 25 | return(TRUE) 26 | } 27 | -------------------------------------------------------------------------------- /R/executeModelRxBTrees.r: -------------------------------------------------------------------------------- 1 | #' Build a Linear model. 2 | #' 3 | #' Time-stamp: <2017-08-18 12:13:21 Graham Williams> 4 | #' 5 | executeModelRxBTrees <- function() 6 | { 7 | # Initial setup. 8 | 9 | TV <- "ada_textview" 10 | VAR <- "crs$ada" 11 | NAME <- "Boosted Trees" 12 | FUNC <- "rxBTrees" 13 | 14 | # Formula Creation for the model. 15 | 16 | crs$target %>% 17 | paste("~", paste(crs$input, collapse=" + ")) %>% 18 | strwrap(crv$log_width, 0, 4) %>% 19 | paste(collapse="\n") -> 20 | frml 21 | 22 | # Build the model build command. 23 | 24 | # TODO Need to allow parameters to be set from the GUI. 25 | 26 | build.cmd <- paste0(VAR, " <- ", FUNC, "(\n\n ", frml, ",\n\n", 27 | " data = crs$xdf.split[[1]],\n", 28 | " maxDepth = 30,\n", 29 | " cp = 0.01,\n", 30 | " minSplit = 20", 31 | ")") 32 | 33 | # Build the model. 34 | 35 | appendLog(Rtxt("Build a rxBTrees model."), 36 | build.cmd, sep="") 37 | start.time <- Sys.time() 38 | result <- try(eval(parse(text=build.cmd)), silent=TRUE) 39 | summary.cmd <- "print(summary(crs$ada))" 40 | 41 | print.cmd <- paste0("print(", VAR, ")") 42 | 43 | # Text view 44 | resetTextview(TV) 45 | setTextview(TV, 46 | sprintf(Rtxt("Boosted Trees built using %s"), 47 | FUNC), 48 | "\n\n", 49 | collectOutput(print.cmd)) 50 | return(TRUE) 51 | } 52 | -------------------------------------------------------------------------------- /R/executeModelRxDForest.R: -------------------------------------------------------------------------------- 1 | #' Build a random forest based from xdf dataset. 2 | #' 3 | #' Time-stamp: <2017-08-18 12:13:50 Graham Williams> 4 | #' 5 | executeModelRxDForest <- function() 6 | { 7 | # Identify the model specific constants. 8 | 9 | TV <- "rf_textview" 10 | NAME <- commonName(crv$RXDFOREST) 11 | PKG <- "RevoScaleR" 12 | FUNC <- "rxDForest" 13 | VAR <- "crs$rf" 14 | TYPE <- Rtxt("Classification") 15 | DESC <- Rtxt("build an xdf based random forest model") 16 | 17 | # Check package prerequisites. 18 | 19 | if (! packageIsAvailable(PKG, DESC)) return(FALSE) 20 | 21 | # Construct the formula for the model build. 22 | 23 | crs$target %>% 24 | paste("~", paste(crs$input, collapse=" + ")) %>% 25 | strwrap(crv$log_width, 0, 4) %>% 26 | paste(collapse="\n") -> 27 | frml 28 | 29 | # Variables to be included --- a string of indicies. 30 | 31 | # included <- getIncludedVariables() 32 | included <- "c(crs$input, crs$target)" # 20110102 33 | 34 | # Some convenience booleans 35 | 36 | sampling <- not.null(crs$train) 37 | including <- not.null(included) 38 | subsetting <- sampling || including 39 | 40 | # Commands. 41 | 42 | build.cmd <- paste0(VAR, " <- ", FUNC, "(\n\n ", frml, ",\n\n", 43 | " data = crs$xdf.split[[1]],\n", 44 | " importance = TRUE", 45 | ")") 46 | 47 | print.cmd <- paste0("print(", VAR, ")") 48 | 49 | startLog(NAME) 50 | 51 | # Build the model. 52 | 53 | appendLog(sprintf(Rtxt("Build the %s model."), NAME), build.cmd) 54 | start.time <- Sys.time() 55 | result <- try(eval(parse(text=build.cmd)), silent=TRUE) 56 | time.taken <- Sys.time() - start.time 57 | 58 | # Show the results. 59 | 60 | resetTextview(TV) 61 | setTextview(TV, 62 | sprintf(Rtxt("Summary of the %s model for %s (built using '%s'):"), 63 | NAME, TYPE, FUNC), 64 | "\n\n", 65 | collectOutput(print.cmd)) 66 | 67 | # Now that we have a model, make sure the buttons are sensitive. 68 | 69 | showModelRFExists(traditional=TRUE, conditional=FALSE) 70 | 71 | # Finish up. 72 | 73 | reportTimeTaken(TV, time.taken, NAME) 74 | 75 | return(TRUE) 76 | } 77 | -------------------------------------------------------------------------------- /R/executeModelXGB.R: -------------------------------------------------------------------------------- 1 | 2 | #---------------------------------------------------------------------- 3 | # 4 | # MODEL XGB 5 | # 6 | 7 | executeModelXGB <- function(dataset, formula) 8 | { 9 | # Initial setup. 10 | 11 | TV <- "ada_textview" 12 | VAR <- "crs$ada" 13 | 14 | # Build model 15 | 16 | crs$ada <- buildModelXgb(formula, 17 | dataset, 18 | tv=theWidget("ada_textview"), 19 | max_depth=theWidget("ada_maxdepth_spinbutton")$getValue(), 20 | eta=theWidget("ada_learningrate_spinbutton")$getValue(), 21 | #num_parallel_tree=theWidget("ada_ntree_spinbutton")$getValue(), 22 | nthread=theWidget("ada_nthread_spinbutton")$getValue(), 23 | nround=theWidget("ada_niter_spinbutton")$getValue(), 24 | #metrics=theWidget("ada_metrics_combobox")$getActiveText(), 25 | objective=theWidget("ada_objective_combobox")$getActiveText() 26 | ) 27 | 28 | return(TRUE) 29 | } -------------------------------------------------------------------------------- /R/executePairsPlotSelect2.R: -------------------------------------------------------------------------------- 1 | #' Perform the required operations for displaying a pairs plot. 2 | #' 3 | #' Time-stamp: 4 | #' 5 | executePairsPlotSelect2 <- function(dataset, vars, target, targets, stratify, sampling, pmax) 6 | { 7 | startLog(Rtxt("Display a pairs plot for the selected variables.")) 8 | 9 | varsi <- getVariableIndicies(vars) 10 | 11 | # v1 <- theWidget("pairs_color_combobox")$getActiveText() 12 | v1 <- target 13 | if (is.null(v1) || v1 == " ") 14 | { 15 | colorStr<-'' # No color selected. 16 | } 17 | else 18 | { 19 | colorStr<-sprintf('mapping=ggplot2::aes(colour=%s, alpha=0.5, shape=%s),', v1, v1) 20 | } 21 | 22 | plot.cmd <- paste0(dataset, ' %>%\n', 23 | ' dplyr::mutate(', v1, '=as.factor(', v1, ')) %>%\n', 24 | ' GGally::ggpairs(columns=c(', 25 | paste(varsi, collapse=','), '),\n', 26 | if (colorStr!="") paste0(' ', colorStr, "\n"), 27 | ' diag=list(continuous="densityDiag",\n', 28 | ' discrete="barDiag"),\n', 29 | ' upper=list(continuous="cor",\n', 30 | ' combo="box",\n', 31 | ' discrete="ratio"),\n', 32 | ' lower=list(continuous="points",\n', 33 | ' combo="denstrip",\n', 34 | ' discrete="facetbar"),\n', 35 | ' legend=3)', 36 | ' +\n ggplot2::theme(panel.grid.major=ggplot2::element_blank(), ', 37 | 'legend.position="bottom")', 38 | ' +\n ggplot2::scale_alpha_continuous(guide=FALSE)', 39 | ' +\n ggplot2::scale_fill_brewer(palette=rattlePalette)', 40 | ' +\n ggplot2::scale_colour_brewer(palette=rattlePalette)') 41 | # When this next blank theme is included we get bad plots???? Some 42 | # problem with colour. 43 | # 44 | # ' panel.grid.minor=ggplot2::element_blank())') 45 | 46 | appendLog(Rtxt("Use GGally's ggpairs() to do the hard work."), plot.cmd) 47 | newPlot() 48 | eval(parse(text=sprintf("suppressMessages(print(%s))", plot.cmd))) 49 | } 50 | -------------------------------------------------------------------------------- /R/fancyRpartPlot.R: -------------------------------------------------------------------------------- 1 | # Rattle: A GUI for Data Mining in R 2 | # 3 | # Time-stamp: 4 | # 5 | # Copyright (c) 2009-2014 Togaware Pty Ltd 6 | # 7 | #' Plot rpart decision trees nicely. 8 | #' 9 | #' @param model an rpart object 10 | #' @param main title for the plot 11 | #' @param sub sub title for the plot (default is a Rattle string with 12 | #' date, time and username) 13 | #' @param palettes a list of sequential palettes names as supported by 14 | #' RColorBrewer::brewer.pal including Blues BuGn BuPu 15 | #' GnBu Greens Greys Oranges OrRd PuBu PuBuGn PuRd Purples RdPu Reds 16 | #' YlGn YlGnBu YlOrBr YlOrRd. 17 | #' @param ... additional arguments passed on to rpart.plot::prp 18 | # 19 | # This files is part of Rattle. 20 | # 21 | # Rattle is free software: you can redistribute it and/or modify it 22 | # under the terms of the GNU General Public License as published by 23 | # the Free Software Foundation, either version 2 of the License, or 24 | # (at your option) any later version. 25 | # 26 | # Rattle is distributed in the hope that it will be useful, but 27 | # WITHOUT ANY WARRANTY; without even the implied warranty of 28 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 29 | # General Public License for more details. 30 | # 31 | # You should have received a copy of the GNU General Public License 32 | # along with Rattle. If not, see . 33 | 34 | fancyRpartPlot <- function(model, 35 | main="", 36 | sub, 37 | caption, 38 | palettes, 39 | type=2, 40 | ...) 41 | { 42 | if (!inherits(model, "rpart")) 43 | stop("The model object must be an rpart object. ", 44 | "Instead we found: ", paste(class(model), collapse=", "), ".") 45 | 46 | # For new version of rpart.plot (20180710 v3.0.0). 47 | 48 | roundint <- ! is.null(model$model) 49 | 50 | # Migrate to replacing sub with caption in line with ggplot. 51 | 52 | if (missing(sub) & missing(caption)) 53 | { 54 | sub <- paste("Rattle", 55 | format(Sys.time(), "%Y-%b-%d %H:%M:%S"), 56 | Sys.info()["user"]) 57 | } else 58 | { 59 | if (missing(sub)) sub <- caption 60 | } 61 | 62 | num.classes <- length(attr(model, "ylevels")) 63 | 64 | # Generate a colour palette, with a range of 5 (palsize) colours for 65 | # each of the 6 (numpals) palettes. The palette is collapsed into 66 | # one list. We index it according to the class. Keep to the lighter 67 | # end of the palette to ensure printing is okay otherwise the black 68 | # text is hard to read. 69 | 70 | default.palettes <- c("Greens", "Blues", "Oranges", "Purples", "Reds", "Greys") 71 | if (missing(palettes)) 72 | palettes <- default.palettes 73 | missed <- setdiff(1:6, seq(length(palettes))) 74 | palettes <- c(palettes, default.palettes[missed]) 75 | 76 | numpals <- 6 77 | palsize <- 5 78 | pals <- c(RColorBrewer::brewer.pal(9, palettes[1])[1:5], 79 | RColorBrewer::brewer.pal(9, palettes[2])[1:5], 80 | RColorBrewer::brewer.pal(9, palettes[3])[1:5], 81 | RColorBrewer::brewer.pal(9, palettes[4])[1:5], 82 | RColorBrewer::brewer.pal(9, palettes[5])[1:5], 83 | RColorBrewer::brewer.pal(9, palettes[6])[1:5]) 84 | 85 | # Extract the scores/percentages for each of the nodes for the 86 | # majority decision. The decisions are in column 1 of yval2 and the 87 | # percentages are in the final num.classes columns. 88 | 89 | # 121106 Need to handle regression as pointed out by Yana 90 | # Kane-Esrig, 26 October 2012. 91 | 92 | if (model$method == "class") 93 | { 94 | yval2per <- -(1:num.classes)-1 95 | per <- apply(model$frame$yval2[,yval2per], 1, function(x) x[1+x[1]]) 96 | } 97 | else 98 | { 99 | # 130329 This is the deviance relative the the total deviance measured at 100 | # the root node. We use this to colour the strength of the node - 101 | # so more intense colour means less relative deviance. 102 | 103 | #per <- 1 - (model$frame$dev/model$frame$dev[1]) 104 | 105 | # 130329 Perhaps instead we want to use the yval as the intensity 106 | # of the predicted value. Currently not handling negative values. 107 | 108 | per <- model$frame$yval/max(model$frame$yval) 109 | 110 | } 111 | 112 | # The conversion of a tree in CORElearn to an rpart tree results in these 113 | # being character, so ensure we have numerics. 114 | 115 | per <- as.numeric(per) 116 | 117 | # Calculate an index into the combined colour sequence. Once we go 118 | # above numpals * palsize (30) start over. 119 | 120 | if (model$method == "class") 121 | col.index <- ((palsize*(model$frame$yval-1) + 122 | trunc(pmin(1 + (per * palsize), palsize))) %% 123 | (numpals * palsize)) 124 | else 125 | col.index <- round(per * (palsize-1)) + 1 126 | 127 | # Ensure the index is positive. Thanks to John Vorwald, 8 Dec 128 | # 2014. The bug can arise when model$frame$yval are all 129 | # negative. The error is: 130 | # 131 | # fancyRpartPlot(rtreeFit,main=paste('RPART:',cName)) 132 | # Error in pals[col.index] : only 0's may be mixed with negative subscripts 133 | 134 | col.index <- abs(col.index) 135 | 136 | # Determine the amount of extra information added to the nodes. 137 | 138 | if (model$method == "class") 139 | extra <- 104 140 | else 141 | extra <- 101 142 | 143 | # Generate the plot and title. 144 | 145 | rpart.plot::prp(model, type=type, extra=extra, 146 | box.col=pals[col.index], 147 | nn=TRUE, 148 | varlen=0, faclen=0, 149 | shadow.col="grey", 150 | fallen.leaves=TRUE, 151 | branch.lty=3, 152 | roundint=roundint, 153 | main=main, 154 | sub=sub, 155 | ...) 156 | } 157 | -------------------------------------------------------------------------------- /R/ggVarImp.R: -------------------------------------------------------------------------------- 1 | ggVarImp <- function(model, ...) UseMethod("ggVarImp") 2 | 3 | ggVarImpPlot <- function(ds, 4 | n=NULL, 5 | title="Variable Importance", 6 | label="Relative Importance", 7 | caption=genPlotTitleCmd(vector=TRUE), 8 | log=FALSE) 9 | { 10 | # Expect ds to contain at least the columns Variable and Importance. 11 | 12 | if (length(n) == 1L) ds <- head(ds, n) 13 | 14 | ds %>% 15 | dplyr::arrange(desc(Importance)) %>% 16 | dplyr::mutate(Variable=factor(Variable, levels=rev(unique(Variable)))) %>% 17 | ggplot2::ggplot(ggplot2::aes(x = Variable, 18 | y = Importance, 19 | fill = Variable)) + 20 | ggplot2::geom_bar(stat = "identity", 21 | position = "identity", 22 | width = 0.1) + 23 | ggplot2::labs(title = title, 24 | y = label, 25 | x = "", 26 | caption = caption) + 27 | ggplot2::coord_flip() + 28 | ggplot2::theme(axis.ticks.x = ggplot2::element_blank(), 29 | axis.text.x = ggplot2::element_blank(), 30 | axis.title.x = ggplot2::element_blank(), 31 | legend.position = "none") -> 32 | p 33 | 34 | if (log) 35 | p <- p + ggplot2::scale_y_continuous(trans="log10") 36 | else 37 | p <- p + ggplot2::scale_y_continuous(labels=scales::comma) 38 | 39 | return(p) 40 | } 41 | 42 | ggVarImp.randomForest <- function(model, 43 | title="Random Forest Variable Importance", 44 | ...) 45 | { 46 | # By default randomForest() only returns the MeanDecreaseGini. With 47 | # importance=TRUE at model build time we also get 48 | # MeanDecreaseAccuracy and importance relative to the target levels. 49 | 50 | randomForest::importance(model) %>% 51 | data.frame() %>% 52 | dplyr::mutate(Variable=row.names(.)) %>% 53 | tidyr::gather(Measure, Importance, -Variable) %>% 54 | dplyr::group_by(Measure) %>% 55 | dplyr::mutate(Importance=(max(Importance)-Importance)/(max(Importance)-min(Importance))) %>% 56 | ggVarImpPlot(title, ...) + 57 | ggplot2::facet_wrap(~ Measure) 58 | } 59 | 60 | ggVarImp.rpart <- function(model, 61 | title="Decision Tree Variable Importance", 62 | ...) 63 | { 64 | model$variable.importance %>% 65 | data.frame() %>% 66 | magrittr::set_names("Importance") %>% 67 | dplyr::mutate(Variable=row.names(.)) %>% 68 | # dplyr::arrange(desc(Importance)) %>% 69 | # dplyr::mutate(Variable=factor(Variable, levels=rev(unique(Variable)))) %>% 70 | ggVarImpPlot(title, ...) 71 | } 72 | 73 | ggVarImp.rxDForest <- function(model, 74 | title="Big Data Random Forest Variable Importance", 75 | ...) 76 | { 77 | model$importance %>% 78 | data.frame() %>% 79 | dplyr::mutate(Variable=row.names(.)) %>% 80 | # dplyr::arrange(desc(IncNodePurity)) %>% 81 | # dplyr::mutate(Variable=factor(Variable, levels=rev(unique(Variable)))) %>% 82 | dplyr::rename(Importance=IncNodePurity) %>% 83 | ggVarImpPlot(title, ...) 84 | } 85 | 86 | ggVarImp.xgb.Booster <- function(model, 87 | feature_names=NULL, 88 | title="Extreme Gradient Boost Variable Importance", 89 | ...) 90 | { 91 | # The model does not include the feature/colnames, so we need to 92 | # have an option to pass it in. 93 | 94 | xgboost::xgb.importance(feature_names=feature_names, model=model) %>% 95 | dplyr::rename(Variable=Feature, Importance=Gain) %>% 96 | dplyr::select(Variable, Importance) %>% 97 | ggVarImpPlot(title, ...) 98 | } 99 | 100 | ggVarImp.xgb.formula <- function(model, 101 | feature_names=NULL, 102 | title="Extreme Gradient Boost Variable Importance", 103 | ...) 104 | { 105 | class(model) %<>% setdiff("xgb.formula") 106 | ggVarImp(model, feature_names=model$dimnames) 107 | } 108 | -------------------------------------------------------------------------------- /R/loadLibs.R: -------------------------------------------------------------------------------- 1 | #' Load a list of libraries, reporting to the Rattle Log 2 | #' 3 | #' Only load the package if not already loaded. If already loaded then 4 | #' we don't return the name from the function. 5 | #' 6 | #' @param l Vector of pairs, "package name" "used function". 7 | #' @return returns list of packages that get loaded. 8 | #' @rdname loadLibs 9 | loadLibs <- function(l) 10 | { 11 | odd <- seq(1, length(l), 2) 12 | lname <- l[odd] 13 | even <- seq(2, length(l), 2) 14 | lfun <- l[even] 15 | libs <- NULL 16 | for (i in 1:length(odd)) 17 | { 18 | appendLog(packageProvides(lname[i], lfun[i]), sprintf("library(%s)", lname[i])) 19 | if (!sprintf("package:%s", lname[i]) %in% search()) 20 | { 21 | suppressPackageStartupMessages(library(lname[i], character.only=TRUE, warn.conflicts=FALSE, quietly=TRUE)) 22 | libs <- c(libs, lname[i]) 23 | } 24 | } 25 | return(libs) 26 | } 27 | 28 | -------------------------------------------------------------------------------- /R/loadTooltips.R: -------------------------------------------------------------------------------- 1 | loadTooltips <- function() 2 | { 3 | if (! packageIsAvailable("XML", "load GUI tooltips")) 4 | { 5 | warning("The XML package is not available. Tooltips will not be available.") 6 | return(FALSE) 7 | } 8 | 9 | result <- try(etc <- file.path(path.package(package="rattle")[1], "etc"), 10 | silent=TRUE) 11 | if (inherits(result, "try-error")) 12 | doc <- XML::xmlTreeParse("tooltips.xml", useInternalNodes=TRUE) 13 | else 14 | doc <- XML::xmlTreeParse(file.path(etc, "tooltips.xml"), useInternalNodes=TRUE) 15 | 16 | for (tt in XML::getNodeSet(doc, "//tooltip")) 17 | { 18 | # 100110 format the tooltip. blank lines are retained, but other 19 | # line breaks are ignored. 20 | 21 | tip <- gsub("XoX", "\\\n\\\n", 22 | gsub("\n *", " ", 23 | gsub("\n *\n *", "XoX", XML::xmlValue(tt)))) 24 | wd <- theWidget(XML::xmlGetAttr(tt, 'widget')) 25 | wd["tooltip-text"] <- Rtxt (tip) # 100408 Space after Rtxt is intentional. 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/log.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2017-09-10 09:32:39 Graham Williams> 4 | # 5 | # Implement LOG functionality. 6 | # 7 | # Copyright (c) 2009 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | ######################################################################## 25 | # CALLBACKS 26 | 27 | on_log_export_rename_checkbutton_toggled <- function(button) 28 | { 29 | theWidget("log_export_rename_entry")$setSensitive(button$getActive()) 30 | } 31 | 32 | initiateLog <- function() 33 | { 34 | # 100407 Change the font to monospace, like all other textviews. 35 | 36 | if (! isJapanese()) 37 | theWidget("log_textview")$modifyFont(RGtk2::pangoFontDescriptionFromString(crv$textview.font)) 38 | 39 | if (! is.null(crv$log.intro)) 40 | appendTextview("log_textview", 41 | paste0("#", paste0(rep("=", 71), collapse=""), 42 | "\n\n", crv$log.intro), 43 | tvsep=FALSE) 44 | 45 | startLog(paste(sprintf(Rtxt("%s version %s user '%s'"), 46 | crv$appname, crv$version, Sys.info()["user"]), 47 | #LOG_LICENSE 48 | #sprintf("# Started %s by %s\n\n", Sys.time(), Sys.info()["user"]), 49 | "\n\n", 50 | Rtxt("# This log captures interactions with Rattle as an R script.", 51 | "\n\n# For repeatability, export this activity log to a", 52 | "\n# file, like 'model.R' using the Export button or", 53 | "\n# through the Tools menu. Th script can then serve as a", 54 | "\n# starting point for developing your own scripts.", 55 | "\n# After xporting to a file called 'model.R', for exmample,", 56 | "\n# you can type into a new R Console the command", 57 | "\n# \"source('model.R')\" and so repeat all actions. Generally,", 58 | "\n# you will want to edit the file to suit your own needs.", 59 | "\n# You can also edit this log in place to record additional", 60 | "\n# information before exporting the script.", 61 | "\n", 62 | "\n# Note that saving/loading projects retains this log."), 63 | "\n", 64 | '\n# We begin most scripts by loading the required packages.', 65 | '\n# Here are some initial packages to load and others will be', 66 | '\n# identified as we proceed through the script. When writing', 67 | '\n# our own scripts we often collect together the library', 68 | '\n# commands at the beginning of the script here.\n\n', 69 | crv$library.command, 70 | ' # Access the weather dataset and utilities.', 71 | '\nlibrary(magrittr) # Utilise %>% and %<>% pipeline operators.', 72 | "\n\n", 73 | Rtxt("# This log generally records the process of building a model.", 74 | "\n# However, with very little effort the log can also be used", 75 | "\n# to score a new dataset. The logical variable 'building'", 76 | "\n# is used to toggle between generating transformations,", 77 | "\n# when building a model and using the transformations,", 78 | "\n# when scoring a dataset."), 79 | "\n\nbuilding <- TRUE", 80 | "\nscoring <- ! building", 81 | # Removed to avoid loading librarys or suggesting such 82 | # Moving to using namespace :: in the script. 83 | #ifelse(packageIsAvailable("colorspace"), 84 | # paste("\n", 85 | # Rtxt("# The colorspace package is used to generate", 86 | # "the colours used in plots,", 87 | # "if available."), 88 | # "\n\n", 89 | # "library(colorspace)", sep=""), ""), 90 | "\n\n", 91 | Rtxt("# A pre-defined value is used to reset the random seed", 92 | "\n# so that results are repeatable."), 93 | "\n\ncrv$seed <- ", crv$seed, 94 | sep="")) 95 | 96 | } 97 | 98 | startLog <- function(msg=NULL) 99 | { 100 | # Output a suitable separator to the log textview, and if there is 101 | # an optional MSG, display that message, as an introduction to this 102 | # section. 103 | 104 | if (is.null(crv$rattleGUI)) return() 105 | 106 | appendLog(paste("\n\n#", 107 | paste(rep("=", 71), collapse=""), 108 | if (not.null(crv$show.timestamp) && crv$show.timestamp) 109 | paste("\n# ", crv$appname, " ", Rtxt("timestamp:"), " ", 110 | Sys.time(), " ", version$platform, sep=""), 111 | sep=""), 112 | no.start=TRUE) 113 | if (not.null(msg)) 114 | appendLog(paste(sep="", crv$start.log.comment, msg), no.start=TRUE) 115 | } 116 | 117 | appendLog <- function(start, cont=NULL, ..., sep=" ", no.start=FALSE) 118 | { 119 | # 100330 cont is used to identify whether there is more than a 120 | # single string to print. If not, then don't include the 121 | # crv$end.log.comment otherwise there is too much white space in the 122 | # log. 123 | 124 | if (is.null(crv$rattleGUI)) return() 125 | 126 | if (no.start) 127 | msg <- paste(sep=sep, start, cont, ...) 128 | else if (is.null(cont)) 129 | msg <- paste(sep="", crv$start.log.comment, start) 130 | else 131 | msg <- paste(sep="", crv$start.log.comment, start, crv$end.log.comment, cont, ...) 132 | if (length(msg) == 0) msg <-"" 133 | 134 | # 150712 Remove and Rtxt(...), leaving just ... 135 | 136 | msg <- stringr::str_replace(msg, 'Rtxt\\(([^\\)]*)\\)', '\\1') 137 | 138 | # Always place text at the end, irrespective of where the cursor is. 139 | 140 | log.buf <- theWidget("log_textview")$getBuffer() 141 | location <- log.buf$getEndIter()$iter 142 | 143 | log.buf$insert(location, msg) 144 | } 145 | 146 | exportLogTab <- function() 147 | { 148 | # Obtain filename to the LOG textview to. 149 | 150 | dialog <- RGtk2::gtkFileChooserDialog(Rtxt("Export Log"), NULL, "save", 151 | "gtk-cancel", RGtk2::GtkResponseType["cancel"], 152 | "gtk-save", RGtk2::GtkResponseType["accept"]) 153 | dialog$setDoOverwriteConfirmation(TRUE) 154 | 155 | if(not.null(crs$dataname)) 156 | dialog$setCurrentName(sprintf("%s_script.R", get.stem(crs$dataname))) 157 | 158 | ff <- RGtk2::gtkFileFilterNew() 159 | ff$setName(Rtxt("R Files")) 160 | ff$addPattern("*.R") 161 | dialog$addFilter(ff) 162 | 163 | ff <- RGtk2::gtkFileFilterNew() 164 | ff$setName(Rtxt("All Files")) 165 | ff$addPattern("*") 166 | dialog$addFilter(ff) 167 | 168 | if (dialog$run() == RGtk2::GtkResponseType["accept"]) 169 | { 170 | save.name <- dialog$getFilename() 171 | dialog$destroy() 172 | } 173 | else 174 | { 175 | dialog$destroy() 176 | return() 177 | } 178 | 179 | if (get.extension(save.name) != "R") 180 | save.name <- sprintf("%s.R", save.name) 181 | 182 | save.text <- getTextviewContent("log_textview") 183 | if (!theWidget("log_export_comments_checkbutton")$getActive()) 184 | save.text <- gsub("\n\n+", "\n", gsub("#[^\n]*\n", "", save.text)) 185 | if (theWidget("log_export_rename_checkbutton")$getActive()) 186 | { 187 | nm <- theWidget("log_export_rename_entry")$getText() 188 | save.text <- gsub("crs\\$", nm, save.text) 189 | } 190 | write(save.text, save.name) 191 | 192 | setStatusBar(sprintf(Rtxt("The log has been exported to '%s'."), save.name)) 193 | } 194 | 195 | packageProvides <- function(pkg, fun) 196 | { 197 | return(sprintf(Rtxt("The '%s' package provides the '%s' function."), pkg, fun)) 198 | } 199 | 200 | -------------------------------------------------------------------------------- /R/normVarNames.R: -------------------------------------------------------------------------------- 1 | normVarNames <- function(vars, sep="_") 2 | { 3 | return(janitor::make_clean_names(vars, numerals="right")) 4 | } 5 | -------------------------------------------------------------------------------- /R/psfchart.R: -------------------------------------------------------------------------------- 1 | # Generate a PSF chart 2 | 3 | # Gnome R Data Miner: GNOME interface to R for Data Mining 4 | # 5 | # Time-stamp: <2014-09-06 18:51:58 gjw> 6 | # 7 | # Implement evaluate functionality. 8 | # 9 | # Copyright (c) 2009-2013 Togaware Pty Ltd 10 | # 11 | # This files is part of Rattle. 12 | # 13 | # Rattle is free software: you can redistribute it and/or modify it 14 | # under the terms of the GNU General Public License as published by 15 | # the Free Software Foundation, either version 2 of the License, or 16 | # (at your option) any later version. 17 | # 18 | # Rattle is distributed in the hope that it will be useful, but 19 | # WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU General Public License 24 | # along with Rattle. If not, see . 25 | 26 | psfchart <- function(predicted, 27 | actual, 28 | bins=100, # Number of bins to use for the plot. 29 | threshold=0.5, # The decision threshold. 30 | splits=NULL, # E.g., c(0.2, 0.8) 31 | split.lables=c("Low", "Medium", "High"), 32 | tic.size=0.2, # proportional gap between axis ticks. 33 | gg=TRUE, 34 | verbose=FALSE) 35 | { 36 | dosplits <- ! is.null(splits) 37 | 38 | if (is.factor(actual)) actual <- as.numeric(actual)-1 39 | 40 | doAggregate <- function() 41 | { 42 | 43 | # Bin the scores into "bins" bins and store the bins into 44 | # variable "bin". If there are more bins specified than 45 | # scores, then simply rank the scores. In the end we have for 46 | # each score a "rank", which is an integer in 1:bins or 47 | # 1:length(actual). 48 | 49 | if (length(actual) >= bins) 50 | { 51 | bin <- as.numeric(binning(predicted, bins, method="quantile", 52 | ordered=FALSE, labels=FALSE)) 53 | } 54 | else 55 | { 56 | bin <- reshape::rescaler(predicted, "rank") 57 | } 58 | 59 | # Check whether the pr and target agree. 60 | 61 | agree <- as.numeric(as.numeric(predicted > threshold) == actual) 62 | 63 | # Get the size of each bin (should all be the same +/- 64 | # 1). Also get a count of the positives in each bin, assumming 65 | # a 0/1 value for actual (so sum will work) and the accuracy 66 | # of each bin. 67 | 68 | agg <- aggregate(actual, list(bin), length) 69 | names(agg) <- c("bin", "size") 70 | agg$pos <- aggregate(actual, list(bin), sum)[[2]] 71 | agg$acc <- aggregate(agree, list(bin), sum)[[2]] 72 | agg$max <- aggregate(predicted, list(bin), max)[[2]] 73 | agg$tdiff <- agg$max - threshold 74 | 75 | # Rescale bins to be between 0 and 1 so AUC is more sensible. 76 | 77 | agg$rbin <- agg$bin/bins 78 | 79 | # Calulate the proportion accuracy 80 | 81 | agg$pacc <- agg$acc/agg$size 82 | 83 | return(agg) 84 | } 85 | 86 | # Determine the model's decision based on the score and threshold 87 | # and save that as pr. 88 | 89 | pr <- as.numeric(predicted > threshold) 90 | 91 | tp <- round(100 * sum(pr==1 & actual==1)/length(actual)) 92 | fp <- round(100 * sum(pr==1 & actual==0)/length(actual)) 93 | tn <- round(100 * sum(pr==0 & actual==0)/length(actual)) 94 | fn <- round(100 * sum(pr==0 & actual==1)/length(actual)) 95 | 96 | if (verbose) 97 | cat("\nData Summary:", 98 | sprintf(" Obs: %s\n", 99 | format(length(actual), big.mark=",")), 100 | sprintf(" Targets: %s; Rate: %0.2f%%\n", 101 | format(sum(actual==1), big.mark=","), 102 | 100*sum(actual==1)/length(actual)), 103 | sprintf(" Model TP: %9s FN: %9s\n", 104 | format(sum(pr==1 & actual==1), big.mark=","), 105 | format(sum(pr==0 & actual==1), big.mark=",")), 106 | sprintf(" FP: %9s TN: %9s\n", 107 | format(sum(pr==1 & actual==0), big.mark=","), 108 | format(sum(pr==0 & actual==0), big.mark=","))) 109 | 110 | agg <- doAggregate() 111 | 112 | if (gg) 113 | { 114 | if (dosplits) 115 | classes <- data.frame(x=c(splits[1]/2, 116 | (splits[1]+splits[2])/2, 117 | (1+splits[2])/2), 118 | lbl=split.labels) 119 | 120 | quads <- data.frame(x=c(0, 1, 0, 1), hj=c(0, 1, 0, 1), 121 | y=c(0, 0, 1, 1), vj=c(0, 0, 1, 1), 122 | lbl=c(sprintf("True Negatives (%s%%)", tn), 123 | sprintf("True Positives (%s%%)", tp), 124 | sprintf("False Negatives (%s%%)", fn), 125 | sprintf("False Positives (%s%%)", fp))) 126 | 127 | xthresh <- agg$rbin[which(abs(agg$tdiff) == min(abs(agg$tdiff)))][1] 128 | 129 | tics <- seq(0, 1, tic.size) 130 | ord <- order(predicted) 131 | scores <- data.frame(x=tics, 132 | score=round(predicted[ord][c(1, 133 | round(tics*length(ord)))], 2)) 134 | 135 | p <- ggplot2::ggplot(agg, ggplot2::aes(rbin, pacc)) 136 | p <- p + ggplot2::geom_line() 137 | p <- p + ggplot2::ggtitle("Proportional Score Function (PSF) Curve") 138 | p <- p + ggplot2::scale_y_continuous("% Accuracy", limits=c(0,1), 139 | labels=100*tics, breaks=tics) 140 | p <- p + ggplot2::scale_x_continuous(paste("Proportion of Cases", 141 | "\nSorted by Increasing Risk Scores"), 142 | breaks=tics) 143 | p <- p + ggplot2::geom_text(data=quads, 144 | ggplot2::aes(x=x, y=y, label=lbl, hjust=hj, vjust=vj, size=5)) 145 | p <- p + ggplot2::geom_text(x=xthresh, ggplot2::aes(y=0, size=5), 146 | label=sprintf("Threshold (%s)", threshold), 147 | vjust=2, hjust=1.1) 148 | p <- p + ggplot2::geom_vline(xintercept=xthresh) 149 | p <- p + ggplot2::geom_text(data=scores, ggplot2::aes(x=x, y=1, label=score, size=5), 150 | vjust=-0.5) 151 | p <- p + ggplot2::theme(legend.position="none") 152 | if (dosplits) 153 | { 154 | p <- p + ggplot2::geom_text(data=classes, ggplot2::aes(x=x, y=0.2, label=lbl)) 155 | p <- p + ggplot2::geom_vline(xintercept=low, linetype="twodash", color="grey") 156 | p <- p + ggplot2::geom_vline(xintercept=high, linetype="twodash", color="grey") 157 | } 158 | return(p) 159 | } 160 | else 161 | { 162 | plot(agg$rbin, agg$pacc, type="l", xlim=c(0,1), ylim=c(0,1), 163 | xlab="Proportion of Cases\nSorted by Risk Score", ylab="% Accuracy") 164 | title(main="PSF\n") 165 | 166 | abline(v=0.25, lty=3) 167 | abline(v=0.75, lty=3) 168 | text(0.08, 0.08, "Low") 169 | text(0.5, 0.08, "Medium") 170 | text(0.92, 0.08, "High") 171 | 172 | # Add annotations for the sinlge plot. 173 | 174 | abline(v=agg$rbin[which(abs(agg$tdiff) == min(abs(agg$tdiff)))], lty=1) 175 | xthresh <- agg$rbin[which(abs(agg$tdiff) == min(abs(agg$tdiff)))] 176 | text(xthresh, 0.15, "Threshold", pos=2) 177 | text(xthresh, 0.1, threshold, pos=2) 178 | 179 | # TODO NEED TO PROGRAMMATICALLY DETERMINE THE LABELS FROM MIN SCORE TO MAX SCORE 180 | ord <- order(predicted) 181 | scores <- predicted[ord] 182 | axis(3, at=seq(0, 1, 0.2), padj=1.5, lwd.ticks=0, 183 | labels=round(scores[c(1, round(seq(0, 1, 0.2)*length(scores)))], 2)) 184 | 185 | text(0.92, 1, "False Positives") 186 | text(0.085, 1, "False Negatives") 187 | text(0.92, 0, "True Positives") 188 | text(0.08, 0, "True Negatives") 189 | 190 | opar <- par(xpd=TRUE) 191 | text(0.5, 1.08,"Scores") 192 | par(opar) 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /R/rattleInfo.R: -------------------------------------------------------------------------------- 1 | rattleInfo <- function(all.dependencies=FALSE, 2 | include.not.installed=FALSE, 3 | include.not.available=FALSE, 4 | include.libpath=FALSE) 5 | { 6 | 7 | # TODO: Add in support for BIOC 8 | 9 | cran.repos <- "https://cran.rstudio.org" 10 | bioc.repos <- "" 11 | 12 | # Using installed.packages() can be a "very slow way to find 13 | # information on one or a small number of packages" (Brian Riply 14 | # 2012). This is stated in the man page and I am very aware of 15 | # it. Brian also note: "In addition, many of you are using it to 16 | # find out if a package is installed, when actually you want to know 17 | # if it is usable (it might for example be installed for a different 18 | # architecture or require a later version of R), for which you need 19 | # to use require()." This was particularly relevant within 20 | # packageIsAvailable() and there I use a better way of checking for 21 | # an installed package. Here I think it might still remain 22 | # appropriate to use installed.packages(). 23 | 24 | iv <- utils::installed.packages() 25 | av <- available.packages(contriburl=contrib.url(cran.repos)) 26 | have.av <- nrow(av) != 0 27 | # not a cran repos bv <- available.packages(contriburl=contrib.url(cran.repos)) 28 | 29 | riv <- iv["rattle", "Version"] 30 | if (have.av) rav <- av["rattle", "Version"] 31 | 32 | cat(sprintf("Rattle: version %s", riv)) 33 | if (have.av && compareVersion(riv, rav) != 1) cat(sprintf(" CRAN %s", rav)) 34 | cat("\n") 35 | 36 | # Record the packages that can be upgraded 37 | 38 | up <- if (have.av && compareVersion(rav, riv) == 1) "rattle" else NULL 39 | 40 | cat(sprintf("%s\n", sub(" version", ": version", version$version.string))) 41 | 42 | cat("\n") 43 | si <- Sys.info() 44 | for (i in seq_along(si)) 45 | cat(sprintf("%s%s: %s\n", toupper(substr(names(si)[i], 1, 1)), 46 | substring(names(si)[i], 2), si[i])) 47 | 48 | cat("\nInstalled Dependencies\n") 49 | 50 | deps2vec <- function(deps) 51 | { 52 | if (is.na(deps)) return(NULL) 53 | strsplit(gsub("\\n", " ", gsub(' ?\\([^\\)]+\\)', '', deps)), ", ?")[[1]] 54 | } 55 | 56 | if (all.dependencies) 57 | { 58 | if (! "pkgDepTools" %in% rownames(iv)) 59 | { 60 | source("https://bioconductor.org/biocLite.R") 61 | pkg <- "pkgDepTools" 62 | biocLite("pkgDepTools") 63 | } 64 | if (! "Rgraphviz" %in% rownames(iv)) 65 | { 66 | source("https://bioconductor.org/biocLite.R") 67 | biocLite("Rgraphviz") 68 | } 69 | 70 | # 150711 There does not seem to be a way to get both suggest and 71 | # depend links using pkgDepTools::makeDepGraph which I used to 72 | # deploy here. It's either one or the other. Rattle only has 73 | # suggests links. So I want to get what Rattle suggests and then 74 | # find all the depends in cran.deps as the packages that are 75 | # reported on. Instead of going through the repository and build a 76 | # dependency graph, we've already dounloaded the available package 77 | # information so use it here instead. 78 | 79 | pkg.deps <- function(pkg, pkgs, av) 80 | { 81 | if (pkg %in% pkgs) return(pkgs) 82 | 83 | if (! pkg %in% rownames(av)) return(c(pkg, pkgs)) 84 | 85 | for (p in union(deps2vec(av[pkg, "Suggests"]), deps2vec(av[pkg, "Depends"]))) 86 | { 87 | pkgs <- pkg.deps(p, union(pkg, pkgs), av) 88 | } 89 | return(union(pkg, pkgs)) 90 | } 91 | 92 | if (have.av) 93 | deps <- pkg.deps("rattle", NULL, av) 94 | else 95 | deps <- pkg.deps("rattle", NULL, iv) 96 | } 97 | else 98 | deps <- union(deps2vec(iv["rattle", "Depends"]), deps2vec(iv["rattle", "Suggests"])) 99 | 100 | for (p in sort(setdiff(deps, 'rattle'))) 101 | { 102 | if (have.av && ! p %in% rownames(av)) 103 | { 104 | if (include.not.available) cat(sprintf("%s: not available\n", p)) 105 | } 106 | else if (! p %in% rownames(iv)) 107 | { 108 | if (include.not.installed) cat(sprintf("%s: not installed\n", p)) 109 | } 110 | else 111 | cat(sprintf("%s: version %s%s%s%s", p, iv[p,"Version"], 112 | ifelse(have.av && compareVersion(av[p,"Version"], iv[p,"Version"]) == 1, 113 | { 114 | up <- c(up, p); 115 | sprintf(" upgrade available %s", av[p,"Version"]) 116 | }, 117 | ""), 118 | ifelse(include.libpath, paste("\t", iv[p,"LibPath"]), ""), 119 | "\n")) 120 | } 121 | 122 | cat("\nThat was", 123 | if (include.not.available) 124 | length(deps) 125 | else 126 | sum(sapply(deps, function(p) p %in% 127 | if (have.av && include.not.installed) rownames(av) else rownames(iv))), 128 | "packages.\n") 129 | 130 | if (! is.null(up)) 131 | { 132 | cat(sprintf(paste('\nUpdate the packages with either', 133 | 'of the following commands:\n\n ', 134 | '> install.packages(c("%s"))\n\n ', 135 | '> install.packages(rattleInfo(%s%s%s%s%s%s%s))\n\n'), 136 | paste(strwrap(paste(up, collapse='", "'), 137 | width=crv$log_width, exdent=23), collapse="\n"), 138 | ifelse(all.dependencies, "all.dependencies=TRUE", ""), 139 | ifelse(all.dependencies && 140 | (include.not.installed || 141 | include.not.available || 142 | include.libpath), ", ", ""), 143 | ifelse(include.not.installed, "include.not.installed=TRUE", ""), 144 | ifelse(include.not.installed && 145 | (include.not.available || 146 | include.libpath), ", ", ""), 147 | ifelse(include.not.available, "include.not.available=TRUE", ""), 148 | ifelse(include.not.available && 149 | include.libpath, ", ", ""), 150 | ifelse(include.libpath, "include.libpath=TRUE", ""))) 151 | if (isWindows() && "rattle" %in% up) 152 | cat("Detach rattle (and other attached packages) before updating:\n\n ", 153 | '> detach("rattle")\n\n') 154 | cat("Alternatively update all installed packages:\n\n ", 155 | '> update.packages()\n\n') 156 | 157 | } 158 | 159 | invisible(up) 160 | 161 | } 162 | -------------------------------------------------------------------------------- /R/report.R: -------------------------------------------------------------------------------- 1 | # Gnome R Data Miner: GNOME interface to R for Data Mining 2 | # 3 | # Time-stamp: <2018-08-15 20:18:41 Graham.Williams@togaware.com> 4 | # 5 | # Reporting support 6 | # 7 | # Copyright (c) 2009 Togaware Pty Ltd 8 | # 9 | # This files is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | 24 | # TODO 25 | # 100307 Consider moving to using reporttools and using Sweave instead 26 | 27 | on_report_toolbutton_clicked <- function(action, window) 28 | { 29 | # Wrap the actual call with a "try" so that the watch cursor turns 30 | # off even on error. 31 | 32 | setStatusBar("Generating report.") 33 | set.cursor("watch") 34 | try(dispatchReportButton()) 35 | set.cursor() 36 | } 37 | 38 | dispatchReportButton <- function() 39 | { 40 | # Prerequisites: Can not report on data if there is no dataset. 41 | 42 | if (noDatasetLoaded()) return(FALSE) 43 | 44 | if (! questionDialog("The Report button is very experimental.", 45 | "Please report issues and updates to", 46 | "support@togaware.com.", 47 | "\n\nKnown issues:", 48 | "\n\n\tAlways saves to the same fixed file", 49 | "- need chooser.", 50 | "\n\tA plot is displayed on screen - need to suppress.", 51 | "\n\tToo much generated to the console - how remove?", 52 | "\n\nOtherwise it is safe to use!", 53 | "\n\nDo you wish to continue?")) 54 | return(FALSE) 55 | 56 | startLog("GENERATE A REPORT") 57 | 58 | if (! packageIsAvailable("odfWeave", "generate a report")) return(FALSE) 59 | lib.cmd <- "library(odfWeave, quietly=TRUE)" 60 | appendLog("The odfWeave package processes ODT document templates.", lib.cmd) 61 | eval(parse(text=lib.cmd)) 62 | 63 | # Check which tab of the notebook is active and dispatch to the 64 | # appropriate execute action. 65 | 66 | ct <- getCurrentPageLabel(crv$NOTEBOOK) 67 | 68 | if (ct == crv$NOTEBOOK.DATA.NAME || 69 | ct == crv$NOTEBOOK.EXPLORE.NAME) 70 | { 71 | # For the DATA or EXPLORE tabs generate a dataset summary. 72 | 73 | reportDataTab() 74 | } 75 | else if (ct == crv$NOTEBOOK.MODEL.NAME ) 76 | { 77 | if (! is.null(crs$rpart)) 78 | reportTreeModel(crs$rpart) 79 | else 80 | { 81 | infoDialog("Report functionality is only available for the Tree", 82 | ct, " and no Tree model found.") 83 | return(FALSE) 84 | } 85 | } 86 | else 87 | 88 | { 89 | infoDialog("No report functionality is available for the", 90 | ct, "tab as yet. Nothing done.") 91 | return(FALSE) 92 | } 93 | } 94 | 95 | #----------------------------------------------------------------------- 96 | 97 | reportDataTab <- function() 98 | { 99 | if (file.exists("../odf/data_summary.odt")) 100 | { 101 | summary <- "../odf/data_summary.odt" # For Testing 102 | warning(Rtxt("Rattle Report is using local template ../odf"), immediate.=TRUE) 103 | } 104 | else 105 | summary <- system.file("odt", "data_summary.odt", package="rattle") 106 | 107 | ofile <- paste(getwd(), "data_summary_rattle.odt", sep="/") 108 | 109 | odf.cmd <- sprintf(paste('odfWeave("%s",', 110 | '\n "%s",', 111 | '\n control = odfWeaveControl(verbose = FALSE))'), 112 | summary, ofile) 113 | 114 | appendLog(Rtxt("Generate a data report."), odf.cmd) 115 | 116 | eval(parse(text=odf.cmd)) 117 | 118 | setStatusBar(sprintf("Report written to %s.", ofile)) 119 | 120 | system(paste("oowriter", ofile), wait = FALSE) 121 | } 122 | 123 | 124 | #----------------------------------------------------------------------- 125 | 126 | reportTreeModel <- function(model) 127 | { 128 | model <<- model 129 | if (file.exists("../odf/model_rpart_summary.odt")) 130 | { 131 | summary <- "../odf/model_rpart_summary.odt" # For Testing 132 | warning("Rattle Report is using local template ../odf", immediate.=TRUE) 133 | } 134 | else 135 | summary <- system.file("odt", "mode_rpart_summary.odt", package="rattle") 136 | 137 | ofile <- paste(getwd(), "model_rpart_summary_rattle.odt", sep="/") 138 | 139 | # odfWeave::odfWeave(summary, ofile, control=odfWeave::odfWeaveControl(verbose=FALSE)) 140 | 141 | if (! is.null(crv$rattleGUI)) setStatusBar(sprintf("Report written to %s.", ofile)) 142 | } 143 | -------------------------------------------------------------------------------- /R/rocChart.R: -------------------------------------------------------------------------------- 1 | rocChart <- function(pr, target) 2 | { 3 | # Calculate the true positive and the false 4 | # positive rates. 5 | 6 | rates <- pr %>% 7 | ROCR::prediction(target) %>% 8 | ROCR::performance("tpr", "fpr") 9 | 10 | # Calulcate the AUC. 11 | 12 | auc <- pr %>% 13 | ROCR::prediction(target) %>% 14 | ROCR::performance("auc") %>% 15 | attr("y.values") %>% 16 | magrittr::extract2(1) 17 | 18 | # Construct the plot. 19 | 20 | pl <- data.frame(tpr=attr(rates, "y.values")[[1]], 21 | fpr=attr(rates, "x.values")[[1]]) %>% 22 | ggplot2::ggplot(ggplot2::aes(fpr, tpr)) + 23 | ggplot2::geom_line() + 24 | ggplot2::annotate("text", x=0.875, y=0.125, vjust=0, 25 | label=paste("AUC =", round(100*auc, 2)), 26 | family="xkcd") + 27 | ggplot2::xlab("False Positive Rate (1-Specificity)") + 28 | ggplot2::ylab("True Positive Rate (Sensitivity)") 29 | 30 | # Return the plot object. 31 | 32 | attr(pl, "auc") <- auc 33 | return(pl) 34 | } 35 | -------------------------------------------------------------------------------- /R/textminer.R: -------------------------------------------------------------------------------- 1 | # R Data Scientist: GNOME interface to R for Data Science 2 | # 3 | # Time-stamp: <2017-09-10 10:23:54 Graham Williams> 4 | # 5 | # 080921 TEXT MINING DATA 6 | # 7 | # Copyright (c) 2009-2017 Togaware Pty Ltd 8 | # 9 | # This file is part of Rattle. 10 | # 11 | # Rattle is free software: you can redistribute it and/or modify it 12 | # under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 2 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # Rattle is distributed in the hope that it will be useful, but 17 | # WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with Rattle. If not, see . 23 | # 24 | ######################################################################## 25 | # 26 | # First some notes: 27 | # 28 | # 29 | 30 | ## > show(corpus) 31 | ## A text document collection with 5 text documents 32 | 33 | ## > summary(corpus) 34 | ## A text document collection with 5 text documents 35 | 36 | ## > inspect(corpus[1]) 37 | 38 | ## tdm <- TermDocMatrix(corpus) 39 | ## findFreqTerms(tdm, 5, Inf) 40 | ## findAssocs(tdm, "ads", 0.97) 41 | 42 | ## ## 43 | ## ## Add in the target 44 | ## ## 45 | 46 | ## target <- c(1, 0, 0, 1, 0) 47 | ## crs$dataset <- as.data.frame(cbind(tdm@.Data, target)) 48 | ## set.seed(123) 49 | ## crs$train <- sample(nrow(crs$dataset), 4) 50 | 51 | ## ## 52 | ## ## Ignore 1 (15th), 61 (_is_), 238 (30%) or get error, probably 53 | ## ## because of their names. 54 | ## ## 55 | 56 | ## crs$rpart <- rpart(target ~ ., 57 | ## data=crs$dataset[crs$train,c(2:60,62:237,239:285)], 58 | ## method="class") 59 | 60 | ## crs$rf <- randomForest(as.factor(target) ~ ., 61 | ## data=crs$dataset[crs$train,c(2:60,62:237,239:285)], 62 | ## importance=TRUE, na.action=na.omit) 63 | 64 | 65 | ## crs$glm <- glm(target ~ ., 66 | ## data=crs$dataset[crs$train,c(2:60,62:237,239:285)], 67 | ## family=binomial(logit)) 68 | 69 | ## ## 70 | ## ## The others dont yet work: 71 | ## ## 72 | 73 | 74 | ## crs$ada <- ada(target ~ ., data=crs$dataset[crs$train,c(2:60,62:237,239:285)]) 75 | 76 | ## crs$ksvm <- ksvm(as.factor(target) ~ ., 77 | ## data=crs$dataset[crs$train,c(2:60,62:237,239:285)], 78 | ## prob.model=TRUE) 79 | 80 | executeDataCorpus <- function() 81 | { 82 | # 080921 Load all documents in the specified corpus as a document 83 | # corpus except target.csv, if there is one. Load .target.csv if 84 | # there is one as the target for each document in the corpus. The 85 | # .target.csv file must have two columns, comma separated. The first 86 | # row should name the columns, but we don't actually use the column 87 | # names here. The first column is the document id and must be the 88 | # filename without its extension. The second column is the 89 | # classification, for example 0 or 1. I use the name ".target.csv" 90 | # so that the corpus loader will ignore it as a hidden file. 91 | 92 | # 130310 For now, each time we Execute, reload the dataset. Effect 93 | # this with the following: 94 | 95 | crs$dataset <- NULL 96 | theWidget("select_treeview")$getModel()$clear() 97 | 98 | # Obtain interface information. 99 | 100 | location <- theWidget("data_corpus_location_filechooserbutton")$getFilename() 101 | strip <- theWidget("data_corpus_strip_checkbutton")$getActive() 102 | lcase <- theWidget("data_corpus_lowercase_checkbutton")$getActive() 103 | stopw <- theWidget("data_corpus_stopwords_checkbutton")$getActive() 104 | stemw <- theWidget("data_corpus_stem_checkbutton")$getActive() 105 | 106 | # Start the log for this task. 107 | 108 | startLog("LOAD A CORPUS") 109 | 110 | # Ensure the package is available. 111 | 112 | lib.cmd <- "library(tm, quietly=TRUE)" 113 | if (! packageIsAvailable("tm", "text mining")) return(FALSE) 114 | appendLog("Use the tm package to support text mining.", lib.cmd) 115 | eval(parse(text=lib.cmd)) 116 | 117 | # This seems to be avaiable somewhere? library(RStem) 118 | 119 | # Load the document corpus. 120 | 121 | corpus.cmd <- sprintf('my.corpus <- Corpus(DirSource("%s"))', 122 | gsub("\\\\", "/", location)) 123 | appendLog("Load the document corpus.", corpus.cmd) 124 | setStatusBar(Rtxt("Loading corpus from the documents found in"), location, "...") 125 | eval(parse(text=corpus.cmd)) 126 | 127 | # Process the documents. 128 | 129 | map.cmd <- "" 130 | 131 | if (strip) 132 | map.cmd <- sprintf("%s\nmy.corpus <- tm_map(my.corpus, stripWhitespace)", map.cmd) 133 | if (lcase) 134 | map.cmd <- sprintf("%s\nmy.corpus <- tm_map(my.corpus, content_transformer(tolower))", map.cmd) 135 | if (stopw) 136 | map.cmd <- sprintf(paste("%s\nmy.corpus <- tm_map(my.corpus,", 137 | 'removeWords, stopwords("english"))'), map.cmd) 138 | if (stemw) 139 | { 140 | lib.cmd <- "library(SnowballC, quietly=TRUE)" 141 | if (! packageIsAvailable("SnowballC", "word stemming")) return(FALSE) 142 | appendLog(packageProvides("SnowballC", "stemDocument"), lib.cmd) 143 | eval(parse(text=lib.cmd)) 144 | 145 | map.cmd <- sprintf("%s\nmy.corpus <- tm_map(my.corpus, stemDocument)", map.cmd) 146 | } 147 | 148 | 149 | # 111020 For now, always remove punctuation and numbers. 150 | 151 | map.cmd <- sprintf("%s\nmy.corpus <- tm_map(my.corpus, removePunctuation)", map.cmd) 152 | map.cmd <- sprintf("%s\nmy.corpus <- tm_map(my.corpus, removeNumbers)", map.cmd) 153 | 154 | # 111020 TODO Update and include some more information. 155 | 156 | ## Dictionary(TermDocumentMatrix(my.corpus)) 157 | 158 | ## tdm <- TermDocumentMatrix(my.corpus, 159 | ## control = list(removePunctuation = TRUE, 160 | ## removeNumbers = TRUE, 161 | ## stopwords = TRUE)) 162 | 163 | ## plot(tdm, corThreshold = 0.8, weighting = TRUE, 164 | ## attrs = list(graph = list(rankdir = "BT"), 165 | ## node = list(shape = "circle"))) 166 | 167 | 168 | ## dissimilarity(my.corpus[[1]], my.corpus[[2]], method = "eJaccard") 169 | ## dissimilarity(tdm, method = "cosine") 170 | 171 | ## rownames(tdm) 172 | ## colnames(tdm) 173 | ## dimnames(tdm) 174 | ## Docs(tdm) 175 | ## nTerms(tdm) 176 | ## Terms(tdm) 177 | 178 | ## inspect(my.corpus[1:3]) 179 | ## tdm <- TermDocumentMatrix(my.corpus)[1:10, 1:10] 180 | ## inspect(tdm) 181 | 182 | ## summary(my.corpus) 183 | 184 | ## findFreqTerms(tdm, 2, 3 ) 185 | 186 | ## removeSparseTerms(tdm,0.4) 187 | 188 | ## searchFullText(my.corpus[[3]], "accounts") 189 | 190 | ## termFreq(my.corpus[[1]]) 191 | 192 | 193 | 194 | appendLog("Transform the documents.", sub("^\n", "", map.cmd)) 195 | setStatusBar(Rtxt("Transforming the documents"), "...") 196 | eval(parse(text=map.cmd)) 197 | 198 | # Convert into a keyword count dataset. 199 | 200 | ds.cmd <- "crs$dataset <- as.data.frame(t(as.matrix(TermDocumentMatrix(my.corpus))))" 201 | appendLog("Convert into a dataset.", ds.cmd) 202 | eval(parse(text=ds.cmd)) 203 | 204 | # Add in targets if they exist. 205 | 206 | target.fname <- paste(location, ".target.csv", sep="/") 207 | if (file.exists(target.fname)) 208 | { 209 | read.cmd <- sprintf('target <- read.csv("%s", encoding="%s")', 210 | target.fname, crv$csv_encoding) 211 | appendLog("Read in the targets.", read.cmd) 212 | eval(parse(text=read.cmd)) 213 | 214 | if (nrow(crs$dataset) != nrow(target)) 215 | { 216 | errorDialog(Rtxt("The number of targets is different to the", 217 | "number of documents:"), 218 | sprintf("%s %s %s.", nrow(target), Rtxt("versus"), nrow(crs$dataset)), 219 | Rtxt("You may need to update the file"), 220 | target.fname, 221 | Rtxt("to match the number of documents in the corpus.")) 222 | return(FALSE) 223 | } 224 | 225 | target.cmd <- "crs$dataset <- cbind(crs$dataset, TARGET=target[[2]])" 226 | appendLog("Add the targets to the dataset.", target.cmd) 227 | eval(parse(text=target.cmd)) 228 | } 229 | 230 | # Set the title and dataname correctly. 231 | 232 | crs$dataname <- basename(location) 233 | setMainTitle(crs$dataname) 234 | 235 | # For now, always succeed. 236 | 237 | setStatusBar(Rtxt("Corpus has been loaded from the documents in"), 238 | location, 239 | ifelse(file.exists(target.fname), 240 | paste(Rtxt("with targets from"), ".target.csv"), 241 | "")) 242 | 243 | return(TRUE) 244 | } 245 | 246 | 247 | -------------------------------------------------------------------------------- /R/unloadLibs.R: -------------------------------------------------------------------------------- 1 | #' Unload pacakges 2 | #' 3 | #' Detach the list of pacakges, only detaching those that are on the 4 | #' search path. 5 | #' 6 | #' @param l Vector of package names. 7 | #' @return nothing. 8 | #' @rdname unloadLibs 9 | unloadLibs <- function(l) 10 | { 11 | for (p in l) 12 | { 13 | pn <- sprintf("package:%s", p) 14 | if (pn %in% search()) detach(pn, character.only=TRUE) 15 | } 16 | invisible() 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/xgboostFormula.R: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | ## Title: Define functions to enable the formula form of xgboost model 3 | ## Author: Zhou Fang, Data Scientist, Microsoft 4 | ## Date: 11-05-2017 5 | ## Rework the implementation: Graham Williams 6 | ## Date: 20170710 7 | ## Function names: 8 | ## xgboost.formula 9 | ## xgb.importance.formula 10 | ## predict.xgboost.formula 11 | ########################################################################### 12 | 13 | xgboost <- function(...) UseMethod("xgboost") 14 | 15 | xgboost.formula <- function(form, data, nrounds=100, na.action=na.omit, ...) 16 | { 17 | # FOR NOW ASSUME BINARY CLASSIFICATION TASK ONLY FIXME 18 | 19 | # Perform the NA action and note the missing observations. 20 | 21 | nads <- data %>% na.action() 22 | miss <- nads %>% attr("na.action") %>% as.vector() # Assume na.omit() FIXME 23 | 24 | # Create a sparse matrix from the supplied dataset. This will turn 25 | # categoricals into indictor variables. 26 | 27 | sds <- Matrix::sparse.model.matrix(form, data=nads) 28 | 29 | # Create the target vector. 30 | 31 | form %>% 32 | all.vars() %>% 33 | magrittr::extract(1) -> 34 | target 35 | 36 | # Make sure the target is a factor then convert to 0/1. 37 | 38 | data[[target]] %>% 39 | as.factor() %>% 40 | as.integer() %>% 41 | magrittr::subtract(1) -> 42 | label 43 | 44 | if (! is.null(miss)) label <- label[-miss] 45 | 46 | # Train xgboost model. Note the use of print_every_n. I tried 47 | # verbose=0 but then there is no cb.evaluation.log produced and so 48 | # don't get the extra information we need. So use a big value for n 49 | # to aim for first and last iterations. 50 | 51 | model <- xgboost::xgboost(data = sds, 52 | label = label, 53 | nrounds = nrounds, 54 | print_every_n = 1000, 55 | ...) 56 | 57 | # Record the actual formula and the final list of features for later 58 | # usage. 59 | 60 | model$formula <- form 61 | model$dimnames <- sds@Dimnames[[2]] 62 | 63 | # Add extra class for the formula based model. 64 | 65 | class(model) <- c("xgb.formula", class(model)) 66 | 67 | return(model) 68 | } 69 | 70 | importance <- function(...) UseMethod("importance") 71 | 72 | importance.xgb.formula <- function(model, data, ...) 73 | { 74 | # Remove the local class so xgboost is not confused. 75 | 76 | class(model) %<>% setdiff("xgb.formula") 77 | 78 | # Calculate the feature importance. 79 | 80 | imp <- xgboost::xgb.importance(feature_names=model$dimnames, model=model, ...) 81 | 82 | return(imp) 83 | } 84 | 85 | predict.xgb.formula <- function(object, newdata, ...) 86 | { 87 | # 20171029 FIXME needs to be able to run without providing a target 88 | # variable column in the dataset. 89 | 90 | # Transform to model matrix of just the variables required based on 91 | # the formula. 92 | 93 | mf <- model.frame(object$formula, data=newdata) 94 | vars <- attr(attr(mf, "terms"), "term.labels") 95 | x <- model.matrix(attr(mf, "terms"), data=mf) 96 | na <- attr(mf, "na.action") %>% as.vector() 97 | 98 | # Convert the data into a sparse matrix as required for 99 | # predict.xgb.Booster(). 100 | 101 | x <- Matrix::Matrix(x, sparse=TRUE) 102 | 103 | # Remove our local xgb.formula class so that predict will use the 104 | # appropriate xgboost:: method. Otherwise xgboost includes a test 105 | # for == class() rather than %in% class() and fails. 106 | 107 | class(object) %<>% setdiff("xgb.formula") 108 | 109 | # Predict on the new data. 110 | 111 | pr <- predict(object, newdata=x, ...) 112 | 113 | # Splice the missing observations as NA predicitons into the 114 | # result. Is there a splice function? Note the boundary conditions. 115 | 116 | for (i in na) 117 | if (i > length(pr)) 118 | pr <- c(pr, NA) 119 | else 120 | pr <- c(pr[1:i-1], NA, pr[i:length(pr)]) 121 | 122 | return(pr) 123 | } 124 | 125 | print.xgb.formula <- function(model, ...) 126 | { 127 | # Remove the local class so xgboost is not confused. 128 | 129 | class(model) %<>% setdiff("xgb.formula") 130 | 131 | print(model, ...) 132 | } 133 | -------------------------------------------------------------------------------- /build/vignette.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/build/vignette.rds -------------------------------------------------------------------------------- /data/audit.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/data/audit.RData -------------------------------------------------------------------------------- /data/locationsAUS.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/data/locationsAUS.RData -------------------------------------------------------------------------------- /data/weather.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/data/weather.RData -------------------------------------------------------------------------------- /data/weatherAUS.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/data/weatherAUS.RData -------------------------------------------------------------------------------- /data/wine.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/data/wine.RData -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("Please cite the 'rattle' package in publications using:") 2 | 3 | year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", meta$Date) 4 | vers <- paste("R package version", meta$Version) 5 | 6 | citEntry(entry="Book", 7 | title=paste("Data Mining with {Rattle} and {R}:", 8 | "The art of excavating data for knowledge discovery"), 9 | author=personList(as.person("Graham J. Williams")), 10 | publisher="Springer", 11 | series="Use R!", 12 | year="2011", 13 | url="https://rd.springer.com/book/10.1007/978-1-4419-9890-3", 14 | textVersion= 15 | paste("Williams, G. J. (2011), Data Mining with Rattle and R: ", 16 | "The Art of Excavating Data for Knowledge Discovery, ", 17 | "Use R!, Springer.", 18 | sep="")) 19 | 20 | -------------------------------------------------------------------------------- /inst/csv/dvdtrans.csv: -------------------------------------------------------------------------------- 1 | ID,Item 2 | 1,Sixth Sense 3 | 1,LOTR1 4 | 1,Harry Potter1 5 | 1,Green Mile 6 | 1,LOTR2 7 | 2,Gladiator 8 | 2,Patriot 9 | 2,Braveheart 10 | 3,LOTR1 11 | 3,LOTR2 12 | 4,Gladiator 13 | 4,Patriot 14 | 4,Sixth Sense 15 | 5,Gladiator 16 | 5,Patriot 17 | 5,Sixth Sense 18 | 6,Gladiator 19 | 6,Patriot 20 | 6,Sixth Sense 21 | 7,Harry Potter1 22 | 7,Harry Potter2 23 | 8,Gladiator 24 | 8,Patriot 25 | 9,Gladiator 26 | 9,Patriot 27 | 9,Sixth Sense 28 | 10,Sixth Sense 29 | 10,LOTR 30 | 10,Gladiator 31 | 10,Green Mile 32 | -------------------------------------------------------------------------------- /inst/doc/rattle.R: -------------------------------------------------------------------------------- 1 | ### R code from vignette source 'rattle.Rnw' 2 | 3 | ################################################### 4 | ### code chunk number 1: install (eval = FALSE) 5 | ################################################### 6 | ## install.packages("rattle", dependencies=c("Depends", "Suggests")) 7 | 8 | 9 | ################################################### 10 | ### code chunk number 2: install_togaware (eval = FALSE) 11 | ################################################### 12 | ## install.packages("rattle", repos="https://rattle.togaware.com", type="source") 13 | 14 | 15 | ################################################### 16 | ### code chunk number 3: start_up (eval = FALSE) 17 | ################################################### 18 | ## library(rattle) 19 | ## rattle() 20 | 21 | 22 | ################################################### 23 | ### code chunk number 4: rattle.Rnw:149-150 (eval = FALSE) 24 | ################################################### 25 | ## source("~/weather_script.R") 26 | 27 | 28 | -------------------------------------------------------------------------------- /inst/doc/rattle.Rnw: -------------------------------------------------------------------------------- 1 | % \VignetteIndexEntry{Rattle Quick Start Guide} 2 | % \VignetteDepends{rattle} 3 | % \VignetteKeywords{data mining} 4 | % \VignettePackage{rattle} 5 | \documentclass[12pt]{article} 6 | \usepackage{amsmath} 7 | \usepackage[pdftex]{graphicx} 8 | \usepackage{color} 9 | \usepackage{xspace} 10 | \usepackage{fancyvrb} 11 | \usepackage{fancyhdr} 12 | \usepackage{lastpage} 13 | \usepackage{algorithm2e} 14 | \usepackage[ 15 | colorlinks=true, 16 | linkcolor=blue, 17 | citecolor=blue, 18 | urlcolor=blue] 19 | {hyperref} 20 | \usepackage{Sweave} 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | % define new colors for use 25 | \definecolor{darkgreen}{rgb}{0,0.6,0} 26 | \definecolor{darkred}{rgb}{0.6,0.0,0} 27 | \definecolor{lightbrown}{rgb}{1,0.9,0.8} 28 | \definecolor{brown}{rgb}{0.6,0.3,0.3} 29 | \definecolor{darkblue}{rgb}{0,0,0.8} 30 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5} 31 | 32 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 33 | 34 | \newcommand{\Rattle}{\textsf{Rattle}\xspace} 35 | \newcommand{\pkg}[1]{{\tt #1}\xspace} 36 | 37 | \setlength{\oddsidemargin}{-.25 truein} 38 | \setlength{\evensidemargin}{0truein} 39 | \setlength{\topmargin}{-0.2truein} 40 | \setlength{\textwidth}{7 truein} 41 | \setlength{\textheight}{8.5 truein} 42 | \setlength{\parindent}{0.20truein} 43 | \setlength{\parskip}{0.10truein} 44 | 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 46 | \pagestyle{fancy} 47 | \lhead{} 48 | \chead{Rattle} 49 | \rhead{} 50 | \lfoot{} 51 | \cfoot{} 52 | \rfoot{\thepage\ of \pageref{LastPage}} 53 | \renewcommand{\headrulewidth}{1pt} 54 | \renewcommand{\footrulewidth}{1pt} 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | 57 | \title{The Rattle Package: Quick Start Guide} 58 | \author{Graham Williams \\ Graham.Williams@togaware.com} 59 | 60 | \begin{document} 61 | 62 | \maketitle 63 | 64 | \thispagestyle{empty} 65 | 66 | \section{Introduction} 67 | 68 | \Rattle (Williams, 2011) is a package written in R providing a 69 | graphical user interface to very many other R packages that provide 70 | functionality for data mining. 71 | 72 | This quick start guide is under development. See 73 | \url{https://rattle.togaware.com} for extensive documentation 74 | 75 | \section{Requirements} 76 | 77 | \Rattle depends on over 40 other R packages and a couple of other 78 | software applications/libraries that are independent of R. The first 79 | thing to ensure is that you have installed the GTK+ libraries and the 80 | GGobi application. This is operating system dependent and full 81 | installation instructions are available from \url{https://rattle.togaware.com/}. 82 | 83 | Only a couple of R packages are dependencies for \Rattle. Most are 84 | suggestions, but without them functionality is quite limited. At a 85 | minimum it is useful to ensure you have the 86 | \href{https://cran.r-project.org/package=RGtk2}{\pkg{RGtk2}} package 87 | installed. Others that you might like to install include: 88 | \href{https://cran.r-project.org/package=ada}{\pkg{ada}}, 89 | \href{https://cran.r-project.org/package=arules}{\pkg{arules}}, 90 | \href{https://cran.r-project.org/package=doBy}{\pkg{doBy}}, 91 | \href{https://cran.r-project.org/package=ellipse}{\pkg{ellipse}}, 92 | \href{https://cran.r-project.org/package=fBasics}{\pkg{fBasics}}, 93 | \href{https://cran.r-project.org/package=fpc}{\pkg{fpc}}, 94 | \href{https://cran.r-project.org/package=gplots}{\pkg{gplots}}, 95 | \href{https://cran.r-project.org/package=Hmisc}{\pkg{Hmisc}}, 96 | \href{https://cran.r-project.org/package=kernlab}{\pkg{kernlab}}, 97 | \href{https://cran.r-project.org/package=mice}{\pkg{mice}}, 98 | \href{https://cran.r-project.org/package=party}{\pkg{party}}, 99 | \href{https://cran.r-project.org/package=playwith}{\pkg{playwith}}, 100 | \href{https://cran.r-project.org/package=pmml}{\pkg{pmml}}, 101 | \href{https://cran.r-project.org/package=randomForest}{\pkg{randomForest}}, 102 | \href{https://cran.r-project.org/package=reshape}{\pkg{reshape}}, 103 | \href{https://cran.r-project.org/package=rggobi}{\pkg{rggobi}}, 104 | \href{https://cran.r-project.org/package=RGtk2}{\pkg{RGtk2}}, 105 | \href{https://cran.r-project.org/package=ROCR}{\pkg{ROCR}}, 106 | \href{https://cran.r-project.org/package=RODBC}{\pkg{RODBC}}, and 107 | \href{https://cran.r-project.org/package=rpart}{\pkg{rpart}}. 108 | 109 | The packages will usually be installed with the following command: 110 | 111 | <>= 112 | install.packages("rattle", dependencies=c("Depends", "Suggests")) 113 | @ 114 | 115 | The latest beta version of rattle is available from 116 | \url{https://rattle.togaware.com/}: 117 | 118 | <>= 119 | install.packages("rattle", repos="https://rattle.togaware.com", type="source") 120 | @ 121 | 122 | \section{First Steps} 123 | 124 | Start up rattle: 125 | <>= 126 | library(rattle) 127 | rattle() 128 | @ 129 | 130 | \section{Sipmle Scenario: Build a Couple of Models} 131 | 132 | \begin{enumerate} 133 | \item Click Execute 134 | \item Click Yes (load the sample weather dataset) 135 | \item Click the Model tab 136 | \item Click Execute (to build a decision tree) 137 | \item Click Draw to display the decision tree (loads other packages as required) 138 | \item Click the Forest radio button 139 | \item Click Execute (to build a random forest - loads packages as required) 140 | \item Click the Evaluate tab 141 | \item Click the Risk radio button (installs packages as required) 142 | \item Click Execute to display two Risk (Cummulative) performance plots 143 | \item Click the Log tab 144 | \item Click the Export button to save script to file weather\_script.R to home folder 145 | \end{enumerate} 146 | 147 | Now exit from R (and rattle) and start R up again. 148 | 149 | <>= 150 | source("~/weather_script.R") 151 | @ 152 | 153 | This will rerun everything that was done in the GUI session but purely as a script. 154 | 155 | \section{References} 156 | 157 | \begin{description} 158 | \item Williams, G. J. (2009). {\em Rattle: A Data Mining GUI for R}. 159 | The R Journal, 1(2), 45-55. URL: 160 | \href{https://journal.r-project.org/archive/2009-2/RJournal_2009-2_Williams.pdf} 161 | {https://journal.r-project.org/archive/2009-2/RJournal\_2009-2\_Williams.pdf}. 162 | \item Williams, G. J. (2011). {\em Data Mining with Rattle and R: The 163 | Art of Excavating Data for Knowledge Discovery}. Use R! 164 | series. Springer. \href{https://bit.ly/rattle_data_mining}{https://bit.ly/rattle\_data\_mining}. 165 | \end{description} 166 | 167 | \end{document} 168 | -------------------------------------------------------------------------------- /inst/doc/rattle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/doc/rattle.pdf -------------------------------------------------------------------------------- /inst/etc/Rlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/etc/Rlogo.png -------------------------------------------------------------------------------- /inst/extdata/audit.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/extdata/audit.xlsx -------------------------------------------------------------------------------- /inst/odt/data_summary.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/odt/data_summary.odt -------------------------------------------------------------------------------- /inst/po/de/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/de/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/es/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/es/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/fr/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/fr/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/id/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/id/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/ja/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/ja/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/no/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/no/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /inst/po/zh_CN/LC_MESSAGES/R-rattle.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/rattle/3875c10d0ae6c7a499d918bc501e121861067e06/inst/po/zh_CN/LC_MESSAGES/R-rattle.mo -------------------------------------------------------------------------------- /man/acquireAuditData.Rd: -------------------------------------------------------------------------------- 1 | \name{acquireAuditData} 2 | 3 | \alias{acquireAuditData} 4 | 5 | \title{Generate the audit dataset.} 6 | 7 | \description{ 8 | 9 | Rattle uses an artificial dataset for demonstration purposes. This 10 | function retrieves the source data 11 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data} 12 | and then transforms the data in a variety of ways. 13 | 14 | } 15 | 16 | \usage{ 17 | acquireAuditData(write.to.file=FALSE) 18 | } 19 | 20 | \arguments{ 21 | 22 | \item{write.to.file}{Whether to generate a colleciton of files based 23 | on the data. The files generated include: audit.csv, audit.Rdata, 24 | audit.arf, and audit\_missing.csv} 25 | 26 | } 27 | 28 | \details{ 29 | 30 | See the function definition for details of the processing done on the 31 | data downloaded from the UCI repository. 32 | 33 | } 34 | 35 | \value{ 36 | 37 | By default the function returns a data frame containing the audit 38 | dataset. If write.to.file is TRUE then the data frame is returned 39 | invisibly. 40 | 41 | } 42 | 43 | \references{Package home page: \url{https://rattle.togaware.com}} 44 | 45 | \author{\email{Graham.Williams@togaware.com}} 46 | 47 | \seealso{ 48 | 49 | \code{\link{audit}}, \code{\link{rattle}}. 50 | 51 | } 52 | -------------------------------------------------------------------------------- /man/asRules.Rd: -------------------------------------------------------------------------------- 1 | \name{asRules} 2 | \alias{asRules} 3 | \title{ 4 | List the rules corresponding to the rpart decision tree 5 | } 6 | \description{ 7 | 8 | Display a list of rules for an rpart decision tree. 9 | 10 | } 11 | \usage{ 12 | asRules(model, compact=FALSE, \dots) 13 | } 14 | \arguments{ 15 | 16 | \item{model}{an rpart model.} 17 | 18 | \item{compact}{whether to list cateogricals compactly.} 19 | 20 | \item{\dots}{further arguments passed to or from other methods.} 21 | 22 | } 23 | \details{ 24 | 25 | Traverse a decision tree to generate the equivalent set of rules, one 26 | rule for each path from the root node to a leaf node. 27 | 28 | } 29 | \references{Package home page: \url{https://rattle.togaware.com}} 30 | \author{\email{Graham.Williams@togaware.com}} 31 | \examples{ 32 | \dontrun{asRules.rpart(my.rpart)} 33 | } 34 | \keyword{tree} 35 | -------------------------------------------------------------------------------- /man/asRules.rpart.Rd: -------------------------------------------------------------------------------- 1 | \name{asRules.rpart} 2 | \alias{asRules.rpart} 3 | \title{ 4 | List the rules corresponding to the rpart decision tree 5 | } 6 | \description{ 7 | 8 | Display a list of rules for an rpart decision tree. 9 | 10 | } 11 | \usage{ 12 | \method{asRules}{rpart}(model, compact=FALSE, classes=NULL, \dots) 13 | } 14 | \arguments{ 15 | 16 | \item{model}{an rpart model.} 17 | 18 | \item{compact}{whether to list cateogricals compactly (default FALSE).} 19 | 20 | \item{classes}{which target classes should be listed (default all).} 21 | 22 | \item{\dots}{further arguments passed to or from other methods.} 23 | 24 | } 25 | \details{ 26 | 27 | Traverse a decision tree to generate the equivalent set of rules, one 28 | rule for each path from the root node to a leaf node. 29 | 30 | } 31 | \references{Package home page: \url{https://rattle.togaware.com}} 32 | \author{\email{Graham.Williams@togaware.com}} 33 | \examples{ 34 | \dontrun{asRules.rpart(my.rpart)} 35 | } 36 | \keyword{tree} 37 | -------------------------------------------------------------------------------- /man/audit.Rd: -------------------------------------------------------------------------------- 1 | \name{audit} 2 | \docType{data} 3 | \alias{audit} 4 | \title{Sample dataset to illustrate Rattle functionality.} 5 | \description{ 6 | 7 | The audit dataset is an artificially constructed dataset that has some 8 | of the characteristics of a true financial audit dataset for modelling 9 | productive and non-productive audits of a person's financial 10 | statement. A productive audit is one which identifies errors or 11 | inaccuracies in the information provided by a client. A non-productive 12 | audit is usually an audit which found all supplied information to be 13 | in order. 14 | 15 | The audit dataset is used to illustrate binary classification. The 16 | target variable is identified as \code{TARGET\_Adjusted}. 17 | 18 | The dataset is quite small, consisting of just 2000 entities. Its 19 | primary purpose is to illustrate modelling in Rattle, so a minimally 20 | sized dataset is suitable. 21 | 22 | The dataset itself is derived from publicly available data (which has 23 | nothing to do with audits). 24 | 25 | } 26 | 27 | \format{ 28 | 29 | A data frame. In line with data mining terminology we refer to the 30 | rows of the data frame (or the observations) as entities. The columns 31 | are refered to as variables. The entities represent people in this 32 | case. We describe the variables here: 33 | 34 | \describe{ 35 | 36 | \item{\code{ID}}{This is a unique identifier for each person.} 37 | 38 | \item{\code{Age}}{The age.} 39 | 40 | \item{\code{Employment}}{The type of employment.} 41 | 42 | \item{\code{Education}}{The highest level of education.} 43 | 44 | \item{\code{Marital}}{Current marital status.} 45 | 46 | \item{\code{Occupation}}{The type of occupation.} 47 | 48 | \item{\code{Income}}{The amount of income declared.} 49 | 50 | \item{\code{Gender}}{The persons gender.} 51 | 52 | \item{\code{Deductions}}{Total amount of expenses that a person 53 | claims in their financial statement.} 54 | 55 | \item{\code{Hours}}{The average hours worked on a weekly basis.} 56 | 57 | \item{\code{IGNORE_Accounts}}{The main country in which the person 58 | has most of their money banked. Note that the variable name is 59 | prefixed with IGNORE. This is recognised by Rattle as the default 60 | role for this variable.} 61 | 62 | \item{\code{RISK_Adjustment}}{This variable records the monetary 63 | amount of any adjustment to the person's financial claims as a 64 | result of a productive audit. This variable, which should not be 65 | treated as an input variable, is thus a measure of the size of the 66 | risk associated with the person.} 67 | 68 | \item{\code{TARGET_Adjusted}}{The target variable for modelling 69 | (generally for classification modelling). This is a numeric field 70 | of class integer, but limited to 0 and 1, indicating 71 | non-productive and productive audits, respectively. Productive 72 | audits are those that result in an adjustment being made to a 73 | client's financial statement.} 74 | 75 | } 76 | 77 | } 78 | \keyword{datasets} 79 | -------------------------------------------------------------------------------- /man/binning.Rd: -------------------------------------------------------------------------------- 1 | \name{binning} 2 | \alias{binning} 3 | \title{ 4 | Perform binning over numeric data 5 | } 6 | \description{ 7 | 8 | Perform binning. 9 | 10 | } 11 | \usage{ 12 | binning(x, bins=4, method=c("quantile", "wtd.quantile", "kmeans"), 13 | labels=NULL, ordered=TRUE, weights=NULL) 14 | } 15 | \arguments{ 16 | 17 | \item{x}{the numeric data to bin.} 18 | 19 | \item{bins}{the number of bins to use.} 20 | 21 | \item{method}{whether to use "quantile", weighted quantile 22 | "wtd.quantile" or "kmeans" binning.} 23 | 24 | \item{labels}{the labels or names to use for each of the bins.} 25 | 26 | \item{ordered}{whether to build an ordered factor or not.} 27 | 28 | \item{weights}{vector of numeric weights for each observation for 29 | weighted quantile binning.} 30 | 31 | } 32 | \details{ 33 | 34 | Bin the provided nmeric data into the specified number of bins using 35 | one of the supported methods. The bins will have the names specified 36 | by labels, if supplied. The result can optionally be an ordered 37 | factor. 38 | 39 | } 40 | \value{A factor is returned.} 41 | \references{Package home page: \url{https://rattle.togaware.com}} 42 | \author{Daniele Medri and Graham Williams} 43 | -------------------------------------------------------------------------------- /man/calcInitialDigitDistr.Rd: -------------------------------------------------------------------------------- 1 | \name{calcInitialDigitDistr} 2 | \alias{calcInitialDigitDistr} 3 | \title{ 4 | Generate a frequency count of the initial digits 5 | } 6 | \description{ 7 | 8 | In the context of Benford's Law calculate the distribution of the 9 | frequencies of the first digit of the numbers supplied as the 10 | argument. 11 | 12 | } 13 | \usage{ 14 | calcInitialDigitDistr(l, digit=1, len=1, 15 | sp=c("none", "positive", "negative")) 16 | } 17 | \arguments{ 18 | 19 | \item{l}{a vector of numbers.} 20 | \item{digit}{the digit to generate frequencies for.} 21 | \item{len}{The number of digits.} 22 | \item{sp}{whether and how to split the digits.} 23 | 24 | } 25 | \references{Package home page: \url{https://rattle.togaware.com}} 26 | \author{\email{Graham.Williams@togaware.com}} 27 | \keyword{hplot} 28 | -------------------------------------------------------------------------------- /man/calculateAUC.Rd: -------------------------------------------------------------------------------- 1 | \name{calculateAUC} 2 | \alias{calculateAUC} 3 | \title{ 4 | Determine area under a curve (e.g. a risk or recall curve) of a risk chart 5 | } 6 | \description{ 7 | 8 | Given the evaluation returned by evaluateRisk, for example, calculate 9 | the area under the risk or recall curves, to use as a metric to 10 | compare the performance of a model. 11 | 12 | } 13 | \usage{ 14 | calculateAUC(x, y) 15 | } 16 | \arguments{ 17 | 18 | \item{x}{a vector of values for the x points.} 19 | \item{y}{a vector of values for the y points.} 20 | 21 | } 22 | \details{ 23 | 24 | The area is returned. 25 | 26 | } 27 | \references{Package home page: \url{https://rattle.togaware.com}} 28 | \author{\email{Graham.Williams@togaware.com}} 29 | \seealso{\code{\link{evaluateRisk}}.} 30 | \examples{ 31 | ## this is usually used in the context of the evaluateRisk function 32 | \dontrun{ev <- evaluateRisk(predicted, actual, risk)} 33 | 34 | ## imitate this output here 35 | ev <- data.frame(Caseload=c(1.0, 0.8, 0.6, 0.4, 0.2, 0), 36 | Precision=c(0.15, 0.18, 0.21, 0.25, 0.28, 0.30), 37 | Recall=c(1.0, 0.95, 0.80, 0.75, 0.5, 0.0), 38 | Risk=c(1.0, 0.98, 0.90, 0.77, 0.30, 0.0)) 39 | 40 | ## Calculate the areas unde the Risk and the Recall curves. 41 | calculateAUC(ev$Caseload, ev$Risk) 42 | calculateAUC(ev$Caseload, ev$Recall) 43 | 44 | } 45 | \keyword{hplot} 46 | -------------------------------------------------------------------------------- /man/centers.hclust.Rd: -------------------------------------------------------------------------------- 1 | \name{centers.hclust} 2 | \alias{centers.hclust} 3 | \title{ 4 | List Cluster Centers for a Hierarchical Cluster 5 | } 6 | \description{ 7 | 8 | Generate a matrix of centers from a hierarchical cluster. 9 | 10 | } 11 | \usage{ 12 | centers.hclust(x, object, nclust=10, use.median=FALSE) 13 | } 14 | \arguments{ 15 | 16 | \item{x}{The data used to build the cluster.} 17 | 18 | \item{object}{A hclust object.} 19 | 20 | \item{nclust}{Number of clusters.} 21 | 22 | \item{use.median}{Use meadion instead of mean.} 23 | 24 | } 25 | \details{ 26 | 27 | For the specified number of clusters, cut the hierarchical cluster 28 | appropriately to that number of clusters, and return the mean (or 29 | median) of each resulting cluster. 30 | 31 | } 32 | \references{Package home page: \url{https://rattle.togaware.com}} 33 | \author{Daniele Medri and \email{Graham.Williams@togaware.com}} 34 | \keyword{cluster} 35 | -------------------------------------------------------------------------------- /man/comcat.Rd: -------------------------------------------------------------------------------- 1 | \name{comcat} 2 | \alias{comcat} 3 | \title{ 4 | Echo data in a human readable form. 5 | } 6 | \description{ 7 | 8 | Format data in the most appropriate human readable form. 9 | 10 | } 11 | \usage{ 12 | comcat(x, ...) 13 | } 14 | \arguments{ 15 | 16 | \item{x}{object.} 17 | 18 | \item{...}{additional arguments passed on to format.} 19 | 20 | } 21 | \references{Package home page: \url{https://rattle.togaware.com}} 22 | \author{\email{Graham.Williams@togaware.com}} 23 | 24 | \examples{ 25 | comcat(dim(iris)) 26 | } 27 | -------------------------------------------------------------------------------- /man/drawTreeNodes.Rd: -------------------------------------------------------------------------------- 1 | \name{drawTreeNodes} 2 | \alias{drawTreeNodes} 3 | \title{ 4 | Draw nodes of a decision tree 5 | } 6 | \description{ 7 | 8 | Draw the nodes of a decision tree 9 | 10 | } 11 | \usage{ 12 | drawTreeNodes(tree, cex = par("cex"), pch = par("pch"), 13 | size = 4 * cex, col = NULL, nodeinfo = FALSE, 14 | units = "", cases = "obs", 15 | digits = getOption("digits"), 16 | decimals = 2, 17 | print.levels = TRUE, new = TRUE) 18 | } 19 | \arguments{ 20 | 21 | \item{tree}{an rpart decision tree.} 22 | 23 | \item{cex}{.} 24 | 25 | \item{pch}{.} 26 | 27 | \item{size}{.} 28 | 29 | \item{col}{.} 30 | 31 | \item{nodeinfo}{.} 32 | 33 | \item{units}{.} 34 | 35 | \item{cases}{.} 36 | 37 | \item{digits}{.} 38 | 39 | \item{decimals}{the number of decimal digits to include in numeric 40 | split nodes.} 41 | 42 | \item{print.levels}{.} 43 | 44 | \item{new}{.} 45 | 46 | } 47 | \details{ 48 | 49 | A variation of draw.tree() from the maptree package. 50 | 51 | } 52 | \references{Package home page: \url{https://rattle.togaware.com}} 53 | \author{\email{Graham.Williams@togaware.com}, Denis White} 54 | \examples{ 55 | ## this is usually used in the context of the plotRisk function 56 | \dontrun{drawTreeNodes(rpart(Species ~ ., iris))} 57 | } 58 | \keyword{hplot} 59 | -------------------------------------------------------------------------------- /man/drawTreesAda.Rd: -------------------------------------------------------------------------------- 1 | \name{drawTreesAda} 2 | \alias{drawTreesAda} 3 | \title{ 4 | Draw trees from an Ada model 5 | } 6 | \description{ 7 | 8 | Using the Rattle drawTreeNodes, draw a selection of Ada trees. 9 | 10 | } 11 | \usage{ 12 | drawTreesAda(model, trees=0, title="") 13 | } 14 | \arguments{ 15 | 16 | \item{model}{an ada model.} 17 | 18 | \item{trees}{The list of trees to draw. Use 0 to draw all trees.} 19 | 20 | \item{title}{An option title to add.} 21 | 22 | } 23 | \details{ 24 | 25 | Using Rattle's drawTreeNodes underneath, a plot for each of the 26 | specified trees from an Ada model will be displayed. 27 | 28 | } 29 | \references{Package home page: \url{https://rattle.togaware.com}} 30 | \author{\email{Graham.Williams@togaware.com}} 31 | \examples{ 32 | \dontrun{drawTreesAda(ds.ada)} 33 | } 34 | \keyword{hplot} 35 | -------------------------------------------------------------------------------- /man/errorMatrix.Rd: -------------------------------------------------------------------------------- 1 | \name{errorMatrix} 2 | \alias{errorMatrix} 3 | \title{ 4 | Generate an error matrix from actua and predicted data. 5 | } 6 | \description{ 7 | An error matrix reports the true/false potisitve/negative rates. 8 | } 9 | \usage{ 10 | errorMatrix(actual, 11 | predicted, 12 | percentage=TRUE, 13 | digits=ifelse(percentage,1,3), 14 | count=FALSE) 15 | } 16 | \arguments{ 17 | 18 | \item{actual}{a vector of true values.} 19 | \item{predicted}{a vector of predicted values.} 20 | \item{percentage}{return percentages.} 21 | \item{digits}{the number of digits to round results.} 22 | \item{count}{return counts.} 23 | 24 | } 25 | \references{Package home page: \url{https://rattle.togaware.com}} 26 | \author{\email{Graham.Williams@togaware.com}} 27 | 28 | \examples{ 29 | \dontrun{errorMatrix(model)} 30 | } 31 | -------------------------------------------------------------------------------- /man/evaluateRisk.Rd: -------------------------------------------------------------------------------- 1 | \name{evaluateRisk} 2 | \alias{evaluateRisk} 3 | \title{ 4 | Summarise the performance of a data mining model 5 | } 6 | \description{ 7 | 8 | By taking predicted values, actual values, and measures of the risk 9 | associated with each case, generate a summary that groups the distinct 10 | predicted values, calculating the accumulative percentage Caseload, 11 | Recall, Risk, Precision, and Measure. 12 | 13 | } 14 | \usage{ 15 | evaluateRisk(predicted, actual, risks) 16 | } 17 | \arguments{ 18 | 19 | \item{predicted}{a numeric vector of probabilities (between 0 and 1) 20 | representing the probability of each entity being a 1.} 21 | 22 | \item{actual}{a numeric vector of classes (0 or 1).} 23 | 24 | \item{risks}{a numeric vector of risk (e.g., dollar amounts) 25 | associated with each entity that has a acutal of 1.} 26 | 27 | } 28 | \references{Package home page: \url{https://rattle.togaware.com}} 29 | \author{\email{Graham.Williams@togaware.com}} 30 | \seealso{\code{\link{plotRisk}}.} 31 | \examples{ 32 | 33 | ## simulate the data that is typical in data mining 34 | 35 | ## we often have only a small number of positive known case 36 | cases <- 1000 37 | actual <- as.integer(rnorm(cases) > 1) 38 | adjusted <- sum(actual) 39 | nfa <- cases - adjusted 40 | 41 | ## risks might be dollar values associated adjusted cases 42 | risks <- rep(0, cases) 43 | risks[actual==1] <- round(abs(rnorm(adjusted, 10000, 5000)), 2) 44 | 45 | ## our models will generated a probability of a case being a 1 46 | predicted <- rep(0.1, cases) 47 | predicted[actual==1] <- predicted[actual==1] + rnorm(adjusted, 0.3, 0.1) 48 | predicted[actual==0] <- predicted[actual==0] + rnorm(nfa, 0.1, 0.08) 49 | predicted <- signif(predicted) 50 | 51 | ## call upon evaluateRisk to generate performance summary 52 | ev <- evaluateRisk(predicted, actual, risks) 53 | 54 | ## have a look at the first few and last few 55 | head(ev) 56 | tail(ev) 57 | 58 | ## the performance is usually presented as a Risk Chart 59 | ## under the CRAN MS/Windows this causes a problem, so don't run for now 60 | \dontrun{plotRisk(ev$Caseload, ev$Precision, ev$Recall, ev$Risk)} 61 | } 62 | \keyword{dplot} 63 | -------------------------------------------------------------------------------- /man/fancyRpartPlot.Rd: -------------------------------------------------------------------------------- 1 | \name{fancyRpartPlot} 2 | \alias{fancyRpartPlot} 3 | \title{ 4 | A wrapper for plotting rpart trees using prp 5 | } 6 | \description{ 7 | 8 | Plots a fancy RPart decision tree using the pretty rpart plotter. 9 | 10 | } 11 | \usage{ 12 | fancyRpartPlot(model, main="", sub, caption, palettes, type=2, ...) 13 | } 14 | \arguments{ 15 | 16 | \item{model}{an rpart object.} 17 | 18 | \item{main}{title for the plot.} 19 | 20 | \item{sub}{sub title for the plot. The default is a Rattle string with 21 | date, time and username.} 22 | 23 | \item{caption}{caption for bottom right of plot.} 24 | 25 | \item{palettes}{a list of sequential palettes names. As supported by 26 | RColorBrewer::brewer.pal the available names are Blues BuGn BuPu GnBu 27 | Greens Greys Oranges OrRd PuBu PuBuGn PuRd Purples RdPu Reds YlGn 28 | YlGnBu YlOrBr YlOrRd.} 29 | 30 | \item{type}{the type of plot to generate (2).} 31 | 32 | \item{...}{additional arguments passed on to prp.} 33 | 34 | } 35 | \references{Package home page: \url{https://rattle.togaware.com}} 36 | \author{\email{Graham.Williams@togaware.com}} 37 | 38 | \examples{ 39 | ## Use rpart to build a decision tree. 40 | 41 | \dontrun{library(rpart) 42 | 43 | ## Set up the data for modelling. 44 | 45 | set.seed(42) 46 | ds <- weather 47 | target <- "RainTomorrow" 48 | risk <- "RISK_MM" 49 | ignore <- c("Date", "Location", risk) 50 | vars <- setdiff(names(ds), ignore) 51 | nobs <- nrow(ds) 52 | form <- formula(paste(target, "~ .")) 53 | train <- sample(nobs, 0.7*nobs) 54 | test <- setdiff(seq_len(nobs), train) 55 | actual <- ds[test, target] 56 | risks <- ds[test, risk] 57 | 58 | # Fit the model. 59 | 60 | fit <- rpart(form, data=ds[train, vars]) 61 | 62 | ## Plot the model. 63 | 64 | fancyRpartPlot(fit) 65 | 66 | ## Choose different colours. 67 | 68 | fancyRpartPlot(fit, palettes=c("Greys", "Oranges")) 69 | 70 | ## Add a main title to the plot. 71 | 72 | fancyRpartPlot(fit, main=target) 73 | 74 | }} 75 | \keyword{hplot} 76 | -------------------------------------------------------------------------------- /man/genPlotTitleCmd.Rd: -------------------------------------------------------------------------------- 1 | \name{genPlotTitleCmd} 2 | \alias{genPlotTitleCmd} 3 | \title{ 4 | Generate a string to add a title to a plot 5 | } 6 | \description{ 7 | 8 | Generate a string that is intended to be \code{\link{eval}}'d that 9 | will add a title and sub-title to a plot. The string is a call to 10 | \code{\link{title}}, supplying the given arguments, 11 | \code{\link{paste}}d together, as the main title, and generating a 12 | sub-title that begins with `Rattle' and continues with the current date 13 | and time, and finishes with the current user's username. This is used 14 | internally in Rattle to adorn a plot with relevant information, but 15 | may be useful outside of Rattle. 16 | 17 | } 18 | \usage{ 19 | genPlotTitleCmd(..., vector=FALSE) 20 | } 21 | \arguments{ 22 | \item{...}{one or more strings that will be pasted together to form 23 | the main title.} 24 | \item{vector}{whether to return a vector as the result.} 25 | } 26 | \references{Package home page: \url{https://rattle.togaware.com}} 27 | \author{\email{Graham.Williams@togaware.com}} 28 | \seealso{ 29 | 30 | \code{\link{eval}}, \code{\link{title}}, \code{\link{plotRisk}}. 31 | } 32 | \examples{ 33 | # generate some random plot 34 | plot(rnorm(100)) 35 | 36 | # generate the string representing the command to add titles 37 | tl <- genPlotTitleCmd("Sample Plot of", "No Particular Importance") 38 | 39 | # cause the string to be executed as an R command 40 | eval(parse(text=tl)) 41 | } 42 | \keyword{aplot} 43 | -------------------------------------------------------------------------------- /man/ggVarImp.Rd: -------------------------------------------------------------------------------- 1 | \name{ggVarImp} 2 | \alias{ggVarImp} 3 | \title{ 4 | Model. 5 | } 6 | \description{ 7 | 8 | Model. 9 | 10 | } 11 | \usage{ 12 | ggVarImp(model, ...) 13 | } 14 | \arguments{ 15 | 16 | \item{model}{object.} 17 | 18 | \item{...}{arguments passed on.} 19 | 20 | } 21 | \references{Package home page: \url{https://rattle.togaware.com}} 22 | \author{\email{Graham.Williams@togaware.com}} 23 | 24 | \examples{ 25 | \dontrun{ggVarImp(model)} 26 | } 27 | -------------------------------------------------------------------------------- /man/grouper.Rd: -------------------------------------------------------------------------------- 1 | \name{rescale.by.group} 2 | \alias{rescale.by.group} 3 | \title{ 4 | 5 | Transform a numeric vector by grouping it according to the values of 6 | the supplied factor and then rescaling within the groups. 7 | 8 | } 9 | \description{ 10 | 11 | The numeric vector is remapped to integers from 0 to max-1, with any 12 | missing values mapped to the midpoint. Original idea from Tony 13 | Nolan. This will eventually be generalised to do the remapping using 14 | any of the rescaling functions. 15 | 16 | } 17 | \usage{ 18 | rescale.by.group(x, by=NULL, type = "irank", itop = 100) 19 | } 20 | \arguments{ 21 | 22 | \item{x}{The numeric vector to rescale.} 23 | 24 | \item{by}{A factor of the same length as x used to define the groups.} 25 | 26 | \item{type}{The type of rescaling to perform.} 27 | 28 | \item{itop}{For an integer remapping this is the number of groups, so 29 | that the numeric values are maped to the integers from 0 to (max-1).} 30 | 31 | } 32 | \details{ 33 | 34 | This Rattle support function, which is also useful by itself, provides 35 | a simple mechanism to rescale a numeric variable. Several rescalings 36 | are possible. The rescaling is done by first grouping the observations 37 | according to the by argument. 38 | 39 | } 40 | 41 | \references{Package home page: \url{https://rattle.togaware.com}} 42 | 43 | \author{\email{Graham.Williams@togaware.com}} 44 | 45 | \seealso{ 46 | 47 | \code{\link{rattle}}. 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/listAdaVarsUsed.Rd: -------------------------------------------------------------------------------- 1 | \name{listAdaVarsUsed} 2 | \alias{listAdaVarsUsed} 3 | \title{ 4 | List the variables used by an adaboost model 5 | } 6 | \description{ 7 | 8 | Returns a list of the variables used and their frequencies. 9 | 10 | } 11 | \usage{ 12 | listAdaVarsUsed(model) 13 | 14 | } 15 | \arguments{ 16 | 17 | \item{model}{an rpart object.} 18 | 19 | } 20 | \references{Package home page: \url{https://rattle.togaware.com}} 21 | \author{\email{Graham.Williams@togaware.com}} 22 | -------------------------------------------------------------------------------- /man/listTreesAda.Rd: -------------------------------------------------------------------------------- 1 | \name{listTreesAda} 2 | \alias{listTreesAda} 3 | \title{ 4 | List trees from an Ada model 5 | } 6 | \description{ 7 | 8 | Display the textual representation of a selection of Ada trees. 9 | 10 | } 11 | \usage{ 12 | listTreesAda(model, trees=0) 13 | } 14 | \arguments{ 15 | 16 | \item{model}{an ada model.} 17 | 18 | \item{trees}{The list of trees to list. Use 0 to list all trees.} 19 | 20 | } 21 | \details{ 22 | 23 | Using rpart's print method display each of the specified trees from an 24 | Ada model. 25 | 26 | } 27 | \references{Package home page: \url{https://rattle.togaware.com}} 28 | \author{\email{Graham.Williams@togaware.com}} 29 | \examples{ 30 | \dontrun{listTreesAda(ds.ada)} 31 | } 32 | \keyword{hplot} 33 | -------------------------------------------------------------------------------- /man/listVersions.Rd: -------------------------------------------------------------------------------- 1 | \name{listVersions} 2 | \alias{listVersions} 3 | \title{ 4 | 5 | Versions of Installed Packages 6 | 7 | } 8 | \description{ 9 | 10 | Generate a list of packages installed and their version number. 11 | 12 | } 13 | \usage{ 14 | 15 | listVersions(file="", ...) 16 | 17 | } 18 | \arguments{ 19 | 20 | \item{file}{a character string naming a file or a connection open for 21 | writing. '""' indicates output to the console.} 22 | 23 | \item{...}{arguments to \code{\link{write.csv}}.} 24 | 25 | } 26 | \details{ 27 | 28 | This function is useful in reporting problems or bugs, to ensure there 29 | is a clear match of R package versions between the system exhibiting 30 | the issue and the test system replicating the issue. 31 | 32 | By default the information is written to the console in a comma 33 | separated form, that is ideally designed to be written to a CSV file 34 | for emailing. 35 | 36 | } 37 | \seealso{\code{\link{write.csv}}} 38 | \author{\email{Graham.Williams@togaware.com}} 39 | -------------------------------------------------------------------------------- /man/modalvalue.Rd: -------------------------------------------------------------------------------- 1 | \name{modalvalue} 2 | \alias{modalvalue} 3 | \title{ 4 | Calculate the mode of a vector, array or list. 5 | } 6 | \description{ 7 | 8 | The mode is the most common or modal value of a list. 9 | 10 | } 11 | \usage{ 12 | modalvalue(x, na.rm=FALSE) 13 | } 14 | \arguments{ 15 | 16 | \item{x}{A vector, array or list.} 17 | 18 | \item{na.rm}{Whether to remove missing values.} 19 | } 20 | \details{ 21 | 22 | This function calculates the mode of a vector, array or list (lists 23 | are flattened). This code originated from an anonymous post on the R 24 | Wiki. 25 | 26 | } 27 | \keyword{hplot} 28 | -------------------------------------------------------------------------------- /man/plotOptimalLine.Rd: -------------------------------------------------------------------------------- 1 | \name{plotOptimalLine} 2 | \alias{plotOptimalLine} 3 | \title{ 4 | Plot three lines on a risk chart, one vertical and two horizontal 5 | } 6 | \description{ 7 | 8 | Plots a a vertical line at x up to max of y1 and y2, then horizontal 9 | from this line at y1 and y2. Intended for plotting on a plotRisk. 10 | 11 | } 12 | \usage{ 13 | plotOptimalLine(x, y1, y2, pr = NULL, colour = "plum", label = NULL) 14 | } 15 | \arguments{ 16 | 17 | \item{x}{location of vertical line.} 18 | 19 | \item{y1}{location of one horizontal line.} 20 | 21 | \item{y2}{location of other horizontal line.} 22 | 23 | \item{pr}{Aprint a percentage at this point.} 24 | 25 | \item{colour}{of the line.} 26 | 27 | \item{label}{at bottom of line.} 28 | 29 | } 30 | \details{ 31 | 32 | Intended to plot an optimal line on a Risk Chart as plotted by 33 | plotRisk. 34 | 35 | } 36 | \references{Package home page: \url{https://rattle.togaware.com}} 37 | \author{\email{Graham.Williams@togaware.com}} 38 | \seealso{\code{\link{plotRisk}}.} 39 | \examples{ 40 | ## this is usually used in the context of the plotRisk function 41 | \dontrun{ev <- evaluateRisk(predicted, actual, risk)} 42 | 43 | ## imitate this output here 44 | ev <- NULL 45 | ev$Caseload <- c(1.0, 0.8, 0.6, 0.4, 0.2, 0) 46 | ev$Precision <- c(0.15, 0.18, 0.21, 0.25, 0.28, 0.30) 47 | ev$Recall <- c(1.0, 0.95, 0.80, 0.75, 0.5, 0.0) 48 | ev$Risk <- c(1.0, 0.98, 0.90, 0.77, 0.30, 0.0) 49 | 50 | ## plot the Risk Chart 51 | plotRisk(ev$Caseload, ev$Precision, ev$Recall, ev$Risk, 52 | chosen=60, chosen.label="Pr=0.45") 53 | 54 | ## plot the optimal point 55 | plotOptimalLine(40, 77, 75, colour="maroon") 56 | 57 | } 58 | \keyword{hplot} 59 | -------------------------------------------------------------------------------- /man/plotRisk.Rd: -------------------------------------------------------------------------------- 1 | \name{plotRisk} 2 | \alias{plotRisk} 3 | \title{ 4 | Plot a risk chart 5 | } 6 | \description{ 7 | 8 | Plots a Rattle Risk Chart. Such a chart has been developed in a 9 | practical context to present the performance of data mining models to 10 | clients, plotting a caseload against performance, allowing a client to 11 | see the tradeoff between coverage and performance. 12 | 13 | } 14 | \usage{ 15 | plotRisk(cl, pr, re, ri = NULL, title = NULL, 16 | show.legend = TRUE, xleg = 60, yleg = 55, 17 | optimal = NULL, optimal.label = "", chosen = NULL, chosen.label = "", 18 | include.baseline = TRUE, dev = "", filename = "", show.knots = NULL, 19 | show.lift=TRUE, show.precision=TRUE, 20 | risk.name = "Risk", recall.name = "Recall", 21 | precision.name = "Precision") 22 | } 23 | \arguments{ 24 | 25 | \item{cl}{a vector of caseloads corresponding to different probability 26 | cutoffs. Can be either percentages (between 0 and 100) or fractions 27 | (between 0 and 1).} 28 | 29 | \item{pr}{a vector of precision values for each probability 30 | cutoff. Can be either percentages (between 0 and 100) or fractions 31 | (between 0 and 1).} 32 | 33 | \item{re}{a vector of recall values for each probability cutoff. Can 34 | be either percentages (between 0 and 100) or fractions (between 0 35 | and 1).} 36 | 37 | \item{ri}{a vector of risk values for each probability cutoff. Can be 38 | either percentages (between 0 and 100) or fractions (between 0 and 39 | 1).} 40 | 41 | \item{title}{the main title to place at the top of the plot.} 42 | 43 | \item{show.legend}{whether to display the legend in the plot.} 44 | 45 | \item{xleg}{the x coordinate for the placement of the legend.} 46 | 47 | \item{yleg}{the y coordinate for the placement of the legend.} 48 | 49 | \item{optimal}{a caseload (percentage or fraction) that represents an 50 | optimal performance point which is also plotted. If instead the value 51 | is \code{TRUE} then the optimal point is identified internally 52 | (maximum valud for \code{(recall-casload)+(risk-caseload)}) and 53 | plotted.} 54 | 55 | \item{optimal.label}{a string which is added to label the line drawn 56 | as the optimal point.} 57 | 58 | \item{chosen}{a caseload (percentage or fraction) that represents a 59 | user chosen optimal performance point which is also plotted.} 60 | 61 | \item{chosen.label}{a string which is added to label the line drawn as 62 | the chosen point.} 63 | 64 | \item{include.baseline}{if TRUE (the default) then display the 65 | diagonal baseline.} 66 | 67 | \item{dev}{a string which, if supplied, identifies a device type as 68 | the target for the plot. This might be one of \code{wmf} (for 69 | generating a Windows Metafile, but only available on MS/Windows), 70 | \code{pdf}, or \code{png}.} 71 | 72 | \item{filename}{a string naming a file. If \code{dev} is not given 73 | then the filename extension is used to identify the image format as 74 | one of those recognised by the \code{dev} argument.} 75 | 76 | \item{show.knots}{a vector of caseload values at which a vertical line 77 | should be drawn. These might correspond, for example, to individual 78 | paths through a decision tree, illustrating the impact of each path on 79 | the caseload and performance.} 80 | 81 | \item{show.lift}{whether to label the right axis with lift.} 82 | 83 | \item{show.precision}{whether to show the precision plot.} 84 | 85 | \item{risk.name}{a string used within the plot's legend that gives a 86 | name to the risk. Often the risk is a dollar amount at risk from a 87 | fraud or from a bank loan point of view, so the default is 88 | \code{Revenue}.} 89 | 90 | \item{recall.name}{a string used within the plot's legend that gives a 91 | name to the recall. The recall is often the percentage of cases that 92 | are positive hits, and in practise these might correspond to known 93 | cases of fraud or reviews where some adjustment to perhaps a incom tax 94 | return or application for credit had to be made on reviewing the case, 95 | and so the default is \code{Adjustments}.} 96 | 97 | \item{precision.name}{a string used within the plot's legend that gives a 98 | name to the precision. A common name for precision is \code{Strike 99 | Rate}, which is the default here.} 100 | 101 | } 102 | \details{ 103 | 104 | Caseload is the percentage of the entities in the dataset covered by 105 | the model at a particular probability cutoff, so that with a cutoff of 106 | 0, all (100\%) of the entities are covered by the model. With a cutoff 107 | of 1 (0\%) no entities are covered by the model. A diagonal line is 108 | drawn to represent a baseline random performance. Then the percentage 109 | of positive cases (the recall) covered for a particular caseload is 110 | plotted, and optionally a measure of the percentage of the total risk 111 | that is also covered for a particular caseload may be plotted. Such a 112 | chart allows a user to select an appropriate tradeoff between caseload 113 | and performance. The charts are similar to ROC curves. The precision 114 | (i.e., strike rate) is also plotted. 115 | 116 | } 117 | \references{Package home page: \url{https://rattle.togaware.com}} 118 | \author{\email{Graham.Williams@togaware.com}} 119 | \seealso{\code{\link{evaluateRisk}}, \code{\link{genPlotTitleCmd}}.} 120 | \examples{ 121 | ## this is usually used in the context of the evaluateRisk function 122 | \dontrun{ev <- evaluateRisk(predicted, actual, risk)} 123 | 124 | ## imitate this output here 125 | ev <- NULL 126 | ev$Caseload <- c(1.0, 0.8, 0.6, 0.4, 0.2, 0) 127 | ev$Precision <- c(0.15, 0.18, 0.21, 0.25, 0.28, 0.30) 128 | ev$Recall <- c(1.0, 0.95, 0.80, 0.75, 0.5, 0.0) 129 | ev$Risk <- c(1.0, 0.98, 0.90, 0.77, 0.30, 0.0) 130 | 131 | ## plot the Risk Chart 132 | plotRisk(ev$Caseload, ev$Precision, ev$Recall, ev$Risk, 133 | chosen=60, chosen.label="Pr=0.45") 134 | 135 | ## Add a title 136 | eval(parse(text=genPlotTitleCmd("Sample Risk Chart"))) 137 | } 138 | \keyword{hplot} 139 | -------------------------------------------------------------------------------- /man/printRandomForests.Rd: -------------------------------------------------------------------------------- 1 | \name{printRandomForests} 2 | \alias{printRandomForests} 3 | \title{ 4 | Print a representation of the Random Forest models to the console 5 | } 6 | \description{ 7 | 8 | A randomForest model, by default, consists of 500 decision trees. This 9 | function walks through each tree and generates a set of rules which 10 | are printed to the console. This takes a considerable amount of time 11 | and is provided for users to access the actual model, but it is not 12 | yet used within the Rattle GUI. It may be used to display the output 13 | of the RF (but it takes longer to generate than the model itself!). Or 14 | it might only be used on export to PMML or SQL. 15 | 16 | } 17 | \usage{ 18 | printRandomForests(model, models=NULL, include.class=NULL, format="") 19 | } 20 | \arguments{ 21 | 22 | \item{model}{a randomForest model.} 23 | 24 | \item{models}{a list of integers limiting the models in MODEL that are 25 | displayed.} 26 | 27 | \item{include.class}{limit the output to the specific class.} 28 | 29 | \item{format}{possible values are "VB".} 30 | 31 | } 32 | \references{Package home page: \url{https://rattle.togaware.com}} 33 | \author{\email{Graham.Williams@togaware.com}} 34 | \examples{ 35 | ## Display a ruleset for a specific model amongst the 500. 36 | \dontrun{printRandomForests(rfmodel, 5)} 37 | 38 | ## Display a ruleset for specific models amongst the 500. 39 | \dontrun{printRandomForests(rfmodel, c(5,10,15))} 40 | 41 | ## Display a ruleset for each of the 500 models. 42 | \dontrun{printRandomForests(rfmodel)} 43 | } 44 | \keyword{hplot} 45 | -------------------------------------------------------------------------------- /man/randomForest2Rules.Rd: -------------------------------------------------------------------------------- 1 | \name{randomForest2Rules} 2 | \alias{randomForest2Rules} 3 | \title{ 4 | Generate accessible data structure of a randomForest model 5 | } 6 | \description{ 7 | 8 | A randomForest model, by default, consists of 500 decision trees. This 9 | function walks through each tree and generates a set of rules. This 10 | takes a considerable amount of time and is provided for users to 11 | access the actual model, but it is not yet used within the Rattle 12 | GUI. It may be used to display the output of the RF (but it takes 13 | longer to generate than the model itself!). Or it might only be used 14 | on export to PMML or SQL. 15 | 16 | } 17 | \usage{ 18 | randomForest2Rules(model, models=NULL) 19 | } 20 | \arguments{ 21 | 22 | \item{model}{a randomForest model.} 23 | 24 | \item{models}{a list of integers limiting the models in MODEL that are 25 | converted.} 26 | 27 | } 28 | \references{Package home page: \url{https://rattle.togaware.com}} 29 | \author{\email{Graham.Williams@togaware.com}} 30 | \examples{ 31 | ## Generate a ruleset for a specific model amongst the 500. 32 | \dontrun{randomForest2Rules(rfmodel, 5)} 33 | 34 | ## Generate a ruleset for specific models amongst the 500. 35 | \dontrun{randomForest2Rules(rfmodel, c(5,10,15))} 36 | 37 | ## Generate a ruleset for each of the 500 models. 38 | \dontrun{randomForest2Rules(rfmodel)} 39 | } 40 | \keyword{hplot} 41 | -------------------------------------------------------------------------------- /man/rattle.Rd: -------------------------------------------------------------------------------- 1 | \name{rattle} 2 | \alias{rattle} 3 | \alias{crs} 4 | \alias{crv} 5 | \title{Display the Rattle User Interface} 6 | \description{ 7 | 8 | The Rattle user interface uses the RGtk2 package to present an 9 | intuitive point and click interface for data mining, extensively 10 | building on the excellent collection of R packages by very many 11 | authors for data manipulation, exploration, analysis, and evaluation. 12 | 13 | } 14 | \usage{ 15 | rattle(csvname=NULL, dataset=NULL, useGtkBuilder=TRUE) 16 | } 17 | \arguments{ 18 | 19 | \item{csvname}{the optional name of a CSV file to load into Rattle on 20 | startup.} 21 | 22 | \item{dataset}{The optional name as a character string of a dataset to 23 | load into Rattle on startup.} 24 | 25 | \item{useGtkBuilder}{if not supplied then automatically determine whether to 26 | use the new GtkBuilder rather than the deprecated libglade. A user 27 | can override the heuristic choice with TRUE or FALSE.} 28 | } 29 | \details{ 30 | 31 | Refer to the Rattle home page in the URL below for a growing reference 32 | manual for using Rattle. 33 | 34 | Whilst the underlying functionality of Rattle is built upon a vast 35 | collection of other R packages, Rattle itself provides a collection of 36 | utility functions used within Rattle. These are made available through 37 | loading the rattle package into your R library. The See Also section 38 | lists these utility functions that may be useful outside of Rattle. 39 | 40 | Rattle can initialise some options using a .Rattle file if the folder 41 | in which Rattle is started. The currently supported options are 42 | .RATTLE.DATA, .RATTLE.SCORE.IN, and .RATTLE.SCORE.OUT. 43 | 44 | If the environment variable RATTLE\_DATA is defined then that is set 45 | as the default CSV file name to load. Otherwise, if .RATTLE.DATA is 46 | defined then that will be used as the CSV file to load. Otherwise, if 47 | csvname is provided then that will be used. 48 | 49 | Two environments are exported by Rattle, capturing the current rattle 50 | state (crs) and the current rattle variables (crv). 51 | 52 | } 53 | 54 | \references{Package home page: \url{https://rattle.togaware.com}} 55 | 56 | \author{\email{Graham.Williams@togaware.com}} 57 | 58 | \seealso{ 59 | 60 | \code{\link{evaluateRisk}}, \code{\link{genPlotTitleCmd}}, 61 | \code{\link{plotRisk}}. 62 | 63 | } 64 | \examples{ 65 | # You can start rattle with a path to a csv file to pre-specify the 66 | # dataset. You then need to click Execute to load the data. 67 | 68 | \dontrun{rattle(system.file("csv", "weather.csv", package = "rattle"))} 69 | 70 | } 71 | \keyword{environment} 72 | -------------------------------------------------------------------------------- /man/rattle.print.summary.multinom.Rd: -------------------------------------------------------------------------------- 1 | \name{rattle.print.summary.multinom} 2 | \alias{rattle.print.summary.multinom} 3 | \title{ 4 | Print information about a multinomial model 5 | } 6 | \description{ 7 | 8 | Displays a textual reveiw of the performance of a multinom model. 9 | 10 | } 11 | \usage{ 12 | rattle.print.summary.multinom(x, digits = x$digits, ...) 13 | } 14 | \arguments{ 15 | 16 | \item{x}{An rpart object.} 17 | 18 | \item{digits}{Number of digist to print for numbers.} 19 | 20 | \item{...}{Other arguments.} 21 | 22 | } 23 | \details{ 24 | 25 | Print a summary of a multinom model. This is sipmly a modification of 26 | the print.summary.multinom function to add the number of entities! 27 | 28 | } 29 | \references{Package home page: \url{https://rattle.togaware.com}} 30 | \author{\email{Graham.Williams@togaware.com}} 31 | 32 | -------------------------------------------------------------------------------- /man/rattleInfo.Rd: -------------------------------------------------------------------------------- 1 | \name{rattleInfo} 2 | \alias{rattleInfo} 3 | \title{ 4 | 5 | Extract Rattle and related package information. 6 | 7 | } 8 | \description{ 9 | 10 | Display system information, including versions of Rattle and R, 11 | operating system, and versions of other packages used by 12 | Rattle. Useful for reporting bugs but also invisibly returns a list of 13 | packages that have updates available and can be passed to 14 | install.packages(). 15 | 16 | } 17 | \usage{ 18 | rattleInfo(all.dependencies=FALSE, 19 | include.not.installed=FALSE, 20 | include.not.available=FALSE, 21 | include.libpath=FALSE) 22 | } 23 | \arguments{ 24 | 25 | \item{all.dependencies}{If TRUE then check the full dependency graph 26 | for Rattle and list all of those packages (which may take quite a 27 | few seconds to compute), or else just list those key packages that 28 | Rattle Depends on and Suggests.} 29 | 30 | \item{include.not.installed}{If TRUE then make mention of any packages 31 | that are not installed, but are available.} 32 | 33 | \item{include.not.available}{If TRUE then make mention of any packages 34 | that are not available from CRAN.} 35 | 36 | \item{include.libpath}{If TRUE then list the library location where 37 | each package is installed.} 38 | 39 | } 40 | \details{ 41 | 42 | This is a support function to list useful information to provide the 43 | developers with information about the system environment when running 44 | Rattle. It is intended to provide the information that is useful in 45 | reporting bugs. 46 | 47 | It also lists the currently installed version of a number of packages 48 | that Rattle makes use of as well as checking for any updates available 49 | for those packages. 50 | 51 | If updates are found then a command is generated and printed so that a 52 | user can simply copy and paste the command to update the relevant 53 | packages. The function also invisibly returns the list of packages 54 | that can be updated, so that we can do something like: 55 | install.packages(rattleInfo()). 56 | 57 | } 58 | 59 | \references{Package home page: \url{https://rattle.togaware.com}} 60 | 61 | \author{\email{Graham.Williams@togaware.com}} 62 | 63 | \seealso{ 64 | 65 | \code{\link{rattle}}. 66 | 67 | } 68 | 69 | \keyword{environment} 70 | -------------------------------------------------------------------------------- /man/riskchart.Rd: -------------------------------------------------------------------------------- 1 | \name{riskchart} 2 | \alias{riskchart} 3 | \title{ 4 | Plot a risk chart 5 | } 6 | \description{ 7 | 8 | Plots a Rattle Risk Chart for binary classification models using 9 | ggplot2. Such a chart has been developed in a practical context to 10 | present the performance of data mining models to clients, plotting a 11 | caseload against performance, allowing a client to see the tradeoff 12 | between coverage and performance. 13 | 14 | } 15 | \usage{ 16 | riskchart(pr, 17 | ac, 18 | ri = NULL, 19 | title = "Risk Chart", 20 | title.size = 10, 21 | subtitle = NULL, 22 | caption = TRUE, 23 | show.legend = TRUE, 24 | optimal = NULL, 25 | optimal.label = "", 26 | chosen = NULL, 27 | chosen.label = "", 28 | include.baseline = TRUE, 29 | dev = "", 30 | filename = "", 31 | show.knots = NULL, 32 | show.lift = TRUE, 33 | show.precision = TRUE, 34 | show.maximal = TRUE, 35 | risk.name = "Risk", 36 | recall.name = "Recall", 37 | precision.name = "Precision", 38 | thresholds = NULL, 39 | legend.horiz = TRUE) 40 | } 41 | \arguments{ 42 | 43 | \item{pr}{The predicted class for each observation.} 44 | 45 | \item{ac}{The actual class for each observation.} 46 | 47 | \item{ri}{The risk class for each observation.} 48 | 49 | \item{title}{the main title to place at the top of the plot.} 50 | 51 | \item{title.size}{font size for the main title.} 52 | 53 | \item{subtitle}{subtitle under the main title.} 54 | 55 | \item{caption}{caption for the bottom right of plot.} 56 | 57 | \item{show.legend}{whether to display the legend in the plot.} 58 | 59 | \item{optimal}{a caseload (percentage or fraction) that represents an 60 | optimal performance point which is also plotted. If instead the value 61 | is \code{TRUE} then the optimal point is identified internally 62 | (maximum valud for \code{(recall-casload)+(risk-caseload)}) and 63 | plotted.} 64 | 65 | \item{optimal.label}{a string which is added to label the line drawn 66 | as the optimal point.} 67 | 68 | \item{chosen}{a caseload (percentage or fraction) that represents a 69 | user chosen optimal performance point which is also plotted.} 70 | 71 | \item{chosen.label}{a string which is added to label the line drawn as 72 | the chosen point.} 73 | 74 | \item{include.baseline}{if TRUE (the default) then display the 75 | diagonal baseline.} 76 | 77 | \item{dev}{a string which, if supplied, identifies a device type as 78 | the target for the plot. This might be one of \code{wmf} (for 79 | generating a Windows Metafile, but only available on MS/Windows), 80 | \code{pdf}, or \code{png}.} 81 | 82 | \item{filename}{a string naming a file. If \code{dev} is not given 83 | then the filename extension is used to identify the image format as 84 | one of those recognised by the \code{dev} argument.} 85 | 86 | \item{show.knots}{a vector of caseload values at which a vertical line 87 | should be drawn. These might correspond, for example, to individual 88 | paths through a decision tree, illustrating the impact of each path on 89 | the caseload and performance.} 90 | 91 | \item{show.lift}{whether to label the right axis with lift.} 92 | 93 | \item{show.precision}{whether to show the precision plot.} 94 | 95 | \item{show.maximal}{whether to show the maximal performance line.} 96 | 97 | \item{risk.name}{a string used within the plot's legend that gives a 98 | name to the risk. Often the risk is a dollar amount at risk from a 99 | fraud or from a bank loan point of view, so the default is 100 | \code{Revenue}.} 101 | 102 | \item{recall.name}{a string used within the plot's legend that gives a 103 | name to the recall. The recall is often the percentage of cases that 104 | are positive hits, and in practise these might correspond to known 105 | cases of fraud or reviews where some adjustment to perhaps a incom tax 106 | return or application for credit had to be made on reviewing the case, 107 | and so the default is \code{Adjustments}.} 108 | 109 | \item{precision.name}{a string used within the plot's legend that gives 110 | a name to the precision. A common name for precision is \code{Strike 111 | Rate}, which is the default here.} 112 | 113 | \item{thresholds}{whether to display scores along the top axis.} 114 | 115 | \item{legend.horiz}{whether to display a horizontal legend.} 116 | } 117 | \details{ 118 | 119 | Caseload is the percentage of the entities in the dataset covered by 120 | the model at a particular probability cutoff, so that with a cutoff of 121 | 0, all (100\%) of the entities are covered by the model. With a cutoff 122 | of 1 (0\%) no entities are covered by the model. A diagonal line is 123 | drawn to represent a baseline random performance. Then the percentage 124 | of positive cases (the recall) covered for a particular caseload is 125 | plotted, and optionally a measure of the percentage of the total risk 126 | that is also covered for a particular caseload may be plotted. Such a 127 | chart allows a user to select an appropriate tradeoff between caseload 128 | and performance. The charts are similar to ROC curves. The precision 129 | (i.e., strike rate) is also plotted. 130 | 131 | } 132 | \references{Package home page: \url{https://rattle.togaware.com}} 133 | \author{\email{Graham.Williams@togaware.com}} 134 | \seealso{\code{\link{evaluateRisk}}, \code{\link{genPlotTitleCmd}}.} 135 | \examples{ 136 | \dontrun{ 137 | 138 | ## Use rpart to build a decision tree. 139 | 140 | library(rpart) 141 | 142 | ## Set up the data for modelling. 143 | 144 | set.seed(42) 145 | ds <- weather 146 | target <- "RainTomorrow" 147 | risk <- "RISK_MM" 148 | ignore <- c("Date", "Location", risk) 149 | vars <- setdiff(names(ds), ignore) 150 | nobs <- nrow(ds) 151 | form <- formula(paste(target, "~ .")) 152 | train <- sample(nobs, 0.7*nobs) 153 | test <- setdiff(seq_len(nobs), train) 154 | actual <- ds[test, target] 155 | risks <- ds[test, risk] 156 | 157 | # Build the model. 158 | 159 | model <- rpart(form, data=ds[train, vars]) 160 | 161 | ## Obtain predictions. 162 | 163 | predicted <- predict(model, ds[test, vars], type="prob")[,2] 164 | 165 | ## Plot the Risk Chart. 166 | 167 | riskchart(predicted, actual, risks) 168 | } 169 | } 170 | \keyword{hplot} 171 | -------------------------------------------------------------------------------- /man/savePlotToFile.Rd: -------------------------------------------------------------------------------- 1 | \name{savePlotToFile} 2 | \alias{savePlotToFile} 3 | \alias{copyPlotToClipboard} 4 | \alias{printPlot} 5 | \title{ 6 | Save a plot in some way 7 | } 8 | \description{ 9 | 10 | For the current device, or for the device identified, save the plot 11 | displayed there in some way. This is either saved to file, copied to 12 | the clipboard for pasting into other applications, or sent to the 13 | printer for saving a hard copy. 14 | 15 | } 16 | \usage{ 17 | savePlotToFile(file.name, dev.num=dev.cur()) 18 | copyPlotToClipboard(dev.num=dev.cur()) 19 | printPlot(dev.num=dev.cur()) 20 | } 21 | \arguments{ 22 | 23 | \item{file.name}{Character string naming the file including the file 24 | name extension which is used to specify the type of file to save.} 25 | 26 | \item{dev.num}{A device number indicating which device to save.} 27 | 28 | } 29 | \references{Package home page: \url{https://rattle.togaware.com}} 30 | \author{\email{Graham.Williams@togaware.com}} 31 | \keyword{hplot} 32 | -------------------------------------------------------------------------------- /man/setupDataset.Rd: -------------------------------------------------------------------------------- 1 | \name{setupDataset} 2 | \alias{setupDataset} 3 | \title{ 4 | Given specific contents of env add other dataset related variables. 5 | } 6 | \description{ 7 | 8 | This rattle support function is used for encapsulating data mining 9 | objects. The supplied environment is augmented with other data derived 10 | from the supplied data, such as a sample trianing dataset, list of 11 | numeric variables, and a formula for modelling. 12 | 13 | } 14 | \usage{ 15 | setupDataset(env, seed=NULL) 16 | 17 | } 18 | \arguments{ 19 | 20 | \item{env}{the environment to modify.} 21 | 22 | \item{seed}{optionally set the seed for repeatability.} 23 | 24 | } 25 | \details{ 26 | 27 | The supplied object (an environment) is assumed to also contain the 28 | variables data (a data frame), target (a character string naming the 29 | target variable), risk (a character string naming the risk variable), 30 | and inputs (a character vector naming all the input variables). This 31 | function then adds in the variables vars (the variables used for 32 | modelling), numerics (the numeric vars within inputs), nobs (the 33 | number of observations), form (the formula for building models), train 34 | (a 70\% training dataset). 35 | 36 | } 37 | \references{Package home page: \url{https://rattle.togaware.com}} 38 | \author{\email{Graham.Williams@togaware.com}} 39 | -------------------------------------------------------------------------------- /man/treeset.randomForest.Rd: -------------------------------------------------------------------------------- 1 | \name{treeset.randomForest} 2 | \alias{treeset.randomForest} 3 | \title{ 4 | Generate a representation of a tree in a Random Forest 5 | } 6 | \description{ 7 | Often we want to view the actual trees built by a random 8 | forest. Although reviewing all 500 trees might be a bit much, this 9 | function allows us to at least list them. 10 | } 11 | \usage{ 12 | treeset.randomForest(model, n=1, root=1, format="R") 13 | } 14 | \arguments{ 15 | 16 | \item{model}{a randomForest model.} 17 | 18 | \item{n}{a specific tree to list.} 19 | 20 | \item{root}{where to start the stree from, primarily for internal use.} 21 | 22 | \item{format}{one of "R", "VB".} 23 | 24 | } 25 | \references{Package home page: \url{https://rattle.togaware.com}} 26 | \author{\email{Graham.Williams@togaware.com}} 27 | \examples{ 28 | ## Display a treeset for a specific model amongst the 500. 29 | \dontrun{treeset.randomForests(rfmodel, 5)} 30 | } 31 | \keyword{hplot} 32 | -------------------------------------------------------------------------------- /man/weather.Rd: -------------------------------------------------------------------------------- 1 | \name{weather} 2 | \docType{data} 3 | \alias{weather} 4 | \title{Sample dataset of daily weather observations from Canberra 5 | airport in Australia.} 6 | \description{ 7 | 8 | One year of daily weather observations collected from the Canberra 9 | airport in Australia was obtained from the Australian Commonwealth 10 | Bureau of Meteorology and processed to create this sample dataset for 11 | illustrating data mining using R and Rattle. 12 | 13 | The data has been processed to provide a target variable 14 | \code{RainTomorrow} (whether there is rain on the following day - 15 | No/Yes) and a risk variable \code{RISK_MM} (how much rain recorded in 16 | millimetres). Various transformations were performed on the source 17 | data. The dataset is quite small and is useful only for repeatable 18 | demonstration of various data science operations. 19 | 20 | The source dataset is Copyright by the Australian Commonwealth Bureau 21 | of Meteorology and is provided as part of the rattle package with 22 | permission. 23 | 24 | } 25 | \usage{weather} 26 | \format{ 27 | 28 | The \code{weather} dataset is a data frame containing one year of 29 | daily observations from a single weather station (Canberra). 30 | 31 | \describe{ 32 | 33 | \item{\code{Date}}{The date of observation (a Date object).} 34 | 35 | \item{\code{Location}}{The common name of the location of the 36 | weather station.} 37 | 38 | \item{\code{MinTemp}}{The minimum temperature in degrees celsius. } 39 | 40 | \item{\code{MaxTemp}}{The maximum temperature in degrees celsius. } 41 | 42 | \item{\code{Rainfall}}{The amount of rainfall recorded for the day in mm. } 43 | 44 | \item{\code{Evaporation}}{The so-called Class A pan evaporation (mm) 45 | in the 24 hours to 9am.} 46 | 47 | \item{\code{Sunshine}}{The number of hours of bright sunshine in the day.} 48 | 49 | \item{\code{WindGustDir}}{The direction of the strongest wind gust 50 | in the 24 hours to midnight.} 51 | 52 | \item{\code{WindGustSpeed}}{The speed (km/h) of the strongest wind 53 | gust in the 24 hours to midnight.} 54 | 55 | \item{\code{Temp9am}}{ Temperature (degrees C) at 9am. } 56 | 57 | \item{\code{RelHumid9am}}{ Relative humidity (percent) at 9am. } 58 | 59 | \item{\code{Cloud9am}}{ Fraction of sky obscured by cloud at 60 | 9am. This is measured in "oktas", which are a unit of eigths. It 61 | records how many eigths of the sky are obscured by cloud. A 0 62 | measure indicates completely clear sky whilst an 8 indicates that 63 | it is completely overcast. } 64 | 65 | \item{\code{WindSpeed9am}}{ 66 | Wind speed (km/hr) averaged over 10 minutes prior to 9am. 67 | } 68 | 69 | \item{\code{Pressure9am}}{ 70 | Atmospheric pressure (hpa) reduced to mean sea level at 9am. 71 | } 72 | 73 | \item{\code{Temp3pm}}{ Temperature (degrees C) at 3pm. } 74 | 75 | \item{\code{RelHumid3pm}}{ Relative humidity (percent) at 3pm. } 76 | 77 | \item{\code{Cloud3pm}}{ 78 | 79 | Fraction of sky obscured by cloud (in "oktas": eighths) at 80 | 3pm. See Cload9am for a description of the values. 81 | 82 | } 83 | 84 | \item{\code{WindSpeed3pm}}{ 85 | Wind speed (km/hr) averaged over 10 minutes prior to 3pm. 86 | } 87 | 88 | \item{\code{Pressure3pm}}{ 89 | Atmospheric pressure (hpa) reduced to mean sea level at 3pm. 90 | } 91 | 92 | \item{\code{ChangeTemp}}{ 93 | Change in temperature. 94 | } 95 | 96 | \item{\code{ChangeTempDir}}{ 97 | Direction of change in temperature. 98 | } 99 | 100 | \item{\code{ChangeTempMag}}{ 101 | Magnitude of change in temperature. 102 | } 103 | 104 | \item{\code{ChangeWindDirect}}{ 105 | Direction of wind change. 106 | } 107 | 108 | \item{\code{MaxWindPeriod}}{ 109 | Period of maximum wind. 110 | } 111 | 112 | \item{\code{RainToday}}{ 113 | Integer: 1 if precipitation (mm) in the 24 hours to 9am exceeds 114 | 1mm, otherwise 0. 115 | } 116 | 117 | \item{\code{TempRange}}{ 118 | 119 | Difference between minimum and maximum temperatures (degrees C) in 120 | the 24 hours to 9am. 121 | 122 | } 123 | 124 | \item{\code{PressureChange}}{ 125 | Change in pressure. 126 | } 127 | \item{\code{RISK_MM}}{ 128 | The amount of rain. A kind of measure of the "risk". 129 | } 130 | 131 | \item{\code{RainTomorrow}}{ 132 | 133 | The target variable. Did it rain tomorrow? 134 | 135 | } 136 | } 137 | } 138 | \source{ 139 | 140 | The daily observations are available from 141 | \url{https://www.bom.gov.au/climate/data}. Copyright Commonwealth of 142 | Australia 2010, Bureau of Meteorology. 143 | 144 | Definitions adapted from 145 | \url{https://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml} 146 | 147 | } 148 | 149 | \references{ 150 | 151 | Package home page: \url{https://rattle.togaware.com}. Data source: 152 | \url{https://www.bom.gov.au/climate/dwo/} and 153 | \url{https://www.bom.gov.au/climate/data}. 154 | 155 | } 156 | 157 | \author{\email{Graham.Williams@togaware.com}} 158 | 159 | \seealso{ 160 | 161 | \code{\link{weatherAUS}}, \code{\link{audit}}. 162 | 163 | } 164 | 165 | \keyword{datasets} 166 | -------------------------------------------------------------------------------- /man/weatherAUS.Rd: -------------------------------------------------------------------------------- 1 | \name{weatherAUS} 2 | \docType{data} 3 | \alias{weatherAUS} 4 | \alias{locationsAUS} 5 | \title{Daily weather observations from multiple Australian weather stations.} 6 | \description{ 7 | 8 | Daily weather observations from multiple locations around Australia, 9 | obtained from the Australian Commonwealth Bureau of Meteorology and 10 | processed to create this realtively large sample dataset for 11 | illustrating analytics, data mining, and data science using R and 12 | Rattle. 13 | 14 | The data has been processed to provide a target variable 15 | \code{RainTomorrow} (whether there is rain on the following day - 16 | No/Yes) and a risk variable \code{RISK_MM} (how much rain recorded in 17 | millimeters). Various transformations are performed on the data. 18 | 19 | The \code{weatherAUS} dataset is regularly updated an updates of this 20 | package usually correspond to updates to this dataset. The data is 21 | updated from the Bureau of Meteorology web site. 22 | 23 | The \code{locationsAUS} dataset records the location of each weather 24 | station. 25 | 26 | The source dataset comes from the Australian Commonwealth Bureau of 27 | Meteorology. The Bureau provided permission to use the data with the 28 | Bureau of Meteorology acknowledged as the source of the data, as per 29 | email from Cathy Toby (C.Toby@bom.gov.au) of the Climate Information 30 | Services of the National CLimate Centre, 17 Dec 2008. 31 | 32 | A CSV version of this dataset is available as 33 | \url{https://rattle.togaware.com/weatherAUS.csv}. 34 | 35 | } 36 | \usage{weatherAUS} 37 | \format{ 38 | 39 | The \code{weatherAUS} dataset is a data frame containing over 140,000 40 | daily observations from over 45 Australian weather stations. 41 | 42 | \describe{ 43 | 44 | \item{\code{Date}}{The date of observation (a Date object).} 45 | 46 | \item{\code{Location}}{The common name of the location of the 47 | weather station.} 48 | 49 | \item{\code{MinTemp}}{The minimum temperature in degrees celsius. } 50 | 51 | \item{\code{MaxTemp}}{The maximum temperature in degrees celsius. } 52 | 53 | \item{\code{Rainfall}}{The amount of rainfall recorded for the day in mm. } 54 | 55 | \item{\code{Evaporation}}{The so-called Class A pan evaporation (mm) 56 | in the 24 hours to 9am.} 57 | 58 | \item{\code{Sunshine}}{The number of hours of bright sunshine in the day.} 59 | 60 | \item{\code{WindGustDir}}{The direction of the strongest wind gust 61 | in the 24 hours to midnight.} 62 | 63 | \item{\code{WindGustSpeed}}{The speed (km/h) of the strongest wind 64 | gust in the 24 hours to midnight.} 65 | 66 | \item{\code{Temp9am}}{ Temperature (degrees C) at 9am. } 67 | 68 | \item{\code{RelHumid9am}}{ Relative humidity (percent) at 9am. } 69 | 70 | \item{\code{Cloud9am}}{ Fraction of sky obscured by cloud at 71 | 9am. This is measured in "oktas", which are a unit of eigths. It 72 | records how many eigths of the sky are obscured by cloud. A 0 73 | measure indicates completely clear sky whilst an 8 indicates that 74 | it is completely overcast. } 75 | 76 | \item{\code{WindSpeed9am}}{ 77 | Wind speed (km/hr) averaged over 10 minutes prior to 9am. 78 | } 79 | 80 | \item{\code{Pressure9am}}{ 81 | Atmospheric pressure (hpa) reduced to mean sea level at 9am. 82 | } 83 | 84 | \item{\code{Temp3pm}}{ Temperature (degrees C) at 3pm. } 85 | 86 | \item{\code{RelHumid3pm}}{ Relative humidity (percent) at 3pm. } 87 | 88 | \item{\code{Cloud3pm}}{ 89 | 90 | Fraction of sky obscured by cloud (in "oktas": eighths) at 91 | 3pm. See Cload9am for a description of the values. 92 | 93 | } 94 | 95 | \item{\code{WindSpeed3pm}}{ 96 | Wind speed (km/hr) averaged over 10 minutes prior to 3pm. 97 | } 98 | 99 | \item{\code{Pressure3pm}}{ 100 | Atmospheric pressure (hpa) reduced to mean sea level at 3pm. 101 | } 102 | 103 | \item{\code{ChangeTemp}}{ 104 | Change in temperature. 105 | } 106 | 107 | \item{\code{ChangeTempDir}}{ 108 | Direction of change in temperature. 109 | } 110 | 111 | \item{\code{ChangeTempMag}}{ 112 | Magnitude of change in temperature. 113 | } 114 | 115 | \item{\code{ChangeWindDirect}}{ 116 | Direction of wind change. 117 | } 118 | 119 | \item{\code{MaxWindPeriod}}{ 120 | Period of maximum wind. 121 | } 122 | 123 | \item{\code{RainToday}}{ 124 | Integer: 1 if precipitation (mm) in the 24 hours to 9am exceeds 125 | 1mm, otherwise 0. 126 | } 127 | 128 | \item{\code{TempRange}}{ 129 | 130 | Difference between minimum and maximum temperatures (degrees C) in 131 | the 24 hours to 9am. 132 | 133 | } 134 | 135 | \item{\code{PressureChange}}{ 136 | Change in pressure. 137 | } 138 | \item{\code{RISK_MM}}{ 139 | The amount of rain. A kind of measure of the "risk". 140 | } 141 | 142 | \item{\code{RainTomorrow}}{ 143 | 144 | The target variable. Did it rain tomorrow? 145 | 146 | } 147 | } 148 | } 149 | \source{ 150 | 151 | Observations were drawn from numerous weather stations. The daily 152 | observations are available from 153 | \url{https://www.bom.gov.au/climate/data}. Copyright Commonwealth of 154 | Australia 2010, Bureau of Meteorology. 155 | 156 | Definitions adapted from 157 | \url{https://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml} 158 | 159 | } 160 | 161 | \references{ 162 | 163 | Package home page: \url{https://rattle.togaware.com}. Data source: 164 | \url{https://www.bom.gov.au/climate/dwo/} and 165 | \url{https://www.bom.gov.au/climate/data}. 166 | 167 | } 168 | 169 | \author{\email{Graham.Williams@togaware.com}} 170 | 171 | \seealso{ 172 | 173 | \code{\link{weather}}, \code{\link{audit}}. 174 | 175 | } 176 | 177 | \keyword{datasets} 178 | -------------------------------------------------------------------------------- /man/whichNumerics.Rd: -------------------------------------------------------------------------------- 1 | \name{whichNumerics} 2 | \alias{whichNumerics} 3 | \title{ 4 | Returns a list of the names of the numeric variables in a data frame. 5 | } 6 | \description{ 7 | 8 | A rattle support function. 9 | 10 | } 11 | \usage{ 12 | whichNumerics(data) 13 | 14 | } 15 | \arguments{ 16 | 17 | \item{data}{a data frame.} 18 | 19 | } 20 | \references{Package home page: \url{https://rattle.togaware.com}} 21 | \author{\email{Graham.Williams@togaware.com}} 22 | -------------------------------------------------------------------------------- /man/wine.Rd: -------------------------------------------------------------------------------- 1 | \name{wine} 2 | \docType{data} 3 | \alias{wine} 4 | \title{The wine dataset from the UCI Machine Learning Repository.} 5 | \description{ 6 | 7 | The \code{wine} dataset contains the results of a chemical analysis of 8 | wines grown in a specific area of Italy. Three types of wine are 9 | represented in the 178 samples, with the results of 13 chemical 10 | analyses recorded for each sample. The \code{Type} variable has been 11 | transformed into a categoric variable. 12 | 13 | The data contains no missing values and consits of only numeric data, 14 | with a three class target variable (\code{Type}) for classification. 15 | 16 | } 17 | \usage{wine} 18 | \format{ 19 | 20 | A data frame containing 178 observations of 13 variables. 21 | 22 | \describe{ 23 | 24 | \item{\code{Type}}{ The type of wine, into one of three classes, 1 25 | (59 obs), 2(71 obs), and 3 (48 obs).} 26 | 27 | \item{\code{Alcohol}}{Alcohol} 28 | 29 | \item{\code{Malic}}{Malic acid} 30 | 31 | \item{\code{Ash}}{Ash} 32 | 33 | \item{\code{Alcalinity}}{Alcalinity of ash} 34 | 35 | \item{\code{Magnesium}}{Magnesium} 36 | 37 | \item{\code{Phenols}}{Total phenols} 38 | 39 | \item{\code{Flavanoids}}{Flavanoids} 40 | 41 | \item{\code{Nonflavanoids}}{Nonflavanoid phenols} 42 | 43 | \item{\code{Proanthocyanins}}{Proanthocyanins} 44 | 45 | \item{\code{Color}}{Color intensity.} 46 | 47 | \item{\code{Hue}}{Hue} 48 | 49 | \item{\code{Dilution}}{D280/OD315 of diluted wines.} 50 | 51 | \item{\code{Proline}}{Proline} 52 | 53 | } 54 | } 55 | \source{ 56 | 57 | The data was downloaded from the UCI Machine Learning Repository. 58 | 59 | It was read as a CSV file with no header using 60 | \code{\link{read.csv}}. The columns were then given the appropriate 61 | names using \code{\link{colnames}} and the Type was transformed into a 62 | factor using \code{\link{as.factor}}. The compressed R data file was 63 | saved using \code{\link{save}}: 64 | 65 | \preformatted{ 66 | UCI <- "https://archive.ics.uci.edu/ml" 67 | REPOS <- "machine-learning-databases" 68 | wine.url <- sprintf("%s/%s/wine/wine.data", UCI, REPOS) 69 | wine <- read.csv(wine.url, header=FALSE) 70 | colnames(wine) <- c('Type', 'Alcohol', 'Malic', 'Ash', 71 | 'Alcalinity', 'Magnesium', 'Phenols', 72 | 'Flavanoids', 'Nonflavanoids', 73 | 'Proanthocyanins', 'Color', 'Hue', 74 | 'Dilution', 'Proline') 75 | wine$Type <- as.factor(wine$Type) 76 | save(wine, file="wine.Rdata", compress=TRUE) 77 | } 78 | } 79 | 80 | \references{ 81 | 82 | Asuncion, A. & Newman, D.J. (2007). \emph{UCI Machine Learning 83 | Repository} 84 | [\url{https://www.ics.uci.edu/~mlearn/MLRepository.html}]. Irvine, CA: 85 | University of California, School of Information and Computer Science. 86 | 87 | } 88 | 89 | \keyword{datasets} 90 | -------------------------------------------------------------------------------- /vignettes/rattle.Rnw: -------------------------------------------------------------------------------- 1 | % \VignetteIndexEntry{Rattle Quick Start Guide} 2 | % \VignetteDepends{rattle} 3 | % \VignetteKeywords{data mining} 4 | % \VignettePackage{rattle} 5 | \documentclass[12pt]{article} 6 | \usepackage{amsmath} 7 | \usepackage[pdftex]{graphicx} 8 | \usepackage{color} 9 | \usepackage{xspace} 10 | \usepackage{fancyvrb} 11 | \usepackage{fancyhdr} 12 | \usepackage{lastpage} 13 | \usepackage{algorithm2e} 14 | \usepackage[ 15 | colorlinks=true, 16 | linkcolor=blue, 17 | citecolor=blue, 18 | urlcolor=blue] 19 | {hyperref} 20 | \usepackage{Sweave} 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | % define new colors for use 25 | \definecolor{darkgreen}{rgb}{0,0.6,0} 26 | \definecolor{darkred}{rgb}{0.6,0.0,0} 27 | \definecolor{lightbrown}{rgb}{1,0.9,0.8} 28 | \definecolor{brown}{rgb}{0.6,0.3,0.3} 29 | \definecolor{darkblue}{rgb}{0,0,0.8} 30 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5} 31 | 32 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 33 | 34 | \newcommand{\Rattle}{\textsf{Rattle}\xspace} 35 | \newcommand{\pkg}[1]{{\tt #1}\xspace} 36 | 37 | \setlength{\oddsidemargin}{-.25 truein} 38 | \setlength{\evensidemargin}{0truein} 39 | \setlength{\topmargin}{-0.2truein} 40 | \setlength{\textwidth}{7 truein} 41 | \setlength{\textheight}{8.5 truein} 42 | \setlength{\parindent}{0.20truein} 43 | \setlength{\parskip}{0.10truein} 44 | 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 46 | \pagestyle{fancy} 47 | \lhead{} 48 | \chead{Rattle} 49 | \rhead{} 50 | \lfoot{} 51 | \cfoot{} 52 | \rfoot{\thepage\ of \pageref{LastPage}} 53 | \renewcommand{\headrulewidth}{1pt} 54 | \renewcommand{\footrulewidth}{1pt} 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | 57 | \title{The Rattle Package: Quick Start Guide} 58 | \author{Graham Williams \\ Graham.Williams@togaware.com} 59 | 60 | \begin{document} 61 | 62 | \maketitle 63 | 64 | \thispagestyle{empty} 65 | 66 | \section{Introduction} 67 | 68 | \Rattle (Williams, 2011) is a package written in R providing a 69 | graphical user interface to very many other R packages that provide 70 | functionality for data mining. 71 | 72 | This quick start guide is under development. See 73 | \url{https://rattle.togaware.com} for extensive documentation 74 | 75 | \section{Requirements} 76 | 77 | \Rattle depends on over 40 other R packages and a couple of other 78 | software applications/libraries that are independent of R. The first 79 | thing to ensure is that you have installed the GTK+ libraries and the 80 | GGobi application. This is operating system dependent and full 81 | installation instructions are available from \url{https://rattle.togaware.com/}. 82 | 83 | Only a couple of R packages are dependencies for \Rattle. Most are 84 | suggestions, but without them functionality is quite limited. At a 85 | minimum it is useful to ensure you have the 86 | \href{https://cran.r-project.org/package=RGtk2}{\pkg{RGtk2}} package 87 | installed. Others that you might like to install include: 88 | \href{https://cran.r-project.org/package=ada}{\pkg{ada}}, 89 | \href{https://cran.r-project.org/package=arules}{\pkg{arules}}, 90 | \href{https://cran.r-project.org/package=doBy}{\pkg{doBy}}, 91 | \href{https://cran.r-project.org/package=ellipse}{\pkg{ellipse}}, 92 | \href{https://cran.r-project.org/package=fBasics}{\pkg{fBasics}}, 93 | \href{https://cran.r-project.org/package=fpc}{\pkg{fpc}}, 94 | \href{https://cran.r-project.org/package=gplots}{\pkg{gplots}}, 95 | \href{https://cran.r-project.org/package=Hmisc}{\pkg{Hmisc}}, 96 | \href{https://cran.r-project.org/package=kernlab}{\pkg{kernlab}}, 97 | \href{https://cran.r-project.org/package=mice}{\pkg{mice}}, 98 | \href{https://cran.r-project.org/package=party}{\pkg{party}}, 99 | \href{https://cran.r-project.org/package=playwith}{\pkg{playwith}}, 100 | \href{https://cran.r-project.org/package=pmml}{\pkg{pmml}}, 101 | \href{https://cran.r-project.org/package=randomForest}{\pkg{randomForest}}, 102 | \href{https://cran.r-project.org/package=reshape}{\pkg{reshape}}, 103 | \href{https://cran.r-project.org/package=rggobi}{\pkg{rggobi}}, 104 | \href{https://cran.r-project.org/package=RGtk2}{\pkg{RGtk2}}, 105 | \href{https://cran.r-project.org/package=ROCR}{\pkg{ROCR}}, 106 | \href{https://cran.r-project.org/package=RODBC}{\pkg{RODBC}}, and 107 | \href{https://cran.r-project.org/package=rpart}{\pkg{rpart}}. 108 | 109 | The packages will usually be installed with the following command: 110 | 111 | <>= 112 | install.packages("rattle", dependencies=c("Depends", "Suggests")) 113 | @ 114 | 115 | The latest beta version of rattle is available from 116 | \url{https://rattle.togaware.com/}: 117 | 118 | <>= 119 | install.packages("rattle", repos="https://rattle.togaware.com", type="source") 120 | @ 121 | 122 | \section{First Steps} 123 | 124 | Start up rattle: 125 | <>= 126 | library(rattle) 127 | rattle() 128 | @ 129 | 130 | \section{Sipmle Scenario: Build a Couple of Models} 131 | 132 | \begin{enumerate} 133 | \item Click Execute 134 | \item Click Yes (load the sample weather dataset) 135 | \item Click the Model tab 136 | \item Click Execute (to build a decision tree) 137 | \item Click Draw to display the decision tree (loads other packages as required) 138 | \item Click the Forest radio button 139 | \item Click Execute (to build a random forest - loads packages as required) 140 | \item Click the Evaluate tab 141 | \item Click the Risk radio button (installs packages as required) 142 | \item Click Execute to display two Risk (Cummulative) performance plots 143 | \item Click the Log tab 144 | \item Click the Export button to save script to file weather\_script.R to home folder 145 | \end{enumerate} 146 | 147 | Now exit from R (and rattle) and start R up again. 148 | 149 | <>= 150 | source("~/weather_script.R") 151 | @ 152 | 153 | This will rerun everything that was done in the GUI session but purely as a script. 154 | 155 | \section{References} 156 | 157 | \begin{description} 158 | \item Williams, G. J. (2009). {\em Rattle: A Data Mining GUI for R}. 159 | The R Journal, 1(2), 45-55. URL: 160 | \href{https://journal.r-project.org/archive/2009-2/RJournal_2009-2_Williams.pdf} 161 | {https://journal.r-project.org/archive/2009-2/RJournal\_2009-2\_Williams.pdf}. 162 | \item Williams, G. J. (2011). {\em Data Mining with Rattle and R: The 163 | Art of Excavating Data for Knowledge Discovery}. Use R! 164 | series. Springer. \href{https://bit.ly/rattle_data_mining}{https://bit.ly/rattle\_data\_mining}. 165 | \end{description} 166 | 167 | \end{document} 168 | --------------------------------------------------------------------------------