├── .gitignore ├── Makefile ├── README.md ├── package.json ├── s3.exclude ├── tools ├── fuse.coffee ├── server.coffee └── site-src │ ├── 404.jade │ ├── _templates │ ├── default.jade │ ├── footer.jade │ ├── head.jade │ ├── header.jade │ └── scripts.jade │ ├── assets │ ├── images │ │ └── h2o-home.png │ ├── styles │ │ ├── font-awesome.min.css │ │ ├── futura-mock.css │ │ ├── github.css │ │ ├── icomoon.css │ │ └── screen.css │ └── vendor │ │ ├── bootstrap │ │ ├── css │ │ │ ├── bootstrap-theme.css │ │ │ ├── bootstrap-theme.css.map │ │ │ ├── bootstrap-theme.min.css │ │ │ ├── bootstrap.css │ │ │ ├── bootstrap.css.map │ │ │ └── bootstrap.min.css │ │ ├── fonts │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ ├── glyphicons-halflings-regular.svg │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ └── glyphicons-halflings-regular.woff │ │ └── js │ │ │ ├── bootstrap.js │ │ │ └── bootstrap.min.js │ │ ├── fastclick.min.js │ │ ├── imagesloaded.pkgd.js │ │ ├── jquery-1.11.1.js │ │ ├── jquery-1.11.1.min.js │ │ ├── jquery-1.11.1.min.map │ │ ├── jquery.actual.min.js │ │ ├── jquery.html5support.min.js │ │ ├── jquery.slimscroll.min.js │ │ ├── jquery.touchwipe.min.js │ │ ├── jquery.vticker.min.js │ │ └── modernizr.min.js │ └── index.md └── tutorials ├── Training_img ├── AUC1.png ├── AUC2.png ├── ClusterStatus.png ├── ConfMtx1.png ├── ConfMtx2.png ├── CreateFrame1.png ├── CreateFrame2.png ├── Export.png ├── GLMResults.png ├── GainsLift.png ├── GainsLift1.png ├── GainsLift2.png ├── HitRatio.png ├── IOStatus.png ├── Import.png ├── Import2.png ├── Impute1.png ├── Impute2.png ├── Inspect1.png ├── Inspect2.png ├── Interaction1.png ├── Interaction2.png ├── Jobs.png ├── Logs.png ├── NetworkTest.png ├── PCAScore.png ├── POJO.png ├── Parse.png ├── PerfBar.png ├── Predict1.png ├── PredictModel.png ├── PredictResults.png ├── Profiler.png ├── Profiler2.png ├── Quantiles1.png ├── Quantiles2.png ├── STEAMtabular.png ├── SplitFrame.png ├── SplitFrame2.png ├── StackDump.png ├── Summary1.png ├── Summary2.png ├── Tasks.png ├── Timeline.png ├── Training.md ├── UDPDrop.png ├── Upload1.png ├── Upload2.png └── View.png ├── advanced ├── binaryClassificationHelper.R.md ├── features │ └── features.R.md ├── higgs │ ├── .DS_Store │ ├── higgs.R.md │ └── images │ │ └── higgs.png └── tools │ └── tools.R.md ├── basics └── basics.R.md ├── bigdataenv └── H2OinBigDataEnvironments.pdf ├── devel ├── droplets │ ├── images │ │ ├── 1-OpenProject.png │ │ ├── 2-JavaDroplet.png │ │ ├── 3-RebuildProject.png │ │ ├── 4-RunTest.png │ │ └── 5-TestPassed.png │ └── tutorial.md ├── hacking │ ├── Grep.md │ ├── KMeans.md │ └── Quantiles.md └── sparkling_water │ ├── images │ ├── h2o-ui.png │ ├── rstudio.png │ └── spark-ui.png │ └── tutorial.md ├── extab ├── excel.md ├── images │ ├── excel01.png │ ├── excel02.png │ ├── excel03.png │ ├── excel04.png │ ├── tableau_dashboard.png │ ├── tableau_data_connection1.png │ ├── tableau_data_connection2.png │ ├── tableau_execute.png │ ├── tableau_execute2.png │ ├── tableau_execute3.png │ ├── tableau_execute4.png │ ├── tableau_h2o_parameters.png │ ├── tableau_r_connection1.png │ ├── tableau_r_connection2.png │ └── workflow.png └── tableau.md ├── hive_udf_template ├── GBM-example.R ├── README.md ├── localjars │ └── h2o-model.jar ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── h2o │ │ └── hive │ │ └── udf │ │ ├── GBMPojo.java │ │ └── ScoreDataUDF.java │ └── test │ └── java │ └── com │ └── h2o │ └── hive │ └── udf │ └── UDFExampleTest.java ├── marketing_usecases ├── h2o_training_yan_2014.pdf ├── h2o_world_Vinod.pdf └── marketing_usecases.R.md ├── setup ├── images │ ├── 01_virtualbox.png │ ├── 02_vb_menu.png │ ├── 03_select_file.png │ └── 04_import_wizard.png └── install.md ├── streaming └── storm │ ├── H2OStormStarter.java │ ├── README.md │ ├── TestH2ODataSpout.java │ ├── example.R │ ├── images │ ├── cats_n_dogs.png │ ├── h2o_storm.png │ ├── ij_1.png │ ├── ij_10.png │ ├── ij_11.png │ ├── ij_2.png │ ├── ij_3.png │ ├── ij_4.png │ ├── ij_6.png │ ├── ij_7.png │ ├── ij_8.png │ └── ij_9.png │ ├── live_data.csv │ ├── premade_generated_model │ ├── GBMPojo.java │ └── h2o-genmodel.jar │ ├── training_data.csv │ └── web │ ├── cat.png │ ├── cloud.png │ ├── dog.png │ ├── index.html │ └── out ├── supervised ├── classification │ ├── classification.R.md │ └── images │ │ ├── glm_f1_cutoff_0.png │ │ ├── glm_f1_cutoff_1.png │ │ ├── glm_roc_0.png │ │ ├── glm_roc_1.png │ │ └── metrics.png ├── deeplearning │ └── deeplearning.R.md ├── gbm │ └── gbm.R.md ├── glm │ └── glm.R.md ├── randomforest │ └── randomforest.R.md └── regression │ ├── images │ └── rand_glm_coef.png │ └── regression.R.md ├── troubleshooting ├── images │ ├── Clusterstattunnel.png │ ├── TroubleshootingHadoopAmbariNodeMgr.png │ ├── TroubleshootingHadoopAmbariyarnscheduler.png │ ├── TroubleshootingHadoopClouderayarnnodemgr.png │ ├── TroubleshootingHadoopClouderayarnscheduler.png │ └── UpdateR.png └── troubleshooting.md ├── unsupervised ├── anomaly │ ├── anomaly.R.md │ └── images │ │ ├── autoencoder.png │ │ ├── bad_both.png │ │ ├── good_both.png │ │ └── ugly_both.png ├── clustering │ └── clustering.R.md └── dimreduction │ ├── dimreduction.R.md │ └── images │ └── mnist_pca_sdev.png └── web_ui └── tutorial.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | build/ 3 | 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: node_modules build/site-src 2 | node_modules/.bin/coffee tools/fuse.coffee build/site-src build/site 3 | 4 | run: node_modules 5 | @echo "Go to http://localhost:8080/" 6 | node_modules/.bin/coffee tools/server.coffee build/site 7 | 8 | # After 'make run' you can do this to get to the web site. 9 | browser_on_mac: 10 | open http://localhost:8080 11 | 12 | clean: 13 | rm -rf build 14 | 15 | node_modules: package.json 16 | npm install 17 | 18 | build/site-src: 19 | rsync -rupE tools/site-src build/ 20 | rsync -rupE tutorials/ build/site-src/ 21 | 22 | mrproper: clean 23 | rm -rf node_modules/ 24 | 25 | install: 26 | rm -rf /opt/h2o-training 27 | cp -r build/site/ /opt/h2o-training 28 | 29 | test: 30 | s3cmd sync --dry-run --delete-removed --acl-public --exclude-from s3.exclude build/site/ s3://train.h2o.ai/ 31 | 32 | push: 33 | s3cmd sync --delete-removed --acl-public --exclude-from s3.exclude build/site/ s3://train.h2o.ai/ 34 | 35 | .PHONY: build run clean test push build/site-src test 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Please go to the h2o-tutorials repository instead! 2 | 3 | ## This repository is now outdated, and contains material for H2O World 2014 training using H2O 2.8. 4 | 5 | --- 6 | 7 | H2O Training 8 | ============ 9 | 10 | ## Follow tutorials 11 | 12 | 13 | ## Build a site 14 | 15 | * Run `make build` to build a site 16 | * Run `make run` to run a web server 17 | * Go to [http://localhost:8080/](http://localhost:8080/) to see generated site 18 | 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "0xdata.com", 3 | "version": "0.0.0", 4 | "description": "0xdata Public Website", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/0xdata/0xdata.com.git" 12 | }, 13 | "keywords": [ 14 | "0xdata.com", 15 | "website" 16 | ], 17 | "author": "Prithvi Prabhu ", 18 | "license": "ISC", 19 | "bugs": { 20 | "url": "https://github.com/0xdata/0xdata.com/issues" 21 | }, 22 | "homepage": "https://github.com/0xdata/0xdata.com", 23 | "devDependencies": { 24 | "highlight.js": "~8.2.0" 25 | }, 26 | "dependencies": { 27 | "escape-html": "~1.0.1", 28 | "twit": "~1.1.18", 29 | "coffee-script": "~1.7.1", 30 | "jade": "~1.5.0", 31 | "js-yaml": "~3.1.0", 32 | "marked": "~0.3.2", 33 | "fs-extra": "~0.10.0", 34 | "connect": "~3.1.0", 35 | "serve-static": "~1.5.0", 36 | "dateformat": "~1.0.8-1.2.3", 37 | "underscore": "~1.6.0" 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /s3.exclude: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .git/* 3 | *.swp 4 | -------------------------------------------------------------------------------- /tools/fuse.coffee: -------------------------------------------------------------------------------- 1 | fs = require 'fs' 2 | fp = require 'path' 3 | fse = require 'fs-extra' 4 | jade = require 'jade' 5 | marked = require 'marked' 6 | yaml = require 'js-yaml' 7 | dateformat = require 'dateformat' 8 | underscore = require 'underscore' 9 | highlight = require 'highlight.js' 10 | 11 | SIM = no 12 | 13 | SITEMAP_XML = ''' 14 | 15 | 16 | {{urls}} 17 | 18 | ''' 19 | 20 | marked.setOptions 21 | smartypants: yes 22 | highlight: (code, lang) -> 23 | (highlight.highlightAuto code, [ lang ]).value 24 | 25 | isYaml = (ext) -> ext.toLowerCase() is '.yml' 26 | isJade = (ext) -> ext.toLowerCase() is '.jade' 27 | isMarkdown = (ext) -> ext.toLowerCase() is '.md' 28 | isContent = (ext) -> (isMarkdown ext) or (isJade ext) 29 | readFile = (path) -> fs.readFileSync path, 'utf8' 30 | writeFile = fse.outputFileSync 31 | copyFile = fse.copySync 32 | 33 | formBin = (src) -> 34 | if isContent src.ext 35 | if src.dir is '.' and (src.slug is 'index' or src.slug is '404') 36 | path: fp.join src.dir, "#{src.slug}.html" 37 | else 38 | if 0 < src.slug.indexOf '_' 39 | tokens = src.slug.split /_+/g 40 | tokens.unshift src.dir 41 | tokens.push 'index.html' 42 | path: fp.join.apply null, tokens 43 | else 44 | path: fp.join src.dir, "#{src.slug}.html" 45 | #path: fp.join src.dir, src.slug, 'index.html' 46 | else 47 | path: src.path 48 | 49 | #TODO turn this into a plugin 50 | createSitemapTxt = (targetDir, urls) -> 51 | writeFile (fp.join targetDir, 'sitemap.txt'), urls.join '\n' 52 | 53 | #TODO turn this into a plugin 54 | createSitemapXml = (targetDir, urls) -> 55 | writeFile (fp.join targetDir, 'sitemap.xml'), SITEMAP_XML.replace '{{urls}}', (urls.map (url) -> "#{url}").join '\n' 56 | 57 | createCategorySlug = (category) -> 58 | category 59 | .toLowerCase() 60 | .replace /[^a-z0-9 ]/g, '' 61 | .replace /\s+/g, '-' 62 | 63 | _templates = {} 64 | loadTemplate = (path, cache=yes) -> 65 | if cache and template = _templates[path] 66 | template 67 | else 68 | template = jade.compileFile path, 69 | filename: path 70 | if cache 71 | _templates[path] = template 72 | template 73 | 74 | forEachPage = (node, go) -> 75 | for name, child of node 76 | if child.__fuse__ 77 | go node, child 78 | else 79 | forEachPage child, go 80 | return 81 | 82 | walkSources = (sourceDir, currentDir, node) -> 83 | for name in (fs.readdirSync currentDir) when name[0] isnt '.' 84 | path = fp.join currentDir, name 85 | stat = fs.statSync path 86 | if stat.isDirectory() or stat.isFile() 87 | relpath = fp.relative sourceDir, path 88 | if stat.isDirectory() 89 | if name isnt '_templates' 90 | node[name] = leaf = {} 91 | walkSources sourceDir, path, leaf 92 | else 93 | ext = fp.extname path 94 | slug = fp.basename path, ext 95 | dir = fp.dirname relpath 96 | 97 | src = 98 | dir: dir 99 | slug: slug 100 | ext: ext 101 | path: relpath 102 | 103 | bin = formBin src 104 | path = bin.path.split fp.sep 105 | path.pop() if path[path.length - 1] is 'index.html' 106 | 107 | unless (isContent ext) and slug[0] is '_' and not slug is '_sidebar' 108 | node[name] = 109 | __fuse__: yes 110 | src: src 111 | bin: bin 112 | ext: if ext[0] is '.' then (ext.substr 1).toLowerCase() else ext.toLowerCase() 113 | url: '/' + path.join '/' 114 | path: path 115 | node 116 | 117 | fuse = (context, sourceDir, targetDir) -> 118 | unless fs.existsSync sourceDir 119 | throw new Error 'Source directory does not exist: ' + sourceDir 120 | 121 | unless fs.statSync(sourceDir).isDirectory() 122 | throw new Error 'Not a directory: ' + sourceDir 123 | 124 | tree = walkSources sourceDir, sourceDir, {} 125 | 126 | tree.find = (path) -> 127 | node = tree 128 | for slug in path 129 | unless node = node[slug] 130 | return null 131 | node 132 | 133 | console.log 'Parsing files...' 134 | 135 | forEachPage tree, (parent, item) -> 136 | sourcePath = fp.join sourceDir, item.src.path 137 | if isMarkdown item.src.ext 138 | console.log 'Parsing ' + sourcePath 139 | content = readFile sourcePath 140 | if content[0 ... 3] is '---' 141 | result = content.match /^-{3,}\s([\s\S]*?)-{3,}(\s[\s\S]*|\s?)$/ 142 | if result?.length is 3 143 | [ match, metadata, markdown ] = result 144 | else 145 | markdown = content 146 | else 147 | markdown = content 148 | 149 | if metadata 150 | properties = yaml.safeLoad metadata 151 | for k, v of properties 152 | item[k] = if k is 'date' then new Date v else v 153 | 154 | if markdown 155 | item.content = marked markdown 156 | 157 | else if isYaml item.src.ext 158 | console.log 'Parsing ' + sourcePath 159 | item.content = yaml.safeLoad readFile sourcePath 160 | 161 | console.log 'Building site...' 162 | 163 | forEachPage tree, (parent, page) -> 164 | if isMarkdown page.src.ext 165 | if page.src.slug[0] isnt '_' 166 | console.log 'Processing: ' + page.src.path 167 | template = loadTemplate fp.join sourceDir, '_templates', "#{page.template or 'default'}.jade" #TODO 168 | html = template 169 | context: context 170 | pages: tree 171 | page: page 172 | 173 | binPath = fp.join targetDir, page.bin.path 174 | console.log "#{page.src.path} --> #{binPath}" 175 | writeFile binPath, html unless SIM 176 | 177 | else if isJade page.src.ext 178 | if page.src.slug[0] isnt '_' 179 | console.log 'Processing: ' + page.src.path 180 | 181 | sourcePath = fp.join sourceDir, page.src.path 182 | render = loadTemplate sourcePath, no 183 | page.content = render 184 | context: context 185 | pages: tree 186 | page: page 187 | 188 | template = loadTemplate fp.join sourceDir, '_templates', 'default.jade' #TODO 189 | html = template 190 | context: context 191 | pages: tree 192 | page: page 193 | 194 | binPath = fp.join targetDir, page.bin.path 195 | console.log "#{page.src.path} --> #{binPath}" 196 | writeFile binPath, html unless SIM 197 | else 198 | console.log 'Copying: ' + page.src.path 199 | srcPath = fp.join sourceDir, page.src.path 200 | binPath = fp.join targetDir, page.bin.path 201 | console.log "#{srcPath} --> #{binPath}" 202 | copyFile srcPath, binPath unless SIM 203 | 204 | if SIM 205 | console.log 'Dumping...' 206 | writeFile 'fuse.dump', JSON.stringify tree, null, 2 207 | 208 | 209 | urls = [] 210 | forEachPage tree, (parent, page) -> 211 | if isContent page.src.ext 212 | urls.push 'http://train.h2o.ai' + page.url 213 | 214 | console.log 'Creating sitemaps...' 215 | createSitemapTxt targetDir, urls 216 | createSitemapXml targetDir, urls 217 | 218 | console.log 'Done!' 219 | 220 | [ runtime, script, sourceDir, targetDir ] = process.argv 221 | 222 | context = 223 | underscore: underscore 224 | formatDate: dateformat 225 | 226 | fuse context, sourceDir, targetDir 227 | 228 | return 229 | -------------------------------------------------------------------------------- /tools/server.coffee: -------------------------------------------------------------------------------- 1 | connect = require 'connect' 2 | serveStatic = require 'serve-static' 3 | connect() 4 | .use serveStatic process.argv[2] 5 | .listen 8080 6 | -------------------------------------------------------------------------------- /tools/site-src/404.jade: -------------------------------------------------------------------------------- 1 | h1 Whoops 2 | h3 Looks like what you're looking for can't be found. 3 | -------------------------------------------------------------------------------- /tools/site-src/_templates/default.jade: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | html(lang="en") 5 | 6 | head 7 | include head.jade 8 | 9 | body.single 10 | main.content 11 | include header.jade 12 | 13 | .container(class='path-#{page.path[0]}') 14 | if page.title 15 | h1= page.title 16 | != page.content 17 | 18 | include footer.jade 19 | include scripts.jade 20 | 21 | -------------------------------------------------------------------------------- /tools/site-src/_templates/footer.jade: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tools/site-src/_templates/footer.jade -------------------------------------------------------------------------------- /tools/site-src/_templates/head.jade: -------------------------------------------------------------------------------- 1 | meta(charset="UTF-8") 2 | meta(name='viewport', content='width=device-width, initial-scale=1.0') 3 | - var pageTitle = page.title 4 | title!= '0xdata' + (pageTitle ? ' - ' + pageTitle : '') 5 | meta(name='description', content='0xdata, makers of H2O - The Open Source In-Memory Prediction Engine for Big Data Science') 6 | link(rel='publisher', href='https://plus.google.com/109486779212435464374/') 7 | meta(property='og:title', content='0xdata, makers of H2O - The Open Source In-Memory Prediction Engine for Big Data Science') 8 | meta(property='og:type', content='article') 9 | meta(property='og:image', content='http://0xdata.com/assets/images/h2o.png') 10 | meta(property='og:url', content='http://0xdata.com') 11 | meta(property='og:description', content='H2O makes Hadoop do Math! H2O scales statistics, machine learning and math over Big Data. H2O keeps familiar interfaces like R, Excel & JSON so that big data enthusiasts and experts can explore, munge, model and score data sets using a range of simple to advanced algorithms.') 12 | link(rel='shortcut icon', href='/img/favicon.ico') 13 | link(rel='stylesheet', type='text/css', href='/assets/vendor/bootstrap/css/bootstrap.min.css', media='screen') 14 | link(rel='stylesheet', type='text/css', href='/assets/styles/icomoon.css', media='screen') 15 | link(rel='stylesheet', type='text/css', href='/assets/styles/font-awesome.min.css', media='screen') 16 | link(rel='stylesheet', type='text/css', href='/assets/styles/screen.css', media='screen') 17 | link(rel='stylesheet', type='text/css', href='/assets/styles/github.css', media='screen') 18 | 19 | 20 | //- Remove this TypeKit is used 21 | //link(rel='stylesheet', type='text/css', href='/assets/styles/futura-mock.css', media='screen') 22 | //- /end Remove this TypeKit is used 23 | 24 | 29 | 30 | 40 | 41 | -------------------------------------------------------------------------------- /tools/site-src/_templates/header.jade: -------------------------------------------------------------------------------- 1 | header.content-header 2 | .site-logo 3 | .container 4 | 5 | -------------------------------------------------------------------------------- /tools/site-src/_templates/scripts.jade: -------------------------------------------------------------------------------- 1 | script(type='text/javascript', src='/assets/vendor/jquery-1.11.1.min.js') 2 | script(type='text/javascript', src='/assets/vendor/modernizr.min.js') 3 | 4 | 7 | 8 | script(type='text/javascript', src='/assets/vendor/bootstrap/js/bootstrap.min.js') 9 | 10 | script(type='text/javascript'). 11 | Modernizr.load([ 12 | { 13 | test: (Modernizr.touch), 14 | yep: ['/assets/vendor/fastclick.min.js'], 15 | complete: function() { 16 | if (Modernizr.touch) { 17 | $('body').append(" 7 | 8 | 9 | 16 | 17 | 80 | 81 | 82 | 83 | 84 |
85 |
86 |
87 |
88 |
89 | 90 | 91 |
92 |
93 | 94 | 95 |
0
96 | 97 |
0
98 |
99 | 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /tutorials/streaming/storm/web/out: -------------------------------------------------------------------------------- 1 | cat,cat -------------------------------------------------------------------------------- /tutorials/supervised/classification/images/glm_f1_cutoff_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/classification/images/glm_f1_cutoff_0.png -------------------------------------------------------------------------------- /tutorials/supervised/classification/images/glm_f1_cutoff_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/classification/images/glm_f1_cutoff_1.png -------------------------------------------------------------------------------- /tutorials/supervised/classification/images/glm_roc_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/classification/images/glm_roc_0.png -------------------------------------------------------------------------------- /tutorials/supervised/classification/images/glm_roc_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/classification/images/glm_roc_1.png -------------------------------------------------------------------------------- /tutorials/supervised/classification/images/metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/classification/images/metrics.png -------------------------------------------------------------------------------- /tutorials/supervised/gbm/gbm.R.md: -------------------------------------------------------------------------------- 1 | # Introduction to Gradient Boosting Machines in H2O 2 | 3 | ###### This tutorial introduces H2O's Gradient (Tree) Boosting Machines framework in R. 4 | 5 | #### Gradient Boosting Machines (GBM) 6 | 7 | ##### Intuition: Average an ensemble of weakly predicting (small) trees where each tree "adjusts" to the "mistakes" of the preceding trees. 8 | 9 | ##### Important components: 10 | ###### 1. Number of trees 11 | ###### 2. Maximum depth of tree 12 | ###### 3. Learning rate ( *shrinkage* parameter) 13 | 14 | ###### where smaller learning rates tend to require larger number of tree and vice versa. 15 | 16 | ### R Documentation 17 | 18 | ###### The `h2o.gbm` function fits H2O's Gradient Boosting Machines from within R. 19 | 20 | library(h2o) 21 | args(h2o.gbm) 22 | 23 | ###### The R documentation (man page) for H2O's Gradient Boosting Machines can be opened from within R using the `help` or `?` functions: 24 | 25 | help(h2o.gbm) 26 | 27 | ###### We can run the example from the man page using the `example` function: 28 | 29 | example(h2o.gbm) 30 | 31 | ###### And run a longer demonstration from the `h2o` package using the `demo` function: 32 | 33 | demo(h2o.gbm) 34 | -------------------------------------------------------------------------------- /tutorials/supervised/glm/glm.R.md: -------------------------------------------------------------------------------- 1 | # Introduction to Generalized Linear Models in H2O 2 | 3 | ###### This tutorial introduces H2O's Generalized Linear Models (GLM) framework in R. 4 | 5 | ### Generalized Linear Models (GLM) 6 | 7 | #### Intuition: A linear combination of predictors is sufficient for determining an outcome. 8 | 9 | ##### Important components: 10 | ###### 1. Exponential family for error distribution (Gaussian/Normal, Binomial, Poisson, Gamma, Tweedie, etc.) 11 | ###### 2. Link function, whose inverse is used to generate predictions 12 | ###### 3. (Elastic Net) Mixing parameter between the L1 and L2 penalties on the coefficient estimates. 13 | ###### 4. (Elastic Net) Shrinkage parameter for the mixed penalty in 3. 14 | 15 | ### R Documentation 16 | 17 | ###### The `h2o.glm` function fits H2O's Generalized Linear Models from within R. 18 | 19 | library(h2o) 20 | args(h2o.glm) 21 | 22 | ###### The R documentation (man page) for H2O's Generalized Linear Models can be opened from within R using the `help` or `?` functions: 23 | 24 | help(h2o.glm) 25 | 26 | ###### We can run the example from the man page using the `example` function: 27 | 28 | example(h2o.glm) 29 | 30 | ###### And run a longer demonstration from the `h2o` package using the `demo` function: 31 | 32 | demo(h2o.glm) 33 | -------------------------------------------------------------------------------- /tutorials/supervised/randomforest/randomforest.R.md: -------------------------------------------------------------------------------- 1 | # Introduction to Random Forests in H2O 2 | 3 | ###### This tutorial introduces H2O's Random Forest framework in R. 4 | 5 | #### Random Forests 6 | 7 | ##### Intuition: Average an ensemble of weakly predicting (larger) trees where each tree is *de-correlated* from all other trees. 8 | 9 | ##### Important components: 10 | ###### 1. Number of trees 11 | ###### 2. Maximum depth of tree 12 | ###### 3. Number of variables randomly sampled as candidates for splits 13 | ###### 4. Sampling rate for constructing data set to use on each tree 14 | 15 | ### R Documentation 16 | 17 | ###### The `h2o.randomForest` function fits H2O's Random Forest from within R. 18 | 19 | library(h2o) 20 | args(h2o.randomForest) 21 | 22 | ###### The R documentation (man page) for H2O's Random Forest can be opened from within R using the `help` or `?` functions: 23 | 24 | help(h2o.randomForest) 25 | 26 | ###### We can run the example from the man page using the `example` function: 27 | 28 | example(h2o.randomForest) 29 | 30 | ###### And run a longer demonstration from the `h2o` package using the `demo` function: 31 | 32 | demo(h2o.randomForest) 33 | -------------------------------------------------------------------------------- /tutorials/supervised/regression/images/rand_glm_coef.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/supervised/regression/images/rand_glm_coef.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/Clusterstattunnel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/Clusterstattunnel.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/TroubleshootingHadoopAmbariNodeMgr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/TroubleshootingHadoopAmbariNodeMgr.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/TroubleshootingHadoopAmbariyarnscheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/TroubleshootingHadoopAmbariyarnscheduler.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/TroubleshootingHadoopClouderayarnnodemgr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/TroubleshootingHadoopClouderayarnnodemgr.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/TroubleshootingHadoopClouderayarnscheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/TroubleshootingHadoopClouderayarnscheduler.png -------------------------------------------------------------------------------- /tutorials/troubleshooting/images/UpdateR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/troubleshooting/images/UpdateR.png -------------------------------------------------------------------------------- /tutorials/unsupervised/anomaly/anomaly.R.md: -------------------------------------------------------------------------------- 1 | # Anomaly Detection on MNIST with H2O Deep Learning 2 | 3 | ######This tutorial shows how a Deep Learning [Auto-Encoder](http://en.wikipedia.org/wiki/Autoencoder) model can be used to find outliers in a dataset. This file is both valid R and markdown code. 4 | 5 | ######Consider the following three-layer neural network with one hidden layer and the same number of input neurons (features) as output neurons. The loss function is the MSE between the input and the output. Hence, the network is forced to learn the identity via a nonlinear, reduced representation of the original data. Such an algorithm is called a deep autoencoder; these models have been used extensively for unsupervised, layer-wise pretraining of supervised deep learning tasks, but here we consider the autoencoder's application for discovering anomalies in data. 6 | 7 | #####![](images/autoencoder.png) 8 | 9 | ######We use the well-known [MNIST](http://yann.lecun.com/exdb/mnist/) dataset of hand-written digits, where each row contains the 28^2=784 raw gray-scale pixel values from 0 to 255 of the digitized digits (0 to 9). 10 | 11 | ### Start H2O and load the MNIST data 12 | 13 | ######Initialize the H2O server and import the MNIST training/testing datasets. 14 | 15 | library(h2o) 16 | h2oServer <- h2o.init(nthreads=-1) 17 | homedir <- "/data/h2o-training/mnist/" 18 | TRAIN = "train.csv.gz" 19 | TEST = "test.csv.gz" 20 | train_hex <- h2o.importFile(h2oServer, path = paste0(homedir,TRAIN), header = F, sep = ',', key = 'train.hex') 21 | test_hex <- h2o.importFile(h2oServer, path = paste0(homedir,TEST), header = F, sep = ',', key = 'test.hex') 22 | 23 | ######The data consists of 784 (=28^2) pixel values per row, with (gray-scale) values from 0 to 255. The last column is the response (a label in 0,1,2,...,9). 24 | 25 | predictors = c(1:784) 26 | resp = 785 27 | 28 | ######We do unsupervised training, so we can drop the response column. 29 | 30 | train_hex <- train_hex[,-resp] 31 | test_hex <- test_hex[,-resp] 32 | 33 | ### Finding outliers - ugly hand-written digits 34 | ######We train a Deep Learning Auto-Encoder to learn a compressed (low-dimensional) non-linear representation of the dataset, hence learning the intrinsic structure of the training dataset. The auto-encoder model is then used to transform all test set images to their reconstructed images, by passing through the lower-dimensional neural network. We then find outliers in a test dataset by comparing the reconstruction of each scanned digit with its original pixel values. The idea is that a high reconstruction error of a digit indicates that the test set point doesn't conform to the structure of the training data and can hence be called an outlier. 35 | 36 | ####1. Learn what's *normal* from the training data 37 | 38 | ######Train unsupervised Deep Learning autoencoder model on the training dataset. For simplicity, we train a model with 1 hidden layer of 50 Tanh neurons to create 50 non-linear features with which to reconstruct the original dataset. We learned from the Dimensionality Reduction tutorial that 50 is a reasonable choice. For simplicity, we train the auto-encoder for only 1 epoch (one pass over the data). We explicitly include constant columns (all white background) for the visualization to be easier. 39 | 40 | ae_model <- h2o.deeplearning(x=predictors, 41 | y=42, #response (ignored - pick any non-constant column) 42 | data=train_hex, 43 | activation="Tanh", 44 | autoencoder=T, 45 | hidden=c(50), 46 | ignore_const_cols=F, 47 | epochs=1) 48 | 49 | ######Note that the response column is ignored (it is only required because of a shared DeepLearning code framework). 50 | 51 | ####2. Find outliers in the test data 52 | ######The Anomaly app computes the per-row reconstruction error for the test data set. It passes it through the autoencoder model (built on the training data) and computes mean square error (MSE) for each row in the test set. 53 | 54 | test_rec_error <- as.data.frame(h2o.anomaly(test_hex, ae_model)) 55 | 56 | 57 | ######In case you wanted to see the lower-dimensional features created by the auto-encoder deep learning model, here's a way to extract them for a given dataset. This a non-linear dimensionality reduction, similar to PCA, but the values are capped by the activation function (in this case, they range from -1...1) 58 | 59 | test_features_deep <- h2o.deepfeatures(test_hex, ae_model, layer=1) 60 | summary(test_features_deep) 61 | 62 | ####3. Visualize the *good*, the *bad* and the *ugly* 63 | ######We will need a helper function for plotting handwritten digits (adapted from http://www.r-bloggers.com/the-essence-of-a-handwritten-digit/). Don't worry if you don't follow this code... 64 | 65 | plotDigit <- function(mydata, rec_error) { 66 | len<-nrow(mydata) 67 | N<-ceiling(sqrt(len)) 68 | op <- par(mfrow=c(N,N),pty='s',mar=c(1,1,1,1),xaxt='n',yaxt='n') 69 | for (i in 1:nrow(mydata)) { 70 | colors<-c('white','black') 71 | cus_col<-colorRampPalette(colors=colors) 72 | z<-array(mydata[i,],dim=c(28,28)) 73 | z<-z[,28:1] 74 | image(1:28,1:28,z,main=paste0("rec_error: ", round(rec_error[i],4)),col=cus_col(256)) 75 | } 76 | on.exit(par(op)) 77 | } 78 | 79 | plotDigits <- function(data, rec_error, rows) { 80 | row_idx <- order(rec_error[,1],decreasing=F)[rows] 81 | my_rec_error <- rec_error[row_idx,] 82 | my_data <- as.matrix(as.data.frame(data[row_idx,])) 83 | plotDigit(my_data, my_rec_error) 84 | } 85 | 86 | ######Let's look at the test set points with low/median/high reconstruction errors. We will now visualize the original test set points and their reconstructions obtained by propagating them through the narrow neural net. 87 | 88 | test_recon <- h2o.predict(ae_model, test_hex) 89 | summary(test_recon) 90 | 91 | ####The good 92 | ######Let's plot the 25 digits with lowest reconstruction error. First we plot the reconstruction, then the original scanned images. 93 | 94 | plotDigits(test_recon, test_rec_error, c(1:25)) 95 | plotDigits(test_hex, test_rec_error, c(1:25)) 96 | 97 | #####![](images/good_both.png) 98 | ######Clearly, a well-written digit 1 appears in both the training and testing set, and is easy to reconstruct by the autoencoder with minimal reconstruction error. Nothing is as easy as a straight line. 99 | 100 | ####The bad 101 | ######Now let's look at the 25 digits with median reconstruction error. 102 | 103 | plotDigits(test_recon, test_rec_error, c(4988:5012)) 104 | plotDigits(test_hex, test_rec_error, c(4988:5012)) 105 | 106 | #####![](images/bad_both.png) 107 | ######These test set digits look "normal" - it is plausible that they resemble digits from the training data to a large extent, but they do have some particularities that cause some reconstruction error. 108 | 109 | ####The ugly 110 | ######And here are the biggest outliers - The 25 digits with highest reconstruction error! 111 | 112 | plotDigits(test_recon, test_rec_error, c(9976:10000)) 113 | plotDigits(test_hex, test_rec_error, c(9976:10000)) 114 | 115 | #####![](images/ugly_both.png) 116 | ######Now here are some pretty ugly digits that are plausibly not commonly found in the training data - some are even hard to classify by humans. 117 | 118 | ###Voila! 119 | #####We were able to find outliers with H2O Deep Learning Auto-Encoder models. We would love to hear your usecase for Anomaly detection. 120 | 121 | ######*Note:* Every run of DeepLearning results in different results since we use [Hogwild!](http://www.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf) parallelization with intentional race conditions between threads. To get reproducible results at the expense of speed for small datasets, set reproducible=T and specify a seed. 122 | 123 | #### More information can be found in the [H2O Deep Learning booklet](https://t.co/kWzyFMGJ2S) and in our [slides](http://www.slideshare.net/0xdata/presentations). 124 | -------------------------------------------------------------------------------- /tutorials/unsupervised/anomaly/images/autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/unsupervised/anomaly/images/autoencoder.png -------------------------------------------------------------------------------- /tutorials/unsupervised/anomaly/images/bad_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/unsupervised/anomaly/images/bad_both.png -------------------------------------------------------------------------------- /tutorials/unsupervised/anomaly/images/good_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/unsupervised/anomaly/images/good_both.png -------------------------------------------------------------------------------- /tutorials/unsupervised/anomaly/images/ugly_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/unsupervised/anomaly/images/ugly_both.png -------------------------------------------------------------------------------- /tutorials/unsupervised/clustering/clustering.R.md: -------------------------------------------------------------------------------- 1 | # Unsupervised Learning and Clustering With H2O KMeans 2 | 3 | ###### This tutorial shows how a [KMeans](http://en.wikipedia.org/wiki/K-means_clustering) model is trained. This file is both valid R and markdown code. We will use a variety of well-known datasets that are used in published papers, which evaluate various KMeans implementations. 4 | 5 | ### Start H2O and build a KMeans model on iris 6 | 7 | ###### Initialize the H2O server and import the datasets we need for this session. 8 | 9 | library(h2o) 10 | h2oServer <- h2o.init(nthreads=-1) 11 | datadir <- "/data" 12 | homedir <- file.path(datadir, "h2o-training", "clustering") 13 | iris.h2o <- as.h2o(h2oServer, iris) 14 | 15 | 16 | ### Our first KMeans model 17 | 18 | ###### It's easy to run KMeans, it's just like the `kmeans` method available in the stats package. We'll leave out the `Species` column and cluster on the iris flower attributes. 19 | 20 | km.model <- h2o.kmeans(data = iris.h2o, centers = 5, cols = 1:4, init="furthest") 21 | 22 | ###### Let's look at the model summary: 23 | 24 | km.model 25 | km.model@model$centers # The centers for each cluster 26 | km.model@model$tot.withinss # total within cluster sum of squares 27 | km.model@model$cluster # cluster assignments per observation 28 | 29 | ###### To see the model parameters that were used, access the model@model$params field: 30 | 31 | km.model@model$params 32 | 33 | ###### You can get the R documentation help here: 34 | 35 | ?h2o.kmeans 36 | 37 | ### Use the [Gap Statistic (Beta)](http://web.stanford.edu/~hastie/Papers/gap.pdf) To Find the Optimal Number of Clusters 38 | ###### This is essentially a grid search over KMeans. 39 | 40 | ###### You can get the R documentation help here: 41 | 42 | ?h2o.gapStatistic 43 | 44 | ###### The idea is that, for each 'k', generate WCSS from a reference distribution and examine the gap between the expected WCSS and the observed WCSS. To obtain the W_k from the reference distribution, 'B' Monte Carlo replicates are drawn from the reference distribution. For each replicate, a KMeans is constructed and the WCSS reported back. 45 | 46 | gap_stat <- h2o.gapStatistic(data = iris.h2o, K = 10, B = 100, boot_frac = .1, cols=1:4) 47 | 48 | ###### Let's take a look at the output. The default output display will show the number of KMeans models that were run and the optimal value of k: 49 | 50 | gap_stat 51 | 52 | ###### We can also run summary on the gap_stat model: 53 | 54 | summary(gap_stat) 55 | 56 | ###### We can also plot our gap_stat model: 57 | 58 | plot(gap_stat) 59 | 60 | ### Comparison against other KMeans implementations. 61 | ###### Let's use the [Census 1990 dataset](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29), which has 2.5 million data points with 68 integer features. 62 | 63 | # census1990 <- "Census1990.csv.gz" 64 | # census.1990 <- h2o.importFile(h2oServer, path = file.path(homedir,census1990), header = F, sep = ',', key = 'census.1990.hex') 65 | 66 | # dim(census.1990) 67 | # km.census <- h2o.kmeans(data = census.1990, centers = 12, init="furthest") # NOT RUN: Too long on VM 68 | # km.census@model$tot.withinss 69 | 70 | ###### We can compare the result with the published result from [Fast and Accurate KMeans on Large Datasets](http://papers.nips.cc/paper/4362-fast-and-accurate-k-means-for-large-datasets.pdf) where the cost for k = 12 and ~2GB of RAM was approximately 3.50E+18. This paper implements a streaming KMeans, so of course accuracy in the streaming case will not be as good as a batch job, but results are comparable within a few orders of magnitude. H2O gives the ability to work on datasets that don't fit in a single box's RAM without having to stream the data from cold storage: simply use distributed H2O. 71 | 72 | ###### We can also compare with [StreamKM++: A Clustering Algorithm for Data Streams](http://www.cs.uni-paderborn.de/uploads/tx_sibibtex/2012_AckermannMRSLS_StreamKMpp.pdf). For various k, we can compare our implementation, but we only do k = 30 here. 73 | 74 | # km.census <- h2o.kmeans(data = census.1990, centers = 30, init="furthest") # NOT RUN: Too long on VM 75 | # km.census@model$tot.withinss # NOT RUN: Too long on VM 76 | 77 | ##### We can also compare with the kmeans package: 78 | 79 | # census.1990.r <- read.csv(file.path(homedir,census1990)) 80 | # km.census.r <- kmeans(census.1990.r, centers = 24) # NOT RUN: Quick-TRANSfer stage steps exceeded maximum (= 122914250) 81 | # km.census.r$tot.withinss 82 | 83 | ###### Let's compare now on the big dataset BigCross [Big Cross](http://www.cs.uni-paderborn.de/en/fachgebiete/ag-bloemer/research/clustering/streamkmpp/), which has 11.6 million data points with 57 integer features. 84 | 85 | # bigcross <- "BigCross.data.gz" 86 | # big.cross <- h2o.importFile(h2oServer, path = file.path(homedir,bigcross), header = F, sep = ',', key = 'big.cross.hex') 87 | 88 | # dim(big.cross) # NOT RUN: Too long on VM 89 | # km.bigcross <- h2o.kmeans(data = big.cross, centers = 24, init="furthest") # NOT RUN: Too long on VM 90 | # km.bigcross@model$tot.withinss # NOT RUN: Too long on VM 91 | 92 | ###### We can compare the result with the published result from [Fast and Accurate KMeans on Large Datasets](http://papers.nips.cc/paper/4362-fast-and-accurate-k-means-for-large-datasets.pdf), where the cost for k = 24 and ~2GB of RAM was approximately 1.50E+14. 93 | 94 | ###### We can also compare with [StreamKM++: A Clustering Algorithm for Data Streams](http://www.cs.uni-paderborn.de/uploads/tx_sibibtex/2012_AckermannMRSLS_StreamKMpp.pdf). For various k, we can compare our implementation, but we only do k = 30 here. 95 | 96 | # km.bigcross <- h2o.kmeans(data = big.cross, centers = 30, init="furthest") # NOT RUN: Too long on VM 97 | # km.bigcross@model$tot.withinss # NOT RUN: Too long on VM 98 | 99 | -------------------------------------------------------------------------------- /tutorials/unsupervised/dimreduction/dimreduction.R.md: -------------------------------------------------------------------------------- 1 | # Dimensionality Reduction of MNIST 2 | 3 | ######This tutorial shows how to reduce the dimensionality of a dataset with H2O. We will use both PCA and Deep Learning. This file is both valid R and markdown code. We use the well-known [MNIST](http://yann.lecun.com/exdb/mnist/) dataset of hand-written digits, where each row contains the 28^2=784 raw gray-scale pixel values from 0 to 255 of the digitized digits (0 to 9). 4 | 5 | ### Start H2O and load the MNIST data 6 | 7 | ######Initialize the H2O server and import the MNIST training/testing datasets. 8 | 9 | library(h2o) 10 | h2oServer <- h2o.init(nthreads=-1) 11 | homedir <- "/data/h2o-training/mnist/" 12 | DATA = "train.csv.gz" 13 | data_hex <- h2o.importFile(h2oServer, path = paste0(homedir,DATA), header = F, sep = ',', key = 'train.hex') 14 | 15 | ######The data consists of 784 (=28^2) pixel values per row, with (gray-scale) values from 0 to 255. The last column is the response (a label in 0,1,2,...,9). 16 | 17 | predictors = c(1:784) 18 | resp = 785 19 | 20 | ######We do unsupervised training, so we can drop the response column. 21 | 22 | data_hex <- data_hex[,-resp] 23 | 24 | ### PCA - Principle Components Analysis 25 | 26 | ###### Let's use [PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) to compute the principal components of the MNIST data, and plot the standard deviations of the principal components (i.e., the square roots of the eigenvalues of the covariance/correlation matrix). 27 | 28 | pca_model <- h2o.prcomp(data_hex) 29 | plot(pca_model@model$sdev) 30 | 31 | #####![](images/mnist_pca_sdev.png) 32 | 33 | ###### We see that the first 50 or 100 principal components cover the majority of the variance of this dataset. 34 | 35 | ###### To reduce the dimensionality of MNIST to its 50 principal components, we use the h2o.predict() function with an extra argument `num_pc`: 36 | 37 | features_pca <- h2o.predict(pca_model, data_hex, num_pc=50) 38 | summary(features_pca) 39 | 40 | ### Deep Learning Autoencoder 41 | 42 | ae_model <- h2o.deeplearning(x=predictors, 43 | y=42, #ignored (pick any non-constant predictor) 44 | data_hex, 45 | activation="Tanh", 46 | autoencoder=T, 47 | hidden=c(100,50,100), 48 | epochs=1, 49 | ignore_const_cols = F) 50 | 51 | ###### We can now convert the data with the autoencoder model to 50-dimensional space (second hidden layer) 52 | 53 | features_ae <- h2o.deepfeatures(data_hex, ae_model, layer=2) 54 | summary(features_ae) 55 | 56 | ###### To get the full reconstruction from the output layer of the autoencoder, use h2o.predict() as follows 57 | 58 | data_reconstr <- h2o.predict(ae_model, data_hex) 59 | summary(data_reconstr) 60 | -------------------------------------------------------------------------------- /tutorials/unsupervised/dimreduction/images/mnist_pca_sdev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/h2o-world-2014-training/63a7ddebceafd32130e5852108a0e00f035824f0/tutorials/unsupervised/dimreduction/images/mnist_pca_sdev.png -------------------------------------------------------------------------------- /tutorials/web_ui/tutorial.md: -------------------------------------------------------------------------------- 1 | # Data Science Flow from H2O`s Web Interface 2 | 3 | You can follow along with our video tutorial: 4 | 5 | 6 | 7 | ## Step 1: Import Data 8 | 9 | The airlines data set we are importing is a subset of the data made available by [RITA](http://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp) with a mix of numeric and factor columns. In the following tutorial we will build multiple classification models predicting for flight delays, run model comparison and score on a specific chosen model. 10 | 11 | * Navigate to [*Data* > *Import File*](http://localhost:54321/2/ImportFiles2.html) 12 | * Input into path `/data/h2o-training/airlines/allyears2k.csv` and hit Submit 13 | * Hit on the nfs link [*`C:\data\h2o-training\airlines\allyears2k.csv`*](http://localhost:54321/2/Parse2.query?source_key=nfs:\C:\data\h2o-training\airlines\allyears2k.csv) 14 | * Scroll down the page to get a preview of your data before hitting Submit again. 15 | 16 | ## Step 2: Data Summary 17 | * On the [data inspect page](http://localhost:54321/2/Inspect2.html?src_key=allyears2k.hex) navigate to the [*Summary*](http://localhost:54321/2/SummaryPage2.query?source=allyears2k.hex) which you can also access by [*Data* > *Summary*](http://localhost:54321/2/SummaryPage2.html) 18 | * Hit Submit to get a summary of all the columns in the data: 19 | * Numeric Columns: Min, Max, and Quantiles 20 | * Factor Columns: Counts of each factor, Cardinality, NAs 21 | 22 | ## Step 3: Split Data into Test and Training Sets 23 | * Navigate back to data inspect page [*Data* > *View All* > *allyears2k.hex* > *Split Frame*](http://localhost:54321/2/FrameSplitPage.query?source=allyears2k.hex) 24 | * Select *shuffle* and hit Submit 25 | * Select [*allyears2k_suffled_part0.hex*](http://localhost:54321/2/Inspect2.html?src_key=allyears2k_shuffled_part0.hex) for the training frame 26 | 27 | ## Step 4: Build a GLM model 28 | 29 | 30 | * Go to [*Model* > *Generalized Linear Model*](http://localhost:54321/2/GLM2.html) 31 | * Input for *source*: `allyears2k_shuffled_part0.hex` 32 | * Select for *response*: `IsDepDelayed` 33 | * Select to ignore all columns (Ctrl+A) except for `Year`, `Month`, `DayofMonth`, `DayOfWeek`, `UniqueCarrier`, `Origin`, `Dest`, and `Distance` (Ctrl) 34 | * Select for *family*: `binomial` 35 | * Check *use all factor levels* and *variable importances* 36 | * Hit submit to start the job 37 | 38 | 39 | ## Step 5: Build a 50 Tree GBM model 40 | 41 | 42 | * Go to [*Model* > *Gradient Boosting Machine*](http://localhost:54321/2/GBM.html) 43 | * Input for *source*: `allyears2k_shuffled_part0.hex` 44 | * Select for *response*: `IsDepDelayed` 45 | * Select to ignore all columns (Ctrl+A) except for `Year`, `Month`, `DayofMonth`, `DayOfWeek`, `UniqueCarrier`, `Origin`, `Dest`, and `Distance` (Ctrl) 46 | * Hit Submit to start the job 47 | 48 | 49 | ## Step 6: Build a simplier 5 Tree GBM model 50 | 51 | 52 | * Go to [*Model* > *Gradient Boosting Machine*](http://localhost:54321/2/GBM.html) 53 | * Input for *source*: `allyears2k_shuffled_part0.hex` 54 | * Select for *response*: `IsDepDelayed` 55 | * Select to ignore all columns (Ctrl+A) except for `Year`, `Month`, `DayofMonth`, `DayOfWeek`, `UniqueCarrier`, `Origin`, `Dest`, and `Distance` (Ctrl) 56 | * Input for *ntrees*: `5` 57 | * Hit Submit to start the job 58 | 59 | > On the model output page, hit the **JSON** tab. 60 | > 61 | > On the model output page, hit the **JAVA** tab. 62 | 63 | 64 | ## Step 7: Deep Learning with Model Grid Search 65 | 66 | 67 | * Go to [*Model* > *Gradient Boosting Machine*](http://localhost:54321/2/DeepLearning.html) 68 | * Input for *source*: `allyears2k_shuffled_part0.hex` 69 | * Select for *response*: `IsDepDelayed` 70 | * Select to ignore all columns (Ctrl+A) except for `Year`, `Month`, `DayofMonth`, `DayOfWeek`, `UniqueCarrier`, `Origin`, `Dest`, and `Distance` (Ctrl) 71 | * Input for *hidden*: `(10,10), (20,20,20)` 72 | * Hit Submit to start the job 73 | 74 | > The models are sorted by error rates. Scroll to all the way to the right to select the first model on the list. 75 | 76 | ## Step 8: Multimodel Scoring Engine 77 | 78 | * Navigate to [*Score* > *Multi model Scoring (beta)*](http://localhost:54321/steam/index.html) 79 | * Select data set `allyears2k.hex` and scroll to the compatible models and select `VIEW THESE MODELS...` 80 | * Select all the models on the left hand task bar. 81 | * Hit *SCORE...* and select `allyears2k_shuffled_part1.hex` and hit *OK* 82 | 83 | > The tabular viewing of the models allows the user to have a side by side comparison of all the models. 84 | 85 | ### Creating Visualizations 86 | 87 | * Navigate to *ADVANCED* Tab to see overlaying ROC curves 88 | * Hit *ADD VISUALIZATION...* 89 | * *For the X-Axis Field* choose `Training Time (ms)` 90 | * *For the Y-Axis Field* choose `AUC` 91 | 92 | > Examine the new graph you created. Weigh the value of extra gain in accuracy for time taken to train the models. Before selecting a model and copying the key of the model. 93 | 94 | 95 | ## Step 9: Create Frame with Predicted Values 96 | 97 | 98 | * Navigate back to [*Home Page* > *Score* > *Predict*](http://local:host:54321/2/Predict.html) 99 | * Input for *model*: paste the model key you got from Step 8 100 | * Input for *data*: `allyears2k_shuffled_part1.hex` 101 | * Input for *prediction*: `pred` 102 | 103 | 104 | ## Step 10: Export Predicted Values as CSV 105 | 106 | * Inspect the [prediction frame](http://localhost:54321/2/Inspect2.html?src_key=pred) 107 | * Select *Download as CSV* 108 | 109 | or export any frame: 110 | 111 | * Navigate to [*Data* > *Export Files*](http://localhost:54321/2/ExportFiles.html) 112 | * Input for *src key*: `pred` 113 | * Input for *path*: `/data/h2o-training/airlines/pred.csv` 114 | 115 | ## Step 11: Save a model for use later 116 | 117 | * Navigate to [*Data* > *View All*](http://localhost:54321/StoreView.html) 118 | * Choose to filter by the model key 119 | * Hit [*Save Model*](http://localhost:54321/2/SaveModel) 120 | * Input for *path*: `/data/h2o-training/airlines/50TreesGBMmodel` 121 | * Hit Submit 122 | 123 | ## Errors?! Download and send us the log files! 124 | 125 | * Navigate to [*Admin* > *Inspect Log*](http://localhost:54321/LogView.html) 126 | * Hit *Download all logs* 127 | 128 | ## Step 12: Shutdown your H2O instance 129 | 130 | * Go to [*Admin* > *Shutdown*] 131 | 132 | ## Extra Bonus: Reload that saved model 133 | 134 | * In a active H2O session 135 | * Navigate to the [Load Model](http://localhost:54321/2/LoadModel.html) 136 | * Input for *path*: `/data/h2o-training/airlines/50TreesGBMmodel` 137 | * Hit Submit --------------------------------------------------------------------------------