├── spark └── astro-ml │ ├── get_data.sh │ ├── submit_spark.sl │ ├── spark-astro-ml.py │ └── spark-astro-ml.ipynb ├── LICENSE ├── DataDay1.ipynb └── daya_dl └── walk_manifold.ipynb /spark/astro-ml/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | wget http://portal.nersc.gov/project/dessn/autoscan/autoscan_features.2.csv 4 | 5 | -------------------------------------------------------------------------------- /spark/astro-ml/submit_spark.sl: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #SBATCH -N 2 4 | #SBATCH -t 30 5 | 6 | module load spark/2.0.0 7 | start-all.sh 8 | 9 | spark-submit spark-astro-ml.py 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, The Regents of the University of California, through 2 | Lawrence Berkeley National Laboratory (subject to receipt of any required 3 | approvals from the U.S. Dept. of Energy). All rights reserved. 4 | -------------------------------------------------------------------------------- /spark/astro-ml/spark-astro-ml.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # Cite: 5 | # D. A. Goldstein, et al. 2015 "Automated Transient Identification in the Dark Energy Survey" AJ (accepted). 6 | 7 | # # Background 8 | 9 | # * We are aiming here to classify two different types of astronomy images: true data, and artificially injected 10 | # 11 | 12 | # First things first, let's get the pyspark kernel. Open up a Cori terminal and type "module load spark" 13 | 14 | # Let's grab the data' 15 | 16 | # In[ ]: 17 | 18 | 19 | 20 | # In[1]: 21 | 22 | 23 | 24 | # In[ ]: 25 | 26 | from skimage.io import imread, imshow 27 | 28 | from matplotlib import pyplot as plt 29 | 30 | path_to_sample_image = "/project/projectdirs/dasrepo/data_day/astron-images/srch11802308.gif" 31 | 32 | 33 | 34 | # #### Here is a sample astronomy image: 35 | 36 | # In[ ]: 37 | 38 | #im = imread(path_to_sample_image) 39 | 40 | #get an image of the other day 41 | 42 | #plt.imshow(im,cmap='gray') 43 | 44 | 45 | # Instead of running directly on the images, we will run on 40 physics computed features. If we compute pretty discriminating features, this will make it easier for the ML algo to discriminate 46 | # 47 | # It would interesting to see if a machine learning algorithm could discriminate solely based on the pixels of the image. If you are interested, I can show later applying deep learning to do classification on the raw images 48 | 49 | # We have a csv file. Here is what it looks like. Each line represents a single event. Each event consists of 40 numbers which are these physically motivated features from the image. The first row of the file is the header with the name of each feature 50 | # 51 | 52 | # In[4]: 53 | 54 | 55 | 56 | # In[5]: 57 | 58 | 59 | 60 | # Ok, we will use spark, here, so let's load the modules of interest and delete the comments at the beginning. 61 | 62 | # In[6]: 63 | 64 | from pyspark.sql import SparkSession 65 | 66 | 67 | # SparkSession is like the workhorse variable here 68 | 69 | # In[7]: 70 | 71 | spark = SparkSession.builder.getOrCreate() 72 | 73 | 74 | # In[ ]: 75 | 76 | 77 | 78 | 79 | # Now we will read the csv file to a data frame 80 | 81 | # In[8]: 82 | 83 | df = spark.read.csv('./autoscan_features.2.csv', header=True) 84 | 85 | 86 | # In[9]: 87 | 88 | #ID will not be useful and band is non-numerical 89 | df=df.drop('ID') 90 | df=df.drop('BAND') 91 | 92 | 93 | # Now let's look at a sample record from the dataset. As we can see, underneath the dataframe is an RDD of rows. 94 | 95 | # In[10]: 96 | 97 | df.take(1) 98 | 99 | 100 | # In[11]: 101 | 102 | len(df.columns) 103 | 104 | 105 | # And the schema. As we can see here, there is one label, one ID and 38 other features 106 | 107 | # In[12]: 108 | 109 | df.printSchema() 110 | 111 | #describe a couple of the physics features 112 | 113 | 114 | # In[13]: 115 | 116 | df.groupBy('OBJECT_TYPE').count().show() 117 | 118 | 119 | # In[14]: 120 | 121 | from pyspark.mllib.linalg import Vectors 122 | 123 | 124 | # In[15]: 125 | 126 | from pyspark.sql import Row 127 | 128 | 129 | # In[16]: 130 | 131 | from pyspark.ml.linalg import Vectors, Vector, VectorUDT 132 | 133 | 134 | # Now the ML algo wants a tuple of label and a vector of the other features. Let's make a little function to convert rows to vectrs 135 | 136 | # In[17]: 137 | 138 | def convert_row_to_vector(row, lbl_key='OBJECT_TYPE'): 139 | row = row.asDict() 140 | lbl = int(row[lbl_key]) 141 | float_list = [0.0 if str(v) == '' else float(v) for k,v in row.iteritems() if k!= lbl_key] 142 | return (lbl, Vectors.dense(float_list)) 143 | 144 | 145 | 146 | # Now, we call map on the rdd in the dataframe, converting each row to a vector 147 | 148 | # In[18]: 149 | 150 | lbl_vec_pairs = df.rdd.map(convert_row_to_vector) 151 | 152 | 153 | # Now we can create a dataframe 154 | 155 | # In[20]: 156 | 157 | data = spark.createDataFrame(lbl_vec_pairs, ['label', 'features']) 158 | 159 | 160 | # In[21]: 161 | 162 | from pyspark.sql.types import StructField, IntegerType, StructType 163 | 164 | from pyspark.mllib.feature import LabeledPoint 165 | 166 | 167 | # In[43]: 168 | 169 | #data=lbl_vec_pairs.map(lambda (l,v): LabeledPoint(l,v)) 170 | 171 | 172 | # In[22]: 173 | 174 | from pyspark.ml.classification import RandomForestClassifier 175 | 176 | from pyspark.ml.feature import DecisionTreeParams 177 | from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder 178 | 179 | from pyspark.ml.feature import MinMaxScaler 180 | 181 | from pyspark.ml import Pipeline 182 | 183 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 184 | 185 | 186 | # In[ ]: 187 | 188 | 189 | 190 | 191 | # In[23]: 192 | 193 | from pyspark.ml.tuning import TrainValidationSplitModel 194 | 195 | 196 | # In[24]: 197 | 198 | bce = BinaryClassificationEvaluator(metricName='mse') 199 | 200 | 201 | # In[25]: 202 | 203 | tr_data, te_data = data.randomSplit([0.8, 0.2]) 204 | 205 | 206 | # In[26]: 207 | 208 | rf = RandomForestClassifier() 209 | 210 | 211 | # In[27]: 212 | 213 | paramGrid = ParamGridBuilder() .addGrid(rf.numTrees, [50, 100]) .addGrid(rf.maxDepth, [30, 15]) .build() 214 | 215 | 216 | # In[28]: 217 | 218 | tvs = TrainValidationSplit(estimator=rf, 219 | estimatorParamMaps=paramGrid, 220 | evaluator=bce, 221 | trainRatio=0.8) 222 | 223 | 224 | # In[ ]: 225 | 226 | tvs.fit(tr_data) 227 | 228 | 229 | # In[ ]: 230 | 231 | prediction = tvs.transform(test) 232 | 233 | 234 | # In[5]: 235 | 236 | # convert to .py file. Now let's submit to queue 237 | 238 | 239 | # HW! 240 | # Items to Work on: 3 Options: 241 | # 242 | # 1. ML 243 | # * make a logistic regression model 244 | # * use cross-validation to search a good space of logisitc regression hyoerparams 245 | # * preprocess all features to mean zero and stdev 1 246 | # * submit this job to batch 247 | # 248 | # 249 | # 2. Data Munging / Saving 250 | # * find number of columns that have an element over 1 251 | # * make a new data frame that contains 252 | # * the sum of GLUX SNR and GAUSS Columns 253 | # * a column with the max value from each row from the original data 254 | # * the mean value from each row 255 | # * the median 256 | # * conver this data frame to pandas 257 | # * also save this data frame out to JSON 258 | # 259 | # 260 | # 3. Deep Learning 261 | # * Train a convolutional neural network to classify the astronomy images for at least 50 epochs 262 | # * Submit this job to the quueue 263 | # * Plot the learning curve and an accuracy curve 264 | -------------------------------------------------------------------------------- /spark/astro-ml/spark-astro-ml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Cite:\n", 8 | "D. A. Goldstein, et al. 2015 \"Automated Transient Identification in the Dark Energy Survey\" AJ (accepted)." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Background" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "* We are aiming here to classify two different types of astronomy images: true data, and artificially injected\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "First things first, let's get the pyspark kernel. Open up a Cori terminal and type \"module load spark\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Let's grab the data'" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "! git clone https://github.com/NERSC/data-day-examples.git" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 1, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "--2016-08-23 06:46:24-- http://portal.nersc.gov/project/dessn/autoscan/autoscan_features.2.csv\n", 62 | "Resolving portal.nersc.gov... 128.55.6.160\n", 63 | "Connecting to portal.nersc.gov|128.55.6.160|:80... connected.\n", 64 | "HTTP request sent, awaiting response... 200 OK\n", 65 | "Length: 448893905 (428M) [text/plain]\n", 66 | "Saving to: “autoscan_features.2.csv”\n", 67 | "\n", 68 | "100%[======================================>] 448,893,905 106M/s in 4.0s \n", 69 | "\n", 70 | "2016-08-23 06:46:28 (106 MB/s) - “autoscan_features.2.csv” saved [448893905/448893905]\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "! wget http://portal.nersc.gov/project/dessn/autoscan/autoscan_features.2.csv" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "from skimage.io import imread, imshow\n", 88 | "\n", 89 | "from matplotlib import pyplot as plt\n", 90 | "\n", 91 | "path_to_sample_image = \"/project/projectdirs/dasrepo/data_day/astron-images/srch11802308.gif\"\n", 92 | "\n", 93 | "%matplotlib inline" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "#### Here is a sample astronomy image:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "#im = imread(path_to_sample_image)\n", 112 | "\n", 113 | "#get an image of the other day\n", 114 | "\n", 115 | "#plt.imshow(im,cmap='gray')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Instead of running directly on the images, we will run on 40 physics computed features. If we compute pretty discriminating features, this will make it easier for the ML algo to discriminate\n", 123 | "\n", 124 | "It would interesting to see if a machine learning algorithm could discriminate solely based on the pixels of the image. If you are interested, I can show later applying deep learning to do classification on the raw images" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "We have a csv file. Here is what it looks like. Each line represents a single event. Each event consists of 40 numbers which are these physically motivated features from the image. The first row of the file is the header with the name of each feature\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "# autoscan training data\r\n", 146 | "# use the id column to cross-match rows with thumbnails\r\n", 147 | "# object_type gives the class of the row\r\n", 148 | "# object_type = 0: artifact\r\n", 149 | "# object_type = 1: non-artifact\r\n", 150 | "# remaining 38 columns defined in section 3 and table 2 of companion paper \r\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "! head -12 './autoscan_features.2.csv' | grep \"^#\" " 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "! sed -i.bak '/^#/d' ./autoscan_features.2.csv" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 2, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "! head -200 ./autoscan_features.2.csv > ./small-autoscan_features.2.csv" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Ok, we will use spark, here, so let's load the modules of interest and delete the comments at the beginning." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "from pyspark.sql import SparkSession" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "SparkSession is like the workhorse variable here" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 7, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "spark = SparkSession.builder.getOrCreate()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Now we will read the csv file to a data frame" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 8, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "CPU times: user 5 ms, sys: 2 ms, total: 7 ms\n", 244 | "Wall time: 8.12 s\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "df = spark.read.csv('./small-autoscan_features.2.csv', header=True)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 9, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "#ID will not be useful and band is non-numerical\n", 261 | "df=df.drop('ID')\n", 262 | "df=df.drop('BAND')" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Now let's look at a sample record from the dataset. As we can see, underneath the dataframe is an RDD of rows." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 10, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "CPU times: user 5 ms, sys: 4 ms, total: 9 ms\n", 284 | "Wall time: 1.01 s\n" 285 | ] 286 | }, 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "[Row(OBJECT_TYPE=u'0', AMP=u'0.8083234429359436', A_IMAGE=u'1.5080000162124634', A_REF=u'2.65006947517395', B_IMAGE=u'0.949999988079071', B_REF=u'1.8995014429092407', CCDID=u'10', COLMEDS=u'0.11207699775695801', DIFFSUMRN=u'25.857545852661133', ELLIPTICITY=u'0.37002652883529663', FLAGS=u'0', FLUX_RATIO=u'0.2590300440788269', GAUSS=u'226.4202880859375', GFLUX=u'1.0089635848999023', L1=u'103.80699920654297', LACOSMIC=u'1.736109972000122', MAG=u'23.031299591064453', MAGDIFF=u'-0.4524995982646942', MAGLIM=u'0', MAG_FROM_LIMIT=u'1.6222000122070312', MAG_REF=u'22.578800201416016', MAG_REF_ERR=u'0.11959999799728394', MASKFRAC=u'0.0', MIN_DISTANCE_TO_EDGE_IN_NEW=u'559.7000122070312', N2SIG3=u'0', N2SIG3SHIFT=u'-7', N2SIG5=u'0', N2SIG5SHIFT=u'-8', N3SIG3=u'0', N3SIG3SHIFT=u'-8', N3SIG5=u'0', N3SIG5SHIFT=u'-9', NN_DIST_RENORM=u'0.6749339699745178', NUMNEGRN=u'22', SCALE=u'2.0241222381591797', SNR=u'7.722346305847168', SPREADERR_MODEL=u'0.004628799855709076', SPREAD_MODEL=u'-0.0037175000179558992')]" 291 | ] 292 | }, 293 | "execution_count": 10, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "df.take(1)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 11, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "38" 313 | ] 314 | }, 315 | "execution_count": 11, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "len(df.columns)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "And the schema. As we can see here, there is one label, one ID and 38 other features" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 12, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "root\n", 343 | " |-- OBJECT_TYPE: string (nullable = true)\n", 344 | " |-- AMP: string (nullable = true)\n", 345 | " |-- A_IMAGE: string (nullable = true)\n", 346 | " |-- A_REF: string (nullable = true)\n", 347 | " |-- B_IMAGE: string (nullable = true)\n", 348 | " |-- B_REF: string (nullable = true)\n", 349 | " |-- CCDID: string (nullable = true)\n", 350 | " |-- COLMEDS: string (nullable = true)\n", 351 | " |-- DIFFSUMRN: string (nullable = true)\n", 352 | " |-- ELLIPTICITY: string (nullable = true)\n", 353 | " |-- FLAGS: string (nullable = true)\n", 354 | " |-- FLUX_RATIO: string (nullable = true)\n", 355 | " |-- GAUSS: string (nullable = true)\n", 356 | " |-- GFLUX: string (nullable = true)\n", 357 | " |-- L1: string (nullable = true)\n", 358 | " |-- LACOSMIC: string (nullable = true)\n", 359 | " |-- MAG: string (nullable = true)\n", 360 | " |-- MAGDIFF: string (nullable = true)\n", 361 | " |-- MAGLIM: string (nullable = true)\n", 362 | " |-- MAG_FROM_LIMIT: string (nullable = true)\n", 363 | " |-- MAG_REF: string (nullable = true)\n", 364 | " |-- MAG_REF_ERR: string (nullable = true)\n", 365 | " |-- MASKFRAC: string (nullable = true)\n", 366 | " |-- MIN_DISTANCE_TO_EDGE_IN_NEW: string (nullable = true)\n", 367 | " |-- N2SIG3: string (nullable = true)\n", 368 | " |-- N2SIG3SHIFT: string (nullable = true)\n", 369 | " |-- N2SIG5: string (nullable = true)\n", 370 | " |-- N2SIG5SHIFT: string (nullable = true)\n", 371 | " |-- N3SIG3: string (nullable = true)\n", 372 | " |-- N3SIG3SHIFT: string (nullable = true)\n", 373 | " |-- N3SIG5: string (nullable = true)\n", 374 | " |-- N3SIG5SHIFT: string (nullable = true)\n", 375 | " |-- NN_DIST_RENORM: string (nullable = true)\n", 376 | " |-- NUMNEGRN: string (nullable = true)\n", 377 | " |-- SCALE: string (nullable = true)\n", 378 | " |-- SNR: string (nullable = true)\n", 379 | " |-- SPREADERR_MODEL: string (nullable = true)\n", 380 | " |-- SPREAD_MODEL: string (nullable = true)\n", 381 | "\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "df.printSchema()\n", 387 | "\n", 388 | "#describe a couple of the physics features" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 13, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "+-----------+------+\n", 403 | "|OBJECT_TYPE| count|\n", 404 | "+-----------+------+\n", 405 | "| 0|454092|\n", 406 | "| 1|444871|\n", 407 | "+-----------+------+\n", 408 | "\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "df.groupBy('OBJECT_TYPE').count().show()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 14, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "from pyspark.mllib.linalg import Vectors" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 15, 430 | "metadata": { 431 | "collapsed": true 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "from pyspark.sql import Row" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 16, 441 | "metadata": { 442 | "collapsed": true 443 | }, 444 | "outputs": [], 445 | "source": [ 446 | "from pyspark.ml.linalg import Vectors, Vector, VectorUDT" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "Now the ML algo wants a tuple of label and a vector of the other features. Let's make a little function to convert rows to vectrs" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 17, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "def convert_row_to_vector(row, lbl_key='OBJECT_TYPE'):\n", 465 | " row = row.asDict()\n", 466 | " lbl = int(row[lbl_key])\n", 467 | " float_list = [0.0 if str(v) == '' else float(v) for k,v in row.iteritems() if k!= lbl_key]\n", 468 | " return (lbl, Vectors.dense(float_list))\n", 469 | " " 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "Now, we call map on the rdd in the dataframe, converting each row to a vector" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 18, 482 | "metadata": { 483 | "collapsed": false 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "lbl_vec_pairs = df.rdd.map(convert_row_to_vector)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "Now we can create a dataframe" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 20, 500 | "metadata": { 501 | "collapsed": false 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "data = spark.createDataFrame(lbl_vec_pairs, ['label', 'features'])" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 21, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "from pyspark.sql.types import StructField, IntegerType, StructType\n", 517 | "\n", 518 | "from pyspark.mllib.feature import LabeledPoint" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 43, 524 | "metadata": { 525 | "collapsed": false 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "#data=lbl_vec_pairs.map(lambda (l,v): LabeledPoint(l,v))" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 22, 535 | "metadata": { 536 | "collapsed": false 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "from pyspark.ml.classification import RandomForestClassifier\n", 541 | "\n", 542 | "from pyspark.ml.feature import DecisionTreeParams \n", 543 | "from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder\n", 544 | "\n", 545 | "from pyspark.ml.feature import MinMaxScaler\n", 546 | "\n", 547 | "from pyspark.ml import Pipeline\n", 548 | "\n", 549 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "collapsed": true 557 | }, 558 | "outputs": [], 559 | "source": [] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 23, 564 | "metadata": { 565 | "collapsed": true 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "from pyspark.ml.tuning import TrainValidationSplitModel" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 24, 575 | "metadata": { 576 | "collapsed": true 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "bce = BinaryClassificationEvaluator(metricName='mse')" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 25, 586 | "metadata": { 587 | "collapsed": true 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "tr_data, te_data = data.randomSplit([0.8, 0.2])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 26, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "rf = RandomForestClassifier()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 27, 608 | "metadata": { 609 | "collapsed": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "paramGrid = ParamGridBuilder() \\\n", 614 | " .addGrid(rf.numTrees, [50, 100]) \\\n", 615 | " .addGrid(rf.maxDepth, [30, 15]) \\\n", 616 | " .build()" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 28, 622 | "metadata": { 623 | "collapsed": true 624 | }, 625 | "outputs": [], 626 | "source": [ 627 | "tvs = TrainValidationSplit(estimator=rf,\n", 628 | " estimatorParamMaps=paramGrid,\n", 629 | " evaluator=bce,\n", 630 | " trainRatio=0.8)" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "tvs.fit(tr_data)\n" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": { 648 | "collapsed": false 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "prediction = tvs.transform(test)" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 6, 658 | "metadata": { 659 | "collapsed": false 660 | }, 661 | "outputs": [ 662 | { 663 | "name": "stdout", 664 | "output_type": "stream", 665 | "text": [ 666 | "[NbConvertApp] Converting notebook spark-astro-ml.ipynb to script\n", 667 | "[NbConvertApp] Writing 5781 bytes to spark-astro-ml.py\n" 668 | ] 669 | } 670 | ], 671 | "source": [ 672 | "# convert to .py file. Now let's submit to queue\n", 673 | "! jupyter nbconvert --to script spark-astro-ml.ipynb\n", 674 | "!sed -i.bak '/ipython*/d' ./*.py\n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "HW!\n", 682 | "Items to Work on: 3 Options:\n", 683 | "\n", 684 | "1. ML\n", 685 | " * make a logistic regression model\n", 686 | " * use cross-validation to search a good space of logisitc regression hyoerparams\n", 687 | " * preprocess all features to mean zero and stdev 1\n", 688 | " * submit this job to batch\n", 689 | " \n", 690 | " \n", 691 | "2. Data Munging / Saving\n", 692 | " * find number of columns that have an element over 1\n", 693 | " * make a new data frame that contains \n", 694 | " * the sum of GLUX SNR and GAUSS Columns\n", 695 | " * a column with the max value from each row from the original data\n", 696 | " * the mean value from each row\n", 697 | " * the median\n", 698 | " * conver this data frame to pandas \n", 699 | " * also save this data frame out to JSON\n", 700 | "\n", 701 | " \n", 702 | "3. Deep Learning\n", 703 | " * Train a convolutional neural network to classify the astronomy images for at least 50 epochs\n", 704 | " * Submit this job to the quueue\n", 705 | " * Plot the learning curve and an accuracy curve" 706 | ] 707 | } 708 | ], 709 | "metadata": { 710 | "anaconda-cloud": {}, 711 | "kernelspec": { 712 | "display_name": "pyspark (2.0.0)", 713 | "language": "python", 714 | "name": "pyspark_2.0.0" 715 | }, 716 | "language_info": { 717 | "codemirror_mode": { 718 | "name": "ipython", 719 | "version": 2 720 | }, 721 | "file_extension": ".py", 722 | "mimetype": "text/x-python", 723 | "name": "python", 724 | "nbconvert_exporter": "python", 725 | "pygments_lexer": "ipython2", 726 | "version": "2.7.12" 727 | } 728 | }, 729 | "nbformat": 4, 730 | "nbformat_minor": 0 731 | } 732 | -------------------------------------------------------------------------------- /DataDay1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Dashboard Example\n", 8 | "For this demo we are using the jupyter-hub demo that runs directly on Cori.\n", 9 | "\n", 10 | "For now, this has the driver process and executor process on the same node. Executor process is limited to only one core so as to not swamp the node.\n", 11 | "\n", 12 | "Most Spark jobs will be done via the batch system, with a single node working as the driver and many other nodes running the executors." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "%matplotlib inline" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Load Spark modules and create a Spark session\n", 31 | "Spark session name is optional" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "from pyspark.sql import SparkSession\n", 43 | "from pyspark.sql.functions import col, size, udf, sum, count\n", 44 | "from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType\n", 45 | "import pandas as pd\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "\n", 48 | "#Get a hook to the spark session\n", 49 | "spark = SparkSession.builder.appName(\"DataDash\").getOrCreate()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Data Loading\n", 57 | "Read in data file from text." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "-rw-rw-r-- 1 lgerhard ccc 29M Aug 21 2016 /global/project/projectdirs/mpccc/lgerhard/data_day/frac_m1523.txt\n", 72 | "90000 /global/project/projectdirs/mpccc/lgerhard/data_day/frac_m1523.txt\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "infile = \"/global/project/projectdirs/mpccc/lgerhard/data_day/frac_m1523.txt\"\n", 78 | "outdir = \"/global/project/projectdirs/mpccc/lgerhard/out_temp\"\n", 79 | "\n", 80 | "! ls -lh /global/project/projectdirs/mpccc/lgerhard/data_day/frac_m1523.txt\n", 81 | "! wc -l /global/project/projectdirs/mpccc/lgerhard/data_day/frac_m1523.txt" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Create a schema for DataFrame and read it in.\n", 89 | "\n", 90 | "The schema can also be inferred from the data file and it can harvest the column names automatically." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "CPU times: user 0 ns, sys: 4.66 ms, total: 4.66 ms\n", 105 | "Wall time: 6.25 s\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "#Define data schema\n", 111 | "dataschema = StructType([ \\\n", 112 | "StructField(\"inode\", LongType(),True), \\\n", 113 | "StructField(\"gnum\", LongType(),True), \\\n", 114 | "StructField(\"sid\", LongType(),True), \\\n", 115 | "StructField(\"white\",StringType(),True), \\\n", 116 | "StructField(\"size\", LongType(),True), \\\n", 117 | "StructField(\"fileset\", StringType(),True), \\\n", 118 | "StructField(\"gnum2\", LongType(),True), \\\n", 119 | "StructField(\"misc\", StringType(),True), \\\n", 120 | "StructField(\"nlink\", LongType(),True), \\\n", 121 | "StructField(\"uid\", LongType(),True), \\\n", 122 | "StructField(\"gid\", LongType(),True), \\\n", 123 | "StructField(\"mode\", StringType(),True), \\\n", 124 | "StructField(\"atime\", LongType(),True), \\\n", 125 | "StructField(\"mtime\", LongType(),True), \\\n", 126 | "StructField(\"bsize\", LongType(),True), \\\n", 127 | "StructField(\"ctime\", LongType(),True), \\\n", 128 | "StructField(\"sep\", StringType(),True), \\\n", 129 | "StructField(\"path\", StringType(),True)])\n", 130 | "\n", 131 | "#Read in data with given schema\n", 132 | "%time dfdir = spark.read.csv(infile,sep=\" \",schema=dataschema)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "+--------+-----+---+-----+------+----------+-----+----+-----+-----+-----+----------+----------+----------+-------+----------+---+--------------------+\n", 147 | "| inode| gnum|sid|white| size| fileset|gnum2|misc|nlink| uid| gid| mode| atime| mtime| bsize| ctime|sep| path|\n", 148 | "+--------+-----+---+-----+------+----------+-----+----+-----+-----+-----+----------+----------+----------+-------+----------+---+--------------------+\n", 149 | "|61696771|65539|701| | 33650|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1262887452|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 150 | "|61696772|65539|701| | 296|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1091829779|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 151 | "|61696773|65539|701| | 7194|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1269198988|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 152 | "|61696774|65539|701| | 3193|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1202326129|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 153 | "|61696775|65539|701| | 2215|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1203788411|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 154 | "|61696776|65539|701| | 2065|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1271878888|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 155 | "|61696777|65539|701| | 814|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1175621388|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 156 | "|61696778|65539|701| | 6859|paralleldb|65539| FAu| 1|45277|55576|-rwxr-xr-x|1368739473|1227047220|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 157 | "|61696779|65539|701| | 1484|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1217461768|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 158 | "|61696780|65539|701| | 1350|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1213396713|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 159 | "|61696781|65539|701| | 462|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1177522192|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 160 | "|61696782|65539|701| | 982|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1186200618|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 161 | "|61696783|65539|701| | 2556|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1203788411|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 162 | "|61696784|65539|701| | 6543|paralleldb|65539| FAu| 1|45277|55576|-rwxr-xr-x|1368739473|1280814903|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 163 | "|61696785|65539|701| | 29550|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1279657094|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 164 | "|61696786|65539|701| | 60037|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473| 860621101|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 165 | "|61696787|65539|701| | 5645|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1259002473|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 166 | "|61696788|65539|701| | 9802|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1280774455|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 167 | "|61696789|65539|701| | 5316|paralleldb|65539| FAu| 1|45277|55576|-rw-r--r--|1368739473|1207806399|4194304|1382632459| --|%2Fproject%2F.sna...|\n", 168 | "|61696790|65539|701| |131072|paralleldb|65539| D2u| 2|45277|55576|drwxr-sr-x|1468458485|1282655464| 131072|1382632459| --|%2Fproject%2F.sna...|\n", 169 | "+--------+-----+---+-----+------+----------+-----+----+-----+-----+-----+----------+----------+----------+-------+----------+---+--------------------+\n", 170 | "only showing top 20 rows\n", 171 | "\n", 172 | "CPU times: user 1.95 ms, sys: 0 ns, total: 1.95 ms\n", 173 | "Wall time: 1.5 s\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "#what did we get?\n", 179 | "%time dfdir.show()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 6, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "CPU times: user 1.31 ms, sys: 538 µs, total: 1.85 ms\n", 194 | "Wall time: 1.1 s\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "90000" 201 | ] 202 | }, 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "%time dfdir.count()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### Data cleaning\n", 217 | "Have some directories that look like\n", 218 | "/project/projectdirs/gt\n", 219 | "these are used for system-level stuff we don't care about. Toss 'em.\n", 220 | "\n", 221 | "Remember input file uses \"%2F\" in place of \"/\" in directory path" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "def cleaner(path,fileset):\n", 233 | " # Skip lines with projectdirs/gt in them (this is give/take directory)\n", 234 | " if \"projectdirs%2Fgt\" in path:\n", 235 | " return 0\n", 236 | " return 1" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Only certain functions pre-canned are available: avg, collect, count, first, group, kurtosis, last, max, mean, min, skewness, stddev, sum\n", 244 | "\n", 245 | "For everything else, can use a UDF.\n", 246 | "\n", 247 | "Note the use of cache. Because we will be using this dataset again below, it tells Spark to keep it in memory." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 8, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "CPU times: user 1.96 ms, sys: 0 ns, total: 1.96 ms\n", 262 | "Wall time: 323 ms\n", 263 | "CPU times: user 2.27 ms, sys: 423 µs, total: 2.7 ms\n", 264 | "Wall time: 5.37 s\n" 265 | ] 266 | }, 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "89900" 271 | ] 272 | }, 273 | "execution_count": 8, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "#Make UDF to invoke cleaner function\n", 280 | "clean_ = udf(lambda p,q: cleaner(p,q), IntegerType())\n", 281 | "%time cleandir = dfdir.filter(clean_(col(\"path\"),col(\"fileset\"))==1).cache()\n", 282 | "%time cleandir.count()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### Data analysis\n", 290 | "Want to loop over all the entries and calculate the total size (in bytes) in each project directory belonging to each user. Also want to count the number of files that each user owns.\n", 291 | "\n", 292 | "Then write this out to a text file. When data is written, each process will write its tiny part of the output. This can be annoying when you have a datset that has 30,000 partitions, so use coalesce to collapse it down to a smaller number. A good value for this is the number of executor nodes, which allows Spark to avoid extra shuffling." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 9, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "CPU times: user 2.38 ms, sys: 970 µs, total: 3.35 ms\n", 307 | "Wall time: 61.3 ms\n", 308 | "CPU times: user 2.26 ms, sys: 0 ns, total: 2.26 ms\n", 309 | "Wall time: 1.91 s\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "#Group by project and user and sum size and number of inodes\n", 315 | "%time tempdf = cleandir.groupBy([\"fileset\",\"uid\"]).agg({\"size\":\"sum\",\"fileset\":\"count\"})\n", 316 | "%time tempdf.coalesce(10).write.csv(outdir,mode=\"overwrite\")" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 10, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "total 0\r\n", 331 | "-rw-r--r-- 1 lgerhard nstaff 0 Sep 20 09:56 _SUCCESS\r\n", 332 | "-rw-r--r-- 1 lgerhard nstaff 65 Sep 20 09:56 part-r-00000-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 333 | "-rw-r--r-- 1 lgerhard nstaff 63 Sep 20 09:56 part-r-00001-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 334 | "-rw-r--r-- 1 lgerhard nstaff 134 Sep 20 09:56 part-r-00002-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 335 | "-rw-r--r-- 1 lgerhard nstaff 0 Sep 20 09:56 part-r-00003-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 336 | "-rw-r--r-- 1 lgerhard nstaff 151 Sep 20 09:56 part-r-00004-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 337 | "-rw-r--r-- 1 lgerhard nstaff 101 Sep 20 09:56 part-r-00005-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 338 | "-rw-r--r-- 1 lgerhard nstaff 156 Sep 20 09:56 part-r-00006-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 339 | "-rw-r--r-- 1 lgerhard nstaff 88 Sep 20 09:56 part-r-00007-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 340 | "-rw-r--r-- 1 lgerhard nstaff 95 Sep 20 09:56 part-r-00008-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n", 341 | "-rw-r--r-- 1 lgerhard nstaff 123 Sep 20 09:56 part-r-00009-66c343e9-d637-4a96-95c0-1adf2ac4c6c7.csv\r\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "!ls -lh /global/project/projectdirs/mpccc/lgerhard/out_temp/" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 11, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "paralleldb,55710,565,18342399426\r\n", 361 | "paralleldb,57828,1661,603758699\r\n", 362 | "paralleldb,55683,10718,196204722544\r\n", 363 | "paralleldb,59777,24,303732\r\n", 364 | "paralleldb,59408,62,5370754856\r\n", 365 | "paralleldb,56547,1157,1438890701757\r\n", 366 | "paralleldb,55684,5417,74420354020\r\n", 367 | "paralleldb,58362,56,281184410241\r\n", 368 | "paralleldb,59392,20833,104027273701\r\n", 369 | "paralleldb,60023,111,318374310\r\n", 370 | "paralleldb,61017,73,4038780\r\n", 371 | "paralleldb,32331,12,117462364\r\n", 372 | "paralleldb,34359,1,131072\r\n", 373 | "paralleldb,45277,33343,254310489275\r\n", 374 | "paralleldb,61228,2475,60413573602\r\n", 375 | "paralleldb,65141,1,26141060589\r\n", 376 | "paralleldb,57774,48,52605863028\r\n", 377 | "paralleldb,59354,332,3528086616706\r\n", 378 | "paralleldb,52894,2,1024\r\n", 379 | "paralleldb,20896,3202,24442397343\r\n", 380 | "paralleldb,56233,46,3240841809\r\n", 381 | "paralleldb,57833,12,8628847081\r\n", 382 | "paralleldb,65801,1,131072\r\n", 383 | "paralleldb,63033,18,1880018155\r\n", 384 | "paralleldb,59776,1103,47939605\r\n", 385 | "paralleldb,58491,2,134411671872\r\n", 386 | "paralleldb,30609,182,4097847417\r\n", 387 | "paralleldb,61504,1024,3025887324\r\n", 388 | "paralleldb,70977,1,36887\r\n", 389 | "paralleldb,43940,7200,2090311669\r\n", 390 | "paralleldb,63028,218,4674096675\r\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "!cat /global/project/projectdirs/mpccc/lgerhard/out_temp/part*" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "### Move between Spark DataFrame and Pandas DataFrame\n", 403 | "Use this with caution!!\n", 404 | "\n", 405 | "It will pull **all** of the data onto the driver node. This can **easily** exceed the available memory of the driver node for a large dataset." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 12, 411 | "metadata": { 412 | "collapsed": false 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "CPU times: user 672 ms, sys: 108 ms, total: 781 ms\n", 420 | "Wall time: 2.49 s\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "#Convert to pandas dataframe\n", 426 | "%time pddir = cleandir.toPandas()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 13, 432 | "metadata": { 433 | "collapsed": false 434 | }, 435 | "outputs": [ 436 | { 437 | "data": { 438 | "text/plain": [ 439 | "" 440 | ] 441 | }, 442 | "execution_count": 13, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | }, 446 | { 447 | "data": { 448 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAERCAYAAAB8eMxzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEk5JREFUeJzt3X+sZGddx/H3py1UCwZKla506S5aYaVCCshCKD/GFNsV\nkNUC0kXCj0ggRAoREwqBZO8SY0qIClJ+RFk2YNysSICyQuNWcWkqgTa2pZR22/UHbXctCwSMKBFp\n/frHnNudXu69O7/unTmc9yu52TnPzDznu+fO/c4z3/PMc1JVSJK646RZByBJWl8mfknqGBO/JHWM\niV+SOsbEL0kdY+KXpI4x8UtSx5j4Jalj1iTxJ3lOkmuSfDDJs9diH5Kk8azViL+A7wGnAkfWaB+S\npDEMlfiT7E5yLMnNS9q3JTmU5I4kly22V9U1VfV84K3AO6cbsiRpEsOO+PcAFw02JDkJuKJpPxfY\nkWTLkuf9B/DgSYOUJE3PKcM8qKquTbJpSfNW4HBV3QmQZB+wHTiU5DfpvyE8jP6bgyRpTgyV+Fdw\nFnD3wPYR+m8GVNWngE+t9uQkLgsqSWOoqkzy/EkS/8Q2bnz8uu7vwgufy+7d751KXwsLCywsLEyl\nr1kw/tky/tlpc+wAyUQ5H5gs8R8Fzh7Y3ti0De3IkecAT6X5oLDGDnP11e9Yh/1I0vQdPHiQgwcP\nTqWvURJ/mp9F1wPnNLX/e4BLgB2j7f4Doz18IlaWJLVXr9ej1+uxa9euifsaKvEn2Qv0gDOS3AXs\nrKo9SS4FDtCfHbS7qm4bbfcLTbe90Z42B3q93qxDmIjxz5bxz05bY5/miD+zuvRi/+Tueu77Fh79\n6Eu4665b1nGfkjRdSSY+uetaPZLUMTOd1dPmUo8krSdLPWOx1COp/Sz1SJJGZqlHklrAUs9YLPVI\naj9LPZKkkVnqkaQWsNQzFks9ktrPUo8kaWQmfknqGGv8ktQC1vjHYo1fUvtZ45ckjczEL0kdY+KX\npI4x8UtSxzirR5JawFk9Y3FWj6T2c1aPJGlkJn5J6hgTvyR1jIlfkjrGxC9JHWPil6SOcR6/JLWA\n8/jH4jx+Se3nPH5J0shM/JLUMSZ+SeoYE78kdYyJX5I6xsQvSR2zZok/yWlJrk/yvLXahyRpdGs5\n4r8M+Ks17F+SNIahEn+S3UmOJbl5Sfu2JIeS3JHksoH25wK3At8CJvqigSRpuoYd8e8BLhpsSHIS\ncEXTfi6wI8mW5u4e8DTgZcBrphKpJGkqhlqrp6quTbJpSfNW4HBV3QmQZB+wHThUVe9o2l4BfHuK\n8UqSJjTJIm1nAXcPbB+h/2Zwv6r62OpdLAzc7uFibZL0QNNcnG3RHKzOKUlaSa/Xo9fr3b+9a9eu\nifucJPEfBc4e2N7YtI1gAUf6knRiM1mWOclmYH9VPaHZPhm4HbgAuAe4DthRVbcN2Z/LMkvSiNZt\nWeYke4EvAo9NcleSV1fVfcClwAHga8C+YZO+JGl2hp3V87IV2q8Crhp/9wtY6pGkE/MKXGOx1COp\n/aZR6pmDWT09HPFL0uoc8Y/FEb+k9vOau5KkkVnqkaQWsNQzFks9ktrPUo8kaWQmfknqGGv8ktQC\n1vjHYo1fUvtZ45ckjczEL0kdY41fklrAGv9YrPFLaj9r/JKkkZn4JaljTPyS1DEmfknqGGf1SFIL\nOKtnLM7qkdR+zuqRJI3MxC9JHWPil6SOMfFLUseY+CWpY5zOKUkt4HTOsTidU1L7OZ1TkjQyE78k\ndYyJX5I6xsQvSR1j4pekjjHxS1LHmPglqWPW5AtcSbYAbwLOAD5fVR9ai/1Ikka3JiP+qjpUVa8H\nXgo8Yy32IUkaz1CJP8nuJMeS3LykfVuSQ0nuSHLZkvt+Hfgb4HPTC1eSNKlhR/x7gIsGG5KcBFzR\ntJ8L7GhKPABU1f6qej7w8inFKkmagqFq/FV1bZJNS5q3Aoer6k6AJPuA7cChJM8BLgZOBT47xXgl\nSROa5OTuWcDdA9tH6L8ZUFVfAL5w4i4WBm73cJVOSXqgaa7KuWgOlmWWJK2k1+vR6/Xu3961a9fE\nfU6S+I8CZw9sb2zaRrCAI31JOrGZrMefZDOwv6qe0GyfDNwOXADcA1wH7Kiq24bsz/X4JWlE01iP\nf6gRf5K99IflZyS5C9hZVXuSXAocoD87aPewSf+4BRzxS9KJeQWusTjil9R+XoFLkjSyOZjV08NS\njyStzlLPWCz1SGo/Sz2SpJFZ6pGkFrDUMxZLPZLaz1KPJGlklnokqQUs9YzFUo+k9rPUI0kamYlf\nkjrGxC9JHePJXUlqAU/ujsWTu5Laz5O7kqSRmfglqWNM/JLUMZ7claQW8OTuWDy5K6n9PLkrSRqZ\niV+SOsbEL0kdY+KXpI4x8UtSx5j4JaljnMcvSS3gPP6xOI9fUvs5j1+SNDITvyR1jIlfkjrGxC9J\nHWPil6SOMfFLUses2Tz+JNuB5wM/BXykqq5eq31Jkoa3Zom/qq4ErkzycODdgIlfkubA0KWeJLuT\nHEty85L2bUkOJbkjyWXLPPUdwPsnDVSSNB2j1Pj3ABcNNiQ5CbiiaT8X2JFky8D9lwOfq6qbphCr\nJGkKhk78VXUt8N0lzVuBw1V1Z1X9ENgHbAdIcilwAfDiJK+dUrySpAlNWuM/C7h7YPsI/TcDqup9\nwPsm7F+SNGVzsDrnoh6u0ilJDzTNVTkXjbQ6Z5JNwP6qemKz/XRgoaq2NdtvBaqq3jVEX67OKUkj\nmsbqnKOO+NP8LLoeOKd5Q7gHuATYMXx3CzjSl6QTm8l6/En20s/QZwDHgJ1VtSfJrwHvoX+ieHdV\nXT5kf474JWlE6zrir6qXrdB+FXDVeLtfwBG/JJ2YV+AaiyN+Se03ixr/lC3giF+STswR/1gc8Utq\nP6+5K0kamaUeSWoBSz1jsdQjqf0s9UiSRmbil6SOscYvSS1gjX8s1vgltZ81fknSyEz8ktQx1vgl\nqQWs8Y/FGr+k9rPGL0kamYlfkjrGxC9JHWPil6SOcVaPJLWAs3rG4qweSe3nrB5J0shM/JLUMSZ+\nSeoYE78kdYyJX5I6xsQvSR3jPH5JagHn8Y/FefyS2s95/JKkkZn4JaljTPyS1DEmfknqGBO/JHWM\niV+SOmZNEn+SxyT5cJKPr0X/kqTxrUnir6p/q6rXrEXfkqTJDJX4k+xOcizJzUvatyU5lOSOJJet\nTYiSpGkadsS/B7hosCHJScAVTfu5wI4kW5Y8b6Jvl0nqhg0bNpNkXX82bNg86//2zAyV+KvqWuC7\nS5q3Aoer6s6q+iGwD9gOkOQRST4InOcnAUkncuzYnfSXcFm/n/4+u2mSRdrOAu4e2D5C/82AqvoO\n8PoTd7EwcLuHi7VJ0gNNc3G2RXOwOqckaSW9Xo9er3f/9q5duybuc5LEfxQ4e2B7Y9M2ggUc6UvS\nic1kWeYkm4H9VfWEZvtk4HbgAuAe4DpgR1XdNmR/LsssCegvNby++QAgzGpZ+klMY1nmoUb8SfbS\nH5afkeQuYGdV7UlyKXCA/kni3cMm/eMWcMQvSSfmhVjG4ohfmleO+IfnhVgkSSObg1k9PSz1SNLq\nLPWMxVKPNK8s9QzPUo8kaWSWeiSpBSz1jMVSjzSvLPUMz1KPJGlkJn5J6hhr/JLUAtb4x2KNX5pX\n1viHZ41fkjQyE78kdYw1/jW2YcPmdb/E25lnbuIb3/j6uu5TPz5m8ZrViVnjH8tsavzWLtU2s3rN\n+ncyHGv8kqSRmfglqWNM/JLUMSZ+SeoYZ/VIUgs4q2cszuqRhuGsnvnmrB5J0shM/JLUMSZ+SeoY\nE78kdYyJX5I6xumcktQCTucci9M5pWE4nXO+OZ1TkjQyE78kdYyJX5I6xsQvSR1j4pekjjHxS1LH\nmPglqWPW5AtcSU4DPgD8APhCVe1di/1Ikka3ViP+i4G/rqrXAS9co33M1LS+QTcrxj9bbY8fDs46\ngAkcnHUAMzdU4k+yO8mxJDcvad+W5FCSO5JcNnDXRuDu5vZ9U4p1rrT9D9f4Z6vt8bc7eR6cdQAz\nN+yIfw9w0WBDkpOAK5r2c4EdSbY0d99NP/lD/7vYkqQ5MVTir6prge8uad4KHK6qO6vqh8A+YHtz\n36eAFyd5P7B/WsFKkiY39CJtSTYB+6vqic32i4CLquq1zfbLga1V9cYh+2vf6kiSNAcmXaRtZssy\nTxq4JGk8k8zqOQqcPbC9sWmTJM2xURJ/eOCJ2uuBc5JsSvJg4BLgM9MMTpI0fcNO59wLfBF4bJK7\nkry6qu4DLgUOAF8D9lXVbWsXqiRpGoad1fOyqnpUVZ1aVWdX1Z6m/aqqelxV/UJVXT7sTleZ/z+3\nknw9yVeS3Jjkuqbt9CQHktye5G+TPGzWcS5a7rsXq8Wb5G1JDie5LcmFs4n6uBXi35nkSJIbmp9t\nA/fNTfxJNib5fJKvJflqkjc27a04/svEf2nT3pbjf2qSLzd/q19NsrNpn/vjv0rs0z32VbWuP/Tf\nbP4Z2AQ8CLgJ2LLecYwR978Cpy9pexfwlub2ZcDls45zILZnAucBN58oXuDxwI30T/Zvbn4/mcP4\ndwJvXuaxvzhP8QMbgPOa2w8Fbge2tOX4rxJ/K45/E9Npzb8nA1+iP/28Lcd/udineuxnsUjbavP/\n51n40U9I24GPNrc/CvzGuka0ilr+uxcrxftC+qW6e6vq68Bh+r+nmVkhflj+C4HbmaP4q+obVXVT\nc/u/gNvoT35oxfFfIf6zmrvn/vgDVNX3m5un0k+KRXuO/3KxwxSP/SwS/1kcX84B4AjHX1TzrICr\nk1yf5DVN25lVdQz6fyzAI2cW3XAeuUK8S38nR5nf38kbktyU5MMDH9XnNv4km+l/cvkSK79e2hD/\nl5umVhz/JCcluRH4BnB1VV1PS47/CrHDFI+9yzIP7/yqejLwPOB3kzyL4+/Ei9r2pbS2xfsB4Oeq\n6jz6fxR/NON4VpXkocAngDc1I+dWvV6Wib81x7+q/q+qnkT/k9bWJOfSkuO/TOyPZ8rHfhaJv5Xz\n/6vqnubfbwGfpv9x6liSMwGSbAC+ObsIh7JSvEeBRw88bi5/J1X1rWoKm8Cfc/wj7dzFn+QU+knz\nL6rqyqa5Ncd/ufjbdPwXVdV/0l+VbRstOv7wwNinfexnkfhbN/8/yWnN6IckDwEuBL5KP+5XNQ97\nJXDlsh3MztLvXqwU72eAS5I8OMljgHOA69YryFU8IP7mj3XRxcAtze15jP8jwK1V9d6BtjYd/x+J\nvy3HP8lPL5ZCkvwk8Kv0z1PM/fFfIfZDUz/2MzprvY3+TIHDwFtnEcOI8T6G/uyjG+kn/Lc27Y8A\n/q75vxwAHj7rWAdi3gv8O/2L4dwFvBo4faV4gbfRnxFwG3DhnMb/MeDm5nfxafo127mLHzif/nLk\ni6+ZG5rX/Iqvl5bE35bj/4Qm5puaeN/etM/98V8l9qke+6EXaZMk/Xjw5K4kdYyJX5I6xsQvSR1j\n4pekjjHxS1LHmPglqWNM/GqlJG9Pckv6S2XfkOSpTfufJdkyYd+vS/8a0qM855wk+5vlca9P8vdJ\nntnc98ok32zivCXJx5P8xCQxSpNwHr9aJ8nT6a9V8pyqujfJI4AHV3/hrVnEcyr9L9e8uao+27Q9\nHvjlqvpYklcCT6mqxXX5/xI4UFUfXbFTaQ3N7GLr0gR+Fvh2Vd0LUFXfWbwjyT8Av09/hcJ30l+I\n6zTgQVX180meQv9N4yHAt4FXVbNi40AfO4HvVdUfN/19GfgV4GHA71TVPy6J57eBLy4m/SamW4Fb\nB7tt+j6l2fdyS05L68JSj9roAHB2+ldxe3+SZy99QFXtr6onVX9F1a8A726S7p8CL6qqpwJ7gD8c\nYn8nV9XTgN8DFpa5/1z6X7NfzUuT3EB/GfLTgf1D7FdaEyZ+tU5V/TfwZOC1wLeAfUlesdxjk7wF\n+H5VfQh4HPBL9K+rcCPwduBRQ+zyk82//0T/ynGrSvLJ5rJ5nxho3ldVT66qDfQX2HrLEPuV1oSl\nHrVS9U9OXQNck+SrwCvoL2R1vyTPBV4EPGuxCbilqs4fcXc/aP69j+X/Zr4G3P+po6oubkpK716h\nv/3AG+hfClBad4741TpJHpvknIGm84A7lzxmE3AF8JKq+t+m+XbgZ5qTwyQ5pTkJO9Lul2nbCzwj\nyQsG2h6yyvOeCfzLiPuVpsYRv9roocD7mnXL76W/JO1rm/sWp6m9kv4yvJ9OEuBoVb0gyUuAP22e\nezLwHh54EnapE161qar+p0n6f5LkPcAx4HvAHww87LeSnN/s826OrwsvrTunc0pSx1jqkaSOMfFL\nUseY+CWpY0z8ktQxJn5J6hgTvyR1jIlfkjrm/wFvobAy04g0AAAAAABJRU5ErkJggg==\n", 449 | "text/plain": [ 450 | "" 451 | ] 452 | }, 453 | "metadata": {}, 454 | "output_type": "display_data" 455 | } 456 | ], 457 | "source": [ 458 | "#Make some plots\n", 459 | "ax = plt.hist(pddir[\"size\"]/(1024.*1024.*1024))\n", 460 | "plt.yscale('log')\n", 461 | "plt.xlabel(\"Size in GB\")" 462 | ] 463 | } 464 | ], 465 | "metadata": { 466 | "anaconda-cloud": {}, 467 | "kernelspec": { 468 | "display_name": "pyspark (2.0.0)", 469 | "language": "python", 470 | "name": "pyspark_2.0.0" 471 | }, 472 | "language_info": { 473 | "codemirror_mode": { 474 | "name": "ipython", 475 | "version": 2 476 | }, 477 | "file_extension": ".py", 478 | "mimetype": "text/x-python", 479 | "name": "python", 480 | "nbconvert_exporter": "python", 481 | "pygments_lexer": "ipython2", 482 | "version": "2.7.12" 483 | } 484 | }, 485 | "nbformat": 4, 486 | "nbformat_minor": 0 487 | } 488 | -------------------------------------------------------------------------------- /daya_dl/walk_manifold.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import time\n", 13 | "from matplotlib import pyplot as plt\n", 14 | "import pickle\n", 15 | "import os\n", 16 | "from sklearn.manifold import TSNE\n", 17 | "from numpy.random import rand\n", 18 | "%matplotlib inline\n", 19 | "%matplotlib nbagg" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": false 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "def walk_manifold(ims,ts,lbl=None,rev_label_keys=None):\n", 31 | " ''' Thanks to this example: http://matplotlib.org/examples/event_handling/pick_event_demo.html\n", 32 | " Function that plots low dim embedding and then displays orig high dim image\n", 33 | " based on what point was clicked\n", 34 | " Args:\n", 35 | " ims (numpy array): array of images -> shape is (num_ims x 8 x 24)\n", 36 | " ts (numpy array:): low dim embedding -> shape (num_ims,2)\n", 37 | "'''\n", 38 | " def onpick3(event):\n", 39 | " '''function that executes when point is clicked\n", 40 | " Args\n", 41 | " event (some matplotlib artist): data structure describing point clicked'''\n", 42 | " \n", 43 | " #get index of the point clicked\n", 44 | " ind = event.ind\n", 45 | " \n", 46 | " #pick corresponding image based on the point clicked\n", 47 | " im = ims[ind].reshape(8,24)\n", 48 | " \n", 49 | " #show image\n", 50 | " da_im = ax_im.imshow(im, interpolation='none',vmin=-1, vmax=1)\n", 51 | " cax = fig_s.add_axes([0.9, 0.1, 0.03, 0.8])\n", 52 | " fig_s.colorbar(da_im, cax=cax)\n", 53 | " plt.show()\n", 54 | "\n", 55 | " #create plot for scattering points\n", 56 | " fig_s, (ax_s,ax_im) = plt.subplots(1,2)\n", 57 | " if lbl is not None:\n", 58 | " colors=['r','b','g','y','k']\n", 59 | " col= [colors[int(i) - 1 ] for i in lbl]\n", 60 | " ax_s.scatter(ts[:,0],ts[:,1],color=col ,picker=True)\n", 61 | " \n", 62 | " else:\n", 63 | " col = ax_s.scatter(ts[:,0], ts[:,1],picker=True)\n", 64 | " \n", 65 | " #create plot for plotting images\n", 66 | "# fig1,ax_im = plt.subplots()\n", 67 | "\n", 68 | " \n", 69 | " #connect interactivity\n", 70 | " fig_s.canvas.mpl_connect('pick_event', onpick3)\n", 71 | "\n", 72 | " plt.show()\n", 73 | "\n", 74 | " " 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "* Red : ibd prompt\n", 82 | "* Blue: ibd delay\n", 83 | "* Green: muon\n", 84 | "* Yellow: flasher\n", 85 | "* Black: other\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": false, 93 | "scrolled": true 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "application/javascript": [ 99 | "/* Put everything inside the global mpl namespace */\n", 100 | "window.mpl = {};\n", 101 | "\n", 102 | "mpl.get_websocket_type = function() {\n", 103 | " if (typeof(WebSocket) !== 'undefined') {\n", 104 | " return WebSocket;\n", 105 | " } else if (typeof(MozWebSocket) !== 'undefined') {\n", 106 | " return MozWebSocket;\n", 107 | " } else {\n", 108 | " alert('Your browser does not have WebSocket support.' +\n", 109 | " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", 110 | " 'Firefox 4 and 5 are also supported but you ' +\n", 111 | " 'have to enable WebSockets in about:config.');\n", 112 | " };\n", 113 | "}\n", 114 | "\n", 115 | "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", 116 | " this.id = figure_id;\n", 117 | "\n", 118 | " this.ws = websocket;\n", 119 | "\n", 120 | " this.supports_binary = (this.ws.binaryType != undefined);\n", 121 | "\n", 122 | " if (!this.supports_binary) {\n", 123 | " var warnings = document.getElementById(\"mpl-warnings\");\n", 124 | " if (warnings) {\n", 125 | " warnings.style.display = 'block';\n", 126 | " warnings.textContent = (\n", 127 | " \"This browser does not support binary websocket messages. \" +\n", 128 | " \"Performance may be slow.\");\n", 129 | " }\n", 130 | " }\n", 131 | "\n", 132 | " this.imageObj = new Image();\n", 133 | "\n", 134 | " this.context = undefined;\n", 135 | " this.message = undefined;\n", 136 | " this.canvas = undefined;\n", 137 | " this.rubberband_canvas = undefined;\n", 138 | " this.rubberband_context = undefined;\n", 139 | " this.format_dropdown = undefined;\n", 140 | "\n", 141 | " this.image_mode = 'full';\n", 142 | "\n", 143 | " this.root = $('
');\n", 144 | " this._root_extra_style(this.root)\n", 145 | " this.root.attr('style', 'display: inline-block');\n", 146 | "\n", 147 | " $(parent_element).append(this.root);\n", 148 | "\n", 149 | " this._init_header(this);\n", 150 | " this._init_canvas(this);\n", 151 | " this._init_toolbar(this);\n", 152 | "\n", 153 | " var fig = this;\n", 154 | "\n", 155 | " this.waiting = false;\n", 156 | "\n", 157 | " this.ws.onopen = function () {\n", 158 | " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", 159 | " fig.send_message(\"send_image_mode\", {});\n", 160 | " fig.send_message(\"refresh\", {});\n", 161 | " }\n", 162 | "\n", 163 | " this.imageObj.onload = function() {\n", 164 | " if (fig.image_mode == 'full') {\n", 165 | " // Full images could contain transparency (where diff images\n", 166 | " // almost always do), so we need to clear the canvas so that\n", 167 | " // there is no ghosting.\n", 168 | " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", 169 | " }\n", 170 | " fig.context.drawImage(fig.imageObj, 0, 0);\n", 171 | " };\n", 172 | "\n", 173 | " this.imageObj.onunload = function() {\n", 174 | " this.ws.close();\n", 175 | " }\n", 176 | "\n", 177 | " this.ws.onmessage = this._make_on_message_function(this);\n", 178 | "\n", 179 | " this.ondownload = ondownload;\n", 180 | "}\n", 181 | "\n", 182 | "mpl.figure.prototype._init_header = function() {\n", 183 | " var titlebar = $(\n", 184 | " '
');\n", 186 | " var titletext = $(\n", 187 | " '
');\n", 189 | " titlebar.append(titletext)\n", 190 | " this.root.append(titlebar);\n", 191 | " this.header = titletext[0];\n", 192 | "}\n", 193 | "\n", 194 | "\n", 195 | "\n", 196 | "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", 197 | "\n", 198 | "}\n", 199 | "\n", 200 | "\n", 201 | "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", 202 | "\n", 203 | "}\n", 204 | "\n", 205 | "mpl.figure.prototype._init_canvas = function() {\n", 206 | " var fig = this;\n", 207 | "\n", 208 | " var canvas_div = $('
');\n", 209 | "\n", 210 | " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", 211 | "\n", 212 | " function canvas_keyboard_event(event) {\n", 213 | " return fig.key_event(event, event['data']);\n", 214 | " }\n", 215 | "\n", 216 | " canvas_div.keydown('key_press', canvas_keyboard_event);\n", 217 | " canvas_div.keyup('key_release', canvas_keyboard_event);\n", 218 | " this.canvas_div = canvas_div\n", 219 | " this._canvas_extra_style(canvas_div)\n", 220 | " this.root.append(canvas_div);\n", 221 | "\n", 222 | " var canvas = $('');\n", 223 | " canvas.addClass('mpl-canvas');\n", 224 | " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", 225 | "\n", 226 | " this.canvas = canvas[0];\n", 227 | " this.context = canvas[0].getContext(\"2d\");\n", 228 | "\n", 229 | " var rubberband = $('');\n", 230 | " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", 231 | "\n", 232 | " var pass_mouse_events = true;\n", 233 | "\n", 234 | " canvas_div.resizable({\n", 235 | " start: function(event, ui) {\n", 236 | " pass_mouse_events = false;\n", 237 | " },\n", 238 | " resize: function(event, ui) {\n", 239 | " fig.request_resize(ui.size.width, ui.size.height);\n", 240 | " },\n", 241 | " stop: function(event, ui) {\n", 242 | " pass_mouse_events = true;\n", 243 | " fig.request_resize(ui.size.width, ui.size.height);\n", 244 | " },\n", 245 | " });\n", 246 | "\n", 247 | " function mouse_event_fn(event) {\n", 248 | " if (pass_mouse_events)\n", 249 | " return fig.mouse_event(event, event['data']);\n", 250 | " }\n", 251 | "\n", 252 | " rubberband.mousedown('button_press', mouse_event_fn);\n", 253 | " rubberband.mouseup('button_release', mouse_event_fn);\n", 254 | " // Throttle sequential mouse events to 1 every 20ms.\n", 255 | " rubberband.mousemove('motion_notify', mouse_event_fn);\n", 256 | "\n", 257 | " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", 258 | " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", 259 | "\n", 260 | " canvas_div.on(\"wheel\", function (event) {\n", 261 | " event = event.originalEvent;\n", 262 | " event['data'] = 'scroll'\n", 263 | " if (event.deltaY < 0) {\n", 264 | " event.step = 1;\n", 265 | " } else {\n", 266 | " event.step = -1;\n", 267 | " }\n", 268 | " mouse_event_fn(event);\n", 269 | " });\n", 270 | "\n", 271 | " canvas_div.append(canvas);\n", 272 | " canvas_div.append(rubberband);\n", 273 | "\n", 274 | " this.rubberband = rubberband;\n", 275 | " this.rubberband_canvas = rubberband[0];\n", 276 | " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", 277 | " this.rubberband_context.strokeStyle = \"#000000\";\n", 278 | "\n", 279 | " this._resize_canvas = function(width, height) {\n", 280 | " // Keep the size of the canvas, canvas container, and rubber band\n", 281 | " // canvas in synch.\n", 282 | " canvas_div.css('width', width)\n", 283 | " canvas_div.css('height', height)\n", 284 | "\n", 285 | " canvas.attr('width', width);\n", 286 | " canvas.attr('height', height);\n", 287 | "\n", 288 | " rubberband.attr('width', width);\n", 289 | " rubberband.attr('height', height);\n", 290 | " }\n", 291 | "\n", 292 | " // Set the figure to an initial 600x600px, this will subsequently be updated\n", 293 | " // upon first draw.\n", 294 | " this._resize_canvas(600, 600);\n", 295 | "\n", 296 | " // Disable right mouse context menu.\n", 297 | " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", 298 | " return false;\n", 299 | " });\n", 300 | "\n", 301 | " function set_focus () {\n", 302 | " canvas.focus();\n", 303 | " canvas_div.focus();\n", 304 | " }\n", 305 | "\n", 306 | " window.setTimeout(set_focus, 100);\n", 307 | "}\n", 308 | "\n", 309 | "mpl.figure.prototype._init_toolbar = function() {\n", 310 | " var fig = this;\n", 311 | "\n", 312 | " var nav_element = $('
')\n", 313 | " nav_element.attr('style', 'width: 100%');\n", 314 | " this.root.append(nav_element);\n", 315 | "\n", 316 | " // Define a callback function for later on.\n", 317 | " function toolbar_event(event) {\n", 318 | " return fig.toolbar_button_onclick(event['data']);\n", 319 | " }\n", 320 | " function toolbar_mouse_event(event) {\n", 321 | " return fig.toolbar_button_onmouseover(event['data']);\n", 322 | " }\n", 323 | "\n", 324 | " for(var toolbar_ind in mpl.toolbar_items) {\n", 325 | " var name = mpl.toolbar_items[toolbar_ind][0];\n", 326 | " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", 327 | " var image = mpl.toolbar_items[toolbar_ind][2];\n", 328 | " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", 329 | "\n", 330 | " if (!name) {\n", 331 | " // put a spacer in here.\n", 332 | " continue;\n", 333 | " }\n", 334 | " var button = $('