├── CaloDNN-AnalyzePerformance.ipynb ├── CaloDNN-AnalyzeScan.ipynb ├── CaloDNN-Experiment-Walkthrough.ipynb ├── CaloDNN-Experiment.ipynb ├── DLKit-Generators.ipynb ├── DLKit-Models.ipynb ├── Imaging-Detector-Tutorial-Start-Here.ipynb ├── IntroToDLwithKeras-SOS.ipynb ├── IntroToDLwithKeras.ipynb ├── LArIAT-Data-Generator.ipynb ├── LArIAT-HandScan.ipynb ├── LArIAT-Visualization.ipynb ├── LArTPCDNN-Experiment.ipynb ├── LCD-BDT.ipynb ├── LCD-Data-Generator.ipynb ├── LCD-Visualization.ipynb ├── Lab-1.ipynb ├── Lab-2.ipynb ├── Lab-3-scratch.ipynb ├── Lab-3.ipynb ├── Lab-4.ipynb ├── Lab-5.ipynb ├── Lab-6.ipynb ├── NEXT-Data-Generator.ipynb ├── NEXTDNN-Experiment.ipynb ├── ParticleDetectorsIntro.ipynb ├── README.md └── Tutorial Installation.ipynb /CaloDNN-AnalyzePerformance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Single Model Energy-dependent Classification Performance Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "scrolled": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Using GPU 3\n", 22 | "Found 12 CPUs and 4 GPUs. Using 3 threads. max_threads = 12\n", 23 | "HyperParameter Scan: 240 possible combiniations.\n", 24 | "______________________________________\n", 25 | "ScanConfiguration\n", 26 | "______________________________________\n", 27 | "Picked combination: 0\n", 28 | "Combo[0]={'Width': 32, 'Depth': 1, 'lr': 0.01, 'optimizer': \"'RMSprop'\", 'decay': 0.01}\n", 29 | "Model Filename: CaloDNN_32_1_0.01_RMSprop_0.01\n", 30 | "______________________________________\n", 31 | "Couldn't import dot_parser, loading of dot files will not be possible.\n" 32 | ] 33 | }, 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "Using Theano backend.\n", 39 | "WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:\n", 40 | " https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29\n", 41 | "\n", 42 | "Using gpu device 3: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5105)\n" 43 | ] 44 | }, 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Searching in : /data/LCD/*/*.h5\n", 50 | "Found 639 files.\n", 51 | "Train Class Index Map: {'Pi0': 0, 'ChPi': 1, 'Gamma': 2, 'Ele': 3}\n", 52 | "Caching data on disk for faster processing after first epoch. Hope you have enough disk space.\n", 53 | "Loading Model From: TrainedModels.Test/CaloDNN_32_1_0.01_RMSprop_0.01_Merged.9\n", 54 | "Output Directory: TrainedModels/CaloDNN_32_1_0.01_RMSprop_0.01_Merged.9\n", 55 | "____________________________________________________________________________________________________\n", 56 | "Layer (type) Output Shape Param # Connected to \n", 57 | "====================================================================================================\n", 58 | "input_1 (InputLayer) (None, 25, 25, 25) 0 \n", 59 | "____________________________________________________________________________________________________\n", 60 | "input_2 (InputLayer) (None, 5, 5, 60) 0 \n", 61 | "____________________________________________________________________________________________________\n", 62 | "flatten_1 (Flatten) (None, 15625) 0 \n", 63 | "____________________________________________________________________________________________________\n", 64 | "flatten_2 (Flatten) (None, 1500) 0 \n", 65 | "____________________________________________________________________________________________________\n", 66 | "activation_1 (Activation) (None, 15625) 0 \n", 67 | "____________________________________________________________________________________________________\n", 68 | "activation_3 (Activation) (None, 1500) 0 \n", 69 | "____________________________________________________________________________________________________\n", 70 | "batch_normalization_1 (BatchNorm (None, 15625) 62500 \n", 71 | "____________________________________________________________________________________________________\n", 72 | "batch_normalization_2 (BatchNorm (None, 1500) 6000 \n", 73 | "____________________________________________________________________________________________________\n", 74 | "dense_1 (Dense) (None, 32) 500032 \n", 75 | "____________________________________________________________________________________________________\n", 76 | "dense_2 (Dense) (None, 32) 48032 \n", 77 | "____________________________________________________________________________________________________\n", 78 | "activation_2 (Activation) (None, 32) 0 \n", 79 | "____________________________________________________________________________________________________\n", 80 | "activation_4 (Activation) (None, 32) 0 \n", 81 | "____________________________________________________________________________________________________\n", 82 | "dropout_1 (Dropout) (None, 32) 0 \n", 83 | "____________________________________________________________________________________________________\n", 84 | "dropout_2 (Dropout) (None, 32) 0 \n", 85 | "____________________________________________________________________________________________________\n", 86 | "concatenate_1 (Concatenate) (None, 64) 0 \n", 87 | "____________________________________________________________________________________________________\n", 88 | "dense_3 (Dense) (None, 4) 260 \n", 89 | "====================================================================================================\n", 90 | "Total params: 616,824.0\n", 91 | "Trainable params: 582,574.0\n", 92 | "Non-trainable params: 34,250.0\n", 93 | "____________________________________________________________________________________________________\n", 94 | "Compiling Model.\n", 95 | "Warning: optimizer configuration parameter epsilon was not set in configuration file. Will use default.\n", 96 | "Warning: optimizer configuration parameter rho was not set in configuration file. Will use default.\n", 97 | "Skipping Training.\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "%matplotlib inline\n", 103 | "\n", 104 | "# Simply Run the usual experiment, using -L option to specify the model to load\n", 105 | "# Sets up the model and the data\n", 106 | "\n", 107 | "%run -im CaloDNN.ClassificationExperiment -- --NoAnalysis --NoTrain --cpu -L TrainedModels.Test/CaloDNN_32_1_0.01_RMSprop_0.01_Merged.9" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "scrolled": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "Searching in : /data/LCD/*/*.h5\n", 122 | "Found 639 files.\n", 123 | ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "TrainSampleList,TestSampleList,Norms,shapes=SetupData(FileSearch,\n", 129 | " ECAL,HCAL,True,NClasses,\n", 130 | " [float(NSamples)/MaxEvents,\n", 131 | " float(NTestSamples)/MaxEvents],\n", 132 | " Particles,\n", 133 | " BatchSize,\n", 134 | " multiplier,\n", 135 | " ECALShape,\n", 136 | " HCALShape,\n", 137 | " ECALNorm,\n", 138 | " HCALNorm)\n", 139 | "\n", 140 | "Test_genC = MakeGenerator(ECAL, HCAL, TestSampleList, NTestSamples, LCDNormalization(Norms),\n", 141 | " Merge=False,\n", 142 | " batchsize=BatchSize,\n", 143 | " shapes=shapes,\n", 144 | " n_threads=n_threads,\n", 145 | " multiplier=multiplier,\n", 146 | " cachefile=\"/tmp/CaloDNN-Analysis.h5\")\n", 147 | "\n", 148 | "print \"Loading Data into Memory:\"\n", 149 | "Test_genC.PreloadData(n_threads_cache)\n", 150 | "Test_X_ECAL, Test_X_HCAL, target, Test_Y = tuple(Test_genC.D)\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "scrolled": true 158 | }, 159 | "outputs": [ 160 | { 161 | "name": "stderr", 162 | "output_type": "stream", 163 | "text": [ 164 | "/home/afarbin/.virtualenvs/keras2/local/lib/python2.7/site-packages/matplotlib/__init__.py:1401: UserWarning: This call to matplotlib.use() has no effect\n", 165 | "because the backend has already been chosen;\n", 166 | "matplotlib.use() must be called *before* pylab, matplotlib.pyplot,\n", 167 | "or matplotlib.backends is imported for the first time.\n", 168 | "\n", 169 | " warnings.warn(_use_error_msg)\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "# Make the ROC Curves\n", 175 | "# Also performs inference on the test data, returning the results\n", 176 | "from DLAnalysis.Classification import *\n", 177 | "result,NewMetaData=MultiClassificationAnalysis(MyModel,[Test_X_ECAL,Test_X_HCAL],Test_Y,BatchSize,\n", 178 | " IndexMap={0:'Pi0', 2:'ChPi', 3:'Gamma', 1:'Ele'})" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# Bin the data\n", 190 | "Energy=target[:,:,2].flatten()\n", 191 | "\n", 192 | "def AUCvsEnergy(E_min=10.,E_max=510.,E_bins=100.):\n", 193 | " BD,E_binning=BinDataIndex(Energy, E_min, E_max, E_bins)\n", 194 | " # Run the Classification Analysis in Bins\n", 195 | " return BinMultiClassificationAnalysis(MyModel,Test_Y=Test_Y,Y_binning=E_binning,\n", 196 | " bin_indecies=BD, result=result,\n", 197 | " IndexMap={0:'Pi0', 2:'ChPi', 3:'Gamma', 1:'Ele'})" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": true, 205 | "scrolled": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "# Full Energy Range\n", 210 | "Res=AUCvsEnergy(10.,510.,50.)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# 10 to 100 GeV\n", 222 | "Res=AUCvsEnergy(10.,110.,20.)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "# 250 to 350 GeV\n", 234 | "Res=AUCvsEnergy(250.,350.,50.)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "tmp=plt.hist(Energy,bins=100)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "Python 2", 261 | "language": "python", 262 | "name": "python2" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 2 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython2", 274 | "version": "2.7.10" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /CaloDNN-Experiment-Walkthrough.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Calorimetry with DNNs\n", 8 | "\n", 9 | "The data from highly granular calorimeters can be viewed as 3D images, making them ideal for image classification problems. In this lab, we will apply image classification to simulated calorimeter data from the LCD detector concept for the CLIC accelerator. We will use the [CaloDNN](https://github.com/UTA-HEP-Computing/CaloDNN) package to systematically study different neural network architectures, optimizers, loss functions, and other hyperparameters.\n", 10 | "\n", 11 | "The data is compused of 4 particle types: electrons, neutral pions (pi0s), charged pions, and photons (gamma). The LCD calorimeter is composed of electromagnetic (ECAL) and hadronic (HCAL) sections. The simulation shoots a single particle into the calorimeter and stores a 25 by 25 by 25 cell part of the ECAL and 5 by 5 by 60 part of the HCAL around the particle. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## CaloDNN\n", 19 | "\n", 20 | "CaloDNN is a DLKit based package for studying LCD Calorimetry with DNNs.\n", 21 | "\n", 22 | "The package consists of the following files:\n", 23 | "\n", 24 | " **CaloDNN/ClassificationExperiment.py**: This is the “experiment” that drives everything.\n", 25 | "\n", 26 | " **CaloDNN/ClassificationArguments.py**: This is the file where all of the above arguments are defined and parsed. You can add your own options here if need be. Some defaults are defined here.\n", 27 | "\n", 28 | " **CaloDNN/ClassificationScanConfig.py**: This is the configuration file. The model and experiment parameters are set here. This example is setup to allow hyper-parameter scanning. It also contains the list of input files and maps the files and datasets to classes as well as controls what variables are used in the Neural Network.\n", 29 | "\n", 30 | " **CaloDNN/Models.py**: This contains the Keras models, wrapped in a DLKit ModelWrapper class.\n", 31 | "\n", 32 | " **CaloDNN/LCDData.py**: Contains the DLGenerators to read the data.\n", 33 | " \n", 34 | "Typically we run these experiments from the shell command prompt (e.g. here getting help):\n", 35 | "\n", 36 | " python -m CaloDNN.ClassificationExperiment --help\n", 37 | " \n", 38 | "But we can also do it in our current Jupyter session as follows:\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "%run -im CaloDNN.ClassificationExperiment -- --help" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "The experiment has 4 steps:\n", 55 | "\n", 56 | " 1. Setup Loading Data\n", 57 | " 2. Load or Build Model\n", 58 | " 3. Train Model\n", 59 | " 4. Run Analysis \n", 60 | "\n", 61 | "You can turn off steps as needed using the flags above. " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "We will now run a test classification experiment using the --Test flag on the command line. In this mode there are a reduced number of events and epochs run. This is a good test of your setup and to walk-through the code:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%matplotlib inline\n", 78 | "import matplotlib.pyplot as plt\n", 79 | "%run -im CaloDNN.ClassificationExperiment -- --Test" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "You will see the evolution of the analysis as a function of epochs (in this test we are only running 20k events with 10 epochs. When we run an experiment the model is saved to CaloDNN/TrainedModels, so you can re-load the model in future analyses or further experiments with more events and epochs. The naming of the model reflects the hyper-parameter settings. \n", 87 | "\n", 88 | "At the end you will see a plot reflecting the 'success' of the model at classifying each of the four types of particles (electron, photon, charged or neutral pions). The 'area' (area under the curve) gives a measure of how well this hyper-parameter scan was overall at classifying each particle type... which is pretty great! Even with this short 'test'." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### Configuration and Hyperparameter Scanning\n", 96 | "\n", 97 | "Our DNN architecture is defined by the types, number, and dimension of layers. Hyper-parameter\n", 98 | "scanning refers to the process of searching for an optimal architecture that performs well for a\n", 99 | "task and can be trained and applied within reasonable time. Beyond the parameters that define the\n", 100 | "DNN architecture, other configuration parameters allow setting and testing activation and cost\n", 101 | "functions, optimization (e.g. minimization) techniques, and rate other training parameters.\n", 102 | "\n", 103 | "In DLKit, these parameters are set in a configuration file, which defines a single python key/value\n", 104 | "dictionary called **Config**. DLKit puts the contents of this dictionary in the global scope with the\n", 105 | "keys as the variable names. As an example, see `CaloDNN/ClassificationScanConfig.py`:\n", 106 | "\n", 107 | "```\n", 108 | "Config={\n", 109 | " \"MaxEvents\":int(3.e6),\n", 110 | " \"NTestSamples\":100000,\n", 111 | " \"NClasses\":4,\n", 112 | "\n", 113 | " \"Epochs\":1000,\n", 114 | " \"BatchSize\":1024,\n", 115 | "\n", 116 | "...\n", 117 | "\n", 118 | " # Configure Running time callback\n", 119 | " # Set RunningTime to a value to stop training after N seconds.\n", 120 | " \"RunningTime\": 2*3600,\n", 121 | "\n", 122 | " # Load last trained version of this model configuration. (based on Name var below)\n", 123 | " \"LoadPreviousModel\":True\n", 124 | " }\n", 125 | "```\n", 126 | "\n", 127 | "These parameters are fixed and will be used by the Experiment to build the model. \n", 128 | "\n", 129 | "An important parameter in this configuration file is the `RunningTime`, which sets duration of the training. Using this parameter, you can train a model for a fix amount of time. You can rerun the job to continue training, which will automatically load the last successful training session, as set by `LoadPreviousModel` parameter.\n", 130 | "\n", 131 | "[`CaloDNN/ClassificationScanConfig.py`](https://github.com/UTA-HEP-Computing/CaloDNN/blob/master/ClassificationScanConfig.py) is well commented. We suggest you read through the comments.\n", 132 | "\n", 133 | "For hyper-parameter scanning, it would be cumbersome to generate a new configuration file for every\n", 134 | "network we would like to try. Instead, **ScanConfig.py** uses a second dictionary to specify\n", 135 | "parameters that you would like to scan, and the **DLTools.Permutator** class to generate all possible\n", 136 | "resulting configurations. For example the following lines:\n", 137 | "\n", 138 | "```\n", 139 | "# Parameters to scan and their scan points.\n", 140 | "Params={ \"optimizer\":[\"'RMSprop'\",\"'Adam'\",\"'SGD'\"],\n", 141 | " \"Width\":[32,64,128,256,512],\n", 142 | " \"Depth\":range(1,5),\n", 143 | " \"lr\":[0.01,0.001],\n", 144 | " \"decay\":[0.01,0.001],\n", 145 | " }\n", 146 | " ```\n", 147 | "\n", 148 | "will generate 3 x 5 x 4 x 2 x 2 = 240 different configurations, which we can enumerate through. To check, we can\n", 149 | "simply run the **ClassificationScanConfig.py** file:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "%run -m CaloDNN.ClassificationScanConfig" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "This should tell you the number possible configurations. We will select\n", 166 | "specific ones using the **-s** flag when running the experiment." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "%run -im CaloDNN.ClassificationExperiment -- --Test -s 10" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "source": [ 186 | "## Performing a Scan\n", 187 | "\n", 188 | "From above, it should be appearant that in order you can easily try all possible configurations by running the same command with all possible values of the `-s` parameter. And since every configuration is independent, you can run the experiments in parallel. \n", 189 | "\n", 190 | "### PBS/Torque Batch System\n", 191 | "\n", 192 | "On most GPU equipped clusters, like UTA-DL, a batch system allows you to submit \"jobs\" into \"queues\" which will then execute each job when appropriate resources become available. \n", 193 | "\n", 194 | "You can get a list of available queue, using the `qstat -Q` command:" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "!qstat -Q" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "On the UTA-DL cluster, the queues are setup as follows. The `cpu_queue` and `gpu_queue` routing queues send jobs to CPU and GPU resources on each of 5 nodes:\n", 211 | "\n", 212 | " * thecount: 44-core 10 GPU\n", 213 | " * super: 24-core 4 GPU\n", 214 | " * thingone and thingtwo: 6 core 4 GPU each.\n", 215 | " * oscar: 6 core 2 GPU (used for Jupyter sessions).\n", 216 | " \n", 217 | "Submitting to the queue system, requires you to write a script. For example, this is the script `CaloDNN/ScanJob.sh`:\n" 218 | ] 219 | }, 220 | { 221 | "cell_type": "raw", 222 | "metadata": {}, 223 | "source": [ 224 | "#PBS -V\n", 225 | "printenv\n", 226 | "mkdir -p ScanLogs\n", 227 | "output=ScanLogs/$PBS_ARRAYID.log\n", 228 | "\n", 229 | "echo $output >> $output\n", 230 | "echo Running on $HOSTNAME >> $output\n", 231 | "echo Array Number: $PBS_ARRAYID >> $output\n", 232 | "echo Queue: $PBS_QUEUE >> $output\n", 233 | "\n", 234 | "cd ~/LCD/DLKit\n", 235 | "source setup.sh\n", 236 | "\n", 237 | "python -m CaloDNN.ClassificationExperiment -s $PBS_ARRAYID &>> $output" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "This scripts creates a directory to store the `stdout/stderr` output of the job. Sets up the environment, and starts the job. To set the `-s` parameter, we use Torque's array job mechanism, which will set the `$PBS_ARRAYID` environment variable, to an interger as specified during submission.\n", 245 | "\n", 246 | "So for example, to run configurations 10-20, we do (don't run this unless you mean it):" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "!qsub -q gpu_queue -t 10-20 CaloDNN/ScanJob.sh" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "You can monitor your jobs using the `qstat` command:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "!qstat" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Analysis\n", 279 | "\n", 280 | "After you jobs start to complete, you can start viewing the performance using:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "!python -m DLAnalysis.Scan TrainedModels/" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "You can explore the performance of all of the models in your scan using the python notebook [`CaloDNN/AnalyzeScan-OptimizerStudy.ipynb`](https://github.com/UTA-HEP-Computing/CaloDNN/blob/master/AnalyzeScan-OptimizerStudy.ipynb). Simply make a copy of the notebook into your DLKit directory and navigate Jupyter to the notebook:" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "!cp CaloDNN/AnalyzeScan-OptimizerStudy.ipynb ./AnalyzeScan-MyStudy.ipynb" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "Similarly use can use [`CaloDNN/AnalyzerPerformance.ipynb`](https://github.com/UTA-HEP-Computing/CaloDNN/blob/master/AnalyzePerformance.ipynb) to study the performance of a specific model in detail." 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "## The Experiment" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "The main driver of the experiment, [`CaloDNN/ClassificationExperiment.py`](https://github.com/UTA-HEP-Computing/CaloDNN/blob/master/ClassificationExperiment.py), is well commented. In order for you to add you own models and modify things, you should carefully read through this file." 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 2", 346 | "language": "python", 347 | "name": "python2" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 2 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython2", 359 | "version": "2.7.12" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 2 364 | } 365 | -------------------------------------------------------------------------------- /CaloDNN-Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "The following runs the Classification Experiment. Remove \"--Test\" to run on full dataset for many epochs." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "scrolled": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%run -im CaloDNN.ClassificationExperiment -- --Test" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from DLAnalysis.Scan import *\n", 30 | "MyModels=[MyModel]\n", 31 | "# Rename the Models using Width and Depth\n", 32 | "ResetNames(MyModels,[\"Width\",\"Depth\"])\n", 33 | "\n", 34 | "# Add the number of epochs to MetaData by counting length of history\n", 35 | "GetEpochs(MyModels)\n", 36 | "\n", 37 | "# Pull data from previous trainings into latest MetaData, with \"All_\" suffix\n", 38 | "print \"Historical Parameters:\",GetHistorical(MyModels) \n", 39 | "print \"-------------\"\n", 40 | "print \"MetaData:\",MyModel.MetaData.keys()\n", 41 | "print \"-------------\"\n", 42 | "\n", 43 | "print \"Available Parameters:\", GetGoodParams(MyModels)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "ScanTable(MyModels,['Model Name', 'Width', 'Depth', 'Epochs', 'Ele_AUC', 'Pi0_AUC', 'ChPi_AUC', 'Gamma_AUC'],[1,2,0])" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "PlotMetaDataMany(MyModels,4,\n", 62 | " [[\"History\",\"loss\"],[\"History\",\"val_loss\"]],\n", 63 | " sort=[\"Width\",\"Depth\",\"lr\",\"decay\"],switch=[\"Width\",\"Depth\"],loc=\"center left\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Note this will work only if you trained the same model more than once \n", 73 | "PlotMetaData(MyModels,[\"All_History.loss\"])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "PlotMetaDataMany(MyModels,4,\n", 83 | " [[\"All_History.loss\"],[\"All_History.val_loss\"]],\n", 84 | " sort=[\"Width\",\"Depth\",\"lr\",\"decay\"],switch=[\"Width\",\"Depth\"],loc=\"center left\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "MyModel.MetaData[\"InputMetaData\"]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "GetHistorical(MyModels)" 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 2", 109 | "language": "python", 110 | "name": "python2" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 2 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython2", 122 | "version": "2.7.12" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /DLKit-Generators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DLKit Generators\n", 8 | "\n", 9 | "DLKit is a lightweight framework for managing a large number of Keras models. It provides a wrapper for models, tools to efficiently represent and read data, and analysis functions. This notebook overview how to use the DLKit generators to rapidly read and process data using a large number of processes/threads.\n", 10 | "\n", 11 | "\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## DLGenerator\n", 19 | "\n", 20 | "For most HEP applications, the data is far too big to keep in memory. So during fitting, the data needs to be loaded on the fly. Keras uses python generators to read data as it trains. Since reading data can take some time, training in this way usually takes significantly longer than loading the data into memory first. To accelerate reading, Keras enables you to read the data using multiple parallel generators, but unfortunately their implementation has several issues that make it inefficient. So `DLKit` provides generators that not only run much faster, make it easy to read data.\n", 21 | "\n", 22 | "Let's try to read the LCD data using a `DLGenerator`. `DLKit` provides a generator which can read any files from various directories and correctly mix examples in the training data. But we can do the mixing before hand to save some time during training. A \"premixed\" file is available at `/data/LCD/LCD-Merged-All.h5`.\n", 23 | "\n", 24 | "First lets open the file up by hand to see what is inside:" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "ECAL (3211264, 25, 25, 25)\n", 37 | "HCAL (3211264, 5, 5, 60)\n", 38 | "OneHot (3211264, 4)\n", 39 | "index (3211264,)\n", 40 | "target (3211264, 1, 5)\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "import h5py\n", 46 | "f=h5py.File('/data/LCD/LCD-Merged-All.h5')\n", 47 | "\n", 48 | "for k in f.keys():\n", 49 | " try:\n", 50 | " print k,f[k].shape\n", 51 | " except:\n", 52 | " print k,\"Not a tensor\"\n", 53 | "f.close() " 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "We see 3211264 events, which will require several hundred gigs of ram to load into memory. ECAL and HCAL are as described above. \"OneHot\" and \"index\" encode the true class of each example. \"target\" holds the energy of the particle. \n", 61 | "\n", 62 | "Now we can build a DLGenerator to read this file on the fly:" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stderr", 72 | "output_type": "stream", 73 | "text": [ 74 | "Using Theano backend.\n" 75 | ] 76 | }, 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Couldn't import dot_parser, loading of dot files will not be possible.\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# A function to Normalize the data.\n", 87 | "\n", 88 | "from DLTools.ThreadedGenerator import DLh5FileGenerator\n", 89 | "\n", 90 | "def ConstantNormalization(Norms):\n", 91 | " def NormalizationFunction(Ds):\n", 92 | " out = []\n", 93 | " for i,Norm in enumerate(Norms):\n", 94 | " Ds[i]/=Norm\n", 95 | " out.append(Ds[i])\n", 96 | " return out\n", 97 | " return NormalizationFunction\n", 98 | "\n", 99 | "def MergeInputs():\n", 100 | " def f(X):\n", 101 | " return [X[0],X[1]],X[2]\n", 102 | " return f\n", 103 | "\n", 104 | "def MakePreMixGenerator(InputFile,BatchSize,Norms=[150.,1.], Max=3e6,Skip=0, \n", 105 | " ECAL=True, HCAL=True, Energy=False, **kwargs):\n", 106 | " datasets=[]\n", 107 | "\n", 108 | " if ECAL:\n", 109 | " datasets.append(\"ECAL\")\n", 110 | " if HCAL:\n", 111 | " datasets.append(\"HCAL\")\n", 112 | "\n", 113 | " datasets.append(\"OneHot\")\n", 114 | "\n", 115 | " if Energy:\n", 116 | " datasets.append(\"target\")\n", 117 | " \n", 118 | " if ECAL and HCAL:\n", 119 | " post_f=MergeInputs()\n", 120 | " else:\n", 121 | " post_f=False\n", 122 | " \n", 123 | " pre_f=ConstantNormalization(Norms)\n", 124 | " \n", 125 | " G=DLh5FileGenerator(files=[InputFile], datasets=datasets,\n", 126 | " batchsize=BatchSize,\n", 127 | " max=Max, skip=Skip, \n", 128 | " postprocessfunction=post_f,\n", 129 | " preprocessfunction=pre_f,\n", 130 | " **kwargs)\n", 131 | " \n", 132 | " return G\n", 133 | "\n", 134 | "MyGen=MakePreMixGenerator(\"/data/LCD/LCD-Merged-All.h5\",1024,[150.,150.,1.])" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "`DLh5FileGenerator` takes a list of files and keys of objects to read, and delivers `BatchSize` number of examples as requested. Note that we not only read the data, but we use a `preprocessfunction` to normalize the data, and a `postprocessfunction` to format output as needed for Keras to train a ECAL and HCAL model simultaneously.\n", 142 | "\n", 143 | "Let's get some events:" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 3, 149 | "metadata": { 150 | "scrolled": true 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "(1024, 25, 25, 25)\n", 158 | "(1024, 5, 5, 60)\n", 159 | "(1024, 4)\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "TheGen=MyGen.Generator()\n", 165 | "Data=TheGen.next()\n", 166 | "\n", 167 | "print Data[0][0].shape\n", 168 | "print Data[0][1].shape\n", 169 | "print Data[1].shape\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "# Mixing Generator\n", 177 | "\n", 178 | "The DLKit's mixing generator take data separated into files and appropriately mix them and label them for classification tasks." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 2", 194 | "language": "python", 195 | "name": "python2" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 2 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython2", 207 | "version": "2.7.12" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 2 212 | } 213 | -------------------------------------------------------------------------------- /DLKit-Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DLKit Models\n", 8 | "\n", 9 | "DLKit is a lightweight framework for managing a large number of Keras models. It provides a wrapper for models, tools to efficiently represent and read data, and analysis functions. This notebook introduces the DLKit `ModelWrapper`.\n", 10 | "\n", 11 | "## DLModels\n", 12 | "To create a DLKit model, create a new class that inherits from `DLTools.ModelWrapper` and implement the constructor (`__init__`) and `Build` functions. For example:\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from DLTools.ModelWrapper import ModelWrapper\n", 22 | "\n", 23 | "from keras.layers import Input, Dense\n", 24 | "from keras.models import Model\n", 25 | "\n", 26 | "class MLPClassification(ModelWrapper):\n", 27 | " def __init__(self, Name,\n", 28 | " InputShape=(None,5),\n", 29 | " Widths=[10],\n", 30 | " Activation=\"relu\",\n", 31 | " Loss=\"categorical_crossentropy\",\n", 32 | " Optimizer=\"SGD\"):\n", 33 | "\n", 34 | " super(MLPClassification, self).__init__(Name, Loss, Optimizer)\n", 35 | "\n", 36 | " self.InputShape = self.MetaData[\"InputShape\"] = InputShape\n", 37 | " self.Widths = self.MetaData[\"Widths\"] = Widths\n", 38 | " self.Activation = self.MetaData[\"Activation\"] = Activation\n", 39 | "\n", 40 | " def Build(self):\n", 41 | " myInput = Input(shape=self.InputShape)\n", 42 | " myModel = myInput\n", 43 | "\n", 44 | " for i in range(0, len(self.Widths)-1):\n", 45 | " myModel = Dense(self.Widths[i], activation=self.Activation)(myModel)\n", 46 | "\n", 47 | " # Use softmax activation for final layer for classification\n", 48 | " i+=1\n", 49 | " myModel = Dense(self.Widths[i], activation=\"softmax\")(myModel)\n", 50 | "\n", 51 | " self.Model = Model(inputs=myInput, outputs=myModel)\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Note that you specify any configurable parameters of you model in the constructor. Also, you can store information in the `MetaData` dictionary. This information, along with the model architecture and weights, will be saved when `ModelWrapper.Save()` is called and restored when `ModelWrapper.Load()` is called. This information will be stored in `TrainedModels/`, where `` is the name of the model you provide when you instantiate the model. For example:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Instantiate a model which takes in 10 inputs per example, has hidden layers \n", 68 | "# of width 5, 10, and 20, and classifies into 4 classes.\n", 69 | "MyModel=MLPClassification(\"MyFirstModel\",InputShape=(None,10), Widths=[5,10,20,4] )\n", 70 | "\n", 71 | "# Now Build it\n", 72 | "MyModel.Build()\n", 73 | "\n", 74 | "# And Compile\n", 75 | "MyModel.Compile()\n", 76 | "\n", 77 | "# Get Summary from the Keras model\n", 78 | "MyModel.Model.summary()\n", 79 | "\n", 80 | "MyModel.Save()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Note that `MyModel.Model` is the Keras model, which you can then use to fit or evaluate. You can see the saved model in the `TrainedModels` directory:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "%ls TrainedModels\n", 97 | "%ls TrainedModels/MyFirstModel" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "You can load a model back:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "LoadedModel=ModelWrapper(\"MyFirstModel\")\n", 114 | "LoadedModel.Load()\n", 115 | "print LoadedModel.MetaData" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Note that by default, `Save()` does not overwrite existing models with the same name. Also note that `MetaData[\"InputMetaData\"]` is a list of all previous saves of the model. " 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [] 133 | } 134 | ], 135 | "metadata": { 136 | "kernelspec": { 137 | "display_name": "Python 2", 138 | "language": "python", 139 | "name": "python2" 140 | }, 141 | "language_info": { 142 | "codemirror_mode": { 143 | "name": "ipython", 144 | "version": 2 145 | }, 146 | "file_extension": ".py", 147 | "mimetype": "text/x-python", 148 | "name": "python", 149 | "nbconvert_exporter": "python", 150 | "pygments_lexer": "ipython2", 151 | "version": "2.7.12" 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 2 156 | } 157 | -------------------------------------------------------------------------------- /Imaging-Detector-Tutorial-Start-Here.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Imaging Detector Tutorial\n", 8 | "\n", 9 | "Particle detectors such as Calorimeters, Time Projection Chambers, and Cherenkov detectors produce 2D or 3D images of particle interactions that are typically 'reconsructed' by algorithms into features that enable identifying particle types and measuring particle energies. This tutorial presents simulated data from three different types of such detectors along with several challenging problems, solutions to which will have significant impact to High Energy Physics. The goal is to establish working groups composed of physicists and machine learning researchers aimed at systematically searching for viable solutions to these problems, which in turn can be applied to running and future particle physics experiments. \n", 10 | "\n", 11 | "Along with these datasets, this tutorial provides software that facilitates rapidly reading these generally large datasets and collaboratively building and studying Deep Neural Networks. Most of the suggested problems are either completely setup or a small variation of an existing example, allowing participants to focus on the network architecture and training instead of worrying about data engineering issues. The tutorial will be run on the HEP Deep Learning cluster at the University of Texas Arlington, which provides 80 CPU cores and 22 (mostly Pascal) NVidia GPUs. \n", 12 | "\n", 13 | "The plan for the tutorial is rather ambitious. The goal of the first session is to introduce the datasets, run an existing classification problem, and enable participants to use their own models and perform hyperparameter scans. Participants are encouraged to use the two days between the tutorials to try ideas and perform scans. In the second session, participants will attempt to simulate data using Generative Adversorial Networks and to measure particle energy via regression. In the final day, participants will be presented the problem of inferring 3D spatial information from two 2D images. For the most part, participants will be lead through fully setup and running problems that use simple networks on down sampled data and not achieve desired performance. " 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Understanding and Visualizing the Data\n", 21 | "\n", 22 | "[Note: this section is meant to serve as reference material. The information provided here will be presented in the introductory presentations. Participants are encourged to attempt any exercises presented in this section on their own time, and instead spend the time during the tutorial session to run Deep Neural Networks.]\n", 23 | "\n", 24 | "The [Particle Detector Introduction notebook](ParticleDetectorsIntro.ipynb) introduces the detectors and provides background information that non-physicists may find helpful to better understand the data and problems. Participants have a choice of datasets to use. This material is intended to facilitate making the choice of dataset.\n", 25 | "\n", 26 | "The [LCD Visualization notebook](LCD-Visualization.ipynb) notebook introduces the LCD dataset by generating some simple visualizations. For comparison, the [LCD BDT notebook](LCD-BDT.ipynb) trains a Boosted Decision Tree (BDT) using features derived from the LCD images.\n", 27 | "\n", 28 | "The [LArIAT HandScan notebook](LArIAT-HandScan.ipynb) introduces the LArIAT dataset and leads participants to perform a handscan. This tutorial is aimed at participants with little experience in Machine Learning. It introduces basic python, numpy, and h5 file/data structure manipulation, ploting in matplotlib, and the concepts of problem formulation, training, and validation. \n", 29 | "\n", 30 | "The [LArIAT Visualization notebook](LArIAT-Visualization.ipynb) notebook introduces the LArIAT dataset by generating some 2D and 3D visualizations." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## The DLKit Framework\n", 38 | "\n", 39 | "[Note: this section is also meant to serve as reference material. A quick overview of the information provided here will be presented in the introductory presentations. Experienced participants may find this section useful to quickly use the datasets in their own codes. Others are encouraged to refer to this material as their needs become more sophisticated.]\n", 40 | "\n", 41 | "The DLKit is a lightweight Framework built on top of Keras and is intended to facilitate rapidly reading large datasets and easily studying a large number of models and collaborating with others. Experienced participants may choose to work on the datasets using their own tools, though they are encourged to use the data generators in DLKit for reading the data.\n", 42 | "\n", 43 | "The [DLKit Models Notebook](DLKit-Models.ipynb) introduces the data DLKit ModelWrapper. \n", 44 | "\n", 45 | "The [DLKit Generators Notebook](DLKit-Generators.ipynb) introduces the data generators in DLKit. \n", 46 | "\n", 47 | "The [LCD Data Generator notebook](LCD-Data-Generator.ipynb) demonstrates how to read, mix, and down sample the LArIAT data using the multi-threaded generators in DLKit. This notebook may be useful for experienced participants who wish to rapidly adapt their own code to run on this data sample.\n", 48 | "\n", 49 | "The [LArIAT Data Generator notebook](LArIAT-Data-Generator.ipynb) demonstrates how to read, mix, and down sample the LArIAT data using the multi-threaded generators in DLKit. This notebook may be useful for experienced participants who wish to rapidly adapt their own code to run on this data sample.\n", 50 | "\n", 51 | "The [NEXT Data Generator notebook](NEXT-Data-Generator.ipynb) demonstrates how to read, mix, and down sample the NEXT data using the multi-threaded generators in DLKit. This notebook may be useful for experienced participants who wish to rapidly adapt their own code to run on this data sample." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Running an Experiment\n", 59 | "\n", 60 | "In this context, \"experiment\" refers to the process of developing a Deep Neural Network (DNN) to perform a specific task on a specific data set. In DLKit, an experiment sets up a problem, reads the data, builds a DNN, trains it, and assess the performance.\n", 61 | "\n", 62 | "We will learn how to run an experiment by following the [CaloDNN Experiment Walkthrough notebook](CaloDNN-Experiment-Walkthrough.ipynb). Note that all of the other experiments work exactly the same way.\n", 63 | "\n", 64 | "The following three notebooks simply run each corresponding experiment within a notebook. Participants are encouraged to copy these notebooks and edit them as needed.\n", 65 | "\n", 66 | " * [CaloDNN Experiment notebook](CaloDNN-Experiment.ipynb)\n", 67 | " * [LArTPCDNN Experiment notebook](LArTPCDNN-Experiment.ipynb)\n", 68 | " * [NEXTDNN Experiment notebook](NEXTDNN-Experiment.ipynb)\n", 69 | " \n", 70 | "After training a single DNN model, you can closely examine its performance by loading the model, applying it to some test data, and making plots. The [CaloDNN Analyze Performance notebook](CaloDNN-AnalyzePerformance.ipynb) demonstrate how study a trained model in detail.\n", 71 | "\n", 72 | "After training a set of DNN models in a hyperparameter scan, you can compare the performance of models using tools in DLKit. The [CaloDNN Analyze Scan notebook](CaloDNN-AnalyzeScan.ipynb) demonstrates how compare a large number of models." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Day 1 Challenge\n", 80 | "\n", 81 | "All of the experiments presented above implement the simplest possible network, a fully connected DNN with width and depth specified or scanned. Since the dataset are composed of 2D and 3D images, Convolutional Neutral Networks (CNNs) are probably the best suited. \n", 82 | "\n", 83 | "We challenge participants to choose a dataset, implement a more sophisticated model than the fully connected DNN, perform a hyper-parameter scan in the batch queues, compare the performance of the trained models, and show the energy dependence of the performance for the best model.\n" 84 | ] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 2", 90 | "language": "python", 91 | "name": "python2" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 2 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython2", 103 | "version": "2.7.12" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /IntroToDLwithKeras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# An Introduction to Deep Learning with Keras\n", 8 | "\n", 9 | "This tutorial is meant to teach a beginning HEP under-graduate or graduate student who may be unfamiliar with python or data science in python to train Deep Learning models using Keras. Usually, tutorials attempt to familiarize you with a software package by leading you through the steps of a few tasks. Like most tutorials, there are many sections where you can simply follow the instructions and execute example code like a robot (usualy via copy/paste into a terminal, but in this case using Jupyter notebooks). But this tutorial also aims to teach you key concepts in scientific computing, Machine Learning, and Deep Learning in python through exercises that require you to slow down, think critically, and apply what you read. The exercises were derived from labs for a Deep Learning in HEP course for undergrads (taught at University of Texas at Arlington by Amir Farbin). \n", 10 | "\n", 11 | "The tutorial is divided into three sections:\n", 12 | "\n", 13 | "A. Basics\n", 14 | " 0. Jupyter\n", 15 | " 1. Python\n", 16 | " 2. Numpy\n", 17 | " 3. HDF5\n", 18 | " \n", 19 | "B. MachineLearning\n", 20 | " 1. Dataset\n", 21 | " 2. Pandas\n", 22 | " 3. Scikit-learn\n", 23 | "\n", 24 | "C. DeepLearning\n", 25 | " 1. Keras\n", 26 | "\n", 27 | "You are very likely to find that the first sections of this tutorial are very basic. If you have some familiarity with data science in python, we suggest you skip what you know and only follow sections B.1, B.2, and C.\n", 28 | "\n", 29 | "Please quickly skim the beginning sections to find the appropriate starting point for you. If there isn't sufficient time for you to finish the exercises during the tutorial session please read through the explanations, execute the cells containing the examples, and think about how you would solve the exercises. You can go back and try the exercises at home." 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "\n", 37 | "# A. Basics\n", 38 | "\n", 39 | "Data Science in python usually starts with loading hdf5 files into numpy tensors for manipulation in an interactive python session. While you can run the session in a terminal, Jupyter provides a nice web-based alternative environment for data analysis. As you can see (since you should be running this in Jupyter Notebbok), it allows you to combine text, code, and results all in one interactive document in your browser. There are many excellent primers already out there for python, numpy, h5py, and jupyter. You are encouraged to study them on your own as needed. " 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "\n", 47 | "## 0. Jupyter\n", 48 | "If you are seeing this page, you have successfully connected to a python server via ssh tunnel and navigated to this notebook. Jupyter notebooks consist of cells that can hold text or code (usually python). This text that you are reading, was written into a text cell as simple text \"coding\" language known as mark-down. When this cell is run (either automatically at start of the notebook or manually by pressing shift-enter), the mark-down text is interpreted into nice looking text. Running a code cell will execute the code in that cell and give you the results. If you make a mistake, you can usually simply change the cell and re-run. But be aware that since you ran the mistaken cell already, whatever code was properly executed before your mistake/error, was already executed and has therefore changed your current python environment accordingly. In some cases this situation will be problematic, and you will need to rerun the notebook from the start by pressing the \"reload\" botton (next to the \"stop\" button) above.\n", 49 | "\n", 50 | "You are encouraged to add cells to this notebook (using the \"+\" button on the tool bar) and play around a bit. If you don't want to mess up this notebook, you can work in a copy of this notebook by selecting Make Copy from the File menu." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "\n", 58 | "## 1. Python\n", 59 | "\n", 60 | "Here we are assuming you have some basic level of python knowledge, such as the syntax. There are many great python tutorials available. For an introductory level interactive tutorial you can try this one: http://www.learnpython.org/\n", 61 | "\n", 62 | "\n", 63 | "We will lead you through exercises that show you common fundamental problems you might face when doing a deep learning problem. Lets start with generating some fake random data. You can get a random number between 0 and 1 using the python random module as follow: (remember to execute this in Jupyter notebooks click in the cell and hit shift-enter)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "import random\n", 75 | "x=random.random()\n", 76 | "print \"The Value of x is\", x" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Exercise A.1.1\n", 84 | "Using the random method (shown above), write a function GenerateData(N, mymin, mymax), that returns a python list containing N random numbers between a specified minimum and maximum value. Note that you may want to quickly work out on paper how to turn numbers between 0 and 1 to other values. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "# Skeleton\n", 96 | "def GenerateData(N,min,max):\n", 97 | " out = []\n", 98 | " ### BEGIN SOLUTION\n", 99 | "\n", 100 | " # Fill in your solution here \n", 101 | " \n", 102 | " ### END SOLUTION\n", 103 | " return out\n", 104 | "\n", 105 | "Data=GenerateData(1000,-10,10)\n", 106 | "print \"Data Type:\", type(Data)\n", 107 | "print \"Data Length:\", len(Data)\n", 108 | "if len(Data)>0: \n", 109 | " print \"Type of Data Contents:\", type(Data[0])\n", 110 | " print \"Data Minimum:\", min(Data)\n", 111 | " print \"Data Maximum:\", max(Data)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Exercise A.1.2\n", 119 | "\n", 120 | "Write a function that computes the mean of values in a list." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# Skeleton\n", 132 | "def mean(Data):\n", 133 | " m=0\n", 134 | " ### BEGIN SOLUTION\n", 135 | "\n", 136 | " # Fill in your solution here \n", 137 | " \n", 138 | " ### END SOLUTION \n", 139 | " return m\n", 140 | "\n", 141 | "print \"Mean of Data:\", mean(Data)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### Exercise A.1.3\n", 149 | "\n", 150 | "Write a function the applies a booling function (that returns true/false) to every element in data, and returns a list of indices of elements where the result was true. Use this function to find the indices of positive entries. (This might be something you want to do if you are applying some selection criteria to your dataset and only want to keep events/entries/examples that pass the criteria.)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "def where(mylist,myfunc):\n", 162 | " out= []\n", 163 | " ### BEGIN SOLUTION\n", 164 | "\n", 165 | " # Fill in your solution here \n", 166 | " \n", 167 | " ### END SOLUTION \n", 168 | " return out" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Exercise A.1.4\n", 176 | "\n", 177 | "The inrange(mymin,mymax) function below returns a function that tests if it's input is between the specified values. Use this function, in conjunction to your solution to Exercise 1.3, to demonstrate that your data is \"flat\". Hint: pick several sub-ranges and show that the number of data point divided by the size of the range is roughly constant. " 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "def inrange(mymin,mymax):\n", 189 | " def testrange(x):\n", 190 | " return x=mymin\n", 191 | " return testrange\n", 192 | "\n", 193 | "# Examples:\n", 194 | "F1=inrange(0,10)\n", 195 | "F2=inrange(10,20)\n", 196 | "\n", 197 | "print F1(0), F1(1), F1(10), F1(15), F1(20)\n", 198 | "print F2(0), F2(1), F2(10), F2(15), F2(20)\n", 199 | "\n", 200 | "print \"Number of Entries passing F1:\", len(where(Data,F1))\n", 201 | "print \"Number of Entries passing F2:\", len(where(Data,F2))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Exercise A.1.5\n", 209 | "\n", 210 | "Repeat Exercise 1.4 using the built in python functions sum and map instead of your solution to 1.3. " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "### BEGIN SOLUTION\n", 222 | "\n", 223 | "# Fill in your solution here \n", 224 | " \n", 225 | "### END SOLUTION" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Exercise A.1.6\n", 233 | "\n", 234 | "Write a new function called GenerateDataFromFunction(N,mymin,mymax,myfunc), that instead of generating a flat distribution, generates a distribution with a functional form coded in myfunc. Note that for this exercise myfunc should always be > 0. \n", 235 | "\n", 236 | "For this exercise, let us make myfunc a Gaussian distribution, which is given below. Generate 1000 numbers that follow this Gaussian distribution. Confirm that the mean of the generated data is close to mean you specified when building the Gaussian. \n", 237 | "\n", 238 | "Hint: A simple, but slow, solution to generate data with a given distribution is to a draw random number, let's call it test_x within the specified range (mymin, mymax) and another number p between the minimum and maximum values of the function myfunc (which you will have to determine). If p<=function(test_x), then place test_x on the output. If not, repeat the process, drawing two new numbers. Repeat until you have the specified number of generated numbers, N, in the output. This method is often called \"the accept and reject sampling method\". For this problem, it's OK to determine the min and max by numerically sampling the function. \n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "def GenerateDataFromFunction(N,mymin,mymax,myfunc):\n", 250 | " out = []\n", 251 | " ### BEGIN SOLUTION\n", 252 | "\n", 253 | " # Fill in your solution here \n", 254 | " \n", 255 | " ### END SOLUTION \n", 256 | " return out" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "import math\n", 268 | "\n", 269 | "def gaussian(mean, sigma):\n", 270 | " def f(x):\n", 271 | " return (1/math.sqrt(2*math.pi*sigma**2))*math.exp(-( (x-mean)**2)/(2*(sigma**2) ))\n", 272 | " return f\n", 273 | "\n", 274 | "# Example Instantiation\n", 275 | "g1=gaussian(0,1)\n", 276 | "g2=gaussian(10,3)\n", 277 | "\n", 278 | "### BEGIN SOLUTION\n", 279 | "\n", 280 | "# Fill in your solution here \n", 281 | " \n", 282 | "### END SOLUTION\n" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "\n", 290 | "## 2. Numpy\n", 291 | "\n", 292 | "[Numpy](http://www.numpy.org) is the tensor manipulation package most commonly used in python-based scientific computing. Numpy tensor interface is also adopted by all packages that provide tensors (e.g. h5py, theano, TensorFlow, ...). " 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Exercise A.2.1\n", 300 | "\n", 301 | "Let start with some basic reshape manipulations. Consider a classification task. We can imagine the training data X consisting of N examples each with M inputs, so the shape of X is (M,N). The output of the Neural Network for the training sample encodes the true class of each of the N examples in X, in a \"one-hot\" matrix of shape (N,C), where C is the number of classes and each row corresponds to the true class for the corresponding example in X. So for a given row Y[i], all elements are 0 except for the column corresponding to the true class.\n", 302 | "\n", 303 | "For example consider a classification task of separating between 4 classes. We'll call them A, B, C, and D.\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "import numpy as np\n", 315 | "\n", 316 | "Y=np.array( [ [0, 1, 0, 0], # Class B\n", 317 | " [1, 0, 0, 0], # Class A\n", 318 | " [0, 0, 0, 1], # Class C\n", 319 | " [0, 0, 1, 0] # Class D\n", 320 | " ])\n", 321 | "\n", 322 | "print \"Shape of Y:\", Y.shape" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Lets imagine that we want to change to 2 classes instead by combining classes A with B and C with D. Use np.reshape and np.sum to create a new vector Y1. Hint: change the shape of Y into (8,2), sum along the correct axes, and change shape to (4,2). LH: solution given" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "print \"Transpose:\", np.transpose(Y)\n", 341 | "print \"Reshape 8,2:\", np.transpose(Y).reshape((8,2))\n", 342 | "print \"Sum:\", np.sum(np.transpose(Y).reshape((8,2)),axis=1)\n", 343 | "\n", 344 | "Y1= np.sum(np.transpose(Y)\n", 345 | " .reshape((8,2)),axis=1).reshape(4,2)\n", 346 | "print \"Answer: \",Y1" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Exercise A.2.2\n", 354 | "\n", 355 | "Oftentimes we find that neutral networks work best when their input is mostly between 0,1. Below, we create a random dataset that is normal distributed (mean of 4, sigma of 10). Shift the data so that the mean is 0.5 and 68% of the data lies between 0 and 1. LH: solution given." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "X=np.random.normal(4,10,1000)\n", 367 | "print np.mean(X)\n", 368 | "print np.min(X)\n", 369 | "print np.max(X)\n", 370 | "print np.var(X)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "import math\n", 382 | "X1=(X-np.mean(X))/math.sqrt(np.var(X)) # Replace X with your answer\n", 383 | "\n", 384 | "print np.mean(X1)\n", 385 | "print np.var(X1)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## Exercise A.2.3\n", 393 | "\n", 394 | "Using np.random.random and np.random.normal to generate two datasets. Then use np.where to repeat exercise 1.4 showing that one creates a flat distribution and the other does not by binning the data by hand." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": { 401 | "collapsed": true 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "X0=np.random.random(1000)\n", 406 | "\n", 407 | "def CheckFlatness(D,steps=10):\n", 408 | " maxD=np.max(D)\n", 409 | " minD=np.min(D)\n", 410 | " i=minD\n", 411 | " stepsize=(maxD-minD)/steps\n", 412 | " while ii) ))\n", 414 | " i+=stepsize\n", 415 | " \n", 416 | "CheckFlatness(X0)\n", 417 | "CheckFlatness(X)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "\n", 425 | "## 3. h5py\n", 426 | "\n", 427 | "[HDF5](https://support.hdfgroup.org/HDF5/) is a \"data model, library, and file format for storing and managing data.\" It is also the most common storage format in data science. [h5py](http://www.h5py.org) provides a python API for HDF5. In most cases, you do not need to know very much about HDF5 or h5py, just how to read/write tensors into/from files, which you can easily pick up from the [h5py Quick Start](http://docs.h5py.org/en/latest/quick.html#quick). We won't be using HDF5 for this tutorial. This section is here as reference for when you do encounter an HDF5 file.\n" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "\n", 435 | "# B. Machine Learning\n", 436 | "\n", 437 | "For the remainder of this tutorial, we will attempt to follow the first paper on Deep Learning in High Energy physics [P. Baldi, et al](https://arxiv.org/pdf/1402.4735.pdf). This paper demonstrates that Deep Neural Networks can learn from raw data the features that are typically used for searches for exotics particles. The authors publically provide the two benchmark scenarios considered in the paper. We will focus on the SUSY benchmark. \n", 438 | "\n", 439 | "\n", 440 | "## 1. The Dataset\n", 441 | "\n", 442 | "The data is distributed as a comma separated values (CSV) file. If you are running on lxplus, you can find a local copy to use as input. Otherwise, download the ~ GB compressed file from [UCI's ML Archive](http://archive.ics.uci.edu/ml/datasets/SUSY), use `gunzip` to decompress it, and change the path to the file below accordingly." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "filename=\"/afs/cern.ch/user/a/afarbin/public/AML-Tutorial/UCI/SUSY.csv\"\n", 452 | "# print out the first 5 lines using unix head command (note in jupyter ! => shell command)\n", 453 | "!head -5 \"/afs/cern.ch/user/a/afarbin/public/AML-Tutorial/UCI/SUSY.csv\"" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "Each row represents a LHC collision event. Each column contains some observable from that event. The variable names are:" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": { 467 | "collapsed": true 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "VarNames=[\"signal\", \"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\", \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "Some of these variables represent the \"raw\" kinematics of the observed final state particles, while others are \"features\" that are derived from these raw quantities:" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "RawNames=[\"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\"]\n", 490 | "FeatureNames=[ \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "\n", 498 | "## 2. Pandas\n", 499 | "\n", 500 | "We will use [pandas](http://pandas.pydata.org) to read in the file, and [matplotlib](https://matplotlib.org) to make plots. Pandas provides \"data structures and data analysis tools for the Python Programming Language\". Many machine learning tasks can be accomplished with [numpy](http://www.numpy.org) tensors and [h5py](http://www.h5py.org) files. In this case, pandas just makes it very easy to read a CSV file.\n", 501 | "\n", 502 | "The following ensures pandas is installed and sets everything up:" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": { 509 | "collapsed": true 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "import pandas as pd\n", 514 | "import matplotlib.pyplot as plt\n", 515 | "%matplotlib inline" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "Now we can read the data into a pandas dataframe. It's a ~GB file, so be patient." 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "df = pd.read_csv(filename, dtype='float64', names=VarNames)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "Another nice feature of pandas is that you can see the data in Jupyter by just evaluating the dataframe:" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": true 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "df" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "The first column stores the \"truth\" label of whether an event was signal or background. Pandas makes it easy to create dataframes that store only the signal or background events:" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "collapsed": true 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "df_sig=df[df.signal==1]\n", 568 | "df_bkg=df[df.signal==0]" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "The following example plots the signal and background distributions of every variable. Note that we use VarNames[1:] to skip the first variable, which was the true label. \n", 576 | "\n", 577 | "We will use matplotlib for plotting. There are lots of tutorials and primers out there that you can find searching the web. A good tutorial can be found in the [Scipy Lectures](http://www.scipy-lectures.org/intro/matplotlib/matplotlib.html). Look through these on your own time, it is not necessary for doing these exercise. The code below is all you need to know for making histograms with matplotlib." 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": { 584 | "collapsed": true 585 | }, 586 | "outputs": [], 587 | "source": [ 588 | "for var in VarNames[1:]:\n", 589 | " print var\n", 590 | " plt.figure()\n", 591 | " plt.hist(df_sig[var],bins=100,histtype=\"step\", color=\"red\",label=\"background\",stacked=True)\n", 592 | " plt.hist(df_bkg[var],bins=100,histtype=\"step\", color=\"blue\", label=\"signal\",stacked=True)\n", 593 | " plt.legend(loc='upper right')\n", 594 | " plt.show()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "\n", 602 | "## 3. Scikit-learn\n", 603 | "\n", 604 | "[Scikit-learn](http://scikit-learn.org) is a rich python library for data science, including machine learning. As an example, we can easily build a Fisher Discriminant (aka Linear Discriminant Analysis, or LDA). The [LDA Documentation](http://scikit-learn.org/stable/modules/lda_qda.html#dimensionality-reduction-using-linear-discriminant-analysis) does as great job explaining this classifier. Here's how we instanciate the classifier: " 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "collapsed": true 612 | }, 613 | "outputs": [], 614 | "source": [ 615 | "import sklearn.discriminant_analysis as DA\n", 616 | "Fisher=DA.LinearDiscriminantAnalysis()" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "Lets separate the data into inputs (X) vs outputs (Y) and training vs testing samples:" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": { 630 | "collapsed": true 631 | }, 632 | "outputs": [], 633 | "source": [ 634 | "N_Train=4000000\n", 635 | "\n", 636 | "Train_Sample=df[:N_Train]\n", 637 | "Test_Sample=df[N_Train:]\n", 638 | "\n", 639 | "X_Train=Train_Sample[VarNames[1:]]\n", 640 | "y_Train=Train_Sample[\"signal\"]\n", 641 | "\n", 642 | "X_Test=Test_Sample[VarNames[1:]]\n", 643 | "y_Test=Test_Sample[\"signal\"]\n", 644 | "\n", 645 | "Test_sig=Test_Sample[Test_Sample.signal==1]\n", 646 | "Test_bkg=Test_Sample[Test_Sample.signal==0]" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "We can train the classifier as follow:" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": { 660 | "collapsed": true 661 | }, 662 | "outputs": [], 663 | "source": [ 664 | "Fisher.fit(X_Train,y_Train)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | "We can plot the output, comparing signal and background:" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": { 678 | "collapsed": true 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "plt.figure()\n", 683 | "plt.hist(Fisher.decision_function(Test_sig[VarNames[1:]]),bins=100,histtype=\"step\", color=\"blue\", label=\"signal\",stacked=True)\n", 684 | "plt.hist(Fisher.decision_function(Test_bkg[VarNames[1:]]),bins=100,histtype=\"step\", color=\"red\", label=\"background\",stacked=True)\n", 685 | "plt.legend(loc='upper right')\n", 686 | "plt.show()" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "metadata": {}, 692 | "source": [ 693 | "And we can make a ROC curve and evaluate the AUC:" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": { 700 | "collapsed": true 701 | }, 702 | "outputs": [], 703 | "source": [ 704 | "from sklearn.metrics import roc_curve, auc\n", 705 | "fpr, tpr, _ = roc_curve(y_Test, Fisher.decision_function(X_Test))\n", 706 | "\n", 707 | "roc_auc = auc(fpr, tpr)\n", 708 | "\n", 709 | "plt.plot(fpr,tpr,color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc)\n", 710 | "plt.legend(loc=\"lower right\")\n", 711 | "plt.xlabel('False Positive Rate')\n", 712 | "plt.ylabel('True Positive Rate')\n", 713 | "\n", 714 | "plt.show()" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "## Exercise B.3.1\n", 722 | "\n", 723 | "Train the Fisher performance using the raw, features, and raw+features as input. Compare the performance one a single plot. Add cells to this notebook as needed. Or start new notebooks." 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": { 730 | "collapsed": true 731 | }, 732 | "outputs": [], 733 | "source": [ 734 | "X_Train_Raw=Train_Sample[RawNames]\n", 735 | "X_Test_Raw=Test_Sample[RawNames]\n", 736 | "\n", 737 | "X_Train_Features=Train_Sample[FeatureNames]\n", 738 | "X_Test_Features=Test_Sample[FeatureNames]" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": { 745 | "collapsed": true 746 | }, 747 | "outputs": [], 748 | "source": [ 749 | "def TrainFisher(X_Train,X_Test,y_Train):\n", 750 | " Fisher=DA.LinearDiscriminantAnalysis()\n", 751 | " Fisher.fit(X_Train,y_Train)\n", 752 | "\n", 753 | " fpr, tpr, _ = roc_curve(y_Test, Fisher.decision_function(X_Test))\n", 754 | " roc_auc = auc(fpr, tpr)\n", 755 | "\n", 756 | " plt.plot(fpr,tpr,color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc)\n", 757 | " plt.legend(loc=\"lower right\")\n", 758 | " plt.xlabel('False Positive Rate')\n", 759 | " plt.ylabel('True Positive Rate')\n", 760 | " plt.show()\n", 761 | " \n", 762 | " return Fisher\n", 763 | "\n", 764 | "RawFisher=TrainFisher(X_Train_Raw,X_Test_Raw,y_Train)\n", 765 | "FeatureFisher=TrainFisher(X_Train_Features,X_Test_Features,y_Train)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": {}, 771 | "source": [ 772 | "## Exercise B.3.2\n", 773 | "\n", 774 | "Select 3 different classifiers from the techniques listed [here](http://scikit-learn.org/stable/supervised_learning.html#supervised-learning). Note that you can use the multi-layer peceptron to build a deep network, though training may be prohibitively slow, so avoid this technique. Perform the comparison in exercise 1 for each classifier. Compare your conclusions for your selected techniques to the paper.\n" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "metadata": {}, 780 | "source": [ 781 | "## Exercise B.3.3\n", 782 | "\n", 783 | "The following function calculates the significance of the observation of the signal given the number of expected Signal and Background events, using the simple formula $\\sigma_S= \\frac{N_S}{\\sqrt{N_S+N_B}}$. Read through the code carefully." 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": { 790 | "collapsed": true 791 | }, 792 | "outputs": [], 793 | "source": [ 794 | "def PlotSignificance(N_S,N_B, N_S_min=1):\n", 795 | " plt.figure()\n", 796 | " eff_sig,bins_sig,p_sig=plt.hist(Fisher.decision_function(Test_sig[VarNames[1:]]),bins=100,histtype=\"step\", color=\"blue\", label=\"signal\",cumulative=-1,stacked=True,normed=True)\n", 797 | " eff_bkg,bins_bkg,p_bkg=plt.hist(Fisher.decision_function(Test_bkg[VarNames[1:]]),bins=100,histtype=\"step\", color=\"red\", label=\"background\",cumulative=-1,stacked=True,normed=True)\n", 798 | " plt.legend(loc='upper right')\n", 799 | " plt.show()\n", 800 | " \n", 801 | " good_bins = np.where(eff_sig*N_S>=N_S_min)\n", 802 | "\n", 803 | " print len(good_bins[0])\n", 804 | " if len(good_bins[0])<1:\n", 805 | " print \"Insufficient Signal.\"\n", 806 | " return 0,0,0\n", 807 | " \n", 808 | " significance=(N_S*eff_sig)/np.sqrt((N_B*eff_bkg)+(N_S*eff_sig))\n", 809 | "\n", 810 | " plt.figure()\n", 811 | " plt.plot(bins_sig[:-1],significance)\n", 812 | " \n", 813 | " max_sign=np.max(significance[good_bins])\n", 814 | " max_signI=np.argmax(significance[good_bins])\n", 815 | " \n", 816 | " plt.show()\n", 817 | " print \"Max significance at \", bins_sig[max_signI], \" of\", max_sign\n", 818 | " return bins_sig[max_signI],max_sign, max_signI\n", 819 | " \n", 820 | "PlotSignificance(1000000,1e11)\n" 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": {}, 826 | "source": [ 827 | "Answer the following questions:\n", 828 | " * What are we computing when making a normalized cummulative plot? \n", 829 | " * Assume that the experiment produces 1 signal event for every $10^{11}$ background events. For each of your classifiers, how many signal events need to be produced to be able to make a $5\\sigma$ discovery claim?\n", 830 | " " 831 | ] 832 | }, 833 | { 834 | "cell_type": "markdown", 835 | "metadata": {}, 836 | "source": [ 837 | "### Exercise B.3.4\n", 838 | "\n", 839 | "Read the Baldi, et al. paper and attempt to reproduce the results, as closely as possible, using scikit-learn. \n", 840 | "Try using the [multi-layer peceptron](http://scikitlearn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron) to build a deep network. Or if you are capable, try it using [Keras Scikit-learn interface](https://keras.io/scikit-learn-api/). \n", 841 | " " 842 | ] 843 | }, 844 | { 845 | "cell_type": "markdown", 846 | "metadata": {}, 847 | "source": [ 848 | "\n", 849 | "# C. Deep Learning\n", 850 | "\n", 851 | "This section is meant to get you started in using Keras to design Deep Neural Networks. The goal here is to simply repeat section B with Deep Learning.\n", 852 | "\n", 853 | "If you are starting here and have not run the cells above that load the data, you will need to run the following cell: " 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": { 860 | "collapsed": true 861 | }, 862 | "outputs": [], 863 | "source": [ 864 | "import pandas as pd\n", 865 | "import numpy as np\n", 866 | "import matplotlib.pyplot as plt\n", 867 | "%matplotlib inline\n", 868 | "\n", 869 | "filename=\"/afs/cern.ch/user/a/afarbin/public/AML-Tutorial/UCI/SUSY.csv\"\n", 870 | "VarNames=[\"signal\", \"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\", \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]\n", 871 | "RawNames=[\"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\"]\n", 872 | "FeatureNames=[ \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]\n", 873 | "\n", 874 | "df = pd.read_csv(filename, dtype='float64', names=VarNames)" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "Now lets define training and test samples. Note that DNNs take very long to train, so for testing purposes we will use only about 10% of the 5 million events in the training/validation sample. Once you get everything working, you can go back and make the final version of your plots with the full sample. \n", 882 | "\n", 883 | "Also note that Keras had trouble with the Pandas tensors, so after doing all of the nice manipulation that Pandas enables, we convert the Tensor to a regular numpy tensor." 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": { 890 | "collapsed": true 891 | }, 892 | "outputs": [], 893 | "source": [ 894 | "N_Max=550000\n", 895 | "N_Train=500000\n", 896 | "\n", 897 | "Train_Sample=df[:N_Train]\n", 898 | "Test_Sample=df[N_Train:N_Max]\n", 899 | "\n", 900 | "X_Train=np.array(Train_Sample[VarNames[1:]])\n", 901 | "y_Train=np.array(Train_Sample[\"signal\"])\n", 902 | "\n", 903 | "X_Test=np.array(Test_Sample[VarNames[1:]])\n", 904 | "y_Test=np.array(Test_Sample[\"signal\"])" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "\n", 912 | "## 1. Keras\n", 913 | "\n", 914 | "Training Deep Learning models can take a very long time. If you have access to a GPU, training with the GPU will be about 2 orders of magnitude faster that training with just the CPU. Unforunately, there are no GPUs on lxplus. But, if you are running this notebook on a system with NVidia GPU(s) properly setup, you can tell Keras to use a specific GPU:" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "metadata": { 921 | "collapsed": true 922 | }, 923 | "outputs": [], 924 | "source": [ 925 | "# Since lxplus does not have any GPUs, please DO NOT RUN THIS CELL ON LXPLUS.\n", 926 | "# Selecting First GPU in the system\n", 927 | "gpuid= 0\n", 928 | "print \"Using GPU:\", gpuid\n", 929 | "os.environ['THEANO_FLAGS'] = \"mode=FAST_RUN,device=gpu%s,floatX=float32,force_device=True\" % (gpuid)" 930 | ] 931 | }, 932 | { 933 | "cell_type": "markdown", 934 | "metadata": {}, 935 | "source": [ 936 | "There was some problem getting the profiler to work in this notebook, so we turn it off: " 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": { 943 | "collapsed": true 944 | }, 945 | "outputs": [], 946 | "source": [ 947 | "import theano\n", 948 | "theano.config.profile=False" 949 | ] 950 | }, 951 | { 952 | "cell_type": "markdown", 953 | "metadata": {}, 954 | "source": [ 955 | "Now we will build a simple model. Note that this is a very small model, so things run fast. You should attempt more ambitious models." 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": null, 961 | "metadata": { 962 | "collapsed": true 963 | }, 964 | "outputs": [], 965 | "source": [ 966 | "from keras.models import Sequential\n", 967 | "from keras.layers import Dense\n", 968 | "\n", 969 | "model = Sequential()\n", 970 | "model.add(Dense(12, input_dim=X_Train.shape[1], init='uniform', activation='relu'))\n", 971 | "model.add(Dense(8, init='uniform', activation='relu'))\n", 972 | "model.add(Dense(1, init='uniform', activation='sigmoid'))" 973 | ] 974 | }, 975 | { 976 | "cell_type": "markdown", 977 | "metadata": {}, 978 | "source": [ 979 | "The model has to be compiled. At this time we set the loss function and the optimizer too:" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": null, 985 | "metadata": { 986 | "collapsed": true 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 991 | "model.summary()" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "metadata": {}, 997 | "source": [ 998 | "Now we train. We are running only 10 epochs in this example. Models may need hundreds of epochs before they stop improving." 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": null, 1004 | "metadata": { 1005 | "collapsed": true 1006 | }, 1007 | "outputs": [], 1008 | "source": [ 1009 | "history=model.fit(X_Train, y_Train, validation_data=(X_Test,y_Test), nb_epoch=10, batch_size=2048)" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "markdown", 1014 | "metadata": {}, 1015 | "source": [ 1016 | "The model history keeps track of the loss and accuracy for each epoch. Note that the training above was setup to run on the validation sample at the end of each epoch:" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": null, 1022 | "metadata": { 1023 | "collapsed": true 1024 | }, 1025 | "outputs": [], 1026 | "source": [ 1027 | "print history.history" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "You can plot the loss versus epoch:" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": { 1041 | "collapsed": true 1042 | }, 1043 | "outputs": [], 1044 | "source": [ 1045 | "loss_history=history.history[\"loss\"]\n", 1046 | "plt.plot(range(len(loss_history)),loss_history)" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "markdown", 1051 | "metadata": {}, 1052 | "source": [ 1053 | "## Exercise C.1.1\n", 1054 | "\n", 1055 | "You will need to create several models and make sure they are properly trained. Write a function that takes this history and plots the values versus epoch. For every model that you train in the remainder of this lab, assess:\n", 1056 | "\n", 1057 | " * Has you model's performance plateaued? If not train for more epochs. \n", 1058 | "\n", 1059 | " * Compare the performance on training versus test sample. Are you over training?" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "code", 1064 | "execution_count": null, 1065 | "metadata": { 1066 | "collapsed": true 1067 | }, 1068 | "outputs": [], 1069 | "source": [ 1070 | "## Your Solution Here\n" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "metadata": {}, 1076 | "source": [ 1077 | "We can evaluate how the trained model does on the test sample as follows:" 1078 | ] 1079 | }, 1080 | { 1081 | "cell_type": "code", 1082 | "execution_count": null, 1083 | "metadata": { 1084 | "collapsed": true 1085 | }, 1086 | "outputs": [], 1087 | "source": [ 1088 | "scores = model.evaluate(X_Test, y_Test)\n", 1089 | "print scores" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": {}, 1095 | "source": [ 1096 | "And we can make ROC curves as before:" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": null, 1102 | "metadata": { 1103 | "collapsed": true 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "from sklearn.metrics import roc_curve, auc\n", 1108 | "fpr, tpr, _ = roc_curve(y_Test, model.predict(X_Test))\n", 1109 | " \n", 1110 | "roc_auc = auc(fpr, tpr)\n", 1111 | "\n", 1112 | "plt.plot(fpr,tpr,color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc)\n", 1113 | "plt.legend(loc=\"lower right\")\n", 1114 | "plt.xlabel('False Positive Rate')\n", 1115 | "plt.ylabel('True Positive Rate')\n", 1116 | "\n", 1117 | "plt.show()" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "markdown", 1122 | "metadata": {}, 1123 | "source": [ 1124 | "## Exercise C.1.2\n", 1125 | "\n", 1126 | "Following section B, make a comparison of the performance between models trained with raw, features, and raw+features data." 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": null, 1132 | "metadata": { 1133 | "collapsed": true 1134 | }, 1135 | "outputs": [], 1136 | "source": [ 1137 | "## Your solution here\n" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "## Exercise C.1.3\n", 1145 | "\n", 1146 | "Again, following section B, design and implement at least 3 different DNN models. Train them and compare performance. You may try different architectures, loss functions, and optimizers to see if there is an effect." 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "code", 1151 | "execution_count": null, 1152 | "metadata": { 1153 | "collapsed": true 1154 | }, 1155 | "outputs": [], 1156 | "source": [ 1157 | "## Your solution here" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "markdown", 1162 | "metadata": {}, 1163 | "source": [ 1164 | "## Exercise C.1.4\n", 1165 | "\n", 1166 | "Write a function that evaluates the performance (AUC) as a function of a given input variable. You will need to bin the test data in the variable (i.e. make sub-samples for events which have the particular variable in a range), evaluate the performance in each bin, and plot the results.\n", 1167 | "\n", 1168 | "Apply your function to each input variable." 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": null, 1174 | "metadata": { 1175 | "collapsed": true 1176 | }, 1177 | "outputs": [], 1178 | "source": [ 1179 | "## Your solution here" 1180 | ] 1181 | } 1182 | ], 1183 | "metadata": { 1184 | "kernelspec": { 1185 | "display_name": "Python 2", 1186 | "language": "python", 1187 | "name": "python2" 1188 | }, 1189 | "language_info": { 1190 | "codemirror_mode": { 1191 | "name": "ipython", 1192 | "version": 2 1193 | }, 1194 | "file_extension": ".py", 1195 | "mimetype": "text/x-python", 1196 | "name": "python", 1197 | "nbconvert_exporter": "python", 1198 | "pygments_lexer": "ipython2", 1199 | "version": "2.7.13" 1200 | } 1201 | }, 1202 | "nbformat": 4, 1203 | "nbformat_minor": 2 1204 | } 1205 | -------------------------------------------------------------------------------- /LArIAT-Data-Generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from LArTPCDNN.LoadData import *\n", 10 | "\n", 11 | "# Number of Threads when reading and mixing data\n", 12 | "n_threads=1\n", 13 | "\n", 14 | "# Number of Threads when reading cached mixed data (2nd data pass)\n", 15 | "n_threads2=1\n", 16 | "\n", 17 | "# Number of batches read by each thread\n", 18 | "multiplier=1\n", 19 | "\n", 20 | "BatchSize=16\n", 21 | "FileSearch=\"/data/LArIAT/h5_files_2D_3D/2D_h5/*.h5\"\n", 22 | "\n", 23 | "# Downsampling Factor\n", 24 | "DownSampleSize=8\n", 25 | "\n", 26 | "# Size of window scanned accross data\n", 27 | "ScanWindowSize=256\n", 28 | "\n", 29 | "# Normalize Data?\n", 30 | "Normalize=True\n", 31 | "\n", 32 | "NSamples=10000\n", 33 | "\n", 34 | "# Energy Cut (unnecessary for latest datasets)\n", 35 | "EnergyCut=0.61\n", 36 | "\n", 37 | "# Particle types to read\n", 38 | "Particles= ['electron', 'antielectron',\n", 39 | " 'pion0', \n", 40 | " 'photon',\n", 41 | " 'pionPlus', 'pionMinus',\n", 42 | " 'muon', 'antimuon',\n", 43 | " 'kaonMinus', 'kaonPlus']\n", 44 | "\n", 45 | "NClasses=len(Particles)\n", 46 | "\n", 47 | "if ScanWindowSize>0:\n", 48 | "# shapes=[(BatchSize*multiplier, 2, 240, ScanWindowSize), (BatchSize*multiplier, NClasses)]\n", 49 | " shapes=[(BatchSize*multiplier, 240, ScanWindowSize),\n", 50 | " (BatchSize*multiplier, 240, ScanWindowSize),\n", 51 | " (BatchSize*multiplier, NClasses)]\n", 52 | " viewshape=(None, 240, ScanWindowSize)\n", 53 | "else:\n", 54 | " shapes=[(BatchSize*multiplier, 240, 4096/DownSampleSize),\n", 55 | " (BatchSize*multiplier, 240, 4096/DownSampleSize),\n", 56 | " (BatchSize*multiplier, NClasses)]\n", 57 | "\n", 58 | " viewshape=(None, 240, 4096/DownSampleSize)\n", 59 | "\n", 60 | "\n", 61 | "# Separate Files into Training and Test samples\n", 62 | "TrainSampleList,TestSampleList=DivideFiles(FileSearch,[.9,.1],\n", 63 | " datasetnames=[u'images'],\n", 64 | " Particles=Particles)\n", 65 | "\n", 66 | "def MakeGenerator(SampleList,NSamples,\n", 67 | " cachefile=\"LArIAT-LoadDataTest-Cache.h5\",**kwargs):\n", 68 | "\n", 69 | " return DLMultiClassFilterGenerator(TrainSampleList, FilterEnergy(EnergyCut), max=NSamples,\n", 70 | " preprocessfunction=ProcessWireData(DownSampleSize,ScanWindowSize,Normalize),\n", 71 | " postprocessfunction=MergeInputs(),\n", 72 | " batchsize=BatchSize,\n", 73 | " shapes=shapes,\n", 74 | " n_threads=n_threads,\n", 75 | " multiplier=multiplier,\n", 76 | " cachefile=cachefile,\n", 77 | " **kwargs)\n", 78 | "\n", 79 | "# Use DLGenerators to read data\n", 80 | "Train_gen = MakeGenerator(TrainSampleList, NSamples,\n", 81 | " cachefile=\"/tmp/LArTPCDNN-LArIAT-TrainEvent-Cache.h5\")\n", 82 | "\n", 83 | "gen=Train_gen.DiskCacheGenerator(n_threads2)\n", 84 | "\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Get the first batch\n", 94 | "Data=gen.next()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "Data[0][1].shape" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 2", 110 | "language": "python", 111 | "name": "python2" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 2 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython2", 123 | "version": "2.7.12" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 2 128 | } 129 | -------------------------------------------------------------------------------- /LArIAT-HandScan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# LArIAT - 2D Data Examples / Hand Scan Tutorial\n", 8 | "## Training the Deep Neural Network in your head\n", 9 | "\n", 10 | "This notebook introduces the 2D LArIAT data set and leads you through setting up a handscan. The included exercises are meant to introduce basic python, numpy, and h5 file/data structure manipulation, ploting in matplotlib, and the concepts of problem formulation, training, and validation.\n", 11 | "\n", 12 | "## Introduction\n", 13 | "Before High Energy Physicists used computers with automatic reconstruction to turn raw data into features, they relied on hand scans performed by people. In this notebook we will setup a hand scan using the Liquid Argon TPC (LArTPC) data we looked at last time. The task will be to identify the type of particle. You will be the handscanner. The steps are as follows:\n", 14 | "\n", 15 | " * Data Engineering: Load data from various files\n", 16 | " * Training: Train the handscanner by presenting images of the data with the labels.\n", 17 | " * Validation: Ask the handscanner to classify some randomly selected images, and see how well they do.\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Data Engineering\n", 25 | "\n", 26 | "Our data is stored in a bunch of files. You can see the files by listing the directory using the unix \"ls\" command. You can call shell commands, like \"ls\", from Jupyter:" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "BaseDir2D=\"/data/LArIAT/h5_files_2D_3D/2D_h5/*\"\n", 36 | "%ls $BaseDir2D" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "That's a lot of files. Lets count how many... in python. There are a variety of ways of getting back a directory listing in python. Here's one:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import glob\n", 53 | "Files=glob.glob(BaseDir2D)\n", 54 | "print \"Number of Files:\", len(Files)\n", 55 | "print \"First Filename:\", Files[0]" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Looking at the file names, you notice that they start with the type of particle. Each file contains a samples of \"events\". In each event, we simulated shooting a particle into the detector and stored the response. The name of the file specifies what type of particle was simulated in that file.\n", 63 | "\n", 64 | "Let's try to figure out what types. We'll loop over the file names, strip out the first part of the file name, and store it in a dictionary:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "scrolled": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "import os\n", 76 | "\n", 77 | "FileCount= {} # Store the count here\n", 78 | "FileLists= {} # Organize the files by particle type here.\n", 79 | "\n", 80 | "for aFile in Files:\n", 81 | " # Lets strip the path (everything before the \"/\"s) and get the filename:\n", 82 | " FileName=os.path.basename(aFile)\n", 83 | " \n", 84 | " # Now use everything before the first \"_\" as the particle name\n", 85 | " ParticleName=FileName.split('_')[0]\n", 86 | " \n", 87 | " if ParticleName in FileCount.keys():\n", 88 | " FileCount[ParticleName]+=1\n", 89 | " FileLists[ParticleName].append(aFile)\n", 90 | " else:\n", 91 | " FileCount[ParticleName]=1\n", 92 | " FileLists[ParticleName]= [aFile]\n", 93 | " \n", 94 | "print \"Number of types of particles:\", len(FileCount.keys())\n", 95 | "print \"----------------------------------------------------------\"\n", 96 | "print \"Number of files for each particle type:\", FileCount\n", 97 | "print \"----------------------------------------------------------\"\n", 98 | "print \"First file of each type:\"\n", 99 | "for aFile in FileLists:\n", 100 | " print aFile,\":\",FileLists[aFile][0]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "We can count how many examples are in each file by open them up in h5py:" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "import h5py\n", 117 | "\n", 118 | "f=h5py.File(FileLists[\"electron\"][0],\"r\")\n", 119 | "\n", 120 | "# Read the First N_Events. Data is stored as float16, lets store it as float32 to avoid overflows later when we sum.\n", 121 | "print \"Shape of the data:\", f[\"images\"].shape\n", 122 | "print \"Number of events in file:\", f[\"images\"].shape[0]\n", 123 | "\n", 124 | "#f.close()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "f.keys()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Training\n", 141 | "\n", 142 | "We will use matplotlib for most of our plotting. There are lots of tutorials and primers out there that you can find searching the web. A good tutorial can be found in the [Scipy Lectures](http://www.scipy-lectures.org/intro/matplotlib/matplotlib.html). Look through these on your own time, it is not necessary for you to do these exercise.\n", 143 | "\n", 144 | "The raw data from a LArTPC detector looks like an image. The LArIAT detector, which we have simulated, has two readout views. The following code gives you an example how to plot these images. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import matplotlib.pyplot as plt\n", 154 | "%matplotlib inline\n", 155 | "import numpy as np\n", 156 | "\n", 157 | "# Load the first electron file\n", 158 | "f=h5py.File(FileLists[\"electron\"][0],\"r\")\n", 159 | "\n", 160 | "# Get the images\n", 161 | "images=f[\"images\"]\n", 162 | "\n", 163 | "print \"Data shape:\", images.shape\n", 164 | "\n", 165 | "def PlotEvent(image):\n", 166 | " # Make two plots. Create a 1 by 2 grid the plots.\n", 167 | " ax1 = plt.subplot(1,2,1)\n", 168 | " ax2 = plt.subplot(1,2,2)\n", 169 | "\n", 170 | " # Plot the first view. Note: [EventNumber, View] = [0,0]\n", 171 | " ax1.imshow(image[0])\n", 172 | "\n", 173 | " # Plot the second view \n", 174 | " ax2.imshow(image[1])\n", 175 | "\n", 176 | " # The data is 240 by 4096. Change the aspect ratio so the plot is not squished. \n", 177 | " ax1.set_aspect(16) \n", 178 | " ax2.set_aspect(16) \n", 179 | "\n", 180 | "# Plot the 5th Event\n", 181 | "PlotEvent(np.array(images[4],dtype=\"float32\"))\n", 182 | "\n", 183 | "f.close()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Exercise 3.2.1- Setup Training\n", 191 | "\n", 192 | "Write a function that takes a file, and creates a grid of plots showing the first N events. Use this function to plot the first 9 events in the first file of each particle type in a 3 by 3 grid. You only need to show one view." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "def PlotEvents(FileName, N_Events):\n", 204 | " ### BEGIN SOLUTION\n", 205 | "\n", 206 | " # Fill in your solution here \n", 207 | " \n", 208 | " ### END SOLUTION\n", 209 | " pass\n", 210 | "\n", 211 | "N_Events=9\n", 212 | "\n", 213 | "for aFile in FileLists:\n", 214 | " FileName=FileLists[aFile][0]\n", 215 | " ParticleName=os.path.basename(FileName).split('_')[0]\n", 216 | " \n", 217 | " print ParticleName,\":\"\n", 218 | " PlotEvents(FileName,N_Events)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Exercise 3.2.2- Train Yourself\n", 226 | "\n", 227 | "By looking closely at each particle type, identify at least one \"feature\" that would allow you to by eye uniquely identify that particle type. " 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Type you answer in this box.\n", 235 | "\n", 236 | "### BEGIN SOLUTION\n", 237 | "\n", 238 | "- muon/antimuon: your description here\n", 239 | "- electron/antielectron: your description here\n", 240 | "- pion: your description here\n", 241 | "- pionPlus/pionMinus: your description here\n", 242 | "- kaonPlus/kaonMinus: your description here\n", 243 | "- photon: your description here\n", 244 | "- nue/nuebar: your description here\n", 245 | "- numu/numubar: your description here\n", 246 | "- proton/antiproton: your description here\n", 247 | "\n", 248 | "### END SOLUTION" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## Validation\n", 256 | "\n", 257 | "Now we have to setup a validation process. We will first assign each particle type a unique index. Then we will load some events of each particle type, mix them while keeping track of the indecies. Finally we will present the images to the handscanner, ask them to classify, and keep track of how well they do.\n", 258 | "\n", 259 | "Read through and try to understand the following code which setups up two dictionaries we will use to uniquely identify particle types. " 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "import numpy as np\n", 269 | "\n", 270 | "# Assign index to particle type\n", 271 | "ParticleTypesIndexMap = {}\n", 272 | "\n", 273 | "for i,ParticleType in enumerate(FileLists.keys()): \n", 274 | " ParticleTypesIndexMap[ParticleType]=i\n", 275 | " \n", 276 | "# Merge particle/anti-particle\n", 277 | "for ParticleName in ParticleTypesIndexMap:\n", 278 | " if 'bar' in ParticleName:\n", 279 | " try:\n", 280 | " ParticleicleTypesIndexMap[ParticleName]=ParticleTypesIndexMap[ParticleName.split('bar')[0]]\n", 281 | " except:\n", 282 | " pass\n", 283 | " \n", 284 | " if 'anti' in ParticleName:\n", 285 | " try:\n", 286 | " ParticleTypesIndexMap[ParticleName]=ParticleTypesIndexMap[ParticleName.split('anti')[1]]\n", 287 | " except:\n", 288 | " pass\n", 289 | " \n", 290 | " if 'Minus' in ParticleName:\n", 291 | " try:\n", 292 | " ParticleTypesIndexMap[ParticleName]=ParticleTypesIndexMap[ParticleName.split('Minus')[0]+\"Plus\"]\n", 293 | " except:\n", 294 | " pass\n", 295 | " \n", 296 | "print \"Index map:\"\n", 297 | "print ParticleTypesIndexMap\n", 298 | "\n", 299 | "# Reverse Map\n", 300 | "ParticleTypesIndexMapR={}\n", 301 | "\n", 302 | "for p in ParticleTypesIndexMap:\n", 303 | " if ParticleTypesIndexMap[p] not in ParticleTypesIndexMapR:\n", 304 | " ParticleTypesIndexMapR[ParticleTypesIndexMap[p]]=p\n", 305 | "\n", 306 | "print \"Reverse Index map:\"\n", 307 | "print ParticleTypesIndexMapR\n", 308 | "\n" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Now we load the data and mix them:" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "Data_X = None\n", 325 | "Data_Y = None\n", 326 | "N_Events_perType=10\n", 327 | "\n", 328 | "for ParticleType in FileLists:\n", 329 | " # Open the first file\n", 330 | " FileName=FileLists[ParticleType][1] # we will take the 2nd file so we don't use the training sample for validation\n", 331 | " print \"Opening:\",FileName\n", 332 | " f=h5py.File(FileName,\"r\")\n", 333 | " \n", 334 | " # Get the images/features\n", 335 | " images=np.array(f[\"images\"][:N_Events_perType])\n", 336 | " \n", 337 | " # Warn if not enough events\n", 338 | " N_Events_read=images.shape[0]\n", 339 | " if not N_Events_read==N_Events_perType:\n", 340 | " print \"Warning: Sample\", FileName, \"had only\",N_Events_read,\"events.\"\n", 341 | " \n", 342 | " # Assign labels\n", 343 | " labels=np.empty(N_Events_read)\n", 344 | " labels.fill(ParticleTypesIndexMap[ParticleType])\n", 345 | "\n", 346 | " # Store some of them\n", 347 | " try:\n", 348 | " # If we have already read some data, add to it\n", 349 | " Data_X=np.concatenate((Data_X,np.array(images,dtype=\"float32\")))\n", 350 | " Data_Y=np.concatenate((Data_Y,np.array(labels,dtype=\"float32\")))\n", 351 | " except:\n", 352 | " # If we haven't read any data yet\n", 353 | " Data_X=images\n", 354 | " Data_Y=labels\n", 355 | " \n", 356 | " \n", 357 | " f.close()\n", 358 | "\n", 359 | "print Data_X.shape, Data_Y.shape\n", 360 | "\n", 361 | "def shuffle_in_unison_inplace(a, b):\n", 362 | " assert len(a) == len(b)\n", 363 | " p = np.random.permutation(len(a))\n", 364 | " return a[p], b[p]\n", 365 | " \n", 366 | "Data_X,Data_Y=shuffle_in_unison_inplace(Data_X,Data_Y) \n" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## Exercise 3.3.1\n", 374 | "\n", 375 | "The following code presents images and asks the handscanner for a type. Read through it carefully. Try it out. Then instrument this code so it keeps track of success and failures. The goal is to create a confusion matrix, a table that keeps track of how often each particle type is correctly identified and how often it is misidentified as any other type. " 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "View=0\n", 387 | "\n", 388 | "for image in Data_X:\n", 389 | " PlotEvent(image)\n", 390 | " plt.show()\n", 391 | " \n", 392 | " print \"Select Type from:\", ParticleTypesIndexMapR\n", 393 | " try:\n", 394 | " answer=int(raw_input('Input:'))\n", 395 | " except ValueError:\n", 396 | " print \"Not a number\"\n", 397 | " \n", 398 | " # Stop loop\n", 399 | " if answer==-1:\n", 400 | " break\n", 401 | " \n", 402 | " print \"You selected:\", ParticleTypesIndexMapR[answer]\n", 403 | " " 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "## Exercise 3.3.2\n", 411 | "\n", 412 | "Make yourself the handscanner. Use above code to go through the full data sample and create a confusion matrix." 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "## Exercise 3.4.1\n", 420 | "Write a function that downsamples all of images by summing samples to reduce the 4096 long dimension of the data.\n", 421 | "\n", 422 | "## Exercise 3.4.2\n", 423 | "Write a function that returns a sub-region in the 4096 long dimention where the total charge is max." 424 | ] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "Python 2", 430 | "language": "python", 431 | "name": "python2" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 2 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython2", 443 | "version": "2.7.12" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 2 448 | } 449 | -------------------------------------------------------------------------------- /LArTPCDNN-Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "%run -im LArTPCDNN.ClassificationExperiment -- --Test" 13 | ] 14 | } 15 | ], 16 | "metadata": { 17 | "kernelspec": { 18 | "display_name": "Python 2", 19 | "language": "python", 20 | "name": "python2" 21 | }, 22 | "language_info": { 23 | "codemirror_mode": { 24 | "name": "ipython", 25 | "version": 2 26 | }, 27 | "file_extension": ".py", 28 | "mimetype": "text/x-python", 29 | "name": "python", 30 | "nbconvert_exporter": "python", 31 | "pygments_lexer": "ipython2", 32 | "version": "2.7.12" 33 | } 34 | }, 35 | "nbformat": 4, 36 | "nbformat_minor": 2 37 | } 38 | -------------------------------------------------------------------------------- /LCD-BDT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Training a Boosted Decision Tree (BDT) using features from the LCD images\n", 8 | "\n", 9 | "First we import the classes that we need for opening and reading files. We use h5py to allow Python to read the h5 file format, and numpy for building arrays. We also have to import matplotlib for plotting on the first line, due to a strangeness in the matplotlib package." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import h5py\n", 23 | "import numpy as np" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We will open four different files, each containing 1000 events from a single-particle gun at different energies. There is one file for each of the following particles: charged pions, photons, neutron pions, and electrons. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "dataDir = \"/data/LCD/V1/HLF/\"\n", 42 | "dataFileNames = [\"EleEscan_HLF/EleEscan_1_10_HLF.h5\", \"GammaEscan_HLF/GammaEscan_1_10_HLF.h5\", \"ChPiEscan_HLF/ChPiEscan_1_10_HLF.h5\", \"Pi0Escan_HLF/Pi0Escan_1_10_HLF.h5\"]\n", 43 | "dataFiles = []\n", 44 | "for i in range(len(dataFileNames)):\n", 45 | " dataFiles.append(h5py.File(dataDir + dataFileNames[i], \"r\"))\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "This next command tells us all of the features stored in those files. We can make simple plots to look at these distributions." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "print dataFiles[0].keys()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "One of the features is the pdgID (a unique integer number that represents the true identity of the incoming particle). You can plot the pdgID of the first data file by running:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "plt.hist(dataFiles[0]['pdgID']);" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "The first datafile was for incoming electrons (and positrons), and you can see in the distribution above the pdgIDs for every event are either equal to -11 or +11, which are the pdgID values for electrons and positrons (good!).\n", 85 | "\n", 86 | "Now let us plot the distribution of measured energy in the ECAL from the second input file:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "plt.hist(dataFiles[1]['ECALMeasuredEnergy']);" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Now we will combine the samples, and explicitly label the electrons as class '0', photons as class '1', charged pions as class '2', and neutral pions as class '3'." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "data = []\n", 114 | "features = dataFiles[0].keys()\n", 115 | "# remove the \"Energy\" feature, which contains truth information about the particle gun\n", 116 | "features.remove('Energy')\n", 117 | "\n", 118 | "for count, feature in enumerate(features):\n", 119 | " \n", 120 | " newFeature = []\n", 121 | " for fileN in range(len(dataFiles)):\n", 122 | " newFeature += dataFiles[fileN][feature]\n", 123 | "\n", 124 | " # use \"pdgID\" as the truth classifier y - all other features go into matrix X\n", 125 | " if feature == 'pdgID':\n", 126 | " y = 0 * np.array([int(abs(x) == 11) for x in newFeature]);\n", 127 | " y = y + 1 * np.array([int(abs(x) == 22) for x in newFeature]);\n", 128 | " y = y + 2 * np.array([int(abs(x) == 211) for x in newFeature]);\n", 129 | " y = y + 3 * np.array([int(abs(x) == 111) for x in newFeature]);\n", 130 | " else:\n", 131 | " data.append(newFeature);\n", 132 | "\n", 133 | "X = np.column_stack(data)\n", 134 | "\n", 135 | "# remove all rows containing NaN and inf (from zero energy deposition, e.g.)\n", 136 | "y = y[np.isfinite(X).all(axis=1)]\n", 137 | "X = X[np.isfinite(X).all(axis=1)]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Now we import the sklearn package to perform the BDT training. First, we split the data into 2/3 training data and 1/3 test data." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from sklearn.cross_validation import train_test_split\n", 154 | "\n", 155 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=492)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "We set up a BDT with a maximum depth of 5 and the [AdaBoost-SAMME](http://algorithm-interest-group.me/assets/slides/AdaBoost.pdf) algorithm. We set 800 estimators and a learning rate of 0.5. If we wanted to, we could further split the training data into training and validation data. This would allow us to compare results from using different training parameters." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from sklearn.tree import DecisionTreeClassifier\n", 174 | "from sklearn.ensemble import AdaBoostClassifier\n", 175 | "from sklearn.metrics import classification_report, roc_auc_score\n", 176 | "\n", 177 | "dt = DecisionTreeClassifier(max_depth=5)\n", 178 | "bdt = AdaBoostClassifier(dt,\n", 179 | " algorithm='SAMME',\n", 180 | " n_estimators=800,\n", 181 | " learning_rate=0.5)\n", 182 | "\n", 183 | "bdt.fit(X_train, y_train)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "The result of the training is shown below.\n", 191 | "\n", 192 | "Precision (P) is defined as the number of true positives (T_p) over the number of true positives plus the number of false positives (F_p). E.g. the number of correctly identified electrons over all particles identified as electrons:\n", 193 | "\n", 194 | " P = T_p / (T_p+F_p) \n", 195 | "\n", 196 | "Recall (R) is defined as the number of true positives (T_p) over the number of true positives plus the number of false negatives (F_n). E.g. the number of correctly identified electrons over all truth electrons:\n", 197 | "\n", 198 | " R = T_p / (T_p + F_n)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "y_predicted = bdt.predict(X_test)\n", 210 | "target_names = ['electron', 'photon', 'charged pion', 'neutral pion']\n", 211 | "print (classification_report(y_test, y_predicted, target_names=target_names))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "We see that charged pions are identified very well, followed by electrons, but that the BDT has a bit of trouble distinguishing photons and neutronal pions. We can look at a ROC curve for identifying just these two classes - photons vs. neutral pions." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "y_photon.shape" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": true 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "scores[indicesOfInterest][:,1].shape" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "from sklearn.metrics import roc_curve, auc\n", 252 | "\n", 253 | "scores = bdt.decision_function(X_test)\n", 254 | "\n", 255 | "# photons\n", 256 | "indicesOfInterest = np.array([(y == 1 or y == 3) for y in y_test])\n", 257 | "y_photon = np.array([int(y == 1) for y in y_test[indicesOfInterest]])\n", 258 | "print (\"Area under ROC curve: %.4f\"%(roc_auc_score(y_photon, scores[indicesOfInterest][:,1])))\n", 259 | "\n", 260 | "fpr, tpr, thresholds = roc_curve(y_photon, scores[indicesOfInterest][:,1])\n", 261 | "roc_auc = auc(fpr, tpr)\n", 262 | "\n", 263 | "plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)'%(roc_auc))\n", 264 | "\n", 265 | "plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')\n", 266 | "plt.xlim([-0.05, 1.05])\n", 267 | "plt.ylim([-0.05, 1.05])\n", 268 | "plt.xlabel('False Positive Rate')\n", 269 | "plt.ylabel('True Positive Rate')\n", 270 | "plt.title('Receiver operating characteristic')\n", 271 | "plt.legend(loc=\"lower right\")\n", 272 | "plt.grid()\n", 273 | "plt.show()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "Finally, we can look at signal performance over background for a single class (let's say signal = photons, background = neutral pions) to test for overtraining. " 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "def compare_train_test(clf, X_train, y_train, X_test, y_test, bins=30):\n", 292 | " decisions = []\n", 293 | " for X,y in ((X_train[indicesOfInterest], y_train[indicesOfInterest]), (X_test[indicesOfInterest], y_test[indicesOfInterest])):\n", 294 | " d1 = clf.decision_function(X[y==1]).ravel()\n", 295 | " d2 = clf.decision_function(X[y==3]).ravel()\n", 296 | " decisions += [d1, d2]\n", 297 | " \n", 298 | " low = min(np.min(d) for d in decisions)\n", 299 | " high = max(np.max(d) for d in decisions)\n", 300 | " low_high = (low,high)\n", 301 | " \n", 302 | " plt.hist(decisions[0], color='r', alpha=0.5, range=low_high, bins=bins, histtype='stepfilled', normed=True, label='S (train)')\n", 303 | " plt.hist(decisions[1], color='b', alpha=0.5, range=low_high, bins=bins, histtype='stepfilled', normed=True, label='B (train)')\n", 304 | "\n", 305 | " hist, bins = np.histogram(decisions[2], bins=bins, range=low_high, normed=True)\n", 306 | " scale = len(decisions[2]) / sum(hist)\n", 307 | " err = np.sqrt(hist * scale) / scale\n", 308 | " \n", 309 | " width = (bins[1] - bins[0])\n", 310 | " center = (bins[:-1] + bins[1:]) / 2\n", 311 | " plt.errorbar(center, hist, yerr=err, fmt='o', c='r', label='S (test)')\n", 312 | " \n", 313 | " hist, bins = np.histogram(decisions[3], bins=bins, range=low_high, normed=True)\n", 314 | " scale = len(decisions[2]) / sum(hist)\n", 315 | " err = np.sqrt(hist * scale) / scale\n", 316 | "\n", 317 | " plt.errorbar(center, hist, yerr=err, fmt='o', c='b', label='B (test)')\n", 318 | "\n", 319 | " plt.xlabel(\"BDT output\")\n", 320 | " plt.ylabel(\"Arbitrary units\")\n", 321 | " plt.legend(loc='best')\n", 322 | " \n", 323 | "compare_train_test(bdt, X_train, y_train, X_test, y_test)" 324 | ] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python 2", 330 | "language": "python", 331 | "name": "python2" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 2 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython2", 343 | "version": "2.7.12" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 1 348 | } 349 | -------------------------------------------------------------------------------- /LCD-Data-Generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from CaloDNN.LoadData import *\n", 12 | "\n", 13 | "FileSearch=\"/data/LCD/V1/*/*.h5\"\n", 14 | "MaxEvents=int(3.e6)\n", 15 | "NTestSamples=100000\n", 16 | "\n", 17 | "Particles=[\"ChPi\",\"Gamma\",\"Pi0\",\"Ele\"]\n", 18 | "NClasses=4\n", 19 | "\n", 20 | "BatchSize=1024\n", 21 | " \n", 22 | "n_threads=n_threads\n", 23 | "n_threads_cache=5\n", 24 | "multiplier:1\n", 25 | " \n", 26 | "ECAL=True\n", 27 | "ECALNorm=\"NonLinear\"\n", 28 | "\n", 29 | "HCAL=True,\n", 30 | "HCALNorm=\"NonLinear\"\n", 31 | "\n", 32 | "ECALShape= None, 25, 25, 25\n", 33 | "HCALShape= None, 5, 5, 60\n", 34 | "\n", 35 | "TrainSampleList,TestSampleList,Norms,shapes=SetupData(FileSearch,\n", 36 | " ECAL,HCAL,True,NClasses,\n", 37 | " [float(NSamples)/MaxEvents,\n", 38 | " float(NTestSamples)/MaxEvents],\n", 39 | " Particles,\n", 40 | " BatchSize,\n", 41 | " multiplier,\n", 42 | " ECALShape,\n", 43 | " HCALShape,\n", 44 | " ECALNorm,\n", 45 | " HCALNorm)\n", 46 | "\n", 47 | "Test_genC = MakeGenerator(ECAL, HCAL, TestSampleList, NTestSamples, LCDNormalization(Norms),\n", 48 | " Merge=False,\n", 49 | " batchsize=BatchSize,\n", 50 | " shapes=shapes,\n", 51 | " n_threads=n_threads,\n", 52 | " multiplier=multiplier,\n", 53 | " cachefile=\"/tmp/CaloDNN-Analysis.h5\")\n", 54 | "\n", 55 | "\n", 56 | "gen=Test_gen.DiskCacheGenerator(n_threads_cache)\n", 57 | "\n", 58 | "Data=gen.next()\n", 59 | "\n", 60 | "#print \"Loading Data into Memory:\"\n", 61 | "#Test_genC.PreloadData(n_threads_cache)\n", 62 | "#Test_X_ECAL, Test_X_HCAL, target, Test_Y = tuple(Test_genC.D)\n", 63 | "\n" 64 | ] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 2", 70 | "language": "python", 71 | "name": "python2" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 2 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython2", 83 | "version": "2.7.10" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 2 88 | } 89 | -------------------------------------------------------------------------------- /LCD-Visualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Collider Detector Calorimeter Data Visualization\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**Calorimetry and the LCD Detector**\n", 15 | "\n", 16 | "In high energy physics detectors are used to take 'images' of particles that may result from controlled particle collisions (such as the Large Hadron Collider), single beams of known particles (such as test beams or neutrino beams), or influxes of particles from natural sources (such as solar neutrinos). These images are taken by advanced technologies exploiting the different methods in which particles interact with matter. \n", 17 | "\n", 18 | "The calorimeter is one class of detectors that is used to capture (aka reconstruct) the energy of passing particles. This is done by using materials that the particles will interact with, essentially emitting energy in this exchange, and materials that can tell us the amount of energy that is lost. Through consecutive processes of interactions and energy loss the goal is for particles to effectively stop in the material and for the reconstructed energy to be a measure of the initial energy of the particle. For example in the ATLAS detector, one calorimeter system uses plates of steel (a dense material) to 'slow down and stop' the passing particles inside the calorimeter, alternating with plates of plastic scintillator that ionize as these high energy particles pass. The photons emitted during the ionization process are collected onto photomultiplier tubes which convert the photon into an electrical signal representative of the lost energy. As particles interact with the calorimeter showers of secondary particles are produced, themselves interacting with the material, and so on. In particle physics experiments the purpose of the calorimeter is to precisely reconstruct the energy and assist with identification of the initial particle. \n", 19 | "\n", 20 | "One potential future high energy physics experiment is an electron-positron linear collider. R&D is underway to design and build that accelerator facility and associated detectors. The linear collider detector (LCD) is one of these proposed detectors. The LCD design includes the standard components including inner tracking detectors, calorimeters, and outer tracking systems. \n", 21 | "\n", 22 | "In this analysis we will use simulations of the LCD calorimeter system, made up of an electromagnetic calorimeter (ECAL) and hadronic calorimeter (HCAL). The ECAL aims to fully reconstruct the energy from electrons, positrons, and photons, and to measure the initial energy of showers initiated by hadrons. The ECAL is highly granular (in this dataset we save 25x25x25 cell information around the incoming particle) to help differentiate between electron (positron) and photon particles by properties of their shower developments, and to help identify incoming photons that converted into electron-position pairs. The HCAL is highly granular along the longitudinal direction (in this dataset separated into 5x5x60 cells around the incoming particle), giving more information about the longitudinal development of hadronic showers. The LCD calorimeters are non-compensating, meaning there are ways in which hadrons lose energy in the interactions that are not captured in the reconstruction of the electrical signal. Hence downstream software corrections must be applied to hadron showers to accurately reconstuct the initial hadron energy. The amount of the correction depends on several factors, including the initial particle energy and type. " 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "**The LCD Dataset**\n", 30 | "\n", 31 | "In this example simulated particles of either electrons, photons, neutral pions, or charged pions are shot directly into the LCD calorimeter systems. The cell level information (physical positions, energy deposited) is saved for the ECAL (25x25x25) and HCAL (5x5x60), thus saving a 3D image for each 'event' (event = single particle). The goal is to study the use of deep learning methods to improve the identification and reconstruction of these particles. \n", 32 | "\n", 33 | "The dataset is divided into four directories, each storing a large number of h5 files that hold images for one specific particle type. During training, our tools will appropriate mix and label this dataset. For this visualization exercise, we will use a premixed single 22 GB. Let's start by opening the dataset and explore its contents:\n", 34 | "\n", 35 | "(Running in Jupyter Notebooks: click on the box below and hit shift-enter.)\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import os\n", 45 | "import sys\n", 46 | "import h5py\n", 47 | "\n", 48 | "FileName='/data/LCD/LCD-Merged-All.h5'\n", 49 | "\n", 50 | "file = h5py.File(FileName, 'r')\n", 51 | "\n", 52 | "for k in file.keys():\n", 53 | " try:\n", 54 | " print k,file[k].shape\n", 55 | " except:\n", 56 | " print k,\"Not a tensor\"\n", 57 | "\n", 58 | " \n", 59 | "#file.close()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "The output shows there are 3211264 events, which will require several hundred GB of RAM to load into memory. The ECAL and HCAL are as described above. The groups \"index\" and \"OneHot\" encode the four particle types as an index or 4 \"bits\". The key \"target\" holds the true energy of the particle." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "source": [ 75 | "**The Classification Problem**\n", 76 | "\n", 77 | "One function of the LCD calorimeter is to efficiently distinguish between different particle types so reconstruction software can apply the appropriate energy correction factors. We can examine by-eye the properties of the four different particles provided in our simulated dataset.\n", 78 | "\n", 79 | "The script below reads in the LCD dataset and attempts to visualize the 3D images of one electron and one pion event to show how qualitatively these appear differently. Instead of plotting the 3D image, the images are plotted in 2D showing the cells in the x and y position. For fixed x and y values, the values for the energy deposted in each cell along the longitudinal direction (z) are summed and displayed in the 3rd axis. The color and size of a circle at a given x and y position represent the total amount of energy summed along the longitudinal direction (approximately the path of the particle). \n", 80 | "\n", 81 | "The ECAL is so highly granular that looping over the 25x25x25 tensor is too memory intensive for this short demonstration, hence for this purpose we are only considering 4x4x25 cells. For an actual analysis one should consider the full tensor to maximize the classification ability.\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "print \"-- Starting LCD calorimeter event visualization --\"\n", 91 | "\n", 92 | "import matplotlib.pyplot as plt\n", 93 | "%matplotlib inline\n", 94 | "import numpy as np\n", 95 | "\n", 96 | "#-- user settings\n", 97 | "minEnergy=400 # find electron and pion event with at least 400 GeV of energy\n", 98 | "#-- end user settings\n", 99 | "\n", 100 | "ecal = file[\"ECAL\"]\n", 101 | "hcal = file[\"HCAL\"]\n", 102 | "onehot = file[\"OneHot\"]\n", 103 | "index = file[\"index\"]\n", 104 | "target = file[\"target\"]\n", 105 | "\n", 106 | "#-- pick one event from each particle type and plot the ECAL and HCAL response maps\n", 107 | "\n", 108 | "# find the index of an *electron* event (with energy greater than the minimum value) and associated energy\n", 109 | "indexElEvt = -999\n", 110 | "energyElEvt = -999\n", 111 | "for evt in range(0,len(index)):\n", 112 | " energy=target[evt][0][1]\n", 113 | " if( (onehot[evt][1] ==1.0) and (energy > minEnergy) ): \n", 114 | " indexElEvt = evt\n", 115 | " energyElEvt = energy\n", 116 | " break # end for loop\n", 117 | " \n", 118 | "# find the index of a *pion* event (with energy greater than the minimum value) \n", 119 | "indexPiEvt = -999\n", 120 | "energyPiEvt = -999\n", 121 | "for evt in range(0,len(index)):\n", 122 | " energy = target[evt][0][1]\n", 123 | " if( (onehot[evt][2] ==1.0) and ( energy > minEnergy) ):\n", 124 | " indexPiEvt = evt\n", 125 | " energyPiEvt = energy \n", 126 | " break # end for loop\n", 127 | "\n", 128 | " \n", 129 | "print \"Found the following events to visualize:\"\n", 130 | "print \" Electron: evt# = \", indexElEvt, \" energy = \", energyElEvt\n", 131 | "print \" Pion : evt# = \", indexPiEvt, \" energy = \", energyPiEvt\n", 132 | " \n", 133 | "# arguments:\n", 134 | "# evtNum = event index\n", 135 | "# ec=ecal, hc=hcal\n", 136 | "# title= main title for both figures\n", 137 | "def PlotCalorimeters(evtNum, ec, hc, title):\n", 138 | " # make two plots side-by-side showing ECAL and HCAL\n", 139 | " # will collapse longitudinal layers into the scatter plot\n", 140 | " # Note: plotting limited dimensions in Ecal (25x25x25 very slow)\n", 141 | "\n", 142 | " #-- Fill arrays for plotting\n", 143 | " # ECAL\n", 144 | " sumZAxisEcal = []\n", 145 | " xaxisEcal = []\n", 146 | " yaxisEcal = []\n", 147 | "\n", 148 | " startIndexEcal=10 #over x,y\n", 149 | " endIndexEcal=14\n", 150 | "\n", 151 | " for x in range(startIndexEcal,endIndexEcal): \n", 152 | " for y in range (startIndexEcal,endIndexEcal):\n", 153 | " xaxisEcal.append(x)\n", 154 | " yaxisEcal.append(y)\n", 155 | " sumZEcal = 0\n", 156 | " for z in range (0,24): # over all z\n", 157 | " sumZEcal += ecal[evtNum,x,y,z] \n", 158 | " # end for over z\n", 159 | " sumZAxisEcal.append(sumZEcal)\n", 160 | " #end for over y \n", 161 | " # end for over x\n", 162 | "\n", 163 | "\n", 164 | " # HCAL\n", 165 | " sumZAxisHcal = []\n", 166 | " xaxisHcal = []\n", 167 | " yaxisHcal = []\n", 168 | "\n", 169 | " startIndexHcal=0 #over x,y\n", 170 | " endIndexHcal=4\n", 171 | "\n", 172 | " for x in range(startIndexHcal,endIndexHcal): \n", 173 | " for y in range (startIndexHcal,endIndexHcal):\n", 174 | " xaxisHcal.append(x)\n", 175 | " yaxisHcal.append(y)\n", 176 | " sumZHcal = 0\n", 177 | " for z in range (0,60): # over all z\n", 178 | " sumZHcal += hcal[evtNum,x,y,z]\n", 179 | " # end for over z\n", 180 | " sumZAxisHcal.append(sumZHcal)\n", 181 | " #end for over y \n", 182 | " # end for over x\n", 183 | "\n", 184 | " #-- Plotting\n", 185 | " from matplotlib.ticker import FormatStrFormatter\n", 186 | "\n", 187 | " fig, (ax1, ax2) = plt.subplots(ncols=2)\n", 188 | " plt.tight_layout(pad=1.0, w_pad=1.0, h_pad=1.0)\n", 189 | "\n", 190 | " scatEcal = ax1.scatter(xaxisEcal,yaxisEcal,s=sumZAxisEcal,c=sumZAxisEcal )\n", 191 | " fig.colorbar(scatEcal, ax=ax1,format='%.1f')\n", 192 | " ax1.set_title('ECAL')\n", 193 | " ax1.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))\n", 194 | " ax1.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))\n", 195 | " ax1.xaxis.set_ticks(np.arange(startIndexEcal, endIndexEcal, 1.0))\n", 196 | " ax1.yaxis.set_ticks(np.arange(startIndexEcal, endIndexEcal, 1.0))\n", 197 | "\n", 198 | " scatHcal = ax2.scatter(xaxisHcal,yaxisHcal,s=sumZAxisHcal,c=sumZAxisHcal )\n", 199 | " fig.colorbar(scatHcal, ax=ax2,format='%.1f')\n", 200 | " ax2.set_title('HCAL')\n", 201 | " ax2.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))\n", 202 | " ax2.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))\n", 203 | " ax2.xaxis.set_ticks(np.arange(startIndexHcal, endIndexHcal, 1.0))\n", 204 | " ax2.yaxis.set_ticks(np.arange(startIndexHcal, endIndexHcal, 1.0))\n", 205 | "\n", 206 | " plt.suptitle(title,size=16)\n", 207 | " plt.show()\n", 208 | "\n", 209 | " \n", 210 | "PlotCalorimeters(indexElEvt, ecal, hcal, 'Electron')\n", 211 | "PlotCalorimeters(indexPiEvt, ecal, hcal, 'Pion')\n", 212 | "\n", 213 | "\n", 214 | "file.close()\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "After the notebook is run, the figures on the top show the energy deposits in ECAL (left) and HCAL (right) for an electron event. One can see that even for high energy electrons the bulk of the energy is deposited in the ECAL, with very litte in the HCAL. The bottom figures show the same ECAL and HCAL energy deposits for a pion event. In this case pions deposit energy in both the ECAL and HCAL. The electron and pion events selected above have similar initial energies. " 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "It is not feasible to scan by-eye the calorimeter response, so instead algorithms are trained to classify the particle types. In this example we will study deep learning hyper-parameters to determine which parameters best classify these types of events in the LCD calorimeter. " 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 2", 235 | "language": "python", 236 | "name": "python2" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 2 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython2", 248 | "version": "2.7.12" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /Lab-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 1\n", 8 | "## Introduction\n", 9 | "\n", 10 | "\n", 11 | "Our first two labs are intended to try to force everyone to learn sufficient python, numpy, and HDF5 to perform some basic tasks. As I started writing primers for these topics, I quickly found that there are many excellent primers already out there. So instead, these labs will just present you with various problems and point you to notebooks from other sources that will provide you the necessary background to solve the problems. \n", 12 | "\n", 13 | "If you are seeing this page, you have successfully connected to our python server via ssh tunnel, logged in, and navigated to this notebook. Jupyter notebooks consist of cells that can hold text or code (usually python). This text that you are reading, was written into a text cell as simple text \"coding\" language known as mark-down. When this cell is run (either automatically at start of the notebook or manually by pressing control-enter), the mark-down text is interpreted into nice looking text. Running a code cell will execute the code in that cell and give you the results. If you make a mistake, you can usually simply change the cell and re-run. But be aware that since you ran the mistaken cell already, whatever code that was properly executed before your mistake/error, was already executed and has therefore changed your current python environment accordingly. In some cases this situation will be problematic, and you will need to rerun the notebook from the start by pressing the \"reload\" botton (next to the \"stop\" button) above.\n", 14 | "\n", 15 | "\n", 16 | "## Primers\n", 17 | "\n", 18 | "You'll find two other primers in this lab's directory. These are full lecture series, one on python (by Rajath Kumar from Indian Institute of Science, [License](https://creativecommons.org/licenses/by/3.0/) ) and one on scientific computing (by Robert Johansson, [License](https://creativecommons.org/licenses/by/3.0/) ), that I found in github. Feel free to google around for other sources. \n", 19 | "\n", 20 | "My suggestion is that you first read through this lab to get a sense of what you will need to know. This first lab requires just basic python. If you are unfamiliar with python or not comfortable, then take time during this lab to read enough of the following notebooks to get you started with the exercises, and then follow up at home to go through the rest of the material. Your homework, due 10 am Friday morning (so I can take a look before lecture), is to complete this full Lab. I will simply take a snapshot of your lab notebook at that time. \n", 21 | "\n", 22 | "For the lab on Wednesday, everyone should at least get started with the numpy notebook below, before coming to class.\n", 23 | "\n", 24 | " * Python Lecture: [Introduction to Python Jupyter Notebook Lecture](jrjohansson-lectures/Lecture-1-Introduction-to-Python-Programming.ipynb)\n", 25 | "\n", 26 | " * Python Course: [Python Lectures](PythonLectures)\n", 27 | "\n", 28 | " * Numpy Lecture: [Numpy Jupyter Notebook Lecture](jrjohansson-lectures/Lecture-2-Numpy.ipynb)\n", 29 | "\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Lab 1- Python\n", 37 | "\n", 38 | "Let start with generating some fake random data. You can get a random number between 0 and 1 using the python random module as follow:" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 24, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "The Value of x is 0.430296569413\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "import random\n", 56 | "x=random.random()\n", 57 | "print \"The Value of x is\", x" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Exercise 1.1\n", 65 | "Using random, write a function GenerateData(N, mymin, mymax), that returns a python list containing N random numbers between specified minimum and maximum value. Note that you may want to quickly work out on paper how to turn numbers between 0 and 1 to between other values. " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 43, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | " Data Type: \n", 78 | "Data Length: 0\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "# Skeleton\n", 84 | "def GenerateData(N,min,max):\n", 85 | " out = []\n", 86 | " ### BEGIN SOLUTION\n", 87 | "\n", 88 | " # Fill in your solution here \n", 89 | " \n", 90 | " ### END SOLUTION\n", 91 | " return out\n", 92 | "\n", 93 | "Data=GenerateData(1000,-10,10)\n", 94 | "print \"Data Type:\", type(Data)\n", 95 | "print \"Data Length:\", len(Data)\n", 96 | "if len(Data)>0: \n", 97 | " print \"Type of Data Contents:\", type(Data[0])\n", 98 | " print \"Data Minimum:\", min(Data)\n", 99 | " print \"Data Maximum:\", max(Data)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### Exercise 1.2\n", 107 | "\n", 108 | "Write a function that computes the mean of values in a list." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 45, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Mean of Data: 0\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "# Skeleton\n", 126 | "def mean(Data):\n", 127 | " m=0\n", 128 | " \n", 129 | " ### BEGIN SOLUTION\n", 130 | "\n", 131 | " # Fill in your solution here \n", 132 | " \n", 133 | " ### END SOLUTION\n", 134 | " \n", 135 | " return m\n", 136 | "\n", 137 | "print \"Mean of Data:\", mean(Data)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Exercise 1.3\n", 145 | "\n", 146 | "Write a function the applies a booling function (that returns true/false) to every element in data, and return a list of indices of elements where the result was true. Use this function to find the indices of positive entries. " 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 47, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "def where(mylist,myfunc):\n", 164 | " out= []\n", 165 | " \n", 166 | " ### BEGIN SOLUTION\n", 167 | "\n", 168 | " # Fill in your solution here \n", 169 | " \n", 170 | " ### END SOLUTION\n", 171 | " \n", 172 | " return out" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise 1.4\n", 180 | "\n", 181 | "The inrange(mymin,mymax) function below returns a function that tests if it's input is between the specified values. Use this function, in conjunction to your solution to 1.3, to demonstrate that your data is \"flat\". Hint: pick several sub-ranges and show that the number of data point divided by the size of the range is roughly constant. " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 49, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "True True False False False\n", 194 | "False False True True False\n", 195 | "Number of Entries passing F1: 0\n", 196 | "Number of Entries passing F2: 0\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "def inrange(mymin,mymax):\n", 202 | " def testrange(x):\n", 203 | " return x=mymin\n", 204 | " return testrange\n", 205 | "\n", 206 | "# Examples:\n", 207 | "F1=inrange(0,10)\n", 208 | "F2=inrange(10,20)\n", 209 | "\n", 210 | "print F1(0), F1(1), F1(10), F1(15), F1(20)\n", 211 | "print F2(0), F2(1), F2(10), F2(15), F2(20)\n", 212 | "\n", 213 | "print \"Number of Entries passing F1:\", len(where(Data,F1))\n", 214 | "print \"Number of Entries passing F2:\", len(where(Data,F2))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Exercise 1.5\n", 222 | "\n", 223 | "Repeat Exercise 1.5 using the built in python functions sum and map instead of your solution to 1.3. " 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "### BEGIN SOLUTION\n", 235 | "\n", 236 | " # Fill in your solution here \n", 237 | " \n", 238 | "### END SOLUTION" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Exercise 1.6 (Homework)\n", 246 | "\n", 247 | "Write a new function called GenerateDataFromFunction(N,mymin,mymax,myfunc), that instead of generating a flat distribution, generates a distribution with functional form coded in myfunc. Note that myfunc will always be > 0. \n", 248 | "\n", 249 | "Use your function to generate 1000 numbers that are Gaussian distributed, using the Gaussian function below. Confirm the mean of the data is close to mean you specify when building the Gaussian. \n", 250 | "\n", 251 | "Hint: A simple, but slow, solution is to a draw random number test_x within the specified range and another number p between the min and max of the function (which you will have to determine). If p<=function(test_x), then place test_x on the output. If not, repeat the process, drawing two new numbers. Repeat until you have the specified number of generated numbers, N. For this problem, it's OK to determine the min and max by numerically sampling the function. \n", 252 | "\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "def GenerateDataFromFunction(N,mymin,mymax,myfunc):\n", 264 | " out = []\n", 265 | " ### BEGIN SOLUTION\n", 266 | "\n", 267 | " # Fill in your solution here \n", 268 | " \n", 269 | " ### END SOLUTION\n", 270 | " \n", 271 | " return out" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 50, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "import math\n", 283 | "\n", 284 | "def gaussian(mean, sigma):\n", 285 | " def f(x):\n", 286 | " return math.exp(((x-mean)**2)/(2*sigma**2))/math.sqrt(math.pi*sigma)\n", 287 | " return f\n", 288 | "\n", 289 | "# Example Instantiation\n", 290 | "g1=gaussian(0,1)\n", 291 | "g2=gaussian(10,3)\n", 292 | "\n", 293 | "### BEGIN SOLUTION\n", 294 | "\n", 295 | "# Fill in your solution here \n", 296 | " \n", 297 | "### END SOLUTION\n" 298 | ] 299 | } 300 | ], 301 | "metadata": { 302 | "kernelspec": { 303 | "display_name": "Python 2", 304 | "language": "python", 305 | "name": "python2" 306 | }, 307 | "language_info": { 308 | "codemirror_mode": { 309 | "name": "ipython", 310 | "version": 2 311 | }, 312 | "file_extension": ".py", 313 | "mimetype": "text/x-python", 314 | "name": "python", 315 | "nbconvert_exporter": "python", 316 | "pygments_lexer": "ipython2", 317 | "version": "2.7.13" 318 | } 319 | }, 320 | "nbformat": 4, 321 | "nbformat_minor": 2 322 | } 323 | -------------------------------------------------------------------------------- /Lab-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 2- Numpy\n", 8 | "\n", 9 | "Read through the following notebook to get an introduction to numpy: [Numpy Intro](jrjohansson-lectures/Lecture-2-Numpy.ipynb)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Exercise 2.1\n", 17 | "\n", 18 | "Let start with some basic reshape manipulations. Consider a classification task. We can imagine the training data X consisting of N examples each with M inputs, so the shape of X is (M,N). We usually express the output of the Neural Network, which for the training sample encodes the true class of each of the M examples in X, in a \"one-hot\" matrix of shape (N,C), where C is the number of classes and each row corresponds to the true class for the corresponding example in X. So for a given row Y[i], all elements are 0 except for the column corresponding to the true class.\n", 19 | "\n", 20 | "For example consider a classification task of separating between 4 classes. We'll call them A, B, C, and D.\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import numpy as np\n", 30 | "\n", 31 | "Y=np.array( [ [0, 1, 0, 0], # Class B\n", 32 | " [1, 0, 0, 0], # Class A\n", 33 | " [0, 0, 1, 0], # Class C\n", 34 | " [0, 0, 0, 1] # Class D\n", 35 | " ])\n", 36 | "\n", 37 | "print \"Shape of Y:\", Y.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Lets imagine that we want to change to a 2 classes instead by combining classes A with B and C with D. Use np.reshape and np.sum to create a new vector Y1. Hint: change the shape of Y into (8,2), sum along the correct axes, and change shape to (4,2)." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "print \"Transpose:\", np.transpose(Y)\n", 54 | "print \"Reshape 8,2:\", np.transpose(Y).reshape((8,2))\n", 55 | "print \"Sum:\", np.sum(np.transpose(Y).reshape((8,2)),axis=1)\n", 56 | "\n", 57 | "Y1= np.sum(np.transpose(Y)\n", 58 | " .reshape((8,2)),axis=1).reshape(4,2)\n", 59 | "print \"Answer: \",Y1" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Exercise 2.2\n", 67 | "\n", 68 | "Oftentimes we find that neutral networks work best when their input is mostly between 0,1. Below, we create a random dataset that is normal distributed (mean of 4, sigma of 10). Shift the data so that the mean is 0.5 and 68% of the data lies between 0 and 1." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "X=np.random.normal(4,10,1000)\n", 78 | "print np.mean(X)\n", 79 | "print np.min(X)\n", 80 | "print np.max(X)\n", 81 | "print np.var(X)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import math\n", 91 | "X1=(X-np.mean(X))/math.sqrt(np.var(X)) # Replace X with your answer\n", 92 | "\n", 93 | "print np.mean(X1)\n", 94 | "print np.var(X1)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Exercise 2.3\n", 102 | "\n", 103 | "Using np.random.random and np.random.normal to generate two datasets. Then use np.where to repeat exercise 1.4 showing that one creates a flat distribution and the other does not. " 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "X0=np.random.random(1000)\n", 113 | "\n", 114 | "def CheckFlatness(D,steps=10):\n", 115 | " maxD=np.max(D)\n", 116 | " minD=np.min(D)\n", 117 | " i=minD\n", 118 | " stepsize=(maxD-minD)/steps\n", 119 | " while ii) ))\n", 121 | " i+=stepsize\n", 122 | " \n", 123 | "CheckFlatness(X0)\n", 124 | "CheckFlatness(X)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Exercise 2.4\n", 132 | "\n", 133 | "Now lets play with some real data. We will load a file of example Neutrino interactions in LArTPC detector. There are 2 read out planes in the detector with 240 wires each, sampled 4096 times. Shift the images in the same way as exercise 2.2." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "import h5py\n", 143 | "f=h5py.File(\"/data/LArIAT/h5_files/nue_CC_3-1469384613.h5\",\"r\")\n", 144 | "print f.keys()\n", 145 | "images=np.array(f[\"features\"][0:10],dtype=\"float32\")\n", 146 | "print images.shape" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "print images[0]" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "print np.mean(images)\n", 165 | "print np.var(images)\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "def DownSample(Data,factor,Nx,Ny,sumabs=False):\n", 175 | " if factor==0:\n", 176 | " return np.reshape(Data,[Nx,Ny]),Ny\n", 177 | "\n", 178 | " # Remove entries at the end so Down Sampling works\n", 179 | " NyNew=Ny-Ny%factor\n", 180 | " Data1=np.reshape(Data,[Nx,Ny])[:,0:NyNew]\n", 181 | " \n", 182 | " # DownSample \n", 183 | " if sumabs:\n", 184 | " a=abs(Data1.reshape([Nx*NyNew/factor,factor])).sum(axis=1).reshape([Nx,NyNew/factor])\n", 185 | " else:\n", 186 | " a=Data1.reshape([Nx*NyNew/factor,factor]).sum(axis=1).reshape([Nx,NyNew/factor])\n", 187 | "\n", 188 | " return a,NyNew\n", 189 | "\n", 190 | "\n", 191 | "R,Ny=DownSample(images[0][1],10,240,4096)\n", 192 | "print R.shape" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 2", 208 | "language": "python", 209 | "name": "python2" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 2 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython2", 221 | "version": "2.7.13" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /Lab-5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 5- Deep Learning Model\n", 8 | "\n", 9 | "This lab is meant to get you started in using Keras to design Deep Neural Networks. The goal here is to simply repeat Lab 4, but with DNNs.\n", 10 | "\n", 11 | "Let's start with reading the data, like before:" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "%matplotlib inline\n", 24 | "\n", 25 | "filename=\"/data/afarbin/DLClass/SUSY/SUSY.csv\"\n", 26 | "VarNames=[\"signal\", \"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\", \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]\n", 27 | "RawNames=[\"l_1_pT\", \"l_1_eta\",\"l_1_phi\", \"l_2_pT\", \"l_2_eta\", \"l_2_phi\"]\n", 28 | "FeatureNames=[ \"MET\", \"MET_phi\", \"MET_rel\", \"axial_MET\", \"M_R\", \"M_TR_2\", \"R\", \"MT2\", \"S_R\", \"M_Delta_R\", \"dPhi_r_b\", \"cos_theta_r1\"]\n", 29 | "\n", 30 | "df = pd.read_csv(filename, dtype='float64', names=VarNames)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Now lets define training and test samples. Note that DNNs take very long to train, so for testing purposes we will use only about 10% of the 5 million events in the training/validation sample. Once you get everything working, make the final version of your plots with the full sample. \n", 47 | "\n", 48 | "Also note that Keras had trouble with the Pandas tensors, so after doing all of the nice manipulation that Pandas enables, we convert the Tensor to a regular numpy tensor." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "N_Max=550000\n", 60 | "N_Train=500000\n", 61 | "\n", 62 | "Train_Sample=df[:N_Train]\n", 63 | "Test_Sample=df[N_Train:N_Max]\n", 64 | "\n", 65 | "X_Train=np.array(Train_Sample[VarNames[1:]])\n", 66 | "y_Train=np.array(Train_Sample[\"signal\"])\n", 67 | "\n", 68 | "X_Test=np.array(Test_Sample[VarNames[1:]])\n", 69 | "y_Test=np.array(Test_Sample[\"signal\"])\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Now lets setup everything. Note that you will need to use a GPU to get reasonable training times. The Jupyter server will have up tp 4 GPUs in it. Your Jupyter session is now running through a batch queue system that effectively assigns a GPU to you. Here we tell Keras's backend, Theano, to use the GPU assigned to you." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import os \n", 86 | "print \"Using Queue:\", os.environ[\"PBS_QUEUE\"]\n", 87 | "gpuid=int(os.environ[\"PBS_QUEUE\"][3:4])\n", 88 | "print \"Using GPU:\", gpuid\n", 89 | "os.environ['THEANO_FLAGS'] = \"mode=FAST_RUN,device=gpu%s,floatX=float32,force_device=True\" % (gpuid)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "There was some problem getting the profiler to work in this notebook, so we turn it off: " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "import theano\n", 106 | "theano.config.profile=False" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Now we will build a simple model, as described in class. Note that this is very small model, so things run fast. You should attempt more ambitious models." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "from keras.models import Sequential\n", 123 | "from keras.layers import Dense\n", 124 | "\n", 125 | "model = Sequential()\n", 126 | "model.add(Dense(12, input_dim=X_Train.shape[1], init='uniform', activation='relu'))\n", 127 | "model.add(Dense(8, init='uniform', activation='relu'))\n", 128 | "model.add(Dense(1, init='uniform', activation='sigmoid'))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "\n", 138 | "import keras\n", 139 | "grapher = keras.utils.dot_utils.Grapher()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "The model has to be compiled. At this time we set the loss function and the optimizer too:" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 165 | "model.summary()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Now we train. We are running only 10 epochs in this example. Models may need hundreds of epochs before they stop improving." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "history=model.fit(X_Train, y_Train, validation_data=(X_Test,y_Test), nb_epoch=10, batch_size=2048)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "The model history keeps track of the loss and accuracy for each epoch. Note that the training above was setup to run on the validation sample at the end of each epoch:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "print history.history" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Exercise 5.1\n", 205 | "\n", 206 | "You will need to create several models and make sure they are properly trained. Write a function that takes this history and plots the values versus epoch. For every model that you train in the remainder of this lab, assess:\n", 207 | "\n", 208 | " * Has you model's performance plateaued? If not train for more epochs. \n", 209 | "\n", 210 | " * Compare the performance on training versus test sample. Are you over training?" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "## Your Solution Here\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "We can evaluate how the trained model does on the test sample as follows:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "scores = model.evaluate(X_Test, y_Test)\n", 238 | "print scores" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "And we can make ROC curves as before:" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "from sklearn.metrics import roc_curve, auc\n", 255 | "fpr, tpr, _ = roc_curve(y_Test, model.predict(X_Test))\n", 256 | " \n", 257 | "roc_auc = auc(fpr, tpr)\n", 258 | "\n", 259 | "plt.plot(fpr,tpr,color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc)\n", 260 | "plt.legend(loc=\"lower right\")\n", 261 | "plt.xlabel('False Positive Rate')\n", 262 | "plt.ylabel('True Positive Rate')\n", 263 | "\n", 264 | "plt.show()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Exercise 5.2\n", 272 | "\n", 273 | "Following lab 4, make a comparison of the performance between models trained with raw, features, and raw+features data." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "## Your solution here\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## Exercise 5.3\n", 292 | "\n", 293 | "Again, following lab 4, design and implement at least 3 different DNN models. Train them and compare performance. You may try different architectures, loss functions, and optimizers to see if there is an effect." 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "## Your solution here" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Honors Problem- Exercise 5.4\n", 312 | "\n", 313 | "Write a function that evaluates the performance (AUC) as a function of a given input variable. You will need to bin the test data in the variable (i.e. make sub-samples for events which have the particular variable in a range), evaluate the performance in each bin, and plot the results.\n", 314 | "\n", 315 | "Apply your function to each input variable." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Python 2", 331 | "language": "python", 332 | "name": "python2" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 2 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython2", 344 | "version": "2.7.13" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } 350 | -------------------------------------------------------------------------------- /Lab-6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 6\n", 8 | "\n", 9 | "This notebooks setups up DLKit and CaloDNN so you can run the lab. You should only this one." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "!git clone https://bitbucket.org/anomalousai/DLKit\n", 21 | "%cd DLKit\n", 22 | "!git clone https://github.com/UTA-HEP-Computing/CaloDNN" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "!git clone https://github.com/UTA-HEP-Computing/CaloDNN" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "%ls" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "# Lab Instructions\n", 52 | "\n", 53 | "After the installation above, copy `CaloDNN/Instructions.ipynb` into your DLKit directory and navigate Jupyter through it. Use this notebook as your reference. \n", 54 | "\n", 55 | "You task is to create a new model or the `CaloDNN` task and perform a hyperparameter scan of roughly 20 points, with jobs that run about 1 hour each. Then compare the performance of your models using the AnalyzeScan Jupyter example and analyze the performance of your best model using the AnalyzePerformance Jupyter example.\n", 56 | "\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "!cp CaloDNN/Instructions.ipynb ." 68 | ] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 2", 74 | "language": "python", 75 | "name": "python2" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 2 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython2", 87 | "version": "2.7.12" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /NEXT-Data-Generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from NEXTDNN.LoadData import *" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "n_threads=10\n", 21 | "Directory=\"/data/NEXT/tracksVL/\"\n", 22 | "GenC=NEXTDataGenerator(Directory,n_threads=n_threads,batchsize=8, bins=(100,100,100), Norm=False,\n", 23 | " max=10000, verbose=False)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "Gen=GenC.Generator()\n", 35 | "\n", 36 | "FirstBatch=Gen.next()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "FirstBatch" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "import numpy as np" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "Image=FirstBatch[0][0]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "Image[np.where(Image!=0)]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "Image.shape" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [] 96 | } 97 | ], 98 | "metadata": { 99 | "kernelspec": { 100 | "display_name": "Python 2", 101 | "language": "python", 102 | "name": "python2" 103 | }, 104 | "language_info": { 105 | "codemirror_mode": { 106 | "name": "ipython", 107 | "version": 2 108 | }, 109 | "file_extension": ".py", 110 | "mimetype": "text/x-python", 111 | "name": "python", 112 | "nbconvert_exporter": "python", 113 | "pygments_lexer": "ipython2", 114 | "version": "2.7.12" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 2 119 | } 120 | -------------------------------------------------------------------------------- /NEXTDNN-Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "%run -im NEXTDNN.ClassificationExperiment -- --Test -s 10 --cpu" 13 | ] 14 | } 15 | ], 16 | "metadata": { 17 | "kernelspec": { 18 | "display_name": "Python 2", 19 | "language": "python", 20 | "name": "python2" 21 | }, 22 | "language_info": { 23 | "codemirror_mode": { 24 | "name": "ipython", 25 | "version": 2 26 | }, 27 | "file_extension": ".py", 28 | "mimetype": "text/x-python", 29 | "name": "python", 30 | "nbconvert_exporter": "python", 31 | "pygments_lexer": "ipython2", 32 | "version": "2.7.12" 33 | } 34 | }, 35 | "nbformat": 4, 36 | "nbformat_minor": 2 37 | } 38 | -------------------------------------------------------------------------------- /ParticleDetectorsIntro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Imaging Detectors\n", 8 | "\n", 9 | "## Introduction to Particle Detectors\n", 10 | "\n", 11 | "In high energy physics experiments, detectors serve as cameras, taking pictures of particle interaction 'events' resulting from either natural sources (e.g. cosmic rays, solar neutrinos, radioactive decay, or Dark Matter), or in controlled particle collision arranged by crossing high energy particle beams (e.g. Large Hadron Collider (LHC) or the Tevatron) or by shooting a single such beam on a target (e.g. Neutrino beam experiments at Fermilab). In most cases the particles of ultimate interest (e.g. Higgs, Supersymmetric particles, or neutrino) cannot be observed directly because they either decay immediately or are effectively invisible, and must therefore must be inferred from their decay products or secondary particles that result from their rare interactions. The images are taken by advanced technologies exploiting the unique processes by which these secondary particles interact with matter.\n", 12 | "\n", 13 | "Since charged particles ionize matter, their trajectories can be reconstructed by spatially locating ions, for example in silicon pixels like those in digitial cameras or in wires immersed in liquid Argon. If these secondary particles are in a strong magentic field, the resulting curvature of their trajectories provide a measurement of their momenta. Momentum resolution degrades for highly energetic particles since the curvature in their trajectories becomes small and therefore more difficult to measure, necessitating very strong magnetic fields (i.e. the CMS design) or large volumes of magnetic field (i.e. the ATLAS design). In addition, precision detectors enable particle trajectories to be traced to common vertexes that spatially locate the orgins of decays or interactions. Detectors that perform tracking of charged particles in this way are knowns as Tracking Detectors. Tracking detectors are not the subject of this specific tutorial.\n", 14 | "\n", 15 | "In dense material, nearly all particles will eventually loose their energy in an avalanche of secondary particles, referred to as showers. For light particles that do not feel the strong force, such as photons and electrons, the showers are due to electromagetic interactions with the electrons in the interacting material. For heavier particles that do feel the strong force, such as neutrons or pions, the showers are due to hadronic interactions with the nuclei in the interaction material. In both cases, the fundamental processes governing these interactions are at relatively low energies compared to the particle initiating the shower. As a result the number of particles in the shower is typically directly proportional to the energy of the primary particle, hence the energy resolution improves with energy, following poisson statistics (i.e. proportional to $\\sqrt{E} \\approx \\sqrt{N}$). Calorimeters, which effectively just count the particles in these showers, usually come in two flavors, Electromagentic and Hadronic, corresponding to the two possible types of showers. Highly granular calorimeters measure energy in cells (analogous to voxels), enabling the particle showers to be 'imaged'. The ATLAS calorimeter has about 200,000 cells.\n", 16 | "\n", 17 | "The tutorial presented here focuses on calorimeters and liquid and gas Time Projection Chambers (TPCs), which produce 2D and 3D images of the interactions of particles that in many cases can be easily identified by the human eye. We will use these images to classify the particle type, measure the particle energies, and generate new images for simulation. We ask participants to begin by choosing one type of detector that they find most compelling. In order to assist with this choice, the following sections overview the detectors and the possible Machine Learning tasks.\n", 18 | "\n", 19 | "## Particles and their interactions\n", 20 | "\n", 21 | "The following particles live long enough to traverse particle detectors and therefore be observed:\n", 22 | "\n", 23 | " * Electrons/positrons: These leptons are charged and therefore leave tracks in tracking detectors. They produce dense showers in electromagnetic calorimeter with little energy reaching the hadronic calorimeter.\n", 24 | " \n", 25 | " * Photons: Having no charge, they leave no signature in the tracking detectors, unless they convert to electron/positron pairs by interacting with some detector material (e.g. the tracker). They do leave showers in the Electromagnetic calorimeter that resemble electron showers.\n", 26 | " \n", 27 | " * Neutral Pions: these immediately decay into two photons and are therefore detected as a pair of photons. As their energy goes up, the two photons begin to overlap thus making it difficult to separate from single photon events.\n", 28 | " \n", 29 | " * Muons/anti-muons: These heavier lepton cousins of the electron are similarly charged and therefore leave tracks. But they are too heavy to deaccelerarte in the electromagnetic calorimeters and do not feel the strong force and so also do not shower in the hadronic calorimeters. They are the only particles that typically escape from the calorimeters and are therefore detected and identified by muon tracking systems that surround the other detectors.\n", 30 | "\n", 31 | " * Pions, Kaons, protons, neutrons and other hadrons: if charged, they leave tracks. They typically leave minimal energy in the electromagnetic calorimeters and shower in the hadronic calorimeter.\n", 32 | "\n", 33 | "## Calorimeters\n", 34 | "\n", 35 | "The Large Hadron Collider experiments use Electromagnetic Calorimeters (ECAL) to identify and measure energies of photons and electrons, and use Hadronic Calorimeters (HCAL) to group and measure the energy of jets of hadronic particles initiated by energetic quarks or gluons produced in the primary collisions. These calorimeters are designed to be as hermetic as possible, collecting all of the energy of every collision in order to measure inbalances that indicate the presence of non-interacting particles such as neutrinos or dark matter. Since the LHC collides protons, the collisions are dominated by strong interactions that produce quarks and gluons that turn into hadronic jets. These highly copious jets are the primary background to the identification of photons and leptons, which are much rarer. As a result, these Calorimeters are required to only mis-identify 1 in roughly 10000 jets as a photon or lepton. They achieve such performance by exploiting the differences between electromagnetic and hadronic showers, for example by using multi-variate techniques on features that encapsulate the shower profile such as the spatial moments of the energy deposits.\n", 36 | "\n", 37 | "Even small improvements in the classification performance or energy resolution of Calorimeters can have a large effect on physics results, for example by reducing the backgrounds underneath the Higgs Mass peak or sharpening up that peak. A bigger impact on LHC experiments would come from faster faithful simulations of the calorimeter interactions, an extremely CPU intensive task which currently relies on tracking and simulating the micro-physics of the thousands of particles in each event that traverse the calorimeter. For example of order, one half of the computing resources of ATLAS experiment is used for just this one task. \n", 38 | "\n", 39 | "### LCD Calorimeter\n", 40 | "\n", 41 | "Since simulated calorimeter data from the LHC experiments are generally not public and are difficult to use due to the complexities of the hadronic colliders, we choose to begin our Deep Learning studies in calorimetry in a simpler calorimeter of the Linear Collider Detector (LCD), an R&D detector concept for a future potential accelerator facility. The LCD design includes the standard components including inner tracking detectors, calorimeters, and outer tracking systems. The LCD calorimeter is highly granular, imaging particle showers at much higher resolution than the current LHC experiments. But we can approximate the LHC calorimeters by down-sampling the LCD calorimeter.\n", 42 | "\n", 43 | "The high granularity of the ECAL detector enables differentiation of electron (positron) and photon particles by properties of their shower developments, and identification of incoming photons that converted into electron-position pairs. The HCAL is highly granular along the longitudinal direction, giving more information about the longitudinal development of hadronic showers. The LCD calorimeters are non-compensating, meaning there are ways in which hadrons lose energy in the interactions that are not captured in the reconstruction of the electrical signal. Hence downstream software corrections must be applied to hadron showers to accurately reconstuct the initial hadron energy. The amount of the correction depends on several factors, including the initial particle energy and type. Hadronic calibration and resolution are often one of the dominating sources of uncertaintly and inaccuracy in physics measurements.\n", 44 | "\n", 45 | "The dataset consists of simulations of single electrons/positrons, charged pions, neutral pions (pi0s), or photons (gamma) shot into the calorimeter. To keep the data size manageable, we only store a 25 by 25 by 25 cell part of the ECAL and 5 by 5 by 60 part of the HCAL around the particle. In addition we compute some typical features used in traditional calorimeter particle classification.\n", 46 | "\n", 47 | "The [LCD Visualization Notebook](LCD-Visualization.ipynb) introduces this dataset. [The LCD Classification Experiment Notebook](CaloDNN-Experiment.ipynb) sets up a Deep Learning classification problem. Other iteresting problems that can be easily setup are energy regression and generative models for simulation. \n", 48 | "\n", 49 | "## Neutrino Detectors\n", 50 | "\n", 51 | "Neutrinos are extremely elusive particles. Every second, 100 billion neutrinos go through your thumb nail, but only one may interact in your body in your lifetime. Because of their elusiveness, we know relatively little about neutrinos, yet they may help explain the matter/anti-matter asymmetry in the Universe or give us a hit of physics beyond the Standard Model. Therefore the US has choosen to build an intense neutrino beam which will send neutrinos from Fermilab, through the earth, to unprecidentedly large detector more that 1 km underground (in order to be shielded from Cosmic Rays). The Long Baseline Neutrino Facility (LBNF) and the Deep Underground Neutrino Experiment (DUNE), US's flagship particle physics experiment, will begin taking neutrino data in the mid-2020s. In the meantime current and up-coming short-baseline neutrino beam experiments are studying neutrinos. \n", 52 | "\n", 53 | "Detecting neutrinos requires instrumenting a large volume of material to observe the secondary particles produced when the neutrinos interact. Neutrinos come in three flavors: electron neutrino, muon neutrino, and tau neutrino. They interact in two ways via the weak force: charged current, which results in a of corresponding flavor, and neutral current, which doesn't reveal the flavor of the incoming neutrino. Most neutrino experiments only require identifying the type of neutrino and measuring its energy. Several technologies have been employed for such experiments. One successful approach is to use liquid water or oil as the detector material. The secondary particles, which will typically go faster than speed of light in that material, will therefore emit Cherenkov light in a process analgous to a sonic boom. This light produces rings on the walls of the detector that can be seen by photon detectors.\n", 54 | "\n", 55 | "The technology choosen for DUNE and the majority of recent and planned neutrino experiments is Liquid Argon Time Projection Chambers (LArTPC), which promises twice the detection efficiency of previous techniques. In these detectors, the secondary particles ionize the Argon, which is kept in a strong electric field so that the ions do not recombine. This field causes the electrons to 'drift' to one side of the detector so they can be read out by two or three planes of parallel wires. Since the drift velocity is constant, the time of the arrival of the electrons at the wire corresponds to the distance from the wire. The location of the wire provides a second measurement of the drift electron position. Ploting wire position versus time for collected charge produces a 2D projected image of the neutrino interaction. 3D can be inferred by correlating images from the two or three planes which are at angles wrt each other. \n", 56 | "\n", 57 | "The extremely high resolution images produced by LArTPC detectors can be easily identified by the human eye. But, despite a great deal of effort by the LArTPC community, automatic reconstruction of LArTPC has proven to be difficult and a significant barrier to deriving physics results. In many cases, experiments have had to use people to identify events and help algorithms find patterns in a process known as a \"hand scan\". Convolutional Neural Networks have been shown to perform better than traditional algorithms in the classification of neutrino events and particles in LArTPC. However these studies have only demonstrated feasibility on typically highly downsampled data. The goal of the tutorial here is to initiate the process of developing CNNs at full detector resolution and demonstrating classification and energy regression with performances that are on par with the expected capability of LArTPC detectors. It is noteworthy that the success of DUNE and many of these projects rely on excellent performance on such tasks which have yet to be demonstrated, primarily due to software. \n", 58 | "\n", 59 | "### LArIAT Detector\n", 60 | "\n", 61 | "Argonut detector was the first LArTPC to see neutrino beams. It has been refurbished, renamed LArIAT, and is currently taking test beam data at Fermilab. This very small detector has two planes of 240 wires each, oriented at 60 degrees from the horizontal. Note that the DUNE experiment will have 1 million wires. Each LArIAT readout window consists of 4096 time sampling of each wire, producing two 240 by 4096 images. \n", 62 | "\n", 63 | "For the tutorial, we have produced 1 million LArIAT test beam events for each of the following: electrons/positrons, positive/negative/neutral pions, positive/negative kaons, muons/antimuons, electron/muon/tau neutrinos/anti-neutrinos charged/neutral current. In addition to particle type, the data labeled by the type of interaction and the particle in the final state. We also store the true 3D locations of the charge deposits. The total data is rought 20 TBs. Most studies have been performed by downsampling and scanning the data in time, producing 240 by 256 sized images.\n", 64 | "\n", 65 | "The [LArIAT Visualization Notebook](LArTPC-Visualization.ipynb) introduces this dataset. [The LArIAT Classification Experiment Notebook](LArTPCDNN-Experiment.ipynb) sets up a Deep Learning classification problem. Achieving the design classification and energy regression within detectors would be a significant achievement. Other interesting problems include noise suppression and reconstructing 3D images from the two 2D images." 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 2", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DSatHEP-Tutorial 2 | -------------------------------------------------------------------------------- /Tutorial Installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial Installation\n", 8 | "\n", 9 | "This notebook checks out the packages necessary for the tutorial. You will only need to run this notebook once." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!mkdir Tutorial\n", 19 | "%cd Tutorial\n", 20 | "!git clone https://bitbucket.org/anomalousai/DLKit\n", 21 | "%cd DLKit\n", 22 | "!git clone https://github.com/UTA-HEP-Computing/CaloDNN\n", 23 | "!git clone https://github.com/UTA-HEP-Computing/NEXTDNN\n", 24 | "!git clone https://github.com/UTA-HEP-Computing/LArTPCDNN\n", 25 | "!git clone https://github.com/UTA-HEP-Computing/DSatHEP-Tutorial\n", 26 | "!cp DSatHEP-Tutorial/* ." 27 | ] 28 | } 29 | ], 30 | "metadata": { 31 | "kernelspec": { 32 | "display_name": "Python 2", 33 | "language": "python", 34 | "name": "python2" 35 | }, 36 | "language_info": { 37 | "codemirror_mode": { 38 | "name": "ipython", 39 | "version": 2 40 | }, 41 | "file_extension": ".py", 42 | "mimetype": "text/x-python", 43 | "name": "python", 44 | "nbconvert_exporter": "python", 45 | "pygments_lexer": "ipython2", 46 | "version": "2.7.12" 47 | } 48 | }, 49 | "nbformat": 4, 50 | "nbformat_minor": 2 51 | } 52 | --------------------------------------------------------------------------------