├── .DS_Store ├── .gitignore ├── Advanced_Topics_In_Machine_Learning ├── Introduction_to_Unsupervised_Learning │ ├── ML4ES_UnsupervisedLearning.ipynb │ ├── SOM_animation.gif │ ├── autoencoder.png │ ├── dendrogram.gif │ ├── elbow_method.png │ ├── hierarchical_gif.gif │ ├── kmeans_bad.gif │ ├── kmeans_good.gif │ └── nonlinear_PCA.png ├── ML_Model_Interpretation │ ├── breiman_permutation.gif │ ├── cnn_architecture.jpg │ ├── evaluation │ │ ├── __init__.py │ │ ├── attributes_diagrams.py │ │ ├── keras_metrics.py │ │ ├── performance_diagrams.py │ │ └── roc_curves.py │ ├── interpretation │ │ ├── __init__.py │ │ ├── backwards_optimization.py │ │ ├── binarization.py │ │ ├── class_activation.py │ │ ├── cnn.py │ │ ├── normalization.py │ │ ├── novelty_detection.py │ │ ├── permutation.py │ │ ├── plotting.py │ │ ├── saliency.py │ │ └── utils.py │ ├── lak_permutation.gif │ ├── model_components.png │ ├── notebook.ipynb │ ├── notebook.py │ ├── pretrained_cnn │ │ ├── pretrained_cnn.h5 │ │ ├── pretrained_cnn_metadata.json │ │ ├── pretrained_ucn.h5 │ │ └── pretrained_ucn_metadata.json │ └── wind_barb_explainer.png └── README.md ├── Introduction_To_Machine_Learning ├── .DS_Store ├── Data_Science_Fundamentals │ ├── .DS_Store │ ├── README.md │ ├── __pycache__ │ │ ├── attributes_diagrams.cpython-36.pyc │ │ ├── performance_diagrams.cpython-36.pyc │ │ ├── roc_curves.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── attributes_diagram.png │ ├── attributes_diagrams.py │ ├── contingency_table.png │ ├── ct_scores.png │ ├── ml_short_course_module_2_data_science.ipynb │ ├── ml_short_course_module_2_data_science.py │ ├── overfitting.png │ ├── performance_diagram.png │ ├── performance_diagrams.py │ ├── roc.png │ ├── roc_curves.py │ └── utils.py ├── Introduction_to_ML_and_AI │ ├── Images │ │ ├── AI-vs-ML-vs-Deep-Learning.png │ │ ├── PCAexample.png │ │ ├── SVD_example.png │ │ ├── ml_comic.png │ │ └── pca.gif │ ├── Introduction.ipynb │ └── README.md ├── README.md └── Supervised_Learning_Algorithims │ ├── BP.png │ ├── FP.png │ ├── Kernel.png │ ├── LC.png │ ├── LR.png │ ├── Models.png │ ├── NN.png │ ├── README.md │ ├── SK.png │ ├── SML.png │ ├── SVM.png │ ├── SupervisedML.png │ ├── Supervised_ML_Lecture_3.ipynb │ ├── Supervised_ML_Lecture_3.py │ ├── attr_diagrams.py │ ├── contingency_table.png │ ├── download_data.py │ ├── extract_data.py │ ├── performance_diagrams.py │ ├── roc_curves.py │ ├── tree_schematic.jpg │ └── utils.py ├── README.md ├── download_data.py ├── setup.py └── util └── extract_data.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled, optimized, and DLL files. 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution and packaging. 7 | build/ 8 | dist/ 9 | *.egg-info/ 10 | 11 | # IntelliJ things. 12 | .idea/ 13 | *.iml 14 | 15 | # Other files 16 | Advanced_Topics_In_Machine_Learning/.ipynb_checkpoints/ 17 | data/ 18 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/SOM_animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/SOM_animation.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/autoencoder.png -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/dendrogram.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/dendrogram.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/elbow_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/elbow_method.png -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/hierarchical_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/hierarchical_gif.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/kmeans_bad.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/kmeans_bad.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/kmeans_good.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/kmeans_good.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/nonlinear_PCA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/Introduction_to_Unsupervised_Learning/nonlinear_PCA.png -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/breiman_permutation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/breiman_permutation.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/cnn_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/cnn_architecture.jpg -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/evaluation/attributes_diagrams.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting attributes diagram.""" 2 | 3 | import numpy 4 | from descartes import PolygonPatch 5 | import shapely.geometry 6 | import matplotlib.colors 7 | import matplotlib.pyplot as pyplot 8 | 9 | DEFAULT_NUM_BINS = 20 10 | RELIABILITY_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 11 | RELIABILITY_LINE_WIDTH = 3 12 | PERFECT_LINE_COLOUR = numpy.full(3, 152. / 255) 13 | PERFECT_LINE_WIDTH = 2 14 | 15 | NO_SKILL_LINE_COLOUR = numpy.array([31, 120, 180], dtype=float) / 255 16 | NO_SKILL_LINE_WIDTH = 2 17 | SKILL_AREA_TRANSPARENCY = 0.2 18 | CLIMATOLOGY_LINE_COLOUR = numpy.full(3, 152. / 255) 19 | CLIMATOLOGY_LINE_WIDTH = 2 20 | 21 | HISTOGRAM_FACE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 22 | HISTOGRAM_EDGE_COLOUR = numpy.full(3, 0.) 23 | HISTOGRAM_EDGE_WIDTH = 2 24 | 25 | HISTOGRAM_LEFT_EDGE_COORD = 0.575 26 | HISTOGRAM_BOTTOM_EDGE_COORD = 0.175 27 | HISTOGRAM_WIDTH = 0.3 28 | HISTOGRAM_HEIGHT = 0.3 29 | 30 | HISTOGRAM_X_TICK_VALUES = numpy.linspace(0, 1, num=6, dtype=float) 31 | HISTOGRAM_Y_TICK_SPACING = 0.1 32 | 33 | FIGURE_WIDTH_INCHES = 15 34 | FIGURE_HEIGHT_INCHES = 15 35 | 36 | FONT_SIZE = 30 37 | pyplot.rc('font', size=FONT_SIZE) 38 | pyplot.rc('axes', titlesize=FONT_SIZE) 39 | pyplot.rc('axes', labelsize=FONT_SIZE) 40 | pyplot.rc('xtick', labelsize=FONT_SIZE) 41 | pyplot.rc('ytick', labelsize=FONT_SIZE) 42 | pyplot.rc('legend', fontsize=FONT_SIZE) 43 | pyplot.rc('figure', titlesize=FONT_SIZE) 44 | 45 | 46 | def _get_histogram(input_values, num_bins, min_value, max_value): 47 | """Creates histogram with uniform bin-spacing. 48 | 49 | E = number of input values 50 | B = number of bins 51 | 52 | :param input_values: length-E numpy array of values to bin. 53 | :param num_bins: Number of bins (B). 54 | :param min_value: Minimum value. Any input value < `min_value` will be 55 | assigned to the first bin. 56 | :param max_value: Max value. Any input value > `max_value` will be 57 | assigned to the last bin. 58 | :return: inputs_to_bins: length-E numpy array of bin indices (integers). 59 | """ 60 | 61 | bin_cutoffs = numpy.linspace(min_value, max_value, num=num_bins + 1) 62 | inputs_to_bins = numpy.digitize( 63 | input_values, bin_cutoffs, right=False) - 1 64 | 65 | inputs_to_bins[inputs_to_bins < 0] = 0 66 | inputs_to_bins[inputs_to_bins > num_bins - 1] = num_bins - 1 67 | 68 | return inputs_to_bins 69 | 70 | 71 | def _get_points_in_relia_curve( 72 | observed_labels, forecast_probabilities, num_bins): 73 | """Creates points for reliability curve. 74 | 75 | The reliability curve is the main component of the attributes diagram. 76 | 77 | E = number of examples 78 | B = number of bins 79 | 80 | :param observed_labels: length-E numpy array of class labels (integers in 81 | 0...1). 82 | :param forecast_probabilities: length-E numpy array with forecast 83 | probabilities of label = 1. 84 | :param num_bins: Number of bins for forecast probability. 85 | :return: mean_forecast_probs: length-B numpy array of mean forecast 86 | probabilities. 87 | :return: mean_event_frequencies: length-B numpy array of conditional mean 88 | event frequencies. mean_event_frequencies[j] = frequency of label 1 89 | when forecast probability is in the [j]th bin. 90 | :return: num_examples_by_bin: length-B numpy array with number of examples 91 | in each forecast bin. 92 | """ 93 | 94 | assert numpy.all(numpy.logical_or( 95 | observed_labels == 0, observed_labels == 1 96 | )) 97 | 98 | assert numpy.all(numpy.logical_and( 99 | forecast_probabilities >= 0, forecast_probabilities <= 1 100 | )) 101 | 102 | assert num_bins > 1 103 | 104 | inputs_to_bins = _get_histogram( 105 | input_values=forecast_probabilities, num_bins=num_bins, min_value=0., 106 | max_value=1.) 107 | 108 | mean_forecast_probs = numpy.full(num_bins, numpy.nan) 109 | mean_event_frequencies = numpy.full(num_bins, numpy.nan) 110 | num_examples_by_bin = numpy.full(num_bins, -1, dtype=int) 111 | 112 | for k in range(num_bins): 113 | these_example_indices = numpy.where(inputs_to_bins == k)[0] 114 | num_examples_by_bin[k] = len(these_example_indices) 115 | 116 | mean_forecast_probs[k] = numpy.mean( 117 | forecast_probabilities[these_example_indices]) 118 | 119 | mean_event_frequencies[k] = numpy.mean( 120 | observed_labels[these_example_indices].astype(float) 121 | ) 122 | 123 | return mean_forecast_probs, mean_event_frequencies, num_examples_by_bin 124 | 125 | 126 | def _vertices_to_polygon_object(x_vertices, y_vertices): 127 | """Converts two arrays of vertices to `shapely.geometry.Polygon` object. 128 | 129 | V = number of vertices 130 | 131 | This method allows for simple polygons only (no disjoint polygons, no 132 | holes). 133 | 134 | :param x_vertices: length-V numpy array of x-coordinates. 135 | :param y_vertices: length-V numpy array of y-coordinates. 136 | :return: polygon_object: Instance of `shapely.geometry.Polygon`. 137 | """ 138 | 139 | list_of_vertices = [] 140 | for i in range(len(x_vertices)): 141 | list_of_vertices.append((x_vertices[i], y_vertices[i])) 142 | 143 | return shapely.geometry.Polygon(shell=list_of_vertices) 144 | 145 | 146 | def _plot_background(axes_object, observed_labels): 147 | """Plots background of attributes diagram. 148 | 149 | E = number of examples 150 | 151 | :param axes_object: Instance of `matplotlib.axes._subplots.AxesSubplot`. 152 | Will plot on these axes. 153 | :param observed_labels: length-E numpy array of class labels (integers in 154 | 0...1). 155 | """ 156 | 157 | # Plot positive-skill area. 158 | climatology = numpy.mean(observed_labels.astype(float)) 159 | skill_area_colour = matplotlib.colors.to_rgba( 160 | NO_SKILL_LINE_COLOUR, SKILL_AREA_TRANSPARENCY) 161 | 162 | x_vertices_left = numpy.array([0, climatology, climatology, 0, 0]) 163 | y_vertices_left = numpy.array([0, 0, climatology, climatology / 2, 0]) 164 | 165 | left_polygon_object = _vertices_to_polygon_object( 166 | x_vertices=x_vertices_left, y_vertices=y_vertices_left) 167 | left_polygon_patch = PolygonPatch( 168 | left_polygon_object, lw=0, ec=skill_area_colour, fc=skill_area_colour) 169 | axes_object.add_patch(left_polygon_patch) 170 | 171 | x_vertices_right = numpy.array( 172 | [climatology, 1, 1, climatology, climatology]) 173 | y_vertices_right = numpy.array( 174 | [climatology, (1 + climatology) / 2, 1, 1, climatology]) 175 | 176 | right_polygon_object = _vertices_to_polygon_object( 177 | x_vertices=x_vertices_right, y_vertices=y_vertices_right) 178 | right_polygon_patch = PolygonPatch( 179 | right_polygon_object, lw=0, ec=skill_area_colour, fc=skill_area_colour) 180 | axes_object.add_patch(right_polygon_patch) 181 | 182 | # Plot no-skill line (at edge of positive-skill area). 183 | no_skill_x_coords = numpy.array([0, 1], dtype=float) 184 | no_skill_y_coords = numpy.array([climatology, 1 + climatology]) / 2 185 | axes_object.plot( 186 | no_skill_x_coords, no_skill_y_coords, color=NO_SKILL_LINE_COLOUR, 187 | linestyle='solid', linewidth=NO_SKILL_LINE_WIDTH) 188 | 189 | # Plot climatology line (vertical). 190 | climo_line_x_coords = numpy.full(2, climatology) 191 | climo_line_y_coords = numpy.array([0, 1], dtype=float) 192 | axes_object.plot( 193 | climo_line_x_coords, climo_line_y_coords, color=CLIMATOLOGY_LINE_COLOUR, 194 | linestyle='dashed', linewidth=CLIMATOLOGY_LINE_WIDTH) 195 | 196 | # Plot no-resolution line (horizontal). 197 | no_resolution_x_coords = climo_line_y_coords + 0. 198 | no_resolution_y_coords = climo_line_x_coords + 0. 199 | axes_object.plot( 200 | no_resolution_x_coords, no_resolution_y_coords, 201 | color=CLIMATOLOGY_LINE_COLOUR, linestyle='dashed', 202 | linewidth=CLIMATOLOGY_LINE_WIDTH) 203 | 204 | 205 | def _floor_to_nearest(input_value_or_array, increment): 206 | """Rounds number(s) down to the nearest multiple of `increment`. 207 | 208 | :param input_value_or_array: Input (either scalar or numpy array). 209 | :param increment: Increment (or rounding base -- whatever you want to call 210 | it). 211 | :return: output_value_or_array: Rounded version of `input_value_or_array`. 212 | """ 213 | 214 | return increment * numpy.floor(input_value_or_array / increment) 215 | 216 | 217 | def _plot_forecast_histogram(figure_object, num_examples_by_bin): 218 | """Plots forecast histogram as inset in the attributes diagram. 219 | 220 | B = number of bins 221 | 222 | :param figure_object: Instance of `matplotlib.figure.Figure`. Will plot in 223 | this figure. 224 | :param num_examples_by_bin: length-B numpy array, where 225 | num_examples_by_bin[j] = number of examples in [j]th forecast bin. 226 | """ 227 | 228 | num_bins = len(num_examples_by_bin) 229 | bin_frequencies = ( 230 | num_examples_by_bin.astype(float) / numpy.sum(num_examples_by_bin) 231 | ) 232 | 233 | forecast_bin_edges = numpy.linspace(0, 1, num=num_bins + 1, dtype=float) 234 | forecast_bin_width = forecast_bin_edges[1] - forecast_bin_edges[0] 235 | forecast_bin_centers = forecast_bin_edges[:-1] + forecast_bin_width / 2 236 | 237 | inset_axes_object = figure_object.add_axes( 238 | [HISTOGRAM_LEFT_EDGE_COORD, HISTOGRAM_BOTTOM_EDGE_COORD, 239 | HISTOGRAM_WIDTH, HISTOGRAM_HEIGHT] 240 | ) 241 | 242 | inset_axes_object.bar( 243 | forecast_bin_centers, bin_frequencies, forecast_bin_width, 244 | color=HISTOGRAM_FACE_COLOUR, edgecolor=HISTOGRAM_EDGE_COLOUR, 245 | linewidth=HISTOGRAM_EDGE_WIDTH) 246 | 247 | max_y_tick_value = _floor_to_nearest( 248 | 1.05 * numpy.max(bin_frequencies), HISTOGRAM_Y_TICK_SPACING) 249 | num_y_ticks = 1 + int(numpy.round( 250 | max_y_tick_value / HISTOGRAM_Y_TICK_SPACING 251 | )) 252 | 253 | y_tick_values = numpy.linspace(0, max_y_tick_value, num=num_y_ticks) 254 | pyplot.yticks(y_tick_values, axes=inset_axes_object) 255 | pyplot.xticks(HISTOGRAM_X_TICK_VALUES, axes=inset_axes_object) 256 | 257 | inset_axes_object.set_xlim(0, 1) 258 | inset_axes_object.set_ylim(0, 1.05 * numpy.max(bin_frequencies)) 259 | 260 | 261 | def plot_reliability_curve( 262 | observed_labels, forecast_probabilities, num_bins=DEFAULT_NUM_BINS, 263 | axes_object=None): 264 | """Plots reliability curve. 265 | 266 | E = number of examples 267 | 268 | :param observed_labels: length-E numpy array of class labels (integers in 269 | 0...1). 270 | :param forecast_probabilities: length-E numpy array with forecast 271 | probabilities of label = 1. 272 | :param num_bins: Number of bins for forecast probability. 273 | :param axes_object: Instance of `matplotlib.axes._subplots.AxesSubplot`. 274 | Will plot on these axes. 275 | :return: mean_forecast_probs: See doc for `_get_points_in_relia_curve`. 276 | :return: mean_event_frequencies: Same. 277 | :return: num_examples_by_bin: Same. 278 | """ 279 | 280 | mean_forecast_probs, mean_event_frequencies, num_examples_by_bin = ( 281 | _get_points_in_relia_curve( 282 | observed_labels=observed_labels, 283 | forecast_probabilities=forecast_probabilities, num_bins=num_bins) 284 | ) 285 | 286 | if axes_object is None: 287 | _, axes_object = pyplot.subplots( 288 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 289 | ) 290 | 291 | perfect_x_coords = numpy.array([0, 1], dtype=float) 292 | perfect_y_coords = perfect_x_coords + 0. 293 | axes_object.plot( 294 | perfect_x_coords, perfect_y_coords, color=PERFECT_LINE_COLOUR, 295 | linestyle='dashed', linewidth=PERFECT_LINE_WIDTH) 296 | 297 | real_indices = numpy.where(numpy.invert(numpy.logical_or( 298 | numpy.isnan(mean_forecast_probs), numpy.isnan(mean_event_frequencies) 299 | )))[0] 300 | 301 | axes_object.plot( 302 | mean_forecast_probs[real_indices], mean_event_frequencies[real_indices], 303 | color=RELIABILITY_LINE_COLOUR, 304 | linestyle='solid', linewidth=RELIABILITY_LINE_WIDTH) 305 | 306 | axes_object.set_xlabel('Forecast probability') 307 | axes_object.set_ylabel('Conditional event frequency') 308 | axes_object.set_xlim(0., 1.) 309 | axes_object.set_ylim(0., 1.) 310 | 311 | return mean_forecast_probs, mean_event_frequencies, num_examples_by_bin 312 | 313 | 314 | def plot_attributes_diagram( 315 | observed_labels, forecast_probabilities, num_bins=DEFAULT_NUM_BINS): 316 | """Plots attributes diagram. 317 | 318 | :param observed_labels: See doc for `plot_reliability_curve`. 319 | :param forecast_probabilities: Same. 320 | :param num_bins: Same. 321 | :return: mean_forecast_probs: See doc for `_get_points_in_relia_curve`. 322 | :return: mean_event_frequencies: Same. 323 | :return: num_examples_by_bin: Same. 324 | """ 325 | 326 | mean_forecast_probs, mean_event_frequencies, num_examples_by_bin = ( 327 | _get_points_in_relia_curve( 328 | observed_labels=observed_labels, 329 | forecast_probabilities=forecast_probabilities, num_bins=num_bins) 330 | ) 331 | 332 | figure_object, axes_object = pyplot.subplots( 333 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 334 | ) 335 | 336 | _plot_background(axes_object=axes_object, observed_labels=observed_labels) 337 | _plot_forecast_histogram(figure_object=figure_object, 338 | num_examples_by_bin=num_examples_by_bin) 339 | 340 | plot_reliability_curve( 341 | observed_labels=observed_labels, 342 | forecast_probabilities=forecast_probabilities, num_bins=num_bins, 343 | axes_object=axes_object) 344 | 345 | return mean_forecast_probs, mean_event_frequencies, num_examples_by_bin 346 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/evaluation/keras_metrics.py: -------------------------------------------------------------------------------- 1 | """Performance metrics used to monitor Keras model while training. 2 | 3 | WARNING: these metrics have the following properties, which some users may find 4 | undesirable. 5 | 6 | [1] Used only for monitoring, not to serve as loss functions. 7 | [2] Binary metrics treat the highest class as the positive class, all others as 8 | the negative class. In other words, binary metrics are for "highest class 9 | vs. all". 10 | [3] Metrics are usually based on a contingency table, which contains 11 | deterministic forecasts. However, metrics in this module are based only on 12 | probabilistic forecasts (it would take too long to compute metrics at 13 | various probability thresholds during training). 14 | 15 | --- NOTATION --- 16 | 17 | Throughout this module, I will use the following letters to denote elements of 18 | the contingency table (even though, as mentioned above, there are no actual 19 | contingency tables). 20 | 21 | a = number of true positives ("hits") 22 | b = number of false positives ("false alarms") 23 | c = number of false negatives ("misses") 24 | d = number of true negatives ("correct nulls") 25 | 26 | E = number of examples 27 | K = number of classes (possible values of target variable) 28 | 29 | --- FORMAT 1: BINARY CLASSIFICATION --- 30 | 31 | target_tensor: length-E tensor of target values (observed classes). If 32 | target_tensor[i] = k, the [i]th example belongs to the [k]th class. 33 | 34 | forecast_probability_tensor: length-E tensor of forecast probabilities. 35 | forecast_probability_tensor[i] = forecast probability that the [i]th example 36 | belongs to class 1 (as opposed to 0). 37 | 38 | --- FORMAT 2: NON-BINARY CLASSIFICATION --- 39 | 40 | target_tensor: E-by-K tensor of target values (observed classes). If 41 | target_tensor[i, k] = 1, the [i]th example belongs to the [k]th class. 42 | 43 | forecast_probability_tensor: E-by-K tensor of forecast probabilities. 44 | forecast_probability_tensor[i, k] = forecast probability that the [i]th 45 | example belongs to the [k]th class. 46 | """ 47 | 48 | import keras.backend as K 49 | 50 | 51 | def _get_num_tensor_dimensions(input_tensor): 52 | """Returns number of dimensions in tensor. 53 | 54 | :param input_tensor: Keras tensor. 55 | :return: num_dimensions: Number of dimensions. 56 | """ 57 | 58 | return len(input_tensor.get_shape().as_list()) 59 | 60 | 61 | def _get_num_true_positives(target_tensor, forecast_probability_tensor): 62 | """Returns number of true positives ("a" in the docstring). 63 | 64 | :param target_tensor: See docstring for the 2 possible formats. 65 | :param forecast_probability_tensor: Same. 66 | :return: num_true_positives: Number of true positives. 67 | """ 68 | 69 | num_dimensions = _get_num_tensor_dimensions(target_tensor) 70 | if num_dimensions == 1: 71 | return K.sum(K.clip( 72 | target_tensor * forecast_probability_tensor, 0., 1.)) 73 | 74 | if num_dimensions == 2: 75 | return K.sum(K.clip( 76 | target_tensor[..., -1] * forecast_probability_tensor[..., -1], 77 | 0., 1.)) 78 | 79 | return None 80 | 81 | 82 | def _get_num_false_positives(target_tensor, forecast_probability_tensor): 83 | """Returns number of false positives ("b" in the docstring). 84 | 85 | :param target_tensor: See docstring for the 2 possible formats. 86 | :param forecast_probability_tensor: Same. 87 | :return: num_false_positives: Number of false positives. 88 | """ 89 | 90 | num_dimensions = _get_num_tensor_dimensions(target_tensor) 91 | if num_dimensions == 1: 92 | return K.sum(K.clip( 93 | (1. - target_tensor) * forecast_probability_tensor, 0., 1.)) 94 | 95 | if num_dimensions == 2: 96 | return K.sum(K.clip( 97 | (1. - target_tensor[..., -1]) * 98 | forecast_probability_tensor[..., -1], 99 | 0., 1.)) 100 | 101 | return None 102 | 103 | 104 | def _get_num_false_negatives(target_tensor, forecast_probability_tensor): 105 | """Returns number of false negatives ("c" in the docstring). 106 | 107 | :param target_tensor: See docstring for the 2 possible formats. 108 | :param forecast_probability_tensor: Same. 109 | :return: num_false_negatives: Number of false negatives. 110 | """ 111 | 112 | num_dimensions = _get_num_tensor_dimensions(target_tensor) 113 | if num_dimensions == 1: 114 | return K.sum(K.clip( 115 | target_tensor * (1. - forecast_probability_tensor), 0., 1.)) 116 | 117 | if num_dimensions == 2: 118 | return K.sum(K.clip( 119 | target_tensor[..., -1] * 120 | (1. - forecast_probability_tensor[..., -1]), 121 | 0., 1.)) 122 | 123 | return None 124 | 125 | 126 | def _get_num_true_negatives(target_tensor, forecast_probability_tensor): 127 | """Returns number of false negatives ("d" in the docstring). 128 | 129 | :param target_tensor: See docstring for the 2 possible formats. 130 | :param forecast_probability_tensor: Same. 131 | :return: num_true_negatives: Number of true negatives. 132 | """ 133 | 134 | num_dimensions = _get_num_tensor_dimensions(target_tensor) 135 | if num_dimensions == 1: 136 | return K.sum(K.clip( 137 | (1. - target_tensor) * (1. - forecast_probability_tensor), 0., 1.)) 138 | 139 | if num_dimensions == 2: 140 | return K.sum(K.clip( 141 | (1. - target_tensor[..., -1]) * 142 | (1. - forecast_probability_tensor[..., -1]), 143 | 0., 1.)) 144 | 145 | return None 146 | 147 | 148 | def accuracy(target_tensor, forecast_probability_tensor): 149 | """Returns accuracy. 150 | 151 | :param target_tensor: See docstring for the 2 possible formats. 152 | :param forecast_probability_tensor: Same. 153 | :return: accuracy: Accuracy. 154 | """ 155 | 156 | return K.mean(K.clip(target_tensor * forecast_probability_tensor, 0., 1.)) 157 | 158 | 159 | def binary_accuracy(target_tensor, forecast_probability_tensor): 160 | """Returns binary accuracy ([a + d] / [a + b + c + d]). 161 | 162 | :param target_tensor: See docstring for the 2 possible formats. 163 | :param forecast_probability_tensor: Same. 164 | :return: binary_accuracy: Binary accuracy. 165 | """ 166 | 167 | a = _get_num_true_positives(target_tensor, forecast_probability_tensor) 168 | b = _get_num_false_positives(target_tensor, forecast_probability_tensor) 169 | c = _get_num_false_negatives(target_tensor, forecast_probability_tensor) 170 | d = _get_num_true_negatives(target_tensor, forecast_probability_tensor) 171 | 172 | return (a + d) / (a + b + c + d + K.epsilon()) 173 | 174 | 175 | def binary_csi(target_tensor, forecast_probability_tensor): 176 | """Returns binary critical success index (a / [a + b + c]). 177 | 178 | :param target_tensor: See docstring for the 2 possible formats. 179 | :param forecast_probability_tensor: Same. 180 | :return: binary_csi: Binary CSI. 181 | """ 182 | 183 | a = _get_num_true_positives(target_tensor, forecast_probability_tensor) 184 | b = _get_num_false_positives(target_tensor, forecast_probability_tensor) 185 | c = _get_num_false_negatives(target_tensor, forecast_probability_tensor) 186 | 187 | return a / (a + b + c + K.epsilon()) 188 | 189 | 190 | def binary_frequency_bias(target_tensor, forecast_probability_tensor): 191 | """Returns binary frequency bias ([a + b] / [a + c]). 192 | 193 | :param target_tensor: See docstring for the 2 possible formats. 194 | :param forecast_probability_tensor: Same. 195 | :return: binary_frequency_bias: Binary frequency bias. 196 | """ 197 | 198 | a = _get_num_true_positives(target_tensor, forecast_probability_tensor) 199 | b = _get_num_false_positives(target_tensor, forecast_probability_tensor) 200 | c = _get_num_false_negatives(target_tensor, forecast_probability_tensor) 201 | 202 | return (a + b) / (a + c + K.epsilon()) 203 | 204 | 205 | def binary_pod(target_tensor, forecast_probability_tensor): 206 | """Returns binary probability of detection (a / [a + c]). 207 | 208 | :param target_tensor: See docstring for the 2 possible formats. 209 | :param forecast_probability_tensor: Same. 210 | :return: binary_pod: Binary POD. 211 | """ 212 | 213 | a = _get_num_true_positives(target_tensor, forecast_probability_tensor) 214 | c = _get_num_false_negatives(target_tensor, forecast_probability_tensor) 215 | 216 | return a / (a + c + K.epsilon()) 217 | 218 | 219 | def binary_fom(target_tensor, forecast_probability_tensor): 220 | """Returns binary frequency of misses (c / [a + c]). 221 | 222 | :param target_tensor: See docstring for the 2 possible formats. 223 | :param forecast_probability_tensor: Same. 224 | :return: binary_fom: Binary FOM. 225 | """ 226 | 227 | return 1. - binary_pod(target_tensor, forecast_probability_tensor) 228 | 229 | 230 | def binary_pofd(target_tensor, forecast_probability_tensor): 231 | """Returns binary probability of false detection (b / [b + d]). 232 | 233 | :param target_tensor: See docstring for the 2 possible formats. 234 | :param forecast_probability_tensor: Same. 235 | :return: binary_pofd: Binary POFD. 236 | """ 237 | 238 | b = _get_num_false_positives(target_tensor, forecast_probability_tensor) 239 | d = _get_num_true_negatives(target_tensor, forecast_probability_tensor) 240 | 241 | return b / (b + d + K.epsilon()) 242 | 243 | 244 | def binary_peirce_score(target_tensor, forecast_probability_tensor): 245 | """Returns binary Peirce score. 246 | 247 | :param target_tensor: See docstring for the 2 possible formats. 248 | :param forecast_probability_tensor: Same. 249 | :return: binary_peirce_score: Binary Peirce score. 250 | """ 251 | 252 | return binary_pod(target_tensor, forecast_probability_tensor) - binary_pofd( 253 | target_tensor, forecast_probability_tensor) 254 | 255 | 256 | def binary_npv(target_tensor, forecast_probability_tensor): 257 | """Returns binary negative predictive value (d / [b + d]). 258 | 259 | :param target_tensor: See docstring for the 2 possible formats. 260 | :param forecast_probability_tensor: Same. 261 | :return: binary_npv: Binary NPV. 262 | """ 263 | 264 | return 1. - binary_pofd(target_tensor, forecast_probability_tensor) 265 | 266 | 267 | def binary_success_ratio(target_tensor, forecast_probability_tensor): 268 | """Returns binary success ratio (a / [a + b]). 269 | 270 | :param target_tensor: See docstring for the 2 possible formats. 271 | :param forecast_probability_tensor: Same. 272 | :return: binary_success_ratio: Binary success ratio. 273 | """ 274 | 275 | a = _get_num_true_positives(target_tensor, forecast_probability_tensor) 276 | b = _get_num_false_positives(target_tensor, forecast_probability_tensor) 277 | 278 | return a / (a + b + K.epsilon()) 279 | 280 | 281 | def binary_far(target_tensor, forecast_probability_tensor): 282 | """Returns binary false-alarm rate (b / [a + b]). 283 | 284 | :param target_tensor: See docstring for the 2 possible formats. 285 | :param forecast_probability_tensor: Same. 286 | :return: binary_far: Binary false-alarm rate. 287 | """ 288 | 289 | return 1. - binary_success_ratio(target_tensor, forecast_probability_tensor) 290 | 291 | 292 | def binary_dfr(target_tensor, forecast_probability_tensor): 293 | """Returns binary detection-failure ratio (c / [c + d]). 294 | 295 | :param target_tensor: See docstring for the 2 possible formats. 296 | :param forecast_probability_tensor: Same. 297 | :return: binary_dfr: Binary DFR. 298 | """ 299 | 300 | c = _get_num_false_negatives(target_tensor, forecast_probability_tensor) 301 | d = _get_num_true_negatives(target_tensor, forecast_probability_tensor) 302 | 303 | return c / (c + d + K.epsilon()) 304 | 305 | 306 | def binary_focn(target_tensor, forecast_probability_tensor): 307 | """Returns binary frequency of correct nulls (d / [c + d]). 308 | 309 | :param target_tensor: See docstring for the 2 possible formats. 310 | :param forecast_probability_tensor: Same. 311 | :return: binary_focn: Binary FOCN. 312 | """ 313 | 314 | return 1. - binary_dfr(target_tensor, forecast_probability_tensor) 315 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/evaluation/performance_diagrams.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting performance diagram.""" 2 | 3 | import numpy 4 | import matplotlib.colors 5 | import matplotlib.pyplot as pyplot 6 | 7 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 8 | DEFAULT_LINE_WIDTH = 3 9 | DEFAULT_BIAS_LINE_COLOUR = numpy.full(3, 152. / 255) 10 | DEFAULT_BIAS_LINE_WIDTH = 2 11 | 12 | LEVELS_FOR_CSI_CONTOURS = numpy.linspace(0, 1, num=11, dtype=float) 13 | LEVELS_FOR_BIAS_CONTOURS = numpy.array( 14 | [0.25, 0.5, 0.75, 1., 1.5, 2., 3., 5.]) 15 | 16 | BIAS_STRING_FORMAT = '%.2f' 17 | BIAS_LABEL_PADDING_PX = 10 18 | 19 | FIGURE_WIDTH_INCHES = 15 20 | FIGURE_HEIGHT_INCHES = 15 21 | 22 | FONT_SIZE = 30 23 | pyplot.rc('font', size=FONT_SIZE) 24 | pyplot.rc('axes', titlesize=FONT_SIZE) 25 | pyplot.rc('axes', labelsize=FONT_SIZE) 26 | pyplot.rc('xtick', labelsize=FONT_SIZE) 27 | pyplot.rc('ytick', labelsize=FONT_SIZE) 28 | pyplot.rc('legend', fontsize=FONT_SIZE) 29 | pyplot.rc('figure', titlesize=FONT_SIZE) 30 | 31 | 32 | def _get_sr_pod_grid(success_ratio_spacing=0.01, pod_spacing=0.01): 33 | """Creates grid in SR-POD (success ratio / probability of detection) space. 34 | 35 | M = number of rows (unique POD values) in grid 36 | N = number of columns (unique success ratios) in grid 37 | 38 | :param success_ratio_spacing: Spacing between grid cells in adjacent 39 | columns. 40 | :param pod_spacing: Spacing between grid cells in adjacent rows. 41 | :return: success_ratio_matrix: M-by-N numpy array of success ratios. 42 | Success ratio increases with column index. 43 | :return: pod_matrix: M-by-N numpy array of POD values. POD decreases with 44 | row index. 45 | """ 46 | 47 | num_success_ratios = 1 + int(numpy.ceil(1. / success_ratio_spacing)) 48 | num_pod_values = 1 + int(numpy.ceil(1. / pod_spacing)) 49 | 50 | unique_success_ratios = numpy.linspace(0., 1., num=num_success_ratios) 51 | unique_pod_values = numpy.linspace(0., 1., num=num_pod_values)[::-1] 52 | return numpy.meshgrid(unique_success_ratios, unique_pod_values) 53 | 54 | 55 | def _csi_from_sr_and_pod(success_ratio_array, pod_array): 56 | """Computes CSI (critical success index) from success ratio and POD. 57 | 58 | POD = probability of detection 59 | 60 | :param success_ratio_array: numpy array (any shape) of success ratios. 61 | :param pod_array: numpy array (same shape) of POD values. 62 | :return: csi_array: numpy array (same shape) of CSI values. 63 | """ 64 | 65 | return (success_ratio_array ** -1 + pod_array ** -1 - 1.) ** -1 66 | 67 | 68 | def _bias_from_sr_and_pod(success_ratio_array, pod_array): 69 | """Computes frequency bias from success ratio and POD. 70 | 71 | POD = probability of detection 72 | 73 | :param success_ratio_array: numpy array (any shape) of success ratios. 74 | :param pod_array: numpy array (same shape) of POD values. 75 | :return: frequency_bias_array: numpy array (same shape) of frequency biases. 76 | """ 77 | 78 | return pod_array / success_ratio_array 79 | 80 | 81 | def _get_csi_colour_scheme(): 82 | """Returns colour scheme for CSI (critical success index). 83 | 84 | :return: colour_map_object: Colour scheme (instance of 85 | `matplotlib.colors.ListedColormap`). 86 | :return: colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 87 | defining the scale of the colour map. 88 | """ 89 | 90 | this_colour_map_object = pyplot.cm.Blues 91 | this_colour_norm_object = matplotlib.colors.BoundaryNorm( 92 | LEVELS_FOR_CSI_CONTOURS, this_colour_map_object.N) 93 | 94 | rgba_matrix = this_colour_map_object(this_colour_norm_object( 95 | LEVELS_FOR_CSI_CONTOURS)) 96 | colour_list = [ 97 | rgba_matrix[i, ..., :-1] for i in range(rgba_matrix.shape[0]) 98 | ] 99 | 100 | colour_map_object = matplotlib.colors.ListedColormap(colour_list) 101 | colour_map_object.set_under(numpy.array([1, 1, 1])) 102 | colour_norm_object = matplotlib.colors.BoundaryNorm( 103 | LEVELS_FOR_CSI_CONTOURS, colour_map_object.N) 104 | 105 | return colour_map_object, colour_norm_object 106 | 107 | 108 | def _add_colour_bar( 109 | axes_object, colour_map_object, values_to_colour, min_colour_value, 110 | max_colour_value, colour_norm_object=None, 111 | orientation_string='vertical', extend_min=True, extend_max=True, 112 | fraction_of_axis_length=1., font_size=FONT_SIZE): 113 | """Adds colour bar to existing axes. 114 | 115 | :param axes_object: Existing axes (instance of 116 | `matplotlib.axes._subplots.AxesSubplot`). 117 | :param colour_map_object: Colour scheme (instance of 118 | `matplotlib.pyplot.cm`). 119 | :param values_to_colour: numpy array of values to colour. 120 | :param min_colour_value: Minimum value in colour map. 121 | :param max_colour_value: Max value in colour map. 122 | :param colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 123 | defining the scale of the colour map. If `colour_norm_object is None`, 124 | will assume that scale is linear. 125 | :param orientation_string: Orientation of colour bar ("vertical" or 126 | "horizontal"). 127 | :param extend_min: Boolean flag. If True, the bottom of the colour bar will 128 | have an arrow. If False, it will be a flat line, suggesting that lower 129 | values are not possible. 130 | :param extend_max: Same but for top of colour bar. 131 | :param fraction_of_axis_length: Fraction of axis length (y-axis if 132 | orientation is "vertical", x-axis if orientation is "horizontal") 133 | occupied by colour bar. 134 | :param font_size: Font size for labels on colour bar. 135 | :return: colour_bar_object: Colour bar (instance of 136 | `matplotlib.pyplot.colorbar`) created by this method. 137 | """ 138 | 139 | if colour_norm_object is None: 140 | colour_norm_object = matplotlib.colors.Normalize( 141 | vmin=min_colour_value, vmax=max_colour_value, clip=False) 142 | 143 | scalar_mappable_object = pyplot.cm.ScalarMappable( 144 | cmap=colour_map_object, norm=colour_norm_object) 145 | scalar_mappable_object.set_array(values_to_colour) 146 | 147 | if extend_min and extend_max: 148 | extend_string = 'both' 149 | elif extend_min: 150 | extend_string = 'min' 151 | elif extend_max: 152 | extend_string = 'max' 153 | else: 154 | extend_string = 'neither' 155 | 156 | if orientation_string == 'horizontal': 157 | padding = 0.075 158 | else: 159 | padding = 0.05 160 | 161 | colour_bar_object = pyplot.colorbar( 162 | ax=axes_object, mappable=scalar_mappable_object, 163 | orientation=orientation_string, pad=padding, extend=extend_string, 164 | shrink=fraction_of_axis_length) 165 | 166 | colour_bar_object.ax.tick_params(labelsize=font_size) 167 | return colour_bar_object 168 | 169 | 170 | def _get_points_in_perf_diagram(observed_labels, forecast_probabilities): 171 | """Creates points for performance diagram. 172 | 173 | E = number of examples 174 | T = number of binarization thresholds 175 | 176 | :param observed_labels: length-E numpy array of class labels (integers in 177 | 0...1). 178 | :param forecast_probabilities: length-E numpy array with forecast 179 | probabilities of label = 1. 180 | :return: pod_by_threshold: length-T numpy array of POD (probability of 181 | detection) values. 182 | :return: success_ratio_by_threshold: length-T numpy array of success ratios. 183 | """ 184 | 185 | assert numpy.all(numpy.logical_or( 186 | observed_labels == 0, observed_labels == 1 187 | )) 188 | 189 | assert numpy.all(numpy.logical_and( 190 | forecast_probabilities >= 0, forecast_probabilities <= 1 191 | )) 192 | 193 | observed_labels = observed_labels.astype(int) 194 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 195 | 196 | num_thresholds = len(binarization_thresholds) 197 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 198 | success_ratio_by_threshold = numpy.full(num_thresholds, numpy.nan) 199 | 200 | for k in range(num_thresholds): 201 | these_forecast_labels = ( 202 | forecast_probabilities >= binarization_thresholds[k] 203 | ).astype(int) 204 | 205 | this_num_hits = numpy.sum(numpy.logical_and( 206 | these_forecast_labels == 1, observed_labels == 1 207 | )) 208 | 209 | this_num_false_alarms = numpy.sum(numpy.logical_and( 210 | these_forecast_labels == 1, observed_labels == 0 211 | )) 212 | 213 | this_num_misses = numpy.sum(numpy.logical_and( 214 | these_forecast_labels == 0, observed_labels == 1 215 | )) 216 | 217 | try: 218 | pod_by_threshold[k] = ( 219 | float(this_num_hits) / (this_num_hits + this_num_misses) 220 | ) 221 | except ZeroDivisionError: 222 | pass 223 | 224 | try: 225 | success_ratio_by_threshold[k] = ( 226 | float(this_num_hits) / (this_num_hits + this_num_false_alarms) 227 | ) 228 | except ZeroDivisionError: 229 | pass 230 | 231 | pod_by_threshold = numpy.array([1.] + pod_by_threshold.tolist() + [0.]) 232 | success_ratio_by_threshold = numpy.array( 233 | [0.] + success_ratio_by_threshold.tolist() + [1.] 234 | ) 235 | 236 | return pod_by_threshold, success_ratio_by_threshold 237 | 238 | 239 | def plot_performance_diagram( 240 | observed_labels, forecast_probabilities, 241 | line_colour=DEFAULT_LINE_COLOUR, line_width=DEFAULT_LINE_WIDTH, 242 | bias_line_colour=DEFAULT_BIAS_LINE_COLOUR, 243 | bias_line_width=DEFAULT_BIAS_LINE_WIDTH): 244 | """Plots performance diagram. 245 | 246 | E = number of examples 247 | 248 | :param observed_labels: length-E numpy array of class labels (integers in 249 | 0...1). 250 | :param forecast_probabilities: length-E numpy array with forecast 251 | probabilities of label = 1. 252 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 253 | :param line_width: Line width (real positive number). 254 | :param bias_line_colour: Colour of contour lines for frequency bias. 255 | :param bias_line_width: Width of contour lines for frequency bias. 256 | :return: pod_by_threshold: See doc for `_get_points_in_perf_diagram`. 257 | detection) values. 258 | :return: success_ratio_by_threshold: Same. 259 | """ 260 | 261 | pod_by_threshold, success_ratio_by_threshold = _get_points_in_perf_diagram( 262 | observed_labels=observed_labels, 263 | forecast_probabilities=forecast_probabilities) 264 | 265 | _, axes_object = pyplot.subplots( 266 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 267 | ) 268 | 269 | success_ratio_matrix, pod_matrix = _get_sr_pod_grid() 270 | csi_matrix = _csi_from_sr_and_pod(success_ratio_matrix, pod_matrix) 271 | frequency_bias_matrix = _bias_from_sr_and_pod( 272 | success_ratio_matrix, pod_matrix) 273 | 274 | this_colour_map_object, this_colour_norm_object = _get_csi_colour_scheme() 275 | 276 | pyplot.contourf( 277 | success_ratio_matrix, pod_matrix, csi_matrix, LEVELS_FOR_CSI_CONTOURS, 278 | cmap=this_colour_map_object, norm=this_colour_norm_object, vmin=0., 279 | vmax=1., axes=axes_object) 280 | 281 | colour_bar_object = _add_colour_bar( 282 | axes_object=axes_object, colour_map_object=this_colour_map_object, 283 | colour_norm_object=this_colour_norm_object, 284 | values_to_colour=csi_matrix, min_colour_value=0., 285 | max_colour_value=1., orientation_string='vertical', 286 | extend_min=False, extend_max=False) 287 | colour_bar_object.set_label('CSI (critical success index)') 288 | 289 | bias_colour_tuple = () 290 | for _ in range(len(LEVELS_FOR_BIAS_CONTOURS)): 291 | bias_colour_tuple += (bias_line_colour,) 292 | 293 | bias_contour_object = pyplot.contour( 294 | success_ratio_matrix, pod_matrix, frequency_bias_matrix, 295 | LEVELS_FOR_BIAS_CONTOURS, colors=bias_colour_tuple, 296 | linewidths=bias_line_width, linestyles='dashed', axes=axes_object) 297 | pyplot.clabel( 298 | bias_contour_object, inline=True, inline_spacing=BIAS_LABEL_PADDING_PX, 299 | fmt=BIAS_STRING_FORMAT, fontsize=FONT_SIZE) 300 | 301 | nan_flags = numpy.logical_or( 302 | numpy.isnan(success_ratio_by_threshold), numpy.isnan(pod_by_threshold) 303 | ) 304 | 305 | if not numpy.all(nan_flags): 306 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 307 | axes_object.plot( 308 | success_ratio_by_threshold[real_indices], 309 | pod_by_threshold[real_indices], color=line_colour, 310 | linestyle='solid', linewidth=line_width) 311 | 312 | axes_object.set_xlabel('Success ratio (1 - FAR)') 313 | axes_object.set_ylabel('POD (probability of detection)') 314 | axes_object.set_xlim(0., 1.) 315 | axes_object.set_ylim(0., 1.) 316 | 317 | return pod_by_threshold, success_ratio_by_threshold 318 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/evaluation/roc_curves.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting ROC (receiver operating characteristic) curve.""" 2 | 3 | import numpy 4 | import matplotlib.pyplot as pyplot 5 | 6 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 7 | DEFAULT_LINE_WIDTH = 3 8 | DEFAULT_RANDOM_LINE_COLOUR = numpy.full(3, 152. / 255) 9 | DEFAULT_RANDOM_LINE_WIDTH = 2 10 | 11 | FIGURE_WIDTH_INCHES = 15 12 | FIGURE_HEIGHT_INCHES = 15 13 | 14 | FONT_SIZE = 30 15 | pyplot.rc('font', size=FONT_SIZE) 16 | pyplot.rc('axes', titlesize=FONT_SIZE) 17 | pyplot.rc('axes', labelsize=FONT_SIZE) 18 | pyplot.rc('xtick', labelsize=FONT_SIZE) 19 | pyplot.rc('ytick', labelsize=FONT_SIZE) 20 | pyplot.rc('legend', fontsize=FONT_SIZE) 21 | pyplot.rc('figure', titlesize=FONT_SIZE) 22 | 23 | 24 | def _get_points_in_roc_curve(observed_labels, forecast_probabilities): 25 | """Creates points for ROC curve. 26 | 27 | E = number of examples 28 | T = number of binarization thresholds 29 | 30 | :param observed_labels: length-E numpy array of class labels (integers in 31 | 0...1). 32 | :param forecast_probabilities: length-E numpy array with forecast 33 | probabilities of label = 1. 34 | :return: pofd_by_threshold: length-T numpy array of POFD (probability of 35 | false detection) values. 36 | :return: pod_by_threshold: length-T numpy array of POD (probability of 37 | detection) values. 38 | """ 39 | 40 | assert numpy.all(numpy.logical_or( 41 | observed_labels == 0, observed_labels == 1 42 | )) 43 | 44 | assert numpy.all(numpy.logical_and( 45 | forecast_probabilities >= 0, forecast_probabilities <= 1 46 | )) 47 | 48 | observed_labels = observed_labels.astype(int) 49 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 50 | 51 | num_thresholds = len(binarization_thresholds) 52 | pofd_by_threshold = numpy.full(num_thresholds, numpy.nan) 53 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 54 | 55 | for k in range(num_thresholds): 56 | these_forecast_labels = ( 57 | forecast_probabilities >= binarization_thresholds[k] 58 | ).astype(int) 59 | 60 | this_num_hits = numpy.sum(numpy.logical_and( 61 | these_forecast_labels == 1, observed_labels == 1 62 | )) 63 | 64 | this_num_false_alarms = numpy.sum(numpy.logical_and( 65 | these_forecast_labels == 1, observed_labels == 0 66 | )) 67 | 68 | this_num_misses = numpy.sum(numpy.logical_and( 69 | these_forecast_labels == 0, observed_labels == 1 70 | )) 71 | 72 | this_num_correct_nulls = numpy.sum(numpy.logical_and( 73 | these_forecast_labels == 0, observed_labels == 0 74 | )) 75 | 76 | try: 77 | pofd_by_threshold[k] = ( 78 | float(this_num_false_alarms) / 79 | (this_num_false_alarms + this_num_correct_nulls) 80 | ) 81 | except ZeroDivisionError: 82 | pass 83 | 84 | try: 85 | pod_by_threshold[k] = ( 86 | float(this_num_hits) / (this_num_hits + this_num_misses) 87 | ) 88 | except ZeroDivisionError: 89 | pass 90 | 91 | pod_by_threshold = numpy.array([1.] + pod_by_threshold.tolist() + [0.]) 92 | pofd_by_threshold = numpy.array([1.] + pofd_by_threshold.tolist() + [0.]) 93 | 94 | return pofd_by_threshold, pod_by_threshold 95 | 96 | 97 | def plot_roc_curve( 98 | observed_labels, forecast_probabilities, 99 | line_colour=DEFAULT_LINE_COLOUR, line_width=DEFAULT_LINE_WIDTH, 100 | random_line_colour=DEFAULT_RANDOM_LINE_COLOUR, 101 | random_line_width=DEFAULT_RANDOM_LINE_WIDTH): 102 | """Plots ROC curve. 103 | 104 | E = number of examples 105 | 106 | :param observed_labels: length-E numpy array of class labels (integers in 107 | 0...1). 108 | :param forecast_probabilities: length-E numpy array with forecast 109 | probabilities of label = 1. 110 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 111 | :param line_width: Line width (real positive number). 112 | :param random_line_colour: Colour of reference line (ROC curve for random 113 | predictor). 114 | :param random_line_width: Width of reference line (ROC curve for random 115 | predictor). 116 | :return: pofd_by_threshold: See doc for `_get_points_in_roc_curve`. 117 | :return: pod_by_threshold: Same. 118 | """ 119 | 120 | pofd_by_threshold, pod_by_threshold = _get_points_in_roc_curve( 121 | observed_labels=observed_labels, 122 | forecast_probabilities=forecast_probabilities) 123 | 124 | _, axes_object = pyplot.subplots( 125 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 126 | ) 127 | 128 | random_x_coords = numpy.array([0., 1.]) 129 | random_y_coords = numpy.array([0., 1.]) 130 | axes_object.plot( 131 | random_x_coords, random_y_coords, color=random_line_colour, 132 | linestyle='dashed', linewidth=random_line_width) 133 | 134 | nan_flags = numpy.logical_or( 135 | numpy.isnan(pofd_by_threshold), numpy.isnan(pod_by_threshold) 136 | ) 137 | 138 | if not numpy.all(nan_flags): 139 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 140 | axes_object.plot( 141 | pofd_by_threshold[real_indices], pod_by_threshold[real_indices], 142 | color=line_colour, linestyle='solid', linewidth=line_width) 143 | 144 | axes_object.set_xlabel('POFD (probability of false detection)') 145 | axes_object.set_ylabel('POD (probability of detection)') 146 | axes_object.set_xlim(0., 1.) 147 | axes_object.set_ylim(0., 1.) 148 | 149 | return pofd_by_threshold, pod_by_threshold 150 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/backwards_optimization.py: -------------------------------------------------------------------------------- 1 | """Helper methods for backwards optimization.""" 2 | 3 | import numpy 4 | from keras import backend as K 5 | 6 | DEFAULT_LEARNING_RATE = 0.001 7 | DEFAULT_NUM_ITERATIONS = 1000 8 | DEFAULT_L2_WEIGHT = 1. 9 | 10 | 11 | def _optimize_input_one_example( 12 | model_object, input_matrix, activation_tensor, loss_tensor, 13 | num_iterations, learning_rate, l2_weight): 14 | """Optimizes inputs (predictors) for one example. 15 | 16 | :param model_object: See doc for `optimize_example_for_class`. 17 | :param input_matrix: Same. 18 | :param activation_tensor: Keras tensor defining activation of relevant model 19 | component. 20 | :param loss_tensor: Keras tensor defining loss (difference between actual 21 | and desired activation). 22 | :param num_iterations: See doc for `optimize_example_for_class`. 23 | :param learning_rate: Same. 24 | :param l2_weight: Same. 25 | :return: optimized_input_matrix: Same. 26 | :return: initial_activation: Same. 27 | :return: final_activation: Same. 28 | """ 29 | 30 | if isinstance(model_object.input, list): 31 | input_tensor = model_object.input[0] 32 | else: 33 | input_tensor = model_object.input 34 | 35 | optimized_input_matrix = input_matrix + 0. 36 | 37 | if l2_weight is not None: 38 | difference_tensor = ( 39 | input_tensor[0, ...] - optimized_input_matrix[0, ...] 40 | ) 41 | 42 | loss_tensor += l2_weight * K.sum(difference_tensor ** 2) 43 | 44 | gradient_tensor = K.gradients(loss_tensor, [input_tensor])[0] 45 | gradient_tensor /= K.maximum( 46 | K.sqrt(K.mean(gradient_tensor ** 2)), 47 | K.epsilon() 48 | ) 49 | 50 | grad_descent_function = K.function( 51 | [input_tensor, K.learning_phase()], 52 | [activation_tensor, loss_tensor, gradient_tensor] 53 | ) 54 | 55 | initial_activation = None 56 | current_loss = None 57 | current_activation = None 58 | 59 | for j in range(num_iterations): 60 | vals = grad_descent_function([optimized_input_matrix, 0]) 61 | current_loss = vals[1] 62 | current_activation = vals[0][0] 63 | current_gradient = vals[2] 64 | 65 | if j == 0: 66 | initial_activation = current_activation 67 | 68 | if numpy.mod(j, 100) == 0: 69 | print(( 70 | 'Loss after {0:d} of {1:d} iterations = {2:.2e} ... ' 71 | 'activation = {3:.2e}' 72 | ).format( 73 | j, num_iterations, current_loss, current_activation 74 | )) 75 | 76 | optimized_input_matrix -= current_gradient * learning_rate 77 | 78 | final_activation = current_activation 79 | 80 | print(( 81 | 'Loss after {0:d} iterations = {1:.2e} ... activation = {2:.2e}' 82 | ).format( 83 | num_iterations, current_loss, final_activation 84 | )) 85 | 86 | return optimized_input_matrix, initial_activation, final_activation 87 | 88 | 89 | def optimize_example_for_class( 90 | model_object, input_matrix, target_class, 91 | num_iterations=DEFAULT_NUM_ITERATIONS, 92 | learning_rate=DEFAULT_LEARNING_RATE, 93 | l2_weight=DEFAULT_L2_WEIGHT): 94 | """Optimizes one example to maximize probability of target class. 95 | 96 | :param model_object: Trained model (instance of `keras.models.Model` or 97 | `keras.models.Sequential`). 98 | :param input_matrix: numpy array with inputs (predictors) for one example. 99 | :param target_class: Target class. Must be an integer in 0...(K - 1), where 100 | K = number of classes. 101 | :param num_iterations: Number of iterations for gradient descent. 102 | :param learning_rate: Learning rate for gradient descent. 103 | :param l2_weight: Strength of L_2 penalty (on difference between original 104 | and optimized input matrices). If you do not want an L_2 penalty, make 105 | this None. 106 | :return: optimized_input_matrix: Same as input matrix but with different 107 | values. 108 | :return: initial_activation: Initial activation of relevant model component 109 | (before any backwards optimization). 110 | :return: final_activation: Final activation (after backwards optimization). 111 | """ 112 | 113 | # Check input args. 114 | target_class = int(numpy.round(target_class)) 115 | num_iterations = int(numpy.round(num_iterations)) 116 | 117 | assert not numpy.any(numpy.isnan(input_matrix)) 118 | assert target_class >= 0 119 | assert num_iterations > 0 120 | assert learning_rate > 0. 121 | if l2_weight <= 0: 122 | l2_weight = None 123 | 124 | num_output_neurons = ( 125 | model_object.layers[-1].output.get_shape().as_list()[-1] 126 | ) 127 | 128 | if num_output_neurons == 1: 129 | assert target_class <= 1 130 | 131 | activation_tensor = model_object.layers[-1].output[..., 0] 132 | 133 | if target_class == 1: 134 | loss_tensor = K.mean((activation_tensor - 1) ** 2) 135 | else: 136 | loss_tensor = K.mean(activation_tensor ** 2) 137 | else: 138 | assert target_class < num_output_neurons 139 | 140 | activation_tensor = model_object.layers[-1].output[..., target_class] 141 | loss_tensor = K.mean((activation_tensor - 1) ** 2) 142 | 143 | return _optimize_input_one_example( 144 | model_object=model_object, input_matrix=input_matrix, 145 | activation_tensor=activation_tensor, loss_tensor=loss_tensor, 146 | num_iterations=num_iterations, learning_rate=learning_rate, 147 | l2_weight=l2_weight 148 | ) 149 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/binarization.py: -------------------------------------------------------------------------------- 1 | """Helper methods for binarization of target variable.""" 2 | 3 | import numpy 4 | from interpretation import utils 5 | 6 | 7 | def get_binarization_threshold(image_file_names, percentile_level): 8 | """Computes binarization threshold for target variable. 9 | 10 | Binarization threshold will be [q]th percentile of all image maxima, where 11 | q = `percentile_level`. 12 | 13 | :param image_file_names: 1-D list of paths to input files. 14 | :param percentile_level: q in the above discussion. 15 | :return: binarization_threshold: Binarization threshold (used to turn each 16 | target image into a yes-or-no label). 17 | """ 18 | 19 | max_target_values = numpy.array([]) 20 | 21 | for this_file_name in image_file_names: 22 | print('Reading data from: "{0:s}"...'.format(this_file_name)) 23 | this_image_dict = utils.read_image_file(this_file_name) 24 | 25 | this_target_matrix = this_image_dict[utils.TARGET_MATRIX_KEY] 26 | this_num_examples = this_target_matrix.shape[0] 27 | these_max_target_values = numpy.full(this_num_examples, numpy.nan) 28 | 29 | for i in range(this_num_examples): 30 | these_max_target_values[i] = numpy.max(this_target_matrix[i, ...]) 31 | 32 | max_target_values = numpy.concatenate(( 33 | max_target_values, these_max_target_values 34 | )) 35 | 36 | binarization_threshold = numpy.percentile( 37 | max_target_values, percentile_level 38 | ) 39 | 40 | print('\nBinarization threshold for "{0:s}" = {1:.4e}'.format( 41 | utils.TARGET_NAME, binarization_threshold 42 | )) 43 | 44 | return binarization_threshold 45 | 46 | 47 | def binarize_target_images(target_matrix, binarization_threshold): 48 | """Binarizes target images. 49 | 50 | Specifically, this method turns each target image into a binary label, 51 | depending on whether or not (max value in image) >= binarization_threshold. 52 | 53 | E = number of examples (storm objects) in file 54 | M = number of rows in each storm-centered grid 55 | N = number of columns in each storm-centered grid 56 | 57 | :param target_matrix: E-by-M-by-N numpy array of floats. 58 | :param binarization_threshold: Binarization threshold. 59 | :return: target_values: length-E numpy array of target values (integers in 60 | 0...1). 61 | """ 62 | 63 | num_examples = target_matrix.shape[0] 64 | target_values = numpy.full(num_examples, -1, dtype=int) 65 | 66 | for i in range(num_examples): 67 | target_values[i] = ( 68 | numpy.max(target_matrix[i, ...]) >= binarization_threshold 69 | ) 70 | 71 | return target_values 72 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/class_activation.py: -------------------------------------------------------------------------------- 1 | """Helper methods for class-activation maps.""" 2 | 3 | import numpy 4 | from keras import backend as K 5 | import tensorflow 6 | from scipy.interpolate import ( 7 | UnivariateSpline, RectBivariateSpline, RegularGridInterpolator 8 | ) 9 | from interpretation import utils 10 | from interpretation.saliency import _get_grid_points 11 | 12 | DEFAULT_LINE_WIDTH = 2. 13 | 14 | 15 | def _compute_gradients(loss_tensor, list_of_input_tensors): 16 | """Computes gradient of each input tensor with respect to loss tensor. 17 | 18 | T = number of tensors 19 | 20 | :param loss_tensor: Loss tensor. 21 | :param list_of_input_tensors: length-T list of input tensors. 22 | :return: list_of_gradient_tensors: length-T list of gradient tensors. 23 | """ 24 | 25 | list_of_gradient_tensors = tensorflow.gradients( 26 | loss_tensor, list_of_input_tensors 27 | ) 28 | 29 | for i in range(len(list_of_gradient_tensors)): 30 | if list_of_gradient_tensors[i] is not None: 31 | continue 32 | 33 | list_of_gradient_tensors[i] = tensorflow.zeros_like( 34 | list_of_input_tensors[i] 35 | ) 36 | 37 | return list_of_gradient_tensors 38 | 39 | 40 | def _normalize_tensor(input_tensor): 41 | """Normalizes tensor to Euclidean magnitude (or "L_2 norm") of 1.0. 42 | 43 | :param input_tensor: Input tensor. 44 | :return: output_tensor: Same as input but with Euclidean magnitude of 1.0. 45 | """ 46 | 47 | rms_tensor = K.sqrt(K.mean(K.square(input_tensor))) 48 | return input_tensor / (rms_tensor + K.epsilon()) 49 | 50 | 51 | def _upsample_cam(class_activation_matrix, new_dimensions): 52 | """Upsamples class-activation map (CAM). 53 | 54 | The CAM may be 1-, 2-, or 3-dimensional. 55 | 56 | :param class_activation_matrix: numpy array of class activations. 57 | :param new_dimensions: numpy array of new dimensions. If 58 | `class_activation_matrix` is N-dimensional, this array must be length-N. 59 | :return: class_activation_matrix: Upsampled version of input. 60 | """ 61 | 62 | num_rows_new = new_dimensions[0] 63 | row_indices_new = numpy.linspace( 64 | 1, num_rows_new, num=num_rows_new, dtype=float 65 | ) 66 | row_indices_orig = numpy.linspace( 67 | 1, num_rows_new, num=class_activation_matrix.shape[0], dtype=float 68 | ) 69 | 70 | if len(new_dimensions) == 1: 71 | interp_object = UnivariateSpline( 72 | x=row_indices_orig, y=numpy.ravel(class_activation_matrix), 73 | k=3, s=0 74 | ) 75 | 76 | return interp_object(row_indices_new) 77 | 78 | num_columns_new = new_dimensions[1] 79 | column_indices_new = numpy.linspace( 80 | 1, num_columns_new, num=num_columns_new, dtype=float 81 | ) 82 | column_indices_orig = numpy.linspace( 83 | 1, num_columns_new, num=class_activation_matrix.shape[1], dtype=float 84 | ) 85 | 86 | if len(new_dimensions) == 2: 87 | interp_object = RectBivariateSpline( 88 | x=row_indices_orig, y=column_indices_orig, 89 | z=class_activation_matrix, kx=3, ky=3, s=0 90 | ) 91 | 92 | return interp_object(x=row_indices_new, y=column_indices_new, grid=True) 93 | 94 | num_heights_new = new_dimensions[2] 95 | height_indices_new = numpy.linspace( 96 | 1, num_heights_new, num=num_heights_new, dtype=float 97 | ) 98 | height_indices_orig = numpy.linspace( 99 | 1, num_heights_new, num=class_activation_matrix.shape[2], dtype=float 100 | ) 101 | 102 | interp_object = RegularGridInterpolator( 103 | points=(row_indices_orig, column_indices_orig, height_indices_orig), 104 | values=class_activation_matrix, method='linear' 105 | ) 106 | 107 | column_index_matrix, row_index_matrix, height_index_matrix = ( 108 | numpy.meshgrid(column_indices_new, row_indices_new, height_indices_new) 109 | ) 110 | query_point_matrix = numpy.stack( 111 | (row_index_matrix, column_index_matrix, height_index_matrix), axis=-1 112 | ) 113 | 114 | return interp_object(query_point_matrix) 115 | 116 | 117 | def _plot_cam_one_channel( 118 | class_activation_matrix_2d, axes_object, colour_map_object, 119 | min_contour_value, max_contour_value, contour_interval, 120 | line_width=DEFAULT_LINE_WIDTH): 121 | """Plots 2-D class-activation map with line contours. 122 | 123 | M = number of rows in grid 124 | N = number of columns in grid 125 | 126 | :param class_activation_matrix_2d: M-by-N numpy array of class activations. 127 | :param axes_object: Will plot on these axes (instance of 128 | `matplotlib.axes._subplots.AxesSubplot`). 129 | :param colour_map_object: Colour scheme (instance of `matplotlib.pyplot.cm` 130 | or similar). 131 | :param min_contour_value: Minimum contour value. 132 | :param max_contour_value: Max contour value. 133 | :param contour_interval: Interval between successive contours. 134 | :param line_width: Line width for contours. 135 | """ 136 | 137 | # Check input args. 138 | assert not numpy.any(numpy.isnan(class_activation_matrix_2d)) 139 | assert len(class_activation_matrix_2d.shape) == 2 140 | 141 | max_contour_value = max([ 142 | min_contour_value + 1e-6, max_contour_value 143 | ]) 144 | 145 | contour_interval = max([contour_interval, 1e-7]) 146 | contour_interval = min([ 147 | contour_interval, max_contour_value - min_contour_value 148 | ]) 149 | 150 | num_contours = 1 + int(numpy.round( 151 | (max_contour_value - min_contour_value) / contour_interval 152 | )) 153 | contour_values = numpy.linspace( 154 | min_contour_value, max_contour_value, num=num_contours, dtype=float 155 | ) 156 | 157 | # Find grid coordinates. 158 | num_grid_rows = class_activation_matrix_2d.shape[0] 159 | num_grid_columns = class_activation_matrix_2d.shape[1] 160 | x_coord_spacing = num_grid_columns ** -1 161 | y_coord_spacing = num_grid_rows ** -1 162 | 163 | # TODO(thunderhoser): Calling private method here is a HACK. 164 | x_coords, y_coords = _get_grid_points( 165 | x_min=x_coord_spacing / 2, y_min=y_coord_spacing / 2, 166 | x_spacing=x_coord_spacing, y_spacing=y_coord_spacing, 167 | num_rows=num_grid_rows, num_columns=num_grid_columns 168 | ) 169 | 170 | x_coord_matrix, y_coord_matrix = numpy.meshgrid(x_coords, y_coords) 171 | 172 | # Plot contours. 173 | axes_object.contour( 174 | x_coord_matrix, y_coord_matrix, class_activation_matrix_2d, 175 | contour_values, cmap=colour_map_object, 176 | vmin=numpy.min(contour_values), vmax=numpy.max(contour_values), 177 | linewidths=line_width, linestyles='solid', zorder=1e6, 178 | transform=axes_object.transAxes 179 | ) 180 | 181 | 182 | def run_gradcam(model_object, input_matrix, target_class, target_layer_name): 183 | """Runs Grad-CAM (gradient-weighted class-activation-mapping). 184 | 185 | :param model_object: Trained model (instance of `keras.models.Model` or 186 | `keras.models.Sequential`). 187 | :param input_matrix: numpy array of inputs (predictors) for one example. 188 | :param target_class: Target class. Class-activation maps will be created 189 | for the [k + 1]th class, where k = `target_class`. 190 | :param target_layer_name: Name of target layer. Neuron-importance weights 191 | will be based on activations in this layer. 192 | :return: class_activation_matrix: numpy array of class activations. This 193 | array will have the same dimensions as `input_matrix` but without the 194 | final axis. For example, if `input_matrix` is 32 x 32 x 4 195 | (32 rows x 32 columns x 4 channels), `class_activation_matrix` will be 196 | 32 x 32. 197 | """ 198 | 199 | # Check input args. 200 | target_class = int(numpy.round(target_class)) 201 | assert target_class >= 0 202 | 203 | assert not numpy.any(numpy.isnan(input_matrix)) 204 | num_spatial_dim = len(input_matrix.shape) - 1 205 | assert 1 <= num_spatial_dim <= 3 206 | 207 | # Create loss tensor. 208 | output_layer_object = model_object.layers[-1].output 209 | num_output_neurons = output_layer_object.get_shape().as_list()[-1] 210 | 211 | if num_output_neurons == 1: 212 | assert target_class <= 1 213 | 214 | if target_class == 1: 215 | loss_tensor = model_object.layers[-1].input[..., 0] 216 | else: 217 | loss_tensor = -1 * model_object.layers[-1].input[..., 0] 218 | else: 219 | assert target_class < num_output_neurons 220 | loss_tensor = model_object.layers[-1].input[..., target_class] 221 | 222 | # Create gradient function. 223 | target_layer_activation_tensor = model_object.get_layer( 224 | name=target_layer_name 225 | ).output 226 | 227 | gradient_tensor = _compute_gradients( 228 | loss_tensor, [target_layer_activation_tensor] 229 | )[0] 230 | gradient_tensor = _normalize_tensor(gradient_tensor) 231 | 232 | if isinstance(model_object.input, list): 233 | input_tensor = model_object.input[0] 234 | else: 235 | input_tensor = model_object.input 236 | 237 | gradient_function = K.function( 238 | [input_tensor], 239 | [target_layer_activation_tensor, gradient_tensor] 240 | ) 241 | 242 | # Evaluate gradient function. 243 | input_matrix_with_example_axis = numpy.expand_dims(input_matrix, axis=0) 244 | target_layer_activation_matrix, gradient_matrix = gradient_function( 245 | [input_matrix_with_example_axis] 246 | ) 247 | 248 | target_layer_activation_matrix = target_layer_activation_matrix[0, ...] 249 | gradient_matrix = gradient_matrix[0, ...] 250 | 251 | # Compute class-activation map. 252 | these_axes = [i for i in range(num_spatial_dim)] 253 | mean_weight_by_filter = numpy.mean(gradient_matrix, axis=tuple(these_axes)) 254 | 255 | class_activation_matrix = numpy.ones( 256 | target_layer_activation_matrix.shape[:-1] 257 | ) 258 | num_filters = len(mean_weight_by_filter) 259 | 260 | for k in range(num_filters): 261 | class_activation_matrix += ( 262 | mean_weight_by_filter[k] * target_layer_activation_matrix[..., k] 263 | ) 264 | 265 | # Upsample class-activation map to input space. 266 | input_spatial_dim = numpy.array(input_matrix.shape[:-1], dtype=int) 267 | class_activation_matrix = _upsample_cam( 268 | class_activation_matrix=class_activation_matrix, 269 | new_dimensions=input_spatial_dim 270 | ) 271 | 272 | return numpy.maximum(class_activation_matrix, 0.) 273 | 274 | 275 | def smooth_cams(class_activation_matrix, smoothing_radius_grid_cells): 276 | """Smooths class-activation maps for many examples. 277 | 278 | E = number of examples 279 | D = number of spatial dimensions 280 | 281 | :param class_activation_matrix: numpy array with class-activation maps for 282 | one or more examples. Should have D + 1 dimensions, and the first axis 283 | should have length E. 284 | :param smoothing_radius_grid_cells: e-folding radius (number of grid cells). 285 | :return: saliency_matrices: Smoothed version of input. 286 | """ 287 | 288 | num_examples = class_activation_matrix.shape[0] 289 | 290 | for i in range(num_examples): 291 | class_activation_matrix[i, ...] = utils.apply_gaussian_filter( 292 | input_matrix=class_activation_matrix[i, ...], 293 | e_folding_radius_grid_cells=smoothing_radius_grid_cells 294 | ) 295 | 296 | return class_activation_matrix 297 | 298 | 299 | def plot_2d_cam( 300 | class_activation_matrix_2d, axes_object_matrix, num_channels, 301 | colour_map_object, min_contour_value, max_contour_value, 302 | contour_interval, line_width=DEFAULT_LINE_WIDTH): 303 | """Plots 2-D class-activation map for one example. 304 | 305 | :param class_activation_matrix_2d: See doc for `_plot_cam_one_channel`. 306 | :param axes_object_matrix: 2-D numpy array of axes (each an instance of 307 | `matplotlib.axes._subplots.AxesSubplot`). 308 | :param num_channels: Number of channels (the same CAM will be plotted on top 309 | of each channel). 310 | :param colour_map_object: See doc for `_plot_cam_one_channel`. 311 | :param min_contour_value: Same. 312 | :param max_contour_value: Same. 313 | :param contour_interval: Same. 314 | :param line_width: Same. 315 | """ 316 | 317 | num_panel_rows = axes_object_matrix.shape[0] 318 | num_panel_columns = axes_object_matrix.shape[1] 319 | 320 | for k in range(num_channels): 321 | i, j = numpy.unravel_index(k, (num_panel_rows, num_panel_columns)) 322 | this_axes_object = axes_object_matrix[i, j] 323 | 324 | _plot_cam_one_channel( 325 | class_activation_matrix_2d=class_activation_matrix_2d, 326 | axes_object=this_axes_object, 327 | colour_map_object=colour_map_object, 328 | min_contour_value=min_contour_value, 329 | max_contour_value=max_contour_value, 330 | contour_interval=contour_interval, line_width=line_width 331 | ) 332 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/normalization.py: -------------------------------------------------------------------------------- 1 | """Helper methods for normalization of predictors.""" 2 | 3 | import numpy 4 | from interpretation import utils 5 | 6 | NUM_VALUES_KEY = 'num_values' 7 | MEAN_VALUE_KEY = 'mean_value' 8 | MEAN_OF_SQUARES_KEY = 'mean_of_squares' 9 | 10 | 11 | def _update_normalization_params(intermediate_normalization_dict, new_values): 12 | """Updates normalization params for one predictor. 13 | 14 | :param intermediate_normalization_dict: Dictionary with the following keys. 15 | intermediate_normalization_dict['num_values']: Number of values on which 16 | current estimates are based. 17 | intermediate_normalization_dict['mean_value']: Current estimate for mean. 18 | intermediate_normalization_dict['mean_of_squares']: Current mean of squared 19 | values. 20 | 21 | :param new_values: numpy array of new values (will be used to update 22 | `intermediate_normalization_dict`). 23 | :return: intermediate_normalization_dict: Same as input but with updated 24 | values. 25 | """ 26 | 27 | if MEAN_VALUE_KEY not in intermediate_normalization_dict: 28 | intermediate_normalization_dict = { 29 | NUM_VALUES_KEY: 0, 30 | MEAN_VALUE_KEY: 0., 31 | MEAN_OF_SQUARES_KEY: 0. 32 | } 33 | 34 | # Update mean value. 35 | these_means = numpy.array([ 36 | intermediate_normalization_dict[MEAN_VALUE_KEY], numpy.mean(new_values) 37 | ]) 38 | these_weights = numpy.array([ 39 | intermediate_normalization_dict[NUM_VALUES_KEY], new_values.size 40 | ]) 41 | intermediate_normalization_dict[MEAN_VALUE_KEY] = numpy.average( 42 | these_means, weights=these_weights 43 | ) 44 | 45 | # Update mean of squares. 46 | these_means = numpy.array([ 47 | intermediate_normalization_dict[MEAN_OF_SQUARES_KEY], 48 | numpy.mean(new_values ** 2) 49 | ]) 50 | intermediate_normalization_dict[MEAN_OF_SQUARES_KEY] = numpy.average( 51 | these_means, weights=these_weights 52 | ) 53 | 54 | # Update number of values. 55 | intermediate_normalization_dict[NUM_VALUES_KEY] += new_values.size 56 | 57 | return intermediate_normalization_dict 58 | 59 | 60 | def _get_standard_deviation(intermediate_normalization_dict): 61 | """Computes stdev from intermediate normalization params. 62 | 63 | :param intermediate_normalization_dict: See doc for 64 | `_update_normalization_params`. 65 | :return: standard_deviation: Standard deviation. 66 | """ 67 | 68 | num_values = float(intermediate_normalization_dict[NUM_VALUES_KEY]) 69 | multiplier = num_values / (num_values - 1) 70 | 71 | return numpy.sqrt(multiplier * ( 72 | intermediate_normalization_dict[MEAN_OF_SQUARES_KEY] - 73 | intermediate_normalization_dict[MEAN_VALUE_KEY] ** 2 74 | )) 75 | 76 | 77 | def get_image_normalization_params(image_file_names): 78 | """Computes normalization params (mean and stdev) for each predictor. 79 | 80 | :param image_file_names: 1-D list of paths to input files. 81 | :return: normalization_dict: See input doc for `normalize_images`. 82 | """ 83 | 84 | predictor_names = None 85 | norm_dict_by_predictor = None 86 | 87 | for this_file_name in image_file_names: 88 | print('Reading data from: "{0:s}"...'.format(this_file_name)) 89 | this_image_dict = utils.read_image_file(this_file_name) 90 | 91 | if predictor_names is None: 92 | predictor_names = this_image_dict[utils.PREDICTOR_NAMES_KEY] 93 | norm_dict_by_predictor = [{}] * len(predictor_names) 94 | 95 | for k in range(len(predictor_names)): 96 | norm_dict_by_predictor[k] = _update_normalization_params( 97 | intermediate_normalization_dict=norm_dict_by_predictor[k], 98 | new_values=this_image_dict[utils.PREDICTOR_MATRIX_KEY][..., k] 99 | ) 100 | 101 | print('\n') 102 | normalization_dict = {} 103 | 104 | for k in range(len(predictor_names)): 105 | this_mean = norm_dict_by_predictor[k][MEAN_VALUE_KEY] 106 | this_stdev = _get_standard_deviation(norm_dict_by_predictor[k]) 107 | 108 | normalization_dict[predictor_names[k]] = numpy.array([ 109 | this_mean, this_stdev 110 | ]) 111 | 112 | print(( 113 | 'Mean and standard deviation for "{0:s}" = {1:.4f}, {2:.4f}' 114 | ).format( 115 | predictor_names[k], this_mean, this_stdev 116 | )) 117 | 118 | return normalization_dict 119 | 120 | 121 | def normalize_images( 122 | predictor_matrix, predictor_names, normalization_dict=None): 123 | """Normalizes images to z-scores. 124 | 125 | E = number of examples (storm objects) in file 126 | M = number of rows in each storm-centered grid 127 | N = number of columns in each storm-centered grid 128 | C = number of channels (predictor variables) 129 | 130 | :param predictor_matrix: E-by-M-by-N-by-C numpy array of predictor values. 131 | :param predictor_names: length-C list of predictor names. 132 | :param normalization_dict: Dictionary. Each key is the name of a predictor 133 | value, and the corresponding value is a length-2 numpy array with 134 | [mean, standard deviation]. If `normalization_dict is None`, mean and 135 | standard deviation will be computed for each predictor. 136 | :return: predictor_matrix: Normalized version of input. 137 | :return: normalization_dict: See doc for input variable. If input was None, 138 | this will be a newly created dictionary. Otherwise, this will be the 139 | same dictionary passed as input. 140 | """ 141 | 142 | num_predictors = len(predictor_names) 143 | 144 | if normalization_dict is None: 145 | normalization_dict = {} 146 | 147 | for k in range(num_predictors): 148 | this_mean = numpy.mean(predictor_matrix[..., k]) 149 | this_stdev = numpy.std(predictor_matrix[..., k], ddof=1) 150 | 151 | normalization_dict[predictor_names[k]] = numpy.array([ 152 | this_mean, this_stdev 153 | ]) 154 | 155 | for k in range(num_predictors): 156 | this_mean = normalization_dict[predictor_names[k]][0] 157 | this_stdev = normalization_dict[predictor_names[k]][1] 158 | 159 | predictor_matrix[..., k] = ( 160 | (predictor_matrix[..., k] - this_mean) / float(this_stdev) 161 | ) 162 | 163 | return predictor_matrix, normalization_dict 164 | 165 | 166 | def denormalize_images(predictor_matrix, predictor_names, normalization_dict): 167 | """Denormalizes images from z-scores back to original scales. 168 | 169 | :param predictor_matrix: See doc for `normalize_images`. 170 | :param predictor_names: Same. 171 | :param normalization_dict: Same. 172 | :return: predictor_matrix: Denormalized version of input. 173 | """ 174 | 175 | num_predictors = len(predictor_names) 176 | 177 | for k in range(num_predictors): 178 | this_mean = normalization_dict[predictor_names[k]][0] 179 | this_stdev = normalization_dict[predictor_names[k]][1] 180 | 181 | predictor_matrix[..., k] = ( 182 | this_mean + this_stdev * predictor_matrix[..., k] 183 | ) 184 | 185 | return predictor_matrix 186 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/novelty_detection.py: -------------------------------------------------------------------------------- 1 | """Helper methods for novelty detection.""" 2 | 3 | import numpy 4 | from matplotlib import pyplot 5 | from interpretation import cnn, utils, plotting 6 | 7 | EOF_MATRIX_KEY = 'eof_matrix' 8 | FEATURE_MEANS_KEY = 'feature_means' 9 | FEATURE_STDEVS_KEY = 'feature_standard_deviations' 10 | 11 | NOVEL_MATRIX_KEY = 'novel_predictor_matrix' 12 | NOVEL_MATRIX_UPCONV_KEY = 'novel_matrix_upconv' 13 | NOVEL_MATRIX_UPCONV_SVD_KEY = 'novel_matrix_upconv_svd' 14 | 15 | REFL_COLOUR_MAP_OBJECT = pyplot.get_cmap('PuOr') 16 | TEMPERATURE_COLOUR_MAP_OBJECT = pyplot.get_cmap('bwr') 17 | 18 | 19 | def _normalize_features( 20 | feature_matrix, feature_means=None, feature_standard_deviations=None): 21 | """Normalizes scalar features to z-scores. 22 | 23 | E = number of examples (storm objects) 24 | Z = number of features 25 | 26 | :param feature_matrix: E-by-Z numpy array of features. 27 | :param feature_means: length-Z numpy array of mean values. If 28 | `feature_means is None`, these will be computed on the fly from 29 | `feature_matrix`. 30 | :param feature_standard_deviations: Same but with standard deviations. 31 | :return: feature_matrix: Normalized version of input. 32 | :return: feature_means: See input doc. 33 | :return: feature_standard_deviations: See input doc. 34 | """ 35 | 36 | if feature_means is None or feature_standard_deviations is None: 37 | feature_means = numpy.mean(feature_matrix, axis=0) 38 | feature_standard_deviations = numpy.std(feature_matrix, axis=0, ddof=1) 39 | 40 | num_examples = feature_matrix.shape[0] 41 | num_features = feature_matrix.shape[1] 42 | 43 | mean_matrix = numpy.reshape(feature_means, (1, num_features)) 44 | mean_matrix = numpy.repeat(mean_matrix, repeats=num_examples, axis=0) 45 | 46 | stdev_matrix = numpy.reshape(feature_standard_deviations, (1, num_features)) 47 | stdev_matrix = numpy.repeat(stdev_matrix, repeats=num_examples, axis=0) 48 | 49 | feature_matrix = (feature_matrix - mean_matrix) / stdev_matrix 50 | return feature_matrix, feature_means, feature_standard_deviations 51 | 52 | 53 | def _fit_svd(baseline_feature_matrix, test_feature_matrix, 54 | percent_variance_to_keep): 55 | """Fits SVD (singular-value decomposition) model. 56 | 57 | B = number of baseline examples (storm objects) 58 | T = number of testing examples (storm objects) 59 | Z = number of scalar features (produced by dense layer of a CNN) 60 | K = number of modes (top eigenvectors) retained 61 | 62 | The SVD model will be fit only to the baseline set, but both the baseline 63 | and testing sets will be used to compute normalization parameters (means and 64 | standard deviations). Before, when only the baseline set was used to 65 | compute normalization params, the testing set had huge standard deviations, 66 | which caused the results of novelty detection to be physically unrealistic. 67 | 68 | :param baseline_feature_matrix: B-by-Z numpy array of features. 69 | :param test_feature_matrix: T-by-Z numpy array of features. 70 | :param percent_variance_to_keep: Percentage of variance to keep. Determines 71 | how many eigenvectors (K in the above discussion) will be used in the 72 | SVD model. 73 | 74 | :return: svd_dictionary: Dictionary with the following keys. 75 | svd_dictionary['eof_matrix']: Z-by-K numpy array, where each column is an 76 | EOF (empirical orthogonal function). 77 | svd_dictionary['feature_means']: length-Z numpy array with mean value of 78 | each feature (before transformation). 79 | svd_dictionary['feature_standard_deviations']: length-Z numpy array with 80 | standard deviation of each feature (before transformation). 81 | """ 82 | 83 | assert percent_variance_to_keep >= 50. 84 | assert percent_variance_to_keep <= 100. 85 | 86 | combined_feature_matrix = numpy.concatenate( 87 | (baseline_feature_matrix, test_feature_matrix), axis=0 88 | ) 89 | combined_feature_matrix, feature_means, feature_standard_deviations = ( 90 | _normalize_features(feature_matrix=combined_feature_matrix) 91 | ) 92 | 93 | num_features = baseline_feature_matrix.shape[1] 94 | num_baseline_examples = baseline_feature_matrix.shape[0] 95 | baseline_feature_matrix = ( 96 | combined_feature_matrix[:num_baseline_examples, ...] 97 | ) 98 | 99 | eigenvalues, eof_matrix = numpy.linalg.svd(baseline_feature_matrix)[1:] 100 | eigenvalues = eigenvalues ** 2 101 | 102 | explained_variances = eigenvalues / numpy.sum(eigenvalues) 103 | cumulative_explained_variances = numpy.cumsum(explained_variances) 104 | 105 | fraction_of_variance_to_keep = 0.01 * percent_variance_to_keep 106 | these_indices = numpy.where( 107 | cumulative_explained_variances >= fraction_of_variance_to_keep 108 | )[0] 109 | 110 | if len(these_indices) == 0: 111 | these_indices = numpy.array([num_features - 1], dtype=int) 112 | 113 | num_modes_to_keep = 1 + these_indices[0] 114 | 115 | print(( 116 | 'Number of modes required to explain {0:f}% of variance: {1:d}' 117 | ).format( 118 | percent_variance_to_keep, num_modes_to_keep 119 | )) 120 | 121 | return { 122 | EOF_MATRIX_KEY: numpy.transpose(eof_matrix)[..., :num_modes_to_keep], 123 | FEATURE_MEANS_KEY: feature_means, 124 | FEATURE_STDEVS_KEY: feature_standard_deviations 125 | } 126 | 127 | 128 | def _apply_svd(feature_vector, svd_dictionary): 129 | """Applies SVD (singular-value decomposition) model to new example. 130 | 131 | Z = number of features 132 | 133 | :param feature_vector: length-Z numpy array with feature values for one 134 | example (storm object). 135 | :param svd_dictionary: Dictionary created by `_fit_svd`. 136 | :return: reconstructed_feature_vector: Reconstructed version of input. 137 | """ 138 | 139 | this_matrix = numpy.dot( 140 | svd_dictionary[EOF_MATRIX_KEY], 141 | numpy.transpose(svd_dictionary[EOF_MATRIX_KEY]) 142 | ) 143 | feature_vector_norm = ( 144 | (feature_vector - svd_dictionary[FEATURE_MEANS_KEY]) / 145 | svd_dictionary[FEATURE_STDEVS_KEY] 146 | ) 147 | reconstructed_feature_vector_norm = numpy.dot( 148 | this_matrix, feature_vector_norm 149 | ) 150 | 151 | return ( 152 | svd_dictionary[FEATURE_MEANS_KEY] + 153 | reconstructed_feature_vector_norm * svd_dictionary[FEATURE_STDEVS_KEY] 154 | ) 155 | 156 | 157 | def _plot_novelty_maps(novelty_matrix, predictor_names, max_temp_diff_kelvins, 158 | max_reflectivity_diff_dbz): 159 | """Plots novelty maps for one example. 160 | 161 | M = number of rows in grid 162 | N = number of columns in grid 163 | C = number of predictors 164 | 165 | :param novelty_matrix: M-by-N-by-C numpy array of denormalized novelty 166 | values (upconvnet reconstruction minus upconvnet/SVD reconstruction). 167 | :param predictor_names: length-C list of predictor names. 168 | :param max_temp_diff_kelvins: Max temperature difference in colour bar. 169 | :param max_reflectivity_diff_dbz: Max reflectivity difference in colour bar. 170 | :return: figure_object: Figure handle (instance of 171 | `matplotlib.figure.Figure`). 172 | :return: axes_object_matrix: 2-D numpy array of axes handles (instances 173 | of `matplotlib.axes._subplots.AxesSubplot`). 174 | """ 175 | 176 | u_diff_matrix_m_s01 = novelty_matrix[ 177 | ..., predictor_names.index(utils.U_WIND_NAME) 178 | ] 179 | v_diff_matrix_m_s01 = novelty_matrix[ 180 | ..., predictor_names.index(utils.V_WIND_NAME) 181 | ] 182 | 183 | non_wind_predictor_names = [ 184 | p for p in predictor_names 185 | if p not in [utils.U_WIND_NAME, utils.V_WIND_NAME] 186 | ] 187 | 188 | figure_object, axes_object_matrix = plotting._create_paneled_figure( 189 | num_rows=1, num_columns=len(non_wind_predictor_names), 190 | ) 191 | 192 | for k in range(len(non_wind_predictor_names)): 193 | this_predictor_index = predictor_names.index( 194 | non_wind_predictor_names[k] 195 | ) 196 | 197 | if non_wind_predictor_names[k] == utils.REFLECTIVITY_NAME: 198 | this_max_colour_value = max_reflectivity_diff_dbz 199 | this_colour_map_object = REFL_COLOUR_MAP_OBJECT 200 | else: 201 | this_max_colour_value = max_temp_diff_kelvins 202 | this_colour_map_object = TEMPERATURE_COLOUR_MAP_OBJECT 203 | 204 | plotting.plot_scalar_field_2d( 205 | predictor_matrix=novelty_matrix[..., this_predictor_index], 206 | colour_map_object=this_colour_map_object, 207 | min_colour_value=-this_max_colour_value, 208 | max_colour_value=this_max_colour_value, 209 | axes_object=axes_object_matrix[0, k] 210 | ) 211 | 212 | this_colour_bar_object = plotting.plot_linear_colour_bar( 213 | axes_object_or_matrix=axes_object_matrix[0, k], 214 | data_values=novelty_matrix[..., this_predictor_index], 215 | colour_map_object=this_colour_map_object, 216 | min_value=-this_max_colour_value, max_value=this_max_colour_value, 217 | plot_horizontal=True, plot_min_arrow=True, plot_max_arrow=True 218 | ) 219 | 220 | plotting.plot_wind_2d( 221 | u_wind_matrix_m_s01=u_diff_matrix_m_s01, 222 | v_wind_matrix_m_s01=v_diff_matrix_m_s01, 223 | axes_object=axes_object_matrix[0, k] 224 | ) 225 | 226 | this_colour_bar_object.set_label( 227 | non_wind_predictor_names[k], 228 | fontsize=plotting.DEFAULT_CBAR_FONT_SIZE 229 | ) 230 | 231 | return figure_object, axes_object_matrix 232 | 233 | 234 | def run_novelty_detection( 235 | baseline_predictor_matrix_norm, trial_predictor_matrix_norm, 236 | cnn_model_object, cnn_feature_layer_name, upconvnet_model_object, 237 | num_novel_examples, multipass=False, percent_variance_to_keep=97.5): 238 | """Runs novelty detection. 239 | 240 | B = number of baseline examples 241 | T = number of trial examples 242 | Q = number of novel trial examples to find 243 | 244 | :param baseline_predictor_matrix_norm: numpy array with normalized predictor 245 | values for baseline set. The first axis should have length B. 246 | :param trial_predictor_matrix_norm: numpy array with normalized predictor 247 | values for trial set. The first axis should have length T. 248 | :param cnn_model_object: Trained CNN (instance of `keras.models.Model` or 249 | `keras.models.Sequential`). 250 | :param cnn_feature_layer_name: Name of feature layer in CNN. Outputs from 251 | this layer will be inputs to the upconvnet. 252 | :param upconvnet_model_object: Trained upconvnet (instance of 253 | `keras.models.Model` or `keras.models.Sequential`). 254 | :param num_novel_examples: Q in the above discussion. 255 | :param multipass: Boolean flag. If True, will run multi-pass version. If 256 | False, will run single-pass version. In the multi-pass version, 257 | whenever the next-most novel trial example is found, it is used to fit a 258 | new SVD model. In other words, after finding the [i]th-most novel trial 259 | example, a new SVD model is fit on all baseline examples and the i most 260 | novel trial examples. 261 | :param percent_variance_to_keep: Percentage of variance to keep in SVD 262 | (singular-value decomposition) from image space to feature space. 263 | :return: novelty_dict: Dictionary with the following keys. 264 | novelty_dict['novel_predictor_matrix']: numpy array with most novel examples 265 | in trial set. The first axis has length Q. 266 | novelty_dict['novel_matrix_upconv']: numpy array with upconvnet 267 | reconstructions of the most novel examples. Same dimensions as 268 | `novel_predictor_matrix`. 269 | novelty_dict['novel_matrix_upconv_svd']: numpy array with upconvnet 270 | reconstructions of SVD reconstructions of the most novel examples. 271 | Same dimensions as `novel_predictor_matrix`. 272 | """ 273 | 274 | multipass = bool(multipass) 275 | 276 | num_trial_examples = trial_predictor_matrix_norm.shape[0] 277 | num_novel_examples = int(numpy.round(num_novel_examples)) 278 | num_novel_examples = min([num_novel_examples, num_trial_examples]) 279 | 280 | assert num_novel_examples > 1 281 | 282 | baseline_feature_matrix = cnn.apply_cnn( 283 | model_object=cnn_model_object, 284 | predictor_matrix=baseline_predictor_matrix_norm, 285 | output_layer_name=cnn_feature_layer_name, verbose=True 286 | ) 287 | print('\n') 288 | 289 | trial_feature_matrix = cnn.apply_cnn( 290 | model_object=cnn_model_object, 291 | predictor_matrix=trial_predictor_matrix_norm, 292 | output_layer_name=cnn_feature_layer_name, verbose=True 293 | ) 294 | print('\n') 295 | 296 | svd_dictionary = None 297 | novel_indices = numpy.array([], dtype=int) 298 | novel_matrix_upconv = None 299 | novel_matrix_upconv_svd = None 300 | 301 | for k in range(num_novel_examples): 302 | print('Finding {0:d}th-most novel trial example...'.format( 303 | k + 1, num_novel_examples 304 | )) 305 | 306 | fit_new_svd = multipass or k == 0 307 | 308 | if fit_new_svd: 309 | this_baseline_feature_matrix = numpy.concatenate(( 310 | baseline_feature_matrix, 311 | trial_feature_matrix[novel_indices, ...] 312 | ), axis=0) 313 | 314 | this_trial_feature_matrix = numpy.delete( 315 | trial_feature_matrix, obj=novel_indices, axis=0 316 | ) 317 | 318 | svd_dictionary = _fit_svd( 319 | baseline_feature_matrix=this_baseline_feature_matrix, 320 | test_feature_matrix=this_trial_feature_matrix, 321 | percent_variance_to_keep=percent_variance_to_keep 322 | ) 323 | 324 | trial_svd_errors = numpy.full(num_trial_examples, numpy.nan) 325 | trial_feature_matrix_svd = numpy.full( 326 | trial_feature_matrix.shape, numpy.nan 327 | ) 328 | 329 | for i in range(num_trial_examples): 330 | if i in novel_indices: 331 | continue 332 | 333 | print(i) 334 | 335 | trial_feature_matrix_svd[i, ...] = _apply_svd( 336 | feature_vector=trial_feature_matrix[i, ...], 337 | svd_dictionary=svd_dictionary 338 | ) 339 | 340 | trial_svd_errors[i] = numpy.linalg.norm( 341 | trial_feature_matrix_svd[i, ...] - trial_feature_matrix[i, ...] 342 | ) 343 | 344 | this_novel_index = numpy.nanargmax(trial_svd_errors) 345 | this_novel_index_array = numpy.array([this_novel_index], dtype=int) 346 | novel_indices = numpy.concatenate(( 347 | novel_indices, this_novel_index_array 348 | )) 349 | 350 | this_image_matrix_upconv = upconvnet_model_object.predict( 351 | trial_feature_matrix[this_novel_index_array, ...], batch_size=1 352 | ) 353 | 354 | this_image_matrix_upconv_svd = upconvnet_model_object.predict( 355 | trial_feature_matrix_svd[this_novel_index_array, ...], batch_size=1 356 | ) 357 | 358 | if novel_matrix_upconv is None: 359 | these_dim = ( 360 | (num_novel_examples,) + this_image_matrix_upconv.shape[1:] 361 | ) 362 | novel_matrix_upconv = numpy.full(these_dim, numpy.nan) 363 | novel_matrix_upconv_svd = numpy.full(these_dim, numpy.nan) 364 | 365 | novel_matrix_upconv[k, ...] = this_image_matrix_upconv 366 | novel_matrix_upconv_svd[k, ...] = this_image_matrix_upconv_svd 367 | 368 | return { 369 | NOVEL_MATRIX_KEY: trial_predictor_matrix_norm[novel_indices, ...], 370 | NOVEL_MATRIX_UPCONV_KEY: novel_matrix_upconv, 371 | NOVEL_MATRIX_UPCONV_SVD_KEY: novel_matrix_upconv_svd 372 | } 373 | 374 | 375 | def plot_results(novelty_dict_denorm, plot_index, predictor_names): 376 | """Plots results of novelty detection. 377 | 378 | :param novelty_dict_denorm: Dictionary created by `run_novelty_detection`, 379 | except with denormalized predictor values. 380 | :param plot_index: Will plot the [k]th most novel trial example, where 381 | k = `plot_index`. 382 | :param predictor_names: 1-D list of predictor names. 383 | """ 384 | 385 | temperature_index = predictor_names.index(utils.TEMPERATURE_NAME) 386 | reflectivity_index = predictor_names.index(utils.REFLECTIVITY_NAME) 387 | 388 | actual_predictor_matrix = ( 389 | novelty_dict_denorm[NOVEL_MATRIX_KEY][plot_index, ...] 390 | ) 391 | predictor_matrix_upconv = ( 392 | novelty_dict_denorm[NOVEL_MATRIX_UPCONV_KEY][plot_index, ...] 393 | ) 394 | predictor_matrix_upconv_svd = ( 395 | novelty_dict_denorm[NOVEL_MATRIX_UPCONV_SVD_KEY][plot_index, ...] 396 | ) 397 | novelty_matrix = predictor_matrix_upconv - predictor_matrix_upconv_svd 398 | 399 | concat_temp_matrix_kelvins = numpy.concatenate(( 400 | actual_predictor_matrix[..., temperature_index], 401 | predictor_matrix_upconv[..., temperature_index] 402 | ), axis=0) 403 | 404 | min_colour_temp_kelvins = numpy.percentile(concat_temp_matrix_kelvins, 1) 405 | max_colour_temp_kelvins = numpy.percentile(concat_temp_matrix_kelvins, 99) 406 | 407 | _, axes_object_matrix = plotting.plot_many_predictors_with_barbs( 408 | predictor_matrix=actual_predictor_matrix, 409 | predictor_names=predictor_names, 410 | min_colour_temp_kelvins=min_colour_temp_kelvins, 411 | max_colour_temp_kelvins=max_colour_temp_kelvins 412 | ) 413 | 414 | for i in range(axes_object_matrix.shape[0]): 415 | for j in range(axes_object_matrix.shape[1]): 416 | axes_object_matrix[i, j].set_title('Actual example') 417 | 418 | _, axes_object_matrix = plotting.plot_many_predictors_with_barbs( 419 | predictor_matrix=predictor_matrix_upconv, 420 | predictor_names=predictor_names, 421 | min_colour_temp_kelvins=min_colour_temp_kelvins, 422 | max_colour_temp_kelvins=max_colour_temp_kelvins 423 | ) 424 | 425 | for i in range(axes_object_matrix.shape[0]): 426 | for j in range(axes_object_matrix.shape[1]): 427 | axes_object_matrix[i, j].set_title('Upconvnet reconstruction') 428 | 429 | max_temp_diff_kelvins = numpy.percentile( 430 | numpy.absolute(novelty_matrix[..., temperature_index]), 99 431 | ) 432 | max_reflectivity_diff_dbz = numpy.percentile( 433 | numpy.absolute(novelty_matrix[..., reflectivity_index]), 99 434 | ) 435 | 436 | _, axes_object_matrix = _plot_novelty_maps( 437 | novelty_matrix=novelty_matrix, predictor_names=predictor_names, 438 | max_temp_diff_kelvins=max_temp_diff_kelvins, 439 | max_reflectivity_diff_dbz=max_reflectivity_diff_dbz 440 | ) 441 | 442 | title_string = 'Novelty (unexpected part)' 443 | for i in range(axes_object_matrix.shape[0]): 444 | for j in range(axes_object_matrix.shape[1]): 445 | axes_object_matrix[i, j].set_title(title_string) 446 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/saliency.py: -------------------------------------------------------------------------------- 1 | """Helper methods for saliency.""" 2 | 3 | import numpy 4 | from keras import backend as K 5 | from interpretation import utils 6 | 7 | DEFAULT_LINE_WIDTH = 2. 8 | 9 | 10 | def _do_saliency_calculations( 11 | model_object, loss_tensor, list_of_input_matrices): 12 | """Does saliency calculations. 13 | 14 | T = number of input tensors to the model 15 | E = number of examples (storm objects) 16 | 17 | :param model_object: Instance of `keras.models.Model`. 18 | :param loss_tensor: Keras tensor defining the loss function. 19 | :param list_of_input_matrices: length-T list of numpy arrays, comprising one 20 | or more examples (storm objects). list_of_input_matrices[i] must have 21 | the same dimensions as the [i]th input tensor to the model. 22 | :return: list_of_saliency_matrices: length-T list of numpy arrays, 23 | comprising the saliency map for each example. 24 | list_of_saliency_matrices[i] has the same dimensions as 25 | list_of_input_matrices[i] and defines the "saliency" of each value x, 26 | which is the gradient of the loss function with respect to x. 27 | """ 28 | 29 | if isinstance(model_object.input, list): 30 | list_of_input_tensors = model_object.input 31 | else: 32 | list_of_input_tensors = [model_object.input] 33 | 34 | list_of_gradient_tensors = K.gradients(loss_tensor, list_of_input_tensors) 35 | num_input_tensors = len(list_of_input_tensors) 36 | 37 | for i in range(num_input_tensors): 38 | list_of_gradient_tensors[i] /= K.maximum( 39 | K.std(list_of_gradient_tensors[i]), 40 | K.epsilon() 41 | ) 42 | 43 | inputs_to_gradients_function = K.function( 44 | list_of_input_tensors + [K.learning_phase()], 45 | list_of_gradient_tensors 46 | ) 47 | 48 | list_of_saliency_matrices = inputs_to_gradients_function( 49 | list_of_input_matrices + [0] 50 | ) 51 | 52 | for i in range(num_input_tensors): 53 | list_of_saliency_matrices[i] *= -1 54 | 55 | return list_of_saliency_matrices 56 | 57 | 58 | def _get_grid_points(x_min, x_spacing, num_columns, y_min, y_spacing, num_rows): 59 | """Returns grid points in regular x-y grid. 60 | 61 | M = number of rows in grid 62 | N = number of columns in grid 63 | 64 | :param x_min: Minimum x-coordinate over all grid points. 65 | :param x_spacing: Spacing between adjacent grid points in x-direction. 66 | :param num_columns: N in the above definition. 67 | :param y_min: Minimum y-coordinate over all grid points. 68 | :param y_spacing: Spacing between adjacent grid points in y-direction. 69 | :param num_rows: M in the above definition. 70 | :return: x_coords: length-N numpy array with x-coordinates at grid points. 71 | :return: y_coords: length-M numpy array with y-coordinates at grid points. 72 | """ 73 | 74 | # TODO(thunderhoser): Put this in utils.py. 75 | 76 | x_max = x_min + (num_columns - 1) * x_spacing 77 | y_max = y_min + (num_rows - 1) * y_spacing 78 | 79 | x_coords = numpy.linspace(x_min, x_max, num=num_columns) 80 | y_coords = numpy.linspace(y_min, y_max, num=num_rows) 81 | 82 | return x_coords, y_coords 83 | 84 | 85 | def _plot_2d_saliency_map( 86 | saliency_matrix_2d, axes_object, colour_map_object, max_contour_value, 87 | contour_interval, line_width=DEFAULT_LINE_WIDTH): 88 | """Plots 2-D saliency map with line contours. 89 | 90 | M = number of rows in grid 91 | N = number of columns in grid 92 | 93 | :param saliency_matrix_2d: M-by-N numpy array of saliency values. 94 | :param axes_object: Will plot on these axes (instance of 95 | `matplotlib.axes._subplots.AxesSubplot`). 96 | :param colour_map_object: Colour scheme (instance of `matplotlib.pyplot.cm` 97 | or similar). 98 | :param max_contour_value: Max contour value. Contour values will range from 99 | -v...v, where v = `max_contour_value`. 100 | :param contour_interval: Interval between successive contours. 101 | :param line_width: Line width for contours. 102 | """ 103 | 104 | # Check input args. 105 | assert max_contour_value >= 0. 106 | max_contour_value = max([max_contour_value, 1e-6]) 107 | 108 | assert contour_interval >= 0. 109 | contour_interval = max([contour_interval, 1e-7]) 110 | 111 | assert not numpy.any(numpy.isnan(saliency_matrix_2d)) 112 | assert len(saliency_matrix_2d.shape) == 2 113 | assert contour_interval < max_contour_value 114 | 115 | half_num_contours = int(numpy.round( 116 | 1 + max_contour_value / contour_interval 117 | )) 118 | 119 | # Find grid coordinates. 120 | num_grid_rows = saliency_matrix_2d.shape[0] 121 | num_grid_columns = saliency_matrix_2d.shape[1] 122 | x_coord_spacing = num_grid_columns ** -1 123 | y_coord_spacing = num_grid_rows ** -1 124 | 125 | x_coords, y_coords = _get_grid_points( 126 | x_min=x_coord_spacing / 2, y_min=y_coord_spacing / 2, 127 | x_spacing=x_coord_spacing, y_spacing=y_coord_spacing, 128 | num_rows=num_grid_rows, num_columns=num_grid_columns 129 | ) 130 | 131 | x_coord_matrix, y_coord_matrix = numpy.meshgrid(x_coords, y_coords) 132 | 133 | # Plot positive contours. 134 | positive_contour_values = numpy.linspace( 135 | 0., max_contour_value, num=half_num_contours 136 | ) 137 | 138 | axes_object.contour( 139 | x_coord_matrix, y_coord_matrix, saliency_matrix_2d, 140 | positive_contour_values, cmap=colour_map_object, 141 | vmin=numpy.min(positive_contour_values), 142 | vmax=numpy.max(positive_contour_values), 143 | linewidths=line_width, linestyles='solid', zorder=1e6, 144 | transform=axes_object.transAxes 145 | ) 146 | 147 | # Plot negative contours. 148 | negative_contour_values = positive_contour_values[1:] 149 | 150 | axes_object.contour( 151 | x_coord_matrix, y_coord_matrix, -saliency_matrix_2d, 152 | negative_contour_values, cmap=colour_map_object, 153 | vmin=numpy.min(negative_contour_values), 154 | vmax=numpy.max(negative_contour_values), 155 | linewidths=line_width, linestyles='dashed', zorder=1e6, 156 | transform=axes_object.transAxes 157 | ) 158 | 159 | 160 | def get_saliency_maps_for_class( 161 | model_object, target_class, list_of_input_matrices): 162 | """For each input example, creates saliency map for prob of target class. 163 | 164 | :param model_object: Trained model (instance of `keras.models.Model` or 165 | `keras.models.Sequential`). 166 | :param target_class: Saliency maps will be created for this class. Must be 167 | an integer in 0...(K - 1), where K = number of classes. 168 | :param list_of_input_matrices: See doc for `_do_saliency_calculations`. 169 | :return: list_of_saliency_matrices: See doc for `_do_saliency_calculations`. 170 | """ 171 | 172 | # TODO(thunderhoser): Create example axis. 173 | 174 | target_class = int(numpy.round(target_class)) 175 | assert target_class >= 0 176 | 177 | num_output_neurons = ( 178 | model_object.layers[-1].output.get_shape().as_list()[-1] 179 | ) 180 | 181 | if num_output_neurons == 1: 182 | assert target_class <= 1 183 | 184 | if target_class == 1: 185 | loss_tensor = K.mean( 186 | (model_object.layers[-1].output[..., 0] - 1) ** 2 187 | ) 188 | else: 189 | loss_tensor = K.mean(model_object.layers[-1].output[..., 0] ** 2) 190 | else: 191 | assert target_class < num_output_neurons 192 | 193 | loss_tensor = K.mean( 194 | (model_object.layers[-1].output[..., target_class] - 1) ** 2 195 | ) 196 | 197 | return _do_saliency_calculations( 198 | model_object=model_object, loss_tensor=loss_tensor, 199 | list_of_input_matrices=list_of_input_matrices) 200 | 201 | 202 | def smooth_saliency_maps(saliency_matrices, smoothing_radius_grid_cells): 203 | """Smooths saliency maps via Gaussian filter. 204 | 205 | T = number of input tensors to the model 206 | 207 | :param saliency_matrices: length-T list of numpy arrays. 208 | :param smoothing_radius_grid_cells: e-folding radius (number of grid cells). 209 | :return: saliency_matrices: Smoothed version of input. 210 | """ 211 | 212 | num_matrices = len(saliency_matrices) 213 | num_examples = saliency_matrices[0].shape[0] 214 | 215 | for j in range(num_matrices): 216 | this_num_channels = saliency_matrices[j].shape[-1] 217 | 218 | for i in range(num_examples): 219 | for k in range(this_num_channels): 220 | saliency_matrices[j][i, ..., k] = utils.apply_gaussian_filter( 221 | input_matrix=saliency_matrices[j][i, ..., k], 222 | e_folding_radius_grid_cells=smoothing_radius_grid_cells 223 | ) 224 | 225 | return saliency_matrices 226 | 227 | 228 | def plot_saliency_maps( 229 | saliency_matrix_3d, axes_object_matrix, colour_map_object, 230 | max_contour_value, contour_interval, 231 | line_width=DEFAULT_LINE_WIDTH): 232 | """Plots many saliency maps (one for each channel). 233 | 234 | M = number of rows in grid 235 | N = number of columns in grid 236 | C = number of channels 237 | 238 | :param saliency_matrix_3d: M-by-N-by-C numpy array of saliency values. 239 | :param axes_object_matrix: 2-D numpy array of axes (each an instance of 240 | `matplotlib.axes._subplots.AxesSubplot`). 241 | :param colour_map_object: See doc for `_plot_2d_saliency_map`. 242 | :param max_contour_value: Same. 243 | :param contour_interval: Same. 244 | :param line_width: Same. 245 | """ 246 | 247 | assert len(saliency_matrix_3d.shape) == 3 248 | 249 | num_channels = saliency_matrix_3d.shape[-1] 250 | num_panel_rows = axes_object_matrix.shape[0] 251 | num_panel_columns = axes_object_matrix.shape[1] 252 | 253 | for k in range(num_channels): 254 | i, j = numpy.unravel_index(k, (num_panel_rows, num_panel_columns)) 255 | this_axes_object = axes_object_matrix[i, j] 256 | 257 | _plot_2d_saliency_map( 258 | saliency_matrix_2d=saliency_matrix_3d[..., k], 259 | axes_object=this_axes_object, 260 | colour_map_object=colour_map_object, 261 | max_contour_value=max_contour_value, 262 | contour_interval=contour_interval, line_width=line_width 263 | ) 264 | 265 | # colour_bar_object = utils.plot_linear_colour_bar( 266 | # axes_object_or_matrix=axes_object_matrix, 267 | # data_values=saliency_matrix_3d, colour_map_object=colour_map_object, 268 | # min_value=0., max_value=max_contour_value, plot_horizontal=False, 269 | # plot_min_arrow=False, plot_max_arrow=True, fraction_of_axis_length=0.9 270 | # ) 271 | # 272 | # colour_bar_object.set_label('Absolute saliency') 273 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/interpretation/utils.py: -------------------------------------------------------------------------------- 1 | """Helper methods for model interpretation in general.""" 2 | 3 | import copy 4 | import glob 5 | import errno 6 | import time 7 | import calendar 8 | import os.path 9 | import numpy 10 | import netCDF4 11 | from scipy.interpolate import interp1d 12 | from scipy.ndimage.filters import gaussian_filter 13 | 14 | DATE_FORMAT = '%Y%m%d' 15 | DATE_FORMAT_REGEX = '[0-9][0-9][0-9][0-9][0-1][0-9][0-3][0-9]' 16 | 17 | CSV_TARGET_NAME = 'RVORT1_MAX-future_max' 18 | TARGET_NAME = 'max_future_vorticity_s01' 19 | 20 | NETCDF_REFL_NAME = 'REFL_COM_curr' 21 | NETCDF_TEMP_NAME = 'T2_curr' 22 | NETCDF_U_WIND_NAME = 'U10_curr' 23 | NETCDF_V_WIND_NAME = 'V10_curr' 24 | NETCDF_PREDICTOR_NAMES = [ 25 | NETCDF_REFL_NAME, NETCDF_TEMP_NAME, NETCDF_U_WIND_NAME, NETCDF_V_WIND_NAME 26 | ] 27 | 28 | REFLECTIVITY_NAME = 'reflectivity_dbz' 29 | TEMPERATURE_NAME = 'temperature_kelvins' 30 | U_WIND_NAME = 'u_wind_m_s01' 31 | V_WIND_NAME = 'v_wind_m_s01' 32 | PREDICTOR_NAMES = [ 33 | REFLECTIVITY_NAME, TEMPERATURE_NAME, U_WIND_NAME, V_WIND_NAME 34 | ] 35 | 36 | NETCDF_TRACK_ID_NAME = 'track_id' 37 | NETCDF_TRACK_STEP_NAME = 'track_step' 38 | NETCDF_TARGET_NAME = 'RVORT1_MAX_future' 39 | 40 | STORM_IDS_KEY = 'storm_ids' 41 | STORM_STEPS_KEY = 'storm_steps' 42 | PREDICTOR_NAMES_KEY = 'predictor_names' 43 | PREDICTOR_MATRIX_KEY = 'predictor_matrix' 44 | TARGET_NAME_KEY = 'target_name' 45 | TARGET_MATRIX_KEY = 'target_matrix' 46 | 47 | HIT_INDICES_KEY = 'hit_indices' 48 | MISS_INDICES_KEY = 'miss_indices' 49 | FALSE_ALARM_INDICES_KEY = 'false_alarm_indices' 50 | CORRECT_NULL_INDICES_KEY = 'correct_null_indices' 51 | 52 | 53 | def _image_file_name_to_date(netcdf_file_name): 54 | """Parses date from name of image (NetCDF) file. 55 | 56 | :param netcdf_file_name: Path to input file. 57 | :return: date_string: Date (format "yyyymmdd"). 58 | """ 59 | 60 | pathless_file_name = os.path.split(netcdf_file_name)[-1] 61 | 62 | date_string = pathless_file_name.replace( 63 | 'NCARSTORM_', '' 64 | ).replace('-0000_d01_model_patches.nc', '') 65 | 66 | # Verify. 67 | time_string_to_unix(time_string=date_string, time_format=DATE_FORMAT) 68 | return date_string 69 | 70 | 71 | def create_directory(directory_name=None, file_name=None): 72 | """Creates directory if necessary (i.e., doesn't already exist). 73 | 74 | This method checks for the argument `directory_name` first. If 75 | `directory_name` is None, this method checks for `file_name` and extracts 76 | the directory. 77 | 78 | :param directory_name: Path to local directory. 79 | :param file_name: Path to local file. 80 | """ 81 | 82 | if directory_name is None: 83 | directory_name = os.path.dirname(file_name) 84 | 85 | if directory_name == '': 86 | return 87 | 88 | try: 89 | os.makedirs(directory_name) 90 | except OSError as this_error: 91 | if this_error.errno == errno.EEXIST and os.path.isdir(directory_name): 92 | pass 93 | else: 94 | raise 95 | 96 | 97 | def apply_gaussian_filter(input_matrix, e_folding_radius_grid_cells): 98 | """Applies Gaussian filter to any-dimensional grid. 99 | 100 | :param input_matrix: numpy array with any dimensions. 101 | :param e_folding_radius_grid_cells: e-folding radius (num grid cells). 102 | :return: output_matrix: numpy array after smoothing (same dimensions as 103 | input). 104 | """ 105 | 106 | assert e_folding_radius_grid_cells >= 0. 107 | return gaussian_filter( 108 | input_matrix, sigma=e_folding_radius_grid_cells, order=0, mode='nearest' 109 | ) 110 | 111 | 112 | def time_string_to_unix(time_string, time_format): 113 | """Converts time from string to Unix format. 114 | 115 | Unix format = seconds since 0000 UTC 1 Jan 1970. 116 | 117 | :param time_string: Time string. 118 | :param time_format: Format of time string (example: "%Y%m%d" or 119 | "%Y-%m-%d-%H%M%S"). 120 | :return: unix_time_sec: Time in Unix format. 121 | """ 122 | 123 | return calendar.timegm(time.strptime(time_string, time_format)) 124 | 125 | 126 | def time_unix_to_string(unix_time_sec, time_format): 127 | """Converts time from Unix format to string. 128 | 129 | Unix format = seconds since 0000 UTC 1 Jan 1970. 130 | 131 | :param unix_time_sec: Time in Unix format. 132 | :param time_format: Desired format of time string (example: "%Y%m%d" or 133 | "%Y-%m-%d-%H%M%S"). 134 | :return: time_string: Time string. 135 | """ 136 | 137 | return time.strftime(time_format, time.gmtime(unix_time_sec)) 138 | 139 | 140 | def find_many_image_files(first_date_string, last_date_string, image_dir_name): 141 | """Finds image (NetCDF) files in the given date range. 142 | 143 | :param first_date_string: First date ("yyyymmdd") in range. 144 | :param last_date_string: Last date ("yyyymmdd") in range. 145 | :param image_dir_name: Name of directory with image (NetCDF) files. 146 | :return: netcdf_file_names: 1-D list of paths to image files. 147 | """ 148 | 149 | first_time_unix_sec = time_string_to_unix( 150 | time_string=first_date_string, time_format=DATE_FORMAT 151 | ) 152 | last_time_unix_sec = time_string_to_unix( 153 | time_string=last_date_string, time_format=DATE_FORMAT 154 | ) 155 | 156 | netcdf_file_pattern = ( 157 | '{0:s}/NCARSTORM_{1:s}-0000_d01_model_patches.nc' 158 | ).format(image_dir_name, DATE_FORMAT_REGEX) 159 | 160 | netcdf_file_names = glob.glob(netcdf_file_pattern) 161 | netcdf_file_names.sort() 162 | 163 | file_date_strings = [_image_file_name_to_date(f) for f in netcdf_file_names] 164 | file_times_unix_sec = numpy.array([ 165 | time_string_to_unix(time_string=d, time_format=DATE_FORMAT) 166 | for d in file_date_strings 167 | ], dtype=int) 168 | 169 | good_indices = numpy.where(numpy.logical_and( 170 | file_times_unix_sec >= first_time_unix_sec, 171 | file_times_unix_sec <= last_time_unix_sec 172 | ))[0] 173 | 174 | return [netcdf_file_names[k] for k in good_indices] 175 | 176 | 177 | def read_image_file(netcdf_file_name): 178 | """Reads storm-centered images from NetCDF file. 179 | 180 | E = number of examples (storm objects) in file 181 | M = number of rows in each storm-centered grid 182 | N = number of columns in each storm-centered grid 183 | C = number of channels (predictor variables) 184 | 185 | :param netcdf_file_name: Path to input file. 186 | :return: image_dict: Dictionary with the following keys. 187 | image_dict['storm_ids']: length-E list of storm IDs (integers). 188 | image_dict['storm_steps']: length-E numpy array of storm steps (integers). 189 | image_dict['predictor_names']: length-C list of predictor names. 190 | image_dict['predictor_matrix']: E-by-M-by-N-by-C numpy array of predictor 191 | values. 192 | image_dict['target_name']: Name of target variable. 193 | image_dict['target_matrix']: E-by-M-by-N numpy array of target values. 194 | """ 195 | 196 | dataset_object = netCDF4.Dataset(netcdf_file_name) 197 | 198 | storm_ids = numpy.array( 199 | dataset_object.variables[NETCDF_TRACK_ID_NAME][:], dtype=int 200 | ) 201 | storm_steps = numpy.array( 202 | dataset_object.variables[NETCDF_TRACK_STEP_NAME][:], dtype=int 203 | ) 204 | 205 | predictor_matrix = None 206 | 207 | for this_predictor_name in NETCDF_PREDICTOR_NAMES: 208 | this_predictor_matrix = numpy.array( 209 | dataset_object.variables[this_predictor_name][:], dtype=float 210 | ) 211 | this_predictor_matrix = numpy.expand_dims( 212 | this_predictor_matrix, axis=-1 213 | ) 214 | 215 | if predictor_matrix is None: 216 | predictor_matrix = this_predictor_matrix + 0. 217 | else: 218 | predictor_matrix = numpy.concatenate( 219 | (predictor_matrix, this_predictor_matrix), axis=-1 220 | ) 221 | 222 | target_matrix = numpy.array( 223 | dataset_object.variables[NETCDF_TARGET_NAME][:], dtype=float 224 | ) 225 | 226 | return { 227 | STORM_IDS_KEY: storm_ids, 228 | STORM_STEPS_KEY: storm_steps, 229 | PREDICTOR_NAMES_KEY: PREDICTOR_NAMES, 230 | PREDICTOR_MATRIX_KEY: predictor_matrix, 231 | TARGET_NAME_KEY: TARGET_NAME, 232 | TARGET_MATRIX_KEY: target_matrix 233 | } 234 | 235 | 236 | def read_many_image_files(netcdf_file_names): 237 | """Reads storm-centered images from many NetCDF files. 238 | 239 | :param netcdf_file_names: 1-D list of paths to input files. 240 | :return: image_dict: See doc for `read_image_file`. 241 | """ 242 | 243 | image_dict = None 244 | keys_to_concat = [ 245 | STORM_IDS_KEY, STORM_STEPS_KEY, PREDICTOR_MATRIX_KEY, TARGET_MATRIX_KEY 246 | ] 247 | 248 | for this_file_name in netcdf_file_names: 249 | print('Reading data from: "{0:s}"...'.format(this_file_name)) 250 | this_image_dict = read_image_file(this_file_name) 251 | 252 | if image_dict is None: 253 | image_dict = copy.deepcopy(this_image_dict) 254 | continue 255 | 256 | for this_key in keys_to_concat: 257 | image_dict[this_key] = numpy.concatenate(( 258 | image_dict[this_key], this_image_dict[this_key] 259 | ), axis=0) 260 | 261 | return image_dict 262 | 263 | 264 | def find_extreme_examples( 265 | class_labels, event_probabilities, num_examples_per_set): 266 | """Finds extreme examples. 267 | 268 | There are four sets of examples: 269 | 270 | - best hits 271 | - worst false alarms 272 | - worst misses 273 | - best correct nulls 274 | 275 | E = total number of examples 276 | e = number of examples per set 277 | 278 | :param class_labels: length-E numpy array of class labels (1 for event, 0 279 | for non-event). 280 | :param event_probabilities: length-E numpy array of event probabilities. 281 | :param num_examples_per_set: Number of examples in each set. 282 | 283 | :return: extreme_dict: Dictionary with the following keys. 284 | extreme_dict['hit_indices']: length-e numpy array with indices of best hits. 285 | extreme_dict['miss_indices']: length-e numpy array with indices of worst 286 | misses. 287 | extreme_dict['false_alarm_indices']: length-e numpy array with indices of 288 | worst false alarms. 289 | extreme_dict['correct_null_indices']: length-e numpy array with indices of 290 | best correct nulls. 291 | """ 292 | 293 | # Check input args. 294 | class_labels = numpy.round(class_labels).astype(int) 295 | assert numpy.all(class_labels >= 0) 296 | assert numpy.all(class_labels <= 1) 297 | assert len(class_labels.shape) == 1 298 | 299 | num_examples_total = len(class_labels) 300 | 301 | assert numpy.all(event_probabilities >= 0.) 302 | assert numpy.all(event_probabilities <= 1.) 303 | assert len(event_probabilities.shape) == 1 304 | assert len(event_probabilities) == num_examples_total 305 | 306 | num_examples_per_set = int(numpy.round(num_examples_per_set)) 307 | assert num_examples_per_set > 0 308 | 309 | positive_indices = numpy.where(class_labels == 1)[0] 310 | negative_indices = numpy.where(class_labels == 0)[0] 311 | 312 | num_hits = min([ 313 | num_examples_per_set, len(positive_indices) 314 | ]) 315 | num_misses = min([ 316 | num_examples_per_set, len(positive_indices) 317 | ]) 318 | num_false_alarms = min([ 319 | num_examples_per_set, len(negative_indices) 320 | ]) 321 | num_correct_nulls = min([ 322 | num_examples_per_set, len(negative_indices) 323 | ]) 324 | 325 | these_indices = numpy.argsort(-1 * event_probabilities[positive_indices]) 326 | hit_indices = positive_indices[these_indices][:num_hits] 327 | print('Average event probability for {0:d} best hits = {1:.4f}'.format( 328 | num_hits, numpy.mean(event_probabilities[hit_indices]) 329 | )) 330 | 331 | these_indices = numpy.argsort(event_probabilities[positive_indices]) 332 | miss_indices = positive_indices[these_indices][:num_misses] 333 | print('Average event probability for {0:d} worst misses = {1:.4f}'.format( 334 | num_misses, numpy.mean(event_probabilities[miss_indices]) 335 | )) 336 | 337 | these_indices = numpy.argsort(-1 * event_probabilities[negative_indices]) 338 | false_alarm_indices = negative_indices[these_indices][:num_false_alarms] 339 | print(( 340 | 'Average event probability for {0:d} worst false alarms = {1:.4f}' 341 | ).format( 342 | num_false_alarms, numpy.mean(event_probabilities[false_alarm_indices]) 343 | )) 344 | 345 | these_indices = numpy.argsort(event_probabilities[negative_indices]) 346 | correct_null_indices = negative_indices[these_indices][:num_correct_nulls] 347 | print(( 348 | 'Average event probability for {0:d} best correct nulls = {1:.4f}' 349 | ).format( 350 | num_correct_nulls, numpy.mean(event_probabilities[correct_null_indices]) 351 | )) 352 | 353 | return { 354 | HIT_INDICES_KEY: hit_indices, 355 | MISS_INDICES_KEY: miss_indices, 356 | FALSE_ALARM_INDICES_KEY: false_alarm_indices, 357 | CORRECT_NULL_INDICES_KEY: correct_null_indices 358 | } 359 | 360 | 361 | def run_pmm_one_variable(field_matrix, max_percentile_level=99.): 362 | """Applies PMM (probability-matched means) to one variable. 363 | 364 | :param field_matrix: numpy array with data to be averaged. The first axis 365 | should represent examples, and remaining axes should represent spatial 366 | dimensions. 367 | :param max_percentile_level: Maximum percentile. No output value will 368 | exceed the [q]th percentile of `field_matrix`, where q = 369 | `max_percentile_level`. Similarly, no output value will be less than 370 | the [100 - q]th percentile of `field_matrix`. 371 | :return: mean_field_matrix: numpy array with average spatial field. 372 | Dimensions are the same as `field_matrix`, except that the first axis is 373 | gone. For instance, if `field_matrix` is 1000 x 32 x 32 (1000 examples 374 | x 32 rows x 32 columns), `mean_field_matrix` will be 32 x 32. 375 | """ 376 | 377 | assert not numpy.any(numpy.isnan(field_matrix)) 378 | assert len(field_matrix.shape) > 1 379 | assert max_percentile_level >= 90. 380 | assert max_percentile_level < 100. 381 | 382 | # Pool values over all dimensions and remove extremes. 383 | pooled_values = numpy.sort(numpy.ravel(field_matrix)) 384 | max_pooled_value = numpy.percentile(pooled_values, max_percentile_level) 385 | pooled_values = pooled_values[pooled_values <= max_pooled_value] 386 | 387 | min_pooled_value = numpy.percentile( 388 | pooled_values, 100 - max_percentile_level 389 | ) 390 | pooled_values = pooled_values[pooled_values >= min_pooled_value] 391 | 392 | # Find ensemble mean at each location (e.g., grid point). 393 | mean_field_matrix = numpy.mean(field_matrix, axis=0) 394 | mean_field_flattened = numpy.ravel(mean_field_matrix) 395 | 396 | # At each location, replace ensemble mean with the same percentile from the 397 | # pooled array. 398 | pooled_value_percentiles = numpy.linspace( 399 | 0, 100, num=len(pooled_values), dtype=float 400 | ) 401 | mean_value_percentiles = numpy.linspace( 402 | 0, 100, num=len(mean_field_flattened), dtype=float 403 | ) 404 | 405 | sort_indices = numpy.argsort(mean_field_flattened) 406 | unsort_indices = numpy.argsort(sort_indices) 407 | 408 | interp_object = interp1d( 409 | pooled_value_percentiles, pooled_values, kind='linear', 410 | bounds_error=True, assume_sorted=True 411 | ) 412 | 413 | mean_field_flattened = interp_object(mean_value_percentiles) 414 | mean_field_flattened = mean_field_flattened[unsort_indices] 415 | mean_field_matrix = numpy.reshape( 416 | mean_field_flattened, mean_field_matrix.shape 417 | ) 418 | 419 | return mean_field_matrix 420 | 421 | 422 | def run_pmm_many_variables(field_matrix, max_percentile_level=99.): 423 | """Applies PMM (probability-matched means) to each variable. 424 | 425 | :param field_matrix: numpy array with data to be averaged. The first axis 426 | should represent examples; the last axis should represent variables; and 427 | remaining axes should represent spatial dimensions. 428 | :param max_percentile_level: See doc for `run_pmm_one_variable`. 429 | :return: mean_field_matrix: numpy array with average spatial fields. 430 | Dimensions are the same as `field_matrix`, except that the first axis is 431 | gone. For instance, if `field_matrix` is 1000 x 32 x 32 x 4 432 | (1000 examples x 32 rows x 32 columns x 4 variables), 433 | `mean_field_matrix` will be 32 x 32 x 4. 434 | """ 435 | 436 | assert len(field_matrix.shape) > 2 437 | 438 | num_variables = field_matrix.shape[-1] 439 | mean_field_matrix = numpy.full(field_matrix.shape[1:], numpy.nan) 440 | 441 | for k in range(num_variables): 442 | mean_field_matrix[..., k] = run_pmm_one_variable( 443 | field_matrix=field_matrix[..., k], 444 | max_percentile_level=max_percentile_level 445 | ) 446 | 447 | return mean_field_matrix 448 | -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/lak_permutation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/lak_permutation.gif -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/model_components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/model_components.png -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_cnn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_cnn.h5 -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_cnn_metadata.json: -------------------------------------------------------------------------------- 1 | {"normalization_dict": {"v_wind_m_s01": [0.8430999384858744, 5.02621590127866], "u_wind_m_s01": [-0.2901194096210985, 4.6688756920528895], "temperature_kelvins": [290.34299022259927, 7.613423606954989], "reflectivity_dbz": [22.68552537091767, 15.761682079862304]}, "validation_file_names": ["/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150331-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150416-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150422-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150505-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150510-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150523-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150528-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150605-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150612-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150620-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150630-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150706-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150712-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20151031-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20151227-0000_d01_model_patches.nc"], "training_file_names": ["/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20101024-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20101122-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110201-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110308-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110326-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110404-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110414-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110420-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110425-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110509-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110522-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110527-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110605-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110610-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110615-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110620-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110704-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110712-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20111116-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120218-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120315-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120323-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120401-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120409-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120426-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120503-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120510-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120529-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120606-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120622-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120701-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120706-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120715-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20121225-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130318-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130331-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130411-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130429-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130513-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130519-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130527-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130602-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130613-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130619-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130701-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130708-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130715-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140220-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140328-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140407-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140425-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140508-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140514-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140526-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140604-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140609-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140617-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140622-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140628-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140705-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140710-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20141123-0000_d01_model_patches.nc"], "num_training_batches_per_epoch": 32, "binarization_threshold": 0.005431033764034511, "num_validation_batches_per_epoch": 16, "num_examples_per_batch": 1024} -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_ucn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_ucn.h5 -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/pretrained_cnn/pretrained_ucn_metadata.json: -------------------------------------------------------------------------------- 1 | {"normalization_dict": {"v_wind_m_s01": [0.8430999384858744, 5.02621590127866], "u_wind_m_s01": [-0.2901194096210985, 4.6688756920528895], "temperature_kelvins": [290.34299022259927, 7.613423606954989], "reflectivity_dbz": [22.68552537091767, 15.761682079862304]}, "validation_file_names": ["/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150331-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150416-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150422-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150505-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150510-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150523-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150528-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150605-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150612-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150620-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150630-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150706-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20150712-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20151031-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20151227-0000_d01_model_patches.nc"], "training_file_names": ["/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20101024-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20101122-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110201-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110308-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110326-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110404-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110414-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110420-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110425-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110509-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110522-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110527-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110605-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110610-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110615-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110620-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110704-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20110712-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20111116-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120218-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120315-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120323-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120401-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120409-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120426-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120503-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120510-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120529-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120606-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120622-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120701-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120706-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20120715-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20121225-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130318-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130331-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130411-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130429-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130513-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130519-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130527-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130602-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130613-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130619-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130625-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130701-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130708-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20130715-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140220-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140328-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140407-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140425-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140508-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140514-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140526-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140604-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140609-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140617-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140622-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140628-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140705-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20140710-0000_d01_model_patches.nc", "/condo/swatwork/ralager/ams2019_short_course/track_data_ncar_ams_3km_nc_small/NCARSTORM_20141123-0000_d01_model_patches.nc"], "cnn_file_name": "/condo/swatwork/ralager/ams2019_short_course/pretrained_cnn/pretrained_cnn.h5", "num_training_batches_per_epoch": 32, "num_validation_batches_per_epoch": 16, "cnn_feature_layer_name": "flatten_1", "num_examples_per_batch": 1024} -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/wind_barb_explainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Advanced_Topics_In_Machine_Learning/ML_Model_Interpretation/wind_barb_explainer.png -------------------------------------------------------------------------------- /Advanced_Topics_In_Machine_Learning/README.md: -------------------------------------------------------------------------------- 1 | Machine Learning In Python For Environmental Science Problems Short Course: Advanced Topics In Machine Learning 2 | 3 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/.DS_Store -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/.DS_Store -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/README.md: -------------------------------------------------------------------------------- 1 | # Data Science Fundamentals 2 | 3 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/attributes_diagrams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/attributes_diagrams.cpython-36.pyc -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/performance_diagrams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/performance_diagrams.cpython-36.pyc -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/roc_curves.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/roc_curves.cpython-36.pyc -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/attributes_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/attributes_diagram.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/contingency_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/contingency_table.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/ct_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/ct_scores.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/overfitting.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/performance_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/performance_diagram.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/performance_diagrams.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting performance diagram.""" 2 | 3 | import numpy 4 | import matplotlib.colors 5 | import matplotlib.pyplot as pyplot 6 | 7 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 8 | DEFAULT_LINE_WIDTH = 3 9 | DEFAULT_BIAS_LINE_COLOUR = numpy.full(3, 152. / 255) 10 | DEFAULT_BIAS_LINE_WIDTH = 2 11 | 12 | LEVELS_FOR_CSI_CONTOURS = numpy.linspace(0, 1, num=11, dtype=float) 13 | LEVELS_FOR_BIAS_CONTOURS = numpy.array( 14 | [0.25, 0.5, 0.75, 1., 1.5, 2., 3., 5.]) 15 | 16 | BIAS_STRING_FORMAT = '%.2f' 17 | BIAS_LABEL_PADDING_PX = 10 18 | 19 | FIGURE_WIDTH_INCHES = 10 20 | FIGURE_HEIGHT_INCHES = 10 21 | 22 | FONT_SIZE = 20 23 | pyplot.rc('font', size=FONT_SIZE) 24 | pyplot.rc('axes', titlesize=FONT_SIZE) 25 | pyplot.rc('axes', labelsize=FONT_SIZE) 26 | pyplot.rc('xtick', labelsize=FONT_SIZE) 27 | pyplot.rc('ytick', labelsize=FONT_SIZE) 28 | pyplot.rc('legend', fontsize=FONT_SIZE) 29 | pyplot.rc('figure', titlesize=FONT_SIZE) 30 | 31 | 32 | def _get_sr_pod_grid(success_ratio_spacing=0.01, pod_spacing=0.01): 33 | """Creates grid in SR-POD (success ratio / probability of detection) space. 34 | 35 | M = number of rows (unique POD values) in grid 36 | N = number of columns (unique success ratios) in grid 37 | 38 | :param success_ratio_spacing: Spacing between grid cells in adjacent 39 | columns. 40 | :param pod_spacing: Spacing between grid cells in adjacent rows. 41 | :return: success_ratio_matrix: M-by-N numpy array of success ratios. 42 | Success ratio increases with column index. 43 | :return: pod_matrix: M-by-N numpy array of POD values. POD decreases with 44 | row index. 45 | """ 46 | 47 | num_success_ratios = 1 + int(numpy.ceil(1. / success_ratio_spacing)) 48 | num_pod_values = 1 + int(numpy.ceil(1. / pod_spacing)) 49 | 50 | unique_success_ratios = numpy.linspace(0., 1., num=num_success_ratios) 51 | unique_pod_values = numpy.linspace(0., 1., num=num_pod_values)[::-1] 52 | return numpy.meshgrid(unique_success_ratios, unique_pod_values) 53 | 54 | 55 | def _csi_from_sr_and_pod(success_ratio_array, pod_array): 56 | """Computes CSI (critical success index) from success ratio and POD. 57 | 58 | POD = probability of detection 59 | 60 | :param success_ratio_array: numpy array (any shape) of success ratios. 61 | :param pod_array: numpy array (same shape) of POD values. 62 | :return: csi_array: numpy array (same shape) of CSI values. 63 | """ 64 | 65 | return (success_ratio_array ** -1 + pod_array ** -1 - 1.) ** -1 66 | 67 | 68 | def _bias_from_sr_and_pod(success_ratio_array, pod_array): 69 | """Computes frequency bias from success ratio and POD. 70 | 71 | POD = probability of detection 72 | 73 | :param success_ratio_array: numpy array (any shape) of success ratios. 74 | :param pod_array: numpy array (same shape) of POD values. 75 | :return: frequency_bias_array: numpy array (same shape) of frequency biases. 76 | """ 77 | 78 | return pod_array / success_ratio_array 79 | 80 | 81 | def _get_csi_colour_scheme(): 82 | """Returns colour scheme for CSI (critical success index). 83 | 84 | :return: colour_map_object: Colour scheme (instance of 85 | `matplotlib.colors.ListedColormap`). 86 | :return: colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 87 | defining the scale of the colour map. 88 | """ 89 | 90 | this_colour_map_object = pyplot.cm.Blues 91 | this_colour_norm_object = matplotlib.colors.BoundaryNorm( 92 | LEVELS_FOR_CSI_CONTOURS, this_colour_map_object.N) 93 | 94 | rgba_matrix = this_colour_map_object(this_colour_norm_object( 95 | LEVELS_FOR_CSI_CONTOURS)) 96 | colour_list = [ 97 | rgba_matrix[i, ..., :-1] for i in range(rgba_matrix.shape[0]) 98 | ] 99 | 100 | colour_map_object = matplotlib.colors.ListedColormap(colour_list) 101 | colour_map_object.set_under(numpy.array([1, 1, 1])) 102 | colour_norm_object = matplotlib.colors.BoundaryNorm( 103 | LEVELS_FOR_CSI_CONTOURS, colour_map_object.N) 104 | 105 | return colour_map_object, colour_norm_object 106 | 107 | 108 | def _add_colour_bar( 109 | axes_object, colour_map_object, values_to_colour, min_colour_value, 110 | max_colour_value, colour_norm_object=None, 111 | orientation_string='vertical', extend_min=True, extend_max=True, 112 | fraction_of_axis_length=1., font_size=FONT_SIZE): 113 | """Adds colour bar to existing axes. 114 | 115 | :param axes_object: Existing axes (instance of 116 | `matplotlib.axes._subplots.AxesSubplot`). 117 | :param colour_map_object: Colour scheme (instance of 118 | `matplotlib.pyplot.cm`). 119 | :param values_to_colour: numpy array of values to colour. 120 | :param min_colour_value: Minimum value in colour map. 121 | :param max_colour_value: Max value in colour map. 122 | :param colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 123 | defining the scale of the colour map. If `colour_norm_object is None`, 124 | will assume that scale is linear. 125 | :param orientation_string: Orientation of colour bar ("vertical" or 126 | "horizontal"). 127 | :param extend_min: Boolean flag. If True, the bottom of the colour bar will 128 | have an arrow. If False, it will be a flat line, suggesting that lower 129 | values are not possible. 130 | :param extend_max: Same but for top of colour bar. 131 | :param fraction_of_axis_length: Fraction of axis length (y-axis if 132 | orientation is "vertical", x-axis if orientation is "horizontal") 133 | occupied by colour bar. 134 | :param font_size: Font size for labels on colour bar. 135 | :return: colour_bar_object: Colour bar (instance of 136 | `matplotlib.pyplot.colorbar`) created by this method. 137 | """ 138 | 139 | if colour_norm_object is None: 140 | colour_norm_object = matplotlib.colors.Normalize( 141 | vmin=min_colour_value, vmax=max_colour_value, clip=False) 142 | 143 | scalar_mappable_object = pyplot.cm.ScalarMappable( 144 | cmap=colour_map_object, norm=colour_norm_object) 145 | scalar_mappable_object.set_array(values_to_colour) 146 | 147 | if extend_min and extend_max: 148 | extend_string = 'both' 149 | elif extend_min: 150 | extend_string = 'min' 151 | elif extend_max: 152 | extend_string = 'max' 153 | else: 154 | extend_string = 'neither' 155 | 156 | if orientation_string == 'horizontal': 157 | padding = 0.075 158 | else: 159 | padding = 0.05 160 | 161 | colour_bar_object = pyplot.colorbar( 162 | ax=axes_object, mappable=scalar_mappable_object, 163 | orientation=orientation_string, pad=padding, extend=extend_string, 164 | shrink=fraction_of_axis_length) 165 | 166 | colour_bar_object.ax.tick_params(labelsize=font_size) 167 | return colour_bar_object 168 | 169 | 170 | def get_points_in_perf_diagram(observed_labels, forecast_probabilities): 171 | """Creates points for performance diagram. 172 | 173 | E = number of examples 174 | T = number of binarization thresholds 175 | 176 | :param observed_labels: length-E numpy array of class labels (integers in 177 | 0...1). 178 | :param forecast_probabilities: length-E numpy array with forecast 179 | probabilities of label = 1. 180 | :return: pod_by_threshold: length-T numpy array of POD (probability of 181 | detection) values. 182 | :return: success_ratio_by_threshold: length-T numpy array of success ratios. 183 | """ 184 | 185 | assert numpy.all(numpy.logical_or( 186 | observed_labels == 0, observed_labels == 1 187 | )) 188 | 189 | assert numpy.all(numpy.logical_and( 190 | forecast_probabilities >= 0, forecast_probabilities <= 1 191 | )) 192 | 193 | observed_labels = observed_labels.astype(int) 194 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 195 | 196 | num_thresholds = len(binarization_thresholds) 197 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 198 | success_ratio_by_threshold = numpy.full(num_thresholds, numpy.nan) 199 | 200 | for k in range(num_thresholds): 201 | these_forecast_labels = ( 202 | forecast_probabilities >= binarization_thresholds[k] 203 | ).astype(int) 204 | 205 | this_num_hits = numpy.sum(numpy.logical_and( 206 | these_forecast_labels == 1, observed_labels == 1 207 | )) 208 | 209 | this_num_false_alarms = numpy.sum(numpy.logical_and( 210 | these_forecast_labels == 1, observed_labels == 0 211 | )) 212 | 213 | this_num_misses = numpy.sum(numpy.logical_and( 214 | these_forecast_labels == 0, observed_labels == 1 215 | )) 216 | 217 | try: 218 | pod_by_threshold[k] = ( 219 | float(this_num_hits) / (this_num_hits + this_num_misses) 220 | ) 221 | except ZeroDivisionError: 222 | pass 223 | 224 | try: 225 | success_ratio_by_threshold[k] = ( 226 | float(this_num_hits) / (this_num_hits + this_num_false_alarms) 227 | ) 228 | except ZeroDivisionError: 229 | pass 230 | 231 | pod_by_threshold = numpy.array([1.] + pod_by_threshold.tolist() + [0.]) 232 | success_ratio_by_threshold = numpy.array( 233 | [0.] + success_ratio_by_threshold.tolist() + [1.] 234 | ) 235 | 236 | return pod_by_threshold, success_ratio_by_threshold 237 | 238 | 239 | def plot_performance_diagram( 240 | observed_labels, forecast_probabilities, 241 | line_colour=DEFAULT_LINE_COLOUR, line_width=DEFAULT_LINE_WIDTH, 242 | bias_line_colour=DEFAULT_BIAS_LINE_COLOUR, 243 | bias_line_width=DEFAULT_BIAS_LINE_WIDTH, axes_object=None): 244 | """Plots performance diagram. 245 | 246 | E = number of examples 247 | 248 | :param observed_labels: length-E numpy array of class labels (integers in 249 | 0...1). 250 | :param forecast_probabilities: length-E numpy array with forecast 251 | probabilities of label = 1. 252 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 253 | :param line_width: Line width (real positive number). 254 | :param bias_line_colour: Colour of contour lines for frequency bias. 255 | :param bias_line_width: Width of contour lines for frequency bias. 256 | :param axes_object: Will plot on these axes (instance of 257 | `matplotlib.axes._subplots.AxesSubplot`). If `axes_object is None`, 258 | will create new axes. 259 | :return: pod_by_threshold: See doc for `get_points_in_perf_diagram`. 260 | detection) values. 261 | :return: success_ratio_by_threshold: Same. 262 | """ 263 | 264 | pod_by_threshold, success_ratio_by_threshold = get_points_in_perf_diagram( 265 | observed_labels=observed_labels, 266 | forecast_probabilities=forecast_probabilities) 267 | 268 | if axes_object is None: 269 | _, axes_object = pyplot.subplots( 270 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 271 | ) 272 | 273 | success_ratio_matrix, pod_matrix = _get_sr_pod_grid() 274 | csi_matrix = _csi_from_sr_and_pod(success_ratio_matrix, pod_matrix) 275 | frequency_bias_matrix = _bias_from_sr_and_pod( 276 | success_ratio_matrix, pod_matrix) 277 | 278 | this_colour_map_object, this_colour_norm_object = _get_csi_colour_scheme() 279 | 280 | pyplot.contourf( 281 | success_ratio_matrix, pod_matrix, csi_matrix, LEVELS_FOR_CSI_CONTOURS, 282 | cmap=this_colour_map_object, norm=this_colour_norm_object, vmin=0., 283 | vmax=1., axes=axes_object) 284 | 285 | colour_bar_object = _add_colour_bar( 286 | axes_object=axes_object, colour_map_object=this_colour_map_object, 287 | colour_norm_object=this_colour_norm_object, 288 | values_to_colour=csi_matrix, min_colour_value=0., 289 | max_colour_value=1., orientation_string='vertical', 290 | extend_min=False, extend_max=False) 291 | colour_bar_object.set_label('CSI (critical success index)') 292 | 293 | bias_colour_tuple = () 294 | for _ in range(len(LEVELS_FOR_BIAS_CONTOURS)): 295 | bias_colour_tuple += (bias_line_colour,) 296 | 297 | bias_contour_object = pyplot.contour( 298 | success_ratio_matrix, pod_matrix, frequency_bias_matrix, 299 | LEVELS_FOR_BIAS_CONTOURS, colors=bias_colour_tuple, 300 | linewidths=bias_line_width, linestyles='dashed', axes=axes_object) 301 | pyplot.clabel( 302 | bias_contour_object, inline=True, inline_spacing=BIAS_LABEL_PADDING_PX, 303 | fmt=BIAS_STRING_FORMAT, fontsize=FONT_SIZE) 304 | 305 | nan_flags = numpy.logical_or( 306 | numpy.isnan(success_ratio_by_threshold), numpy.isnan(pod_by_threshold) 307 | ) 308 | 309 | if not numpy.all(nan_flags): 310 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 311 | axes_object.plot( 312 | success_ratio_by_threshold[real_indices], 313 | pod_by_threshold[real_indices], color=line_colour, 314 | linestyle='solid', linewidth=line_width) 315 | 316 | axes_object.set_xlabel('Success ratio (1 - FAR)') 317 | axes_object.set_ylabel('POD (probability of detection)') 318 | axes_object.set_xlim(0., 1.) 319 | axes_object.set_ylim(0., 1.) 320 | 321 | return pod_by_threshold, success_ratio_by_threshold 322 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Data_Science_Fundamentals/roc.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Data_Science_Fundamentals/roc_curves.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting ROC (receiver operating characteristic) curve.""" 2 | 3 | import numpy 4 | import matplotlib.colors 5 | import matplotlib.pyplot as pyplot 6 | import Introduction_To_Machine_Learning.Data_Science_Fundamentals.performance_diagrams as performance_diagrams 7 | 8 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 9 | DEFAULT_LINE_WIDTH = 3 10 | DEFAULT_RANDOM_LINE_COLOUR = numpy.full(3, 152.0 / 255) 11 | DEFAULT_RANDOM_LINE_WIDTH = 2 12 | 13 | LEVELS_FOR_PEIRCE_CONTOURS = numpy.linspace(0, 1, num=11, dtype=float) 14 | 15 | FIGURE_WIDTH_INCHES = 10 16 | FIGURE_HEIGHT_INCHES = 10 17 | 18 | FONT_SIZE = 20 19 | pyplot.rc("font", size=FONT_SIZE) 20 | pyplot.rc("axes", titlesize=FONT_SIZE) 21 | pyplot.rc("axes", labelsize=FONT_SIZE) 22 | pyplot.rc("xtick", labelsize=FONT_SIZE) 23 | pyplot.rc("ytick", labelsize=FONT_SIZE) 24 | pyplot.rc("legend", fontsize=FONT_SIZE) 25 | pyplot.rc("figure", titlesize=FONT_SIZE) 26 | 27 | 28 | def _get_pofd_pod_grid(pofd_spacing=0.01, pod_spacing=0.01): 29 | """Creates grid in POFD-POD space. 30 | 31 | M = number of rows (unique POD values) in grid 32 | N = number of columns (unique POFD values) in grid 33 | 34 | :param pofd_spacing: Spacing between grid cells in adjacent columns. 35 | :param pod_spacing: Spacing between grid cells in adjacent rows. 36 | :return: pofd_matrix: M-by-N numpy array of POFD values. 37 | :return: pod_matrix: M-by-N numpy array of POD values. 38 | """ 39 | 40 | num_pofd_values = 1 + int(numpy.ceil(1.0 / pofd_spacing)) 41 | num_pod_values = 1 + int(numpy.ceil(1.0 / pod_spacing)) 42 | 43 | unique_pofd_values = numpy.linspace(0.0, 1.0, num=num_pofd_values) 44 | unique_pod_values = numpy.linspace(0.0, 1.0, num=num_pod_values)[::-1] 45 | return numpy.meshgrid(unique_pofd_values, unique_pod_values) 46 | 47 | 48 | def _get_peirce_colour_scheme(): 49 | """Returns colour scheme for Peirce score. 50 | 51 | :return: colour_map_object: Colour scheme (instance of 52 | `matplotlib.colors.ListedColormap`). 53 | :return: colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 54 | defining the scale of the colour map. 55 | """ 56 | 57 | this_colour_map_object = pyplot.cm.Blues 58 | this_colour_norm_object = matplotlib.colors.BoundaryNorm( 59 | LEVELS_FOR_PEIRCE_CONTOURS, this_colour_map_object.N 60 | ) 61 | 62 | rgba_matrix = this_colour_map_object( 63 | this_colour_norm_object(LEVELS_FOR_PEIRCE_CONTOURS) 64 | ) 65 | 66 | colour_list = [rgba_matrix[i, ..., :-1] for i in range(rgba_matrix.shape[0])] 67 | 68 | colour_map_object = matplotlib.colors.ListedColormap(colour_list) 69 | colour_map_object.set_under(numpy.array([1, 1, 1])) 70 | colour_norm_object = matplotlib.colors.BoundaryNorm( 71 | LEVELS_FOR_PEIRCE_CONTOURS, colour_map_object.N 72 | ) 73 | 74 | return colour_map_object, colour_norm_object 75 | 76 | 77 | def get_points_in_roc_curve(observed_labels, forecast_probabilities): 78 | """Creates points for ROC curve. 79 | 80 | E = number of examples 81 | T = number of binarization thresholds 82 | 83 | :param observed_labels: length-E numpy array of class labels (integers in 84 | 0...1). 85 | :param forecast_probabilities: length-E numpy array with forecast 86 | probabilities of label = 1. 87 | :return: pofd_by_threshold: length-T numpy array of POFD (probability of 88 | false detection) values. 89 | :return: pod_by_threshold: length-T numpy array of POD (probability of 90 | detection) values. 91 | """ 92 | 93 | assert numpy.all(numpy.logical_or(observed_labels == 0, observed_labels == 1)) 94 | 95 | assert numpy.all( 96 | numpy.logical_and(forecast_probabilities >= 0, forecast_probabilities <= 1) 97 | ) 98 | 99 | observed_labels = observed_labels.astype(int) 100 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 101 | 102 | num_thresholds = len(binarization_thresholds) 103 | pofd_by_threshold = numpy.full(num_thresholds, numpy.nan) 104 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 105 | 106 | for k in range(num_thresholds): 107 | these_forecast_labels = ( 108 | forecast_probabilities >= binarization_thresholds[k] 109 | ).astype(int) 110 | 111 | this_num_hits = numpy.sum( 112 | numpy.logical_and(these_forecast_labels == 1, observed_labels == 1) 113 | ) 114 | 115 | this_num_false_alarms = numpy.sum( 116 | numpy.logical_and(these_forecast_labels == 1, observed_labels == 0) 117 | ) 118 | 119 | this_num_misses = numpy.sum( 120 | numpy.logical_and(these_forecast_labels == 0, observed_labels == 1) 121 | ) 122 | 123 | this_num_correct_nulls = numpy.sum( 124 | numpy.logical_and(these_forecast_labels == 0, observed_labels == 0) 125 | ) 126 | 127 | try: 128 | pofd_by_threshold[k] = float(this_num_false_alarms) / ( 129 | this_num_false_alarms + this_num_correct_nulls 130 | ) 131 | except ZeroDivisionError: 132 | pass 133 | 134 | try: 135 | pod_by_threshold[k] = float(this_num_hits) / ( 136 | this_num_hits + this_num_misses 137 | ) 138 | except ZeroDivisionError: 139 | pass 140 | 141 | pod_by_threshold = numpy.array([1.0] + pod_by_threshold.tolist() + [0.0]) 142 | pofd_by_threshold = numpy.array([1.0] + pofd_by_threshold.tolist() + [0.0]) 143 | 144 | return pofd_by_threshold, pod_by_threshold 145 | 146 | 147 | def plot_roc_curve( 148 | observed_labels, 149 | forecast_probabilities, 150 | line_colour=DEFAULT_LINE_COLOUR, 151 | line_width=DEFAULT_LINE_WIDTH, 152 | random_line_colour=DEFAULT_RANDOM_LINE_COLOUR, 153 | random_line_width=DEFAULT_RANDOM_LINE_WIDTH, 154 | axes_object=None, 155 | ): 156 | """Plots ROC curve. 157 | 158 | E = number of examples 159 | 160 | :param observed_labels: length-E numpy array of class labels (integers in 161 | 0...1). 162 | :param forecast_probabilities: length-E numpy array with forecast 163 | probabilities of label = 1. 164 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 165 | :param line_width: Line width (real positive number). 166 | :param random_line_colour: Colour of reference line (ROC curve for random 167 | predictor). 168 | :param random_line_width: Width of reference line (ROC curve for random 169 | predictor). 170 | :param axes_object: Will plot on these axes (instance of 171 | `matplotlib.axes._subplots.AxesSubplot`). If `axes_object is None`, 172 | will create new axes. 173 | :return: pofd_by_threshold: See doc for `get_points_in_roc_curve`. 174 | :return: pod_by_threshold: Same. 175 | """ 176 | 177 | pofd_by_threshold, pod_by_threshold = get_points_in_roc_curve( 178 | observed_labels=observed_labels, forecast_probabilities=forecast_probabilities 179 | ) 180 | 181 | if axes_object is None: 182 | _, axes_object = pyplot.subplots( 183 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 184 | ) 185 | 186 | pofd_matrix, pod_matrix = _get_pofd_pod_grid() 187 | peirce_score_matrix = pod_matrix - pofd_matrix 188 | 189 | colour_map_object, colour_norm_object = _get_peirce_colour_scheme() 190 | 191 | pyplot.contourf( 192 | pofd_matrix, 193 | pod_matrix, 194 | peirce_score_matrix, 195 | LEVELS_FOR_PEIRCE_CONTOURS, 196 | cmap=colour_map_object, 197 | norm=colour_norm_object, 198 | vmin=0.0, 199 | vmax=1.0, 200 | axes=axes_object, 201 | ) 202 | 203 | # TODO(thunderhoser): Calling private method is a HACK. 204 | colour_bar_object = performance_diagrams._add_colour_bar( 205 | axes_object=axes_object, 206 | colour_map_object=colour_map_object, 207 | colour_norm_object=colour_norm_object, 208 | values_to_colour=peirce_score_matrix, 209 | min_colour_value=0.0, 210 | max_colour_value=1.0, 211 | orientation_string="vertical", 212 | extend_min=False, 213 | extend_max=False, 214 | ) 215 | 216 | print(colour_bar_object) 217 | colour_bar_object.set_label("Peirce score") 218 | 219 | random_x_coords = numpy.array([0.0, 1.0]) 220 | random_y_coords = numpy.array([0.0, 1.0]) 221 | axes_object.plot( 222 | random_x_coords, 223 | random_y_coords, 224 | color=random_line_colour, 225 | linestyle="dashed", 226 | linewidth=random_line_width, 227 | ) 228 | 229 | nan_flags = numpy.logical_or( 230 | numpy.isnan(pofd_by_threshold), numpy.isnan(pod_by_threshold) 231 | ) 232 | 233 | if not numpy.all(nan_flags): 234 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 235 | axes_object.plot( 236 | pofd_by_threshold[real_indices], 237 | pod_by_threshold[real_indices], 238 | color=line_colour, 239 | linestyle="solid", 240 | linewidth=line_width, 241 | ) 242 | 243 | axes_object.set_xlabel("POFD (probability of false detection)") 244 | axes_object.set_ylabel("POD (probability of detection)") 245 | axes_object.set_xlim(0.0, 1.0) 246 | axes_object.set_ylim(0.0, 1.0) 247 | 248 | return pofd_by_threshold, pod_by_threshold 249 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/AI-vs-ML-vs-Deep-Learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/AI-vs-ML-vs-Deep-Learning.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/PCAexample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/PCAexample.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/SVD_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/SVD_example.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/ml_comic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/ml_comic.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/pca.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/Images/pca.gif -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Introduction_to_ML_and_AI/README.md: -------------------------------------------------------------------------------- 1 | Introduction To Machine Learning And Artifical Inteligence 2 | 3 | In this lecture, we will explore different pre-processing techniques applied to data input to machine learning models, including scaling, imputation, and others. We will also introduce the weather dataset to be used throughout the course. 4 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/README.md: -------------------------------------------------------------------------------- 1 | Machine Learning In Python For Environmental Science Problems Short Course: Introduction to Machine Learning 2 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/BP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/BP.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/FP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/FP.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/Kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/Kernel.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/LC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/LC.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/LR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/LR.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/Models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/Models.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/NN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/NN.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/README.md: -------------------------------------------------------------------------------- 1 | Machine Learning In Python For Environmental Science Problems Short Course: Introduction to Machine Learning 2 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SK.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SML.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SVM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SVM.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SupervisedML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/SupervisedML.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/contingency_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/contingency_table.png -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/download_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Developed by David John Gagne II 3 | AMS 2019 Short Course 4 | """ 5 | 6 | from urllib.request import urlretrieve 7 | import os 8 | from os.path import exists, join 9 | import tarfile 10 | 11 | if not exists("data"): 12 | os.mkdir("data") 13 | csv_tar_file = "https://storage.googleapis.com/track_data_ncar_ams_3km_csv_small/track_data_ncar_ams_3km_csv_small.tar.gz" 14 | nc_tar_file = "https://storage.googleapis.com/track_data_ncar_ams_3km_nc_small/track_data_ncar_ams_3km_nc_small.tar.gz" 15 | print("Get csv files") 16 | urlretrieve(csv_tar_file, join("data", csv_tar_file.split("/")[-1])) 17 | print("Get nc files") 18 | urlretrieve(nc_tar_file, join("data", nc_tar_file.split("/")[-1])) 19 | print("Extract csv tar file") 20 | csv_tar = tarfile.open(join("data", csv_tar_file.split("/")[-1])) 21 | csv_tar.extractall("data/") 22 | csv_tar.close() 23 | print("Extract nc tar file") 24 | nc_tar = tarfile.open(join("data", nc_tar_file.split("/")[-1])) 25 | nc_tar.extractall("data/") 26 | nc_tar.close() 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/extract_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Developed by Amanda Burke 3 | Based off methods by Sheri Mickelson, AMS 2019 4 | AMS 2020 Short Course 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import xarray as xr 10 | from glob import glob 11 | 12 | # Input variables for the extract_csv_data() function 13 | csv_input_variables = ['REFL_COM_mean', 'REFL_COM_max', 'REFL_COM_min', 'REFL_COM_std', 'REFL_COM_percentile_10', 14 | 'REFL_COM_percentile_25', 'REFL_COM_percentile_50', 'REFL_COM_percentile_75', 'REFL_COM_percentile_90', 15 | 'U10_mean', 'U10_max', 'U10_min', 'U10_std', 'U10_percentile_10', 'U10_percentile_25', 'U10_percentile_50', 16 | 'U10_percentile_75', 'U10_percentile_90', 'V10_mean', 'V10_max', 'V10_min', 'V10_std', 'V10_percentile_10', 17 | 'V10_percentile_25', 'V10_percentile_50', 'V10_percentile_75', 'V10_percentile_90', 'T2_mean', 'T2_max', 18 | 'T2_min', 'T2_std', 'T2_percentile_10', 'T2_percentile_25', 'T2_percentile_50', 'T2_percentile_75', 19 | 'T2_percentile_90', 'area', 'eccentricity', 'major_axis_length', 'minor_axis_length', 'orientation'] 20 | # Label variable for the extract_csv_data() function 21 | csv_label_variable = ['RVORT1_MAX-future_max'] 22 | 23 | # Input variables for the extract_nc_data() function 24 | nc_input_variables = ["REFL_COM_curr", "U10_curr", "V10_curr"] 25 | # Label variable for the extract_nc_data() function 26 | nc_label_variable = ["RVORT1_MAX_future"] 27 | 28 | 29 | def extract_csv_data(input_data_path): 30 | """ 31 | Extracts csv data from a given set of files. Returns datasets 32 | containing the predictor and label variables. 33 | 34 | Args: 35 | input_data_path (str): path to dataset directory 36 | returns: Predictor, label, and valid date data (# of datafiles,). 37 | 38 | """ 39 | # Find all csv files from given directory 40 | data_files = sorted(glob(input_data_path + "*.csv")) 41 | 42 | in_data = [] 43 | out_data = [] 44 | valid_times = [] 45 | 46 | for files in data_files: 47 | # Read in csv data 48 | data = pd.read_csv(files) 49 | #Append the predictor and label variables 50 | in_data.append(data.loc[:,csv_input_variables].values) 51 | out_data.append(data.loc[:,csv_label_variable].values) 52 | #Append daily timestamps 53 | valid_24_hour_date = data.loc[:,"Valid_Date"].values 54 | valid_times.append(pd.Timestamp(valid_24_hour_date[0][:10])) 55 | 56 | return in_data, out_data, valid_times 57 | 58 | 59 | def extract_nc_data(input_data_path): 60 | """ 61 | Extracts netcdf data from a given set of files. Returns datasets 62 | containing the input variables and output variables. 63 | 64 | Args: 65 | input_data_path (str): path to dataset directory 66 | returns: Predictor and label data (examples, 32, 32, number of variables), 67 | valid dates (examples,). 68 | """ 69 | # Find all netcdf files from given directory 70 | data_files = sorted(glob(input_data_path + "*.nc")) 71 | 72 | in_data = [] 73 | out_data = [] 74 | valid_times = [] 75 | 76 | for files in data_files: 77 | # Read in netcdf data 78 | data = xr.open_dataset(files) 79 | #Append the daily predictor and label variables 80 | in_data.append(np.stack([data[v].values for v in nc_input_variables], axis=-1)) 81 | out_data.append(np.stack([data[v].values for v in nc_label_variable], axis=-1)) 82 | #Append daily timestamps 83 | date = pd.Timestamp(files.split("/")[-1].split("_")[1]) 84 | valid_times.append([date] * in_data[-1].shape[0]) 85 | data.close() 86 | 87 | # Concatenate/stack data from lists of arrays to a single array 88 | all_in_data = np.vstack(in_data) 89 | all_out_data = np.vstack(out_data) 90 | all_valid_times = np.concatenate(valid_times) 91 | 92 | # Delete lists to save memory 93 | del in_data[:], out_data[:],valid_times[:] 94 | del in_data, out_data, valid_times 95 | 96 | return all_in_data, all_out_data, all_valid_times -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/performance_diagrams.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting performance diagram.""" 2 | 3 | import numpy 4 | import matplotlib.colors 5 | import matplotlib.pyplot as pyplot 6 | 7 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 8 | DEFAULT_LINE_WIDTH = 3 9 | DEFAULT_BIAS_LINE_COLOUR = numpy.full(3, 152. / 255) 10 | DEFAULT_BIAS_LINE_WIDTH = 2 11 | 12 | LEVELS_FOR_CSI_CONTOURS = numpy.linspace(0, 1, num=11, dtype=float) 13 | LEVELS_FOR_BIAS_CONTOURS = numpy.array( 14 | [0.25, 0.5, 0.75, 1., 1.5, 2., 3., 5.]) 15 | 16 | BIAS_STRING_FORMAT = '%.2f' 17 | BIAS_LABEL_PADDING_PX = 10 18 | 19 | FIGURE_WIDTH_INCHES = 10 20 | FIGURE_HEIGHT_INCHES = 10 21 | 22 | FONT_SIZE = 20 23 | pyplot.rc('font', size=FONT_SIZE) 24 | pyplot.rc('axes', titlesize=FONT_SIZE) 25 | pyplot.rc('axes', labelsize=FONT_SIZE) 26 | pyplot.rc('xtick', labelsize=FONT_SIZE) 27 | pyplot.rc('ytick', labelsize=FONT_SIZE) 28 | pyplot.rc('legend', fontsize=FONT_SIZE) 29 | pyplot.rc('figure', titlesize=FONT_SIZE) 30 | 31 | 32 | def _get_sr_pod_grid(success_ratio_spacing=0.01, pod_spacing=0.01): 33 | """Creates grid in SR-POD (success ratio / probability of detection) space. 34 | 35 | M = number of rows (unique POD values) in grid 36 | N = number of columns (unique success ratios) in grid 37 | 38 | :param success_ratio_spacing: Spacing between grid cells in adjacent 39 | columns. 40 | :param pod_spacing: Spacing between grid cells in adjacent rows. 41 | :return: success_ratio_matrix: M-by-N numpy array of success ratios. 42 | Success ratio increases with column index. 43 | :return: pod_matrix: M-by-N numpy array of POD values. POD decreases with 44 | row index. 45 | """ 46 | 47 | num_success_ratios = 1 + int(numpy.ceil(1. / success_ratio_spacing)) 48 | num_pod_values = 1 + int(numpy.ceil(1. / pod_spacing)) 49 | 50 | unique_success_ratios = numpy.linspace(0., 1., num=num_success_ratios) 51 | unique_pod_values = numpy.linspace(0., 1., num=num_pod_values)[::-1] 52 | return numpy.meshgrid(unique_success_ratios, unique_pod_values) 53 | 54 | 55 | def _csi_from_sr_and_pod(success_ratio_array, pod_array): 56 | """Computes CSI (critical success index) from success ratio and POD. 57 | 58 | POD = probability of detection 59 | 60 | :param success_ratio_array: numpy array (any shape) of success ratios. 61 | :param pod_array: numpy array (same shape) of POD values. 62 | :return: csi_array: numpy array (same shape) of CSI values. 63 | """ 64 | 65 | return (success_ratio_array ** -1 + pod_array ** -1 - 1.) ** -1 66 | 67 | 68 | def _bias_from_sr_and_pod(success_ratio_array, pod_array): 69 | """Computes frequency bias from success ratio and POD. 70 | 71 | POD = probability of detection 72 | 73 | :param success_ratio_array: numpy array (any shape) of success ratios. 74 | :param pod_array: numpy array (same shape) of POD values. 75 | :return: frequency_bias_array: numpy array (same shape) of frequency biases. 76 | """ 77 | 78 | return pod_array / success_ratio_array 79 | 80 | 81 | def _get_csi_colour_scheme(): 82 | """Returns colour scheme for CSI (critical success index). 83 | 84 | :return: colour_map_object: Colour scheme (instance of 85 | `matplotlib.colors.ListedColormap`). 86 | :return: colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 87 | defining the scale of the colour map. 88 | """ 89 | 90 | this_colour_map_object = pyplot.cm.Blues 91 | this_colour_norm_object = matplotlib.colors.BoundaryNorm( 92 | LEVELS_FOR_CSI_CONTOURS, this_colour_map_object.N) 93 | 94 | rgba_matrix = this_colour_map_object(this_colour_norm_object( 95 | LEVELS_FOR_CSI_CONTOURS)) 96 | colour_list = [ 97 | rgba_matrix[i, ..., :-1] for i in range(rgba_matrix.shape[0]) 98 | ] 99 | 100 | colour_map_object = matplotlib.colors.ListedColormap(colour_list) 101 | colour_map_object.set_under(numpy.array([1, 1, 1])) 102 | colour_norm_object = matplotlib.colors.BoundaryNorm( 103 | LEVELS_FOR_CSI_CONTOURS, colour_map_object.N) 104 | 105 | return colour_map_object, colour_norm_object 106 | 107 | 108 | def _add_colour_bar( 109 | axes_object, colour_map_object, values_to_colour, min_colour_value, 110 | max_colour_value, colour_norm_object=None, 111 | orientation_string='vertical', extend_min=True, extend_max=True, 112 | fraction_of_axis_length=1., font_size=FONT_SIZE): 113 | """Adds colour bar to existing axes. 114 | 115 | :param axes_object: Existing axes (instance of 116 | `matplotlib.axes._subplots.AxesSubplot`). 117 | :param colour_map_object: Colour scheme (instance of 118 | `matplotlib.pyplot.cm`). 119 | :param values_to_colour: numpy array of values to colour. 120 | :param min_colour_value: Minimum value in colour map. 121 | :param max_colour_value: Max value in colour map. 122 | :param colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 123 | defining the scale of the colour map. If `colour_norm_object is None`, 124 | will assume that scale is linear. 125 | :param orientation_string: Orientation of colour bar ("vertical" or 126 | "horizontal"). 127 | :param extend_min: Boolean flag. If True, the bottom of the colour bar will 128 | have an arrow. If False, it will be a flat line, suggesting that lower 129 | values are not possible. 130 | :param extend_max: Same but for top of colour bar. 131 | :param fraction_of_axis_length: Fraction of axis length (y-axis if 132 | orientation is "vertical", x-axis if orientation is "horizontal") 133 | occupied by colour bar. 134 | :param font_size: Font size for labels on colour bar. 135 | :return: colour_bar_object: Colour bar (instance of 136 | `matplotlib.pyplot.colorbar`) created by this method. 137 | """ 138 | 139 | if colour_norm_object is None: 140 | colour_norm_object = matplotlib.colors.Normalize( 141 | vmin=min_colour_value, vmax=max_colour_value, clip=False) 142 | 143 | scalar_mappable_object = pyplot.cm.ScalarMappable( 144 | cmap=colour_map_object, norm=colour_norm_object) 145 | scalar_mappable_object.set_array(values_to_colour) 146 | 147 | if extend_min and extend_max: 148 | extend_string = 'both' 149 | elif extend_min: 150 | extend_string = 'min' 151 | elif extend_max: 152 | extend_string = 'max' 153 | else: 154 | extend_string = 'neither' 155 | 156 | if orientation_string == 'horizontal': 157 | padding = 0.075 158 | else: 159 | padding = 0.05 160 | 161 | colour_bar_object = pyplot.colorbar( 162 | ax=axes_object, mappable=scalar_mappable_object, 163 | orientation=orientation_string, pad=padding, extend=extend_string, 164 | shrink=fraction_of_axis_length) 165 | 166 | colour_bar_object.ax.tick_params(labelsize=font_size) 167 | return colour_bar_object 168 | 169 | 170 | def get_points_in_perf_diagram(observed_labels, forecast_probabilities): 171 | """Creates points for performance diagram. 172 | 173 | E = number of examples 174 | T = number of binarization thresholds 175 | 176 | :param observed_labels: length-E numpy array of class labels (integers in 177 | 0...1). 178 | :param forecast_probabilities: length-E numpy array with forecast 179 | probabilities of label = 1. 180 | :return: pod_by_threshold: length-T numpy array of POD (probability of 181 | detection) values. 182 | :return: success_ratio_by_threshold: length-T numpy array of success ratios. 183 | """ 184 | 185 | assert numpy.all(numpy.logical_or( 186 | observed_labels == 0, observed_labels == 1 187 | )) 188 | 189 | assert numpy.all(numpy.logical_and( 190 | forecast_probabilities >= 0, forecast_probabilities <= 1 191 | )) 192 | 193 | observed_labels = observed_labels.astype(int) 194 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 195 | 196 | num_thresholds = len(binarization_thresholds) 197 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 198 | success_ratio_by_threshold = numpy.full(num_thresholds, numpy.nan) 199 | 200 | for k in range(num_thresholds): 201 | these_forecast_labels = ( 202 | forecast_probabilities >= binarization_thresholds[k] 203 | ).astype(int) 204 | 205 | this_num_hits = numpy.sum(numpy.logical_and( 206 | these_forecast_labels == 1, observed_labels == 1 207 | )) 208 | 209 | this_num_false_alarms = numpy.sum(numpy.logical_and( 210 | these_forecast_labels == 1, observed_labels == 0 211 | )) 212 | 213 | this_num_misses = numpy.sum(numpy.logical_and( 214 | these_forecast_labels == 0, observed_labels == 1 215 | )) 216 | 217 | try: 218 | pod_by_threshold[k] = ( 219 | float(this_num_hits) / (this_num_hits + this_num_misses) 220 | ) 221 | except ZeroDivisionError: 222 | pass 223 | 224 | try: 225 | success_ratio_by_threshold[k] = ( 226 | float(this_num_hits) / (this_num_hits + this_num_false_alarms) 227 | ) 228 | except ZeroDivisionError: 229 | pass 230 | 231 | pod_by_threshold = numpy.array([1.] + pod_by_threshold.tolist() + [0.]) 232 | success_ratio_by_threshold = numpy.array( 233 | [0.] + success_ratio_by_threshold.tolist() + [1.] 234 | ) 235 | 236 | return pod_by_threshold, success_ratio_by_threshold 237 | 238 | 239 | def plot_performance_diagram( 240 | observed_labels, forecast_probabilities, 241 | line_colour=DEFAULT_LINE_COLOUR, line_width=DEFAULT_LINE_WIDTH, 242 | bias_line_colour=DEFAULT_BIAS_LINE_COLOUR, 243 | bias_line_width=DEFAULT_BIAS_LINE_WIDTH, axes_object=None): 244 | """Plots performance diagram. 245 | 246 | E = number of examples 247 | 248 | :param observed_labels: length-E numpy array of class labels (integers in 249 | 0...1). 250 | :param forecast_probabilities: length-E numpy array with forecast 251 | probabilities of label = 1. 252 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 253 | :param line_width: Line width (real positive number). 254 | :param bias_line_colour: Colour of contour lines for frequency bias. 255 | :param bias_line_width: Width of contour lines for frequency bias. 256 | :param axes_object: Will plot on these axes (instance of 257 | `matplotlib.axes._subplots.AxesSubplot`). If `axes_object is None`, 258 | will create new axes. 259 | :return: pod_by_threshold: See doc for `get_points_in_perf_diagram`. 260 | detection) values. 261 | :return: success_ratio_by_threshold: Same. 262 | """ 263 | 264 | pod_by_threshold, success_ratio_by_threshold = get_points_in_perf_diagram( 265 | observed_labels=observed_labels, 266 | forecast_probabilities=forecast_probabilities) 267 | 268 | if axes_object is None: 269 | _, axes_object = pyplot.subplots( 270 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 271 | ) 272 | 273 | success_ratio_matrix, pod_matrix = _get_sr_pod_grid() 274 | csi_matrix = _csi_from_sr_and_pod(success_ratio_matrix, pod_matrix) 275 | frequency_bias_matrix = _bias_from_sr_and_pod( 276 | success_ratio_matrix, pod_matrix) 277 | 278 | this_colour_map_object, this_colour_norm_object = _get_csi_colour_scheme() 279 | 280 | pyplot.contourf( 281 | success_ratio_matrix, pod_matrix, csi_matrix, LEVELS_FOR_CSI_CONTOURS, 282 | cmap=this_colour_map_object, norm=this_colour_norm_object, vmin=0., 283 | vmax=1., axes=axes_object) 284 | 285 | colour_bar_object = _add_colour_bar( 286 | axes_object=axes_object, colour_map_object=this_colour_map_object, 287 | colour_norm_object=this_colour_norm_object, 288 | values_to_colour=csi_matrix, min_colour_value=0., 289 | max_colour_value=1., orientation_string='vertical', 290 | extend_min=False, extend_max=False) 291 | colour_bar_object.set_label('CSI (critical success index)') 292 | 293 | bias_colour_tuple = () 294 | for _ in range(len(LEVELS_FOR_BIAS_CONTOURS)): 295 | bias_colour_tuple += (bias_line_colour,) 296 | 297 | bias_contour_object = pyplot.contour( 298 | success_ratio_matrix, pod_matrix, frequency_bias_matrix, 299 | LEVELS_FOR_BIAS_CONTOURS, colors=bias_colour_tuple, 300 | linewidths=bias_line_width, linestyles='dashed', axes=axes_object) 301 | pyplot.clabel( 302 | bias_contour_object, inline=True, inline_spacing=BIAS_LABEL_PADDING_PX, 303 | fmt=BIAS_STRING_FORMAT, fontsize=FONT_SIZE) 304 | 305 | nan_flags = numpy.logical_or( 306 | numpy.isnan(success_ratio_by_threshold), numpy.isnan(pod_by_threshold) 307 | ) 308 | 309 | if not numpy.all(nan_flags): 310 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 311 | axes_object.plot( 312 | success_ratio_by_threshold[real_indices], 313 | pod_by_threshold[real_indices], color=line_colour, 314 | linestyle='solid', linewidth=line_width) 315 | 316 | axes_object.set_xlabel('Success ratio (1 - FAR)') 317 | axes_object.set_ylabel('POD (probability of detection)') 318 | axes_object.set_xlim(0., 1.) 319 | axes_object.set_ylim(0., 1.) 320 | 321 | return pod_by_threshold, success_ratio_by_threshold 322 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/roc_curves.py: -------------------------------------------------------------------------------- 1 | """Methods for plotting ROC (receiver operating characteristic) curve.""" 2 | 3 | import numpy 4 | import matplotlib.colors 5 | import matplotlib.pyplot as pyplot 6 | import performance_diagrams 7 | 8 | DEFAULT_LINE_COLOUR = numpy.array([228, 26, 28], dtype=float) / 255 9 | DEFAULT_LINE_WIDTH = 3 10 | DEFAULT_RANDOM_LINE_COLOUR = numpy.full(3, 152. / 255) 11 | DEFAULT_RANDOM_LINE_WIDTH = 2 12 | 13 | LEVELS_FOR_PEIRCE_CONTOURS = numpy.linspace(0, 1, num=11, dtype=float) 14 | 15 | FIGURE_WIDTH_INCHES = 10 16 | FIGURE_HEIGHT_INCHES = 10 17 | 18 | FONT_SIZE = 20 19 | pyplot.rc('font', size=FONT_SIZE) 20 | pyplot.rc('axes', titlesize=FONT_SIZE) 21 | pyplot.rc('axes', labelsize=FONT_SIZE) 22 | pyplot.rc('xtick', labelsize=FONT_SIZE) 23 | pyplot.rc('ytick', labelsize=FONT_SIZE) 24 | pyplot.rc('legend', fontsize=FONT_SIZE) 25 | pyplot.rc('figure', titlesize=FONT_SIZE) 26 | 27 | 28 | def _get_pofd_pod_grid(pofd_spacing=0.01, pod_spacing=0.01): 29 | """Creates grid in POFD-POD space. 30 | 31 | M = number of rows (unique POD values) in grid 32 | N = number of columns (unique POFD values) in grid 33 | 34 | :param pofd_spacing: Spacing between grid cells in adjacent columns. 35 | :param pod_spacing: Spacing between grid cells in adjacent rows. 36 | :return: pofd_matrix: M-by-N numpy array of POFD values. 37 | :return: pod_matrix: M-by-N numpy array of POD values. 38 | """ 39 | 40 | num_pofd_values = 1 + int(numpy.ceil(1. / pofd_spacing)) 41 | num_pod_values = 1 + int(numpy.ceil(1. / pod_spacing)) 42 | 43 | unique_pofd_values = numpy.linspace(0., 1., num=num_pofd_values) 44 | unique_pod_values = numpy.linspace(0., 1., num=num_pod_values)[::-1] 45 | return numpy.meshgrid(unique_pofd_values, unique_pod_values) 46 | 47 | 48 | def _get_peirce_colour_scheme(): 49 | """Returns colour scheme for Peirce score. 50 | 51 | :return: colour_map_object: Colour scheme (instance of 52 | `matplotlib.colors.ListedColormap`). 53 | :return: colour_norm_object: Instance of `matplotlib.colors.BoundaryNorm`, 54 | defining the scale of the colour map. 55 | """ 56 | 57 | this_colour_map_object = pyplot.cm.Blues 58 | this_colour_norm_object = matplotlib.colors.BoundaryNorm( 59 | LEVELS_FOR_PEIRCE_CONTOURS, this_colour_map_object.N) 60 | 61 | rgba_matrix = this_colour_map_object(this_colour_norm_object( 62 | LEVELS_FOR_PEIRCE_CONTOURS 63 | )) 64 | 65 | colour_list = [ 66 | rgba_matrix[i, ..., :-1] for i in range(rgba_matrix.shape[0]) 67 | ] 68 | 69 | colour_map_object = matplotlib.colors.ListedColormap(colour_list) 70 | colour_map_object.set_under(numpy.array([1, 1, 1])) 71 | colour_norm_object = matplotlib.colors.BoundaryNorm( 72 | LEVELS_FOR_PEIRCE_CONTOURS, colour_map_object.N) 73 | 74 | return colour_map_object, colour_norm_object 75 | 76 | 77 | def get_points_in_roc_curve(observed_labels, forecast_probabilities): 78 | """Creates points for ROC curve. 79 | 80 | E = number of examples 81 | T = number of binarization thresholds 82 | 83 | :param observed_labels: length-E numpy array of class labels (integers in 84 | 0...1). 85 | :param forecast_probabilities: length-E numpy array with forecast 86 | probabilities of label = 1. 87 | :return: pofd_by_threshold: length-T numpy array of POFD (probability of 88 | false detection) values. 89 | :return: pod_by_threshold: length-T numpy array of POD (probability of 90 | detection) values. 91 | """ 92 | 93 | assert numpy.all(numpy.logical_or( 94 | observed_labels == 0, observed_labels == 1 95 | )) 96 | 97 | assert numpy.all(numpy.logical_and( 98 | forecast_probabilities >= 0, forecast_probabilities <= 1 99 | )) 100 | 101 | observed_labels = observed_labels.astype(int) 102 | binarization_thresholds = numpy.linspace(0, 1, num=1001, dtype=float) 103 | 104 | num_thresholds = len(binarization_thresholds) 105 | pofd_by_threshold = numpy.full(num_thresholds, numpy.nan) 106 | pod_by_threshold = numpy.full(num_thresholds, numpy.nan) 107 | 108 | for k in range(num_thresholds): 109 | these_forecast_labels = ( 110 | forecast_probabilities >= binarization_thresholds[k] 111 | ).astype(int) 112 | 113 | this_num_hits = numpy.sum(numpy.logical_and( 114 | these_forecast_labels == 1, observed_labels == 1 115 | )) 116 | 117 | this_num_false_alarms = numpy.sum(numpy.logical_and( 118 | these_forecast_labels == 1, observed_labels == 0 119 | )) 120 | 121 | this_num_misses = numpy.sum(numpy.logical_and( 122 | these_forecast_labels == 0, observed_labels == 1 123 | )) 124 | 125 | this_num_correct_nulls = numpy.sum(numpy.logical_and( 126 | these_forecast_labels == 0, observed_labels == 0 127 | )) 128 | 129 | try: 130 | pofd_by_threshold[k] = ( 131 | float(this_num_false_alarms) / 132 | (this_num_false_alarms + this_num_correct_nulls) 133 | ) 134 | except ZeroDivisionError: 135 | pass 136 | 137 | try: 138 | pod_by_threshold[k] = ( 139 | float(this_num_hits) / (this_num_hits + this_num_misses) 140 | ) 141 | except ZeroDivisionError: 142 | pass 143 | 144 | pod_by_threshold = numpy.array([1.] + pod_by_threshold.tolist() + [0.]) 145 | pofd_by_threshold = numpy.array([1.] + pofd_by_threshold.tolist() + [0.]) 146 | 147 | return pofd_by_threshold, pod_by_threshold 148 | 149 | 150 | def plot_roc_curve( 151 | observed_labels, forecast_probabilities, 152 | line_colour=DEFAULT_LINE_COLOUR, line_width=DEFAULT_LINE_WIDTH, 153 | random_line_colour=DEFAULT_RANDOM_LINE_COLOUR, 154 | random_line_width=DEFAULT_RANDOM_LINE_WIDTH, axes_object=None): 155 | """Plots ROC curve. 156 | 157 | E = number of examples 158 | 159 | :param observed_labels: length-E numpy array of class labels (integers in 160 | 0...1). 161 | :param forecast_probabilities: length-E numpy array with forecast 162 | probabilities of label = 1. 163 | :param line_colour: Colour (in any format accepted by `matplotlib.colors`). 164 | :param line_width: Line width (real positive number). 165 | :param random_line_colour: Colour of reference line (ROC curve for random 166 | predictor). 167 | :param random_line_width: Width of reference line (ROC curve for random 168 | predictor). 169 | :param axes_object: Will plot on these axes (instance of 170 | `matplotlib.axes._subplots.AxesSubplot`). If `axes_object is None`, 171 | will create new axes. 172 | :return: pofd_by_threshold: See doc for `get_points_in_roc_curve`. 173 | :return: pod_by_threshold: Same. 174 | """ 175 | 176 | pofd_by_threshold, pod_by_threshold = get_points_in_roc_curve( 177 | observed_labels=observed_labels, 178 | forecast_probabilities=forecast_probabilities) 179 | 180 | if axes_object is None: 181 | _, axes_object = pyplot.subplots( 182 | 1, 1, figsize=(FIGURE_WIDTH_INCHES, FIGURE_HEIGHT_INCHES) 183 | ) 184 | 185 | pofd_matrix, pod_matrix = _get_pofd_pod_grid() 186 | peirce_score_matrix = pod_matrix - pofd_matrix 187 | 188 | colour_map_object, colour_norm_object = _get_peirce_colour_scheme() 189 | 190 | pyplot.contourf( 191 | pofd_matrix, pod_matrix, peirce_score_matrix, 192 | LEVELS_FOR_PEIRCE_CONTOURS, cmap=colour_map_object, 193 | norm=colour_norm_object, vmin=0., vmax=1., axes=axes_object) 194 | 195 | # TODO(thunderhoser): Calling private method is a HACK. 196 | colour_bar_object = performance_diagrams._add_colour_bar( 197 | axes_object=axes_object, colour_map_object=colour_map_object, 198 | colour_norm_object=colour_norm_object, 199 | values_to_colour=peirce_score_matrix, min_colour_value=0., 200 | max_colour_value=1., orientation_string='vertical', 201 | extend_min=False, extend_max=False) 202 | 203 | print(colour_bar_object) 204 | colour_bar_object.set_label('Peirce score') 205 | 206 | random_x_coords = numpy.array([0., 1.]) 207 | random_y_coords = numpy.array([0., 1.]) 208 | axes_object.plot( 209 | random_x_coords, random_y_coords, color=random_line_colour, 210 | linestyle='dashed', linewidth=random_line_width) 211 | 212 | nan_flags = numpy.logical_or( 213 | numpy.isnan(pofd_by_threshold), numpy.isnan(pod_by_threshold) 214 | ) 215 | 216 | if not numpy.all(nan_flags): 217 | real_indices = numpy.where(numpy.invert(nan_flags))[0] 218 | axes_object.plot( 219 | pofd_by_threshold[real_indices], pod_by_threshold[real_indices], 220 | color=line_colour, linestyle='solid', linewidth=line_width) 221 | 222 | axes_object.set_xlabel('POFD (probability of false detection)') 223 | axes_object.set_ylabel('POD (probability of detection)') 224 | axes_object.set_xlim(0., 1.) 225 | axes_object.set_ylim(0., 1.) 226 | 227 | return pofd_by_threshold, pod_by_threshold 228 | -------------------------------------------------------------------------------- /Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/tree_schematic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alburke/ams-2020-ml-python-course/997f93303f01956685c49ecdfc010ebb8dfc9b4e/Introduction_To_Machine_Learning/Supervised_Learning_Algorithims/tree_schematic.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ams-2020-ml-python-course 2 | 3 | Machine Learning in Python for Environmental Science Problems AMS 2020 Short Course 4 | 5 | ## Authors 6 | * Amanda Burke, University of Oklahoma (aburke1@ou.edu) 7 | * Benjamin Toms, Colorado State University (benatoms@rams.colostate.edu) 8 | * Katherine Avery, University of Oklahoma (katherine.avery@ou.edu) 9 | * Hamid Kamangir, Texas A&M Corpus Christi (hkamangir@islander.tamucc.edu) 10 | * Karthik Kashinath, Lawrence Berkeley National Laboratory (kkashinath@lbl.gov) 11 | * Ryan Lagerquist, University of Oklahoma (ryan.lagerquist@ou.edu) 12 | 13 | ## Modules 14 | ### Introduction to Machine Learning 15 | 1. Introduction to Machine Learning and AI 16 | 2. Data Science Fundamentals 17 | 3. Supervised Learning Algorithms 18 | 4. Introduction to Deep Learning 19 | 20 | ### Advanced Topics in Machine Learning 21 | 1. Unsupervised Learning Overview 22 | 2. Machine Learning Model Interpretation 23 | 24 | ## Requirements 25 | The modules for this short course require Python 3.6 and the following Python libraries: 26 | * numpy 27 | * scipy 28 | * matplotlib 29 | * xarray 30 | * netcdf4 31 | * pandas 32 | * scikit-learn 33 | * tensorflow-gpu or tensorflow 34 | * keras 35 | * jupyter 36 | * ipython 37 | * jupyterlab 38 | * ipywidgets 39 | 40 | ## Data Access 41 | The data for the course are stored online. The `download_data.py` script will download the data to the appropriate location and extract all files. The netCDF data is contained in a 2GB tar file, so make sure you have at least 4GB of storage available and a fast internet connection. 42 | 43 | ## Course Website 44 | To run the notebooks on the cloud rather than a local installation, see the short course website 45 | [Machine Learning in Python for Environmental Science](https://sites.google.com/rams.colostate.edu/ams-ml4es/agenda-and-code). 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /download_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Developed by David John Gagne II 3 | AMS 2019 Short Course 4 | """ 5 | 6 | from urllib.request import urlretrieve 7 | import os 8 | from os.path import exists, join 9 | import tarfile 10 | 11 | if not exists("data"): 12 | os.mkdir("data") 13 | csv_tar_file = "https://storage.googleapis.com/track_data_ncar_ams_3km_csv_small/track_data_ncar_ams_3km_csv_small.tar.gz" 14 | nc_tar_file = "https://storage.googleapis.com/track_data_ncar_ams_3km_nc_small/track_data_ncar_ams_3km_nc_small.tar.gz" 15 | print("Get csv files") 16 | urlretrieve(csv_tar_file, join("data", csv_tar_file.split("/")[-1])) 17 | print("Get nc files") 18 | urlretrieve(nc_tar_file, join("data", nc_tar_file.split("/")[-1])) 19 | print("Extract csv tar file") 20 | csv_tar = tarfile.open(join("data", csv_tar_file.split("/")[-1])) 21 | csv_tar.extractall("data/") 22 | csv_tar.close() 23 | print("Extract nc tar file") 24 | nc_tar = tarfile.open(join("data", nc_tar_file.split("/")[-1])) 25 | nc_tar.extractall("data/") 26 | nc_tar.close() 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup file for ams-2020-ml-python-course.""" 2 | 3 | from setuptools import setup 4 | 5 | PACKAGE_NAMES = ['interpretation', 'evaluation'] 6 | 7 | KEYWORDS = [ 8 | 'machine learning', 'deep learning', 'artificial intelligence', 9 | 'data mining', 'weather', 'meteorology', 'atmospheric science', 10 | 'thunderstorm', 'tornado' 11 | ] 12 | 13 | SHORT_DESCRIPTION = ( 14 | 'Python library for machine-learning short course at AMS 2020.' 15 | ) 16 | 17 | LONG_DESCRIPTION = ( 18 | 'Python library for short course on machine learning at AMS (American' 19 | 'Meteorological Society) 2020 Annual Meeting.' 20 | ) 21 | 22 | CLASSIFIERS = [ 23 | 'Development Status :: 2 - Pre-Alpha', 24 | 'Intended Audience :: Science/Research', 25 | 'License :: OSI Approved :: MIT License', 26 | 'Programming Language :: Python :: 2.7' 27 | ] 28 | 29 | if __name__ == '__main__': 30 | setup( 31 | name='ams-2020-ml-python-course', 32 | version='0.1', 33 | description=SHORT_DESCRIPTION, 34 | long_description=LONG_DESCRIPTION, 35 | license='MIT', 36 | author='Amanda Burke', 37 | author_email='aburke1@ou.edu', 38 | url='https://github.com/alburke/ams-2020-ml-python-course', 39 | packages=PACKAGE_NAMES, 40 | scripts=[], 41 | keywords=KEYWORDS, 42 | classifiers=CLASSIFIERS, 43 | include_package_data=True, 44 | zip_safe=False 45 | ) 46 | -------------------------------------------------------------------------------- /util/extract_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Developed by Amanda Burke 3 | Based off methods by Sheri Mickelson, AMS 2019 4 | 5 | AMS 2020 Short Course 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import xarray as xr 11 | from glob import glob 12 | 13 | # Input variables for the extract_csv_data() function 14 | csv_input_variables = ['REFL_COM_mean', 'REFL_COM_max', 'REFL_COM_min', 'REFL_COM_std', 'REFL_COM_percentile_10', 15 | 'REFL_COM_percentile_25', 'REFL_COM_percentile_50', 'REFL_COM_percentile_75', 'REFL_COM_percentile_90', 16 | 'U10_mean', 'U10_max', 'U10_min', 'U10_std', 'U10_percentile_10', 'U10_percentile_25', 'U10_percentile_50', 17 | 'U10_percentile_75', 'U10_percentile_90', 'V10_mean', 'V10_max', 'V10_min', 'V10_std', 'V10_percentile_10', 18 | 'V10_percentile_25', 'V10_percentile_50', 'V10_percentile_75', 'V10_percentile_90', 'T2_mean', 'T2_max', 19 | 'T2_min', 'T2_std', 'T2_percentile_10', 'T2_percentile_25', 'T2_percentile_50', 'T2_percentile_75', 20 | 'T2_percentile_90', 'area', 'eccentricity', 'major_axis_length', 'minor_axis_length', 'orientation'] 21 | # Label variable for the extract_csv_data() function 22 | csv_label_variable = ['RVORT1_MAX-future_max'] 23 | 24 | # Input variables for the extract_nc_data() function 25 | nc_input_variables = ["REFL_COM_curr", "U10_curr", "V10_curr"] 26 | # Label variable for the extract_nc_data() function 27 | nc_label_variable = ["RVORT1_MAX_future"] 28 | 29 | 30 | def extract_csv_data(input_data_path): 31 | """ 32 | Extracts csv data from a given set of files. Returns datasets 33 | containing the predictor and label variables. 34 | 35 | Args: 36 | input_data_path (str): path to dataset directory 37 | 38 | returns: Predictor, label, and valid date data (# of datafiles,). 39 | 40 | """ 41 | # Find all csv files from given directory 42 | data_files = sorted(glob(input_data_path + "*.csv")) 43 | 44 | in_data = [] 45 | out_data = [] 46 | valid_times = [] 47 | 48 | for files in data_files: 49 | # Read in csv data 50 | data = pd.read_csv(files) 51 | #Append the predictor and label variables 52 | in_data.append(data.loc[:,csv_input_variables].values) 53 | out_data.append(data.loc[:,csv_label_variable].values) 54 | #Append daily timestamps 55 | valid_24_hour_date = data.loc[:,"Valid_Date"].values 56 | valid_times.append(pd.Timestamp(valid_24_hour_date[0][:10])) 57 | 58 | return in_data, out_data, valid_times 59 | 60 | 61 | def extract_nc_data(input_data_path): 62 | """ 63 | Extracts netcdf data from a given set of files. Returns datasets 64 | containing the input variables and output variables. 65 | 66 | Args: 67 | input_data_path (str): path to dataset directory 68 | 69 | returns: Predictor and label data (examples, 32, 32, number of variables), 70 | valid dates (examples,). 71 | """ 72 | # Find all netcdf files from given directory 73 | data_files = sorted(glob(input_data_path + "*.nc")) 74 | 75 | in_data = [] 76 | out_data = [] 77 | valid_times = [] 78 | 79 | for files in data_files: 80 | # Read in netcdf data 81 | data = xr.open_dataset(files) 82 | #Append the daily predictor and label variables 83 | in_data.append(np.stack([data[v].values for v in nc_input_variables], axis=-1)) 84 | out_data.append(np.stack([data[v].values for v in nc_label_variable], axis=-1)) 85 | #Append daily timestamps 86 | date = pd.Timestamp(files.split("/")[-1].split("_")[1]) 87 | valid_times.append([date] * in_data[-1].shape[0]) 88 | data.close() 89 | 90 | # Concatenate/stack data from lists of arrays to a single array 91 | all_in_data = np.vstack(in_data) 92 | all_out_data = np.vstack(out_data) 93 | all_valid_times = np.concatenate(valid_times) 94 | 95 | # Delete lists to save memory 96 | del in_data[:], out_data[:],valid_times[:] 97 | del in_data, out_data, valid_times 98 | 99 | return all_in_data, all_out_data, all_valid_times 100 | --------------------------------------------------------------------------------