├── .gitignore
├── .gitmodules
├── LICENSE.txt
├── README.md
├── bin
    ├── evaluate.sh
    ├── predict.sh
    ├── randomsearch.sh
    ├── train.sh
    └── tunehp.sh
├── config
    ├── autogenerated
    │   └── README.md
    ├── cnn2d_default.sh
    ├── cnn2d_example.sh
    ├── cnn2d_modular.sh
    ├── cnn2d_modular_hptuning.yaml
    ├── cnn2d_tutorial.sh
    ├── datapath.sh
    ├── get_datapath.py
    ├── tfrecord_baseline_eval.yaml
    ├── tfrecord_baseline_test.yaml
    └── tfrecord_baseline_train.yaml
├── docs
    ├── convert_tfrecords.md
    ├── hptuning.md
    └── ml_framework.md
├── log
    └── README.md
├── preprocessing
    ├── parameters.py
    └── process.py
├── requirements.txt
└── tfrecords
    └── convert_tfrecords.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .ipynb_checkpoints/
4 | .vscode/
5 | log/
6 | config/autogenerated/
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "ml_framework"]
 2 | 	path = ml_framework
 3 | 	url = git@github.com:fantine/ml-framework.git
 4 | [submodule "hptuning"]
 5 | 	path = hptuning
 6 | 	url = git@github.com:fantine/hptuning.git
 7 | [submodule "containers"]
 8 | 	path = containers
 9 | 	url = git@github.com:fantine/containers.git
10 | [submodule "processing_utils"]
11 | 	path = processing_utils
12 | 	url = git@github.com:fantine/seismic-processing-utils.git
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 |     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 |     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 7 |     * Neither the name of Stanford University, nor the name of Stanford Exploration Project, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 8 |     * If the software is used to develop scientific or technical material that is published in any peer-reviewed papers, conference abstracts or similar publications, the recipient agrees to acknowledge the Stanford Exploration Project in a manner consistent with industry practice.
 9 |     * The author(s) would appreciate being notified of any errors found in the supplied code by emailing:seplib-support@sep.stanford.edu
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Microseismic event detection on fiber-optic data using machine learning
 2 | 
 3 | - **Author:** Fantine Huot
 4 | 
 5 | Microseismic analysis is the primary tool available for fracture
 6 | characterization in unconventional reservoirs. As distributed acoustic sensing
 7 | (DAS) fibers are installed in the target reservoir and are thus close to the microseismic events, they hold vast potential for their high-resolution
 8 | analysis.  
 9 | 
10 | However, accurately detecting microseismic signals in continuous data is
11 | challenging and time-consuming. DAS acquisitions generate substantial data
12 | volumes, and microseismic events have a low signal-to-noise ratio in individual
13 | DAS channels. 
14 | 
15 | In this project, we design, train, and deploy a machine learning model to automatically detect thousands of microseismic events in DAS data acquired
16 | inside a shale reservoir. The stimulation of two offset wells generates the microseismic activity.
17 | 
18 | The deep learning model achieves an accuracy of over 98\% on our benchmark
19 | dataset of manually-picked events and even detects low-amplitude events missed
20 | during manual picking.  
21 | 
22 | 
23 | ## Getting started
24 | 
25 | ### Update the submodules
26 | After cloning the repository, run the following commands to initialize and
27 | update the submodules.
28 | 
29 | ```bash
30 | git submodule init
31 | git submodule update
32 | ```
33 | 
34 | ### Requirements
35 | 
36 | You can run the project from an interactive bash session within the provided
37 | [Docker](https://www.docker.com]) container:
38 | ```bash
39 | docker run --gpus all -it fantine/ml_framework:latest bash
40 | ```
41 | If you do not have root permissions to run Docker, [Singularity](https://singularity.lbl.gov) might be a good alternative for you. Refer to 
42 | `containers/README.md` for more details.
43 | 
44 | 
45 | ## Folder structure
46 | 
47 | - **bin:** Scripts to run machine learning jobs.
48 | - **config:** Configuration files. 
49 | - **containers:** Details on how to use containers for this project. 
50 | - **docs:** Documentation.
51 | - **log:** Directory for log files.
52 | - **ml_framework:** Machine learning framework.
53 | - **tfrecords:** Utility functions for converting files to TFRecords.
54 | 
55 | ## Set the datapath for the project
56 | 
57 | Set the `DATAPATH` variable inside `config/datapath.sh` to the data or scratch directory
58 | to which you want write data files.
59 | 
60 | ## Create and run a machine learning model
61 | 
62 | This repository provides a parameterized, modular framework for creating and
63 | running ML models.
64 | 
65 | - [Convert input data to TensorFlow records](docs/convert_tfrecords.md)
66 | - [Machine learning training and inference](docs/ml_framework.md)
67 | - [Hyperparameter tuning](docs/hptuning.md)
68 | 


--------------------------------------------------------------------------------
/bin/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run ML model evaluation
 4 | #
 5 | # e.g. bin/evaluate.sh model_config dataset job_id label
 6 | #
 7 | # @param {model_config} Name of ML model configuration to use.
 8 | #            This should correspond to a configuration file named as follows:
 9 | #            config/${model_config}.sh.
10 | # @param {dataset} Dataset identifier.
11 | #            Check the variable `eval_file` to make sure that this maps to the
12 | #            correct data.
13 | # @param {job_id} Job ID of the ML model to evaluate.
14 | #            Check the variable `ckpt` to make sure that this maps to the 
15 | #            correct ML model checkpoint.
16 | # @param {label} Optional label to add to the job name.
17 | 
18 | # Get arguments
19 | model_config=$1
20 | dataset=$2
21 | job_id=$3
22 | label=$4
23 | 
24 | # Check the datapath config file
25 | datapath_file=config/datapath.sh
26 | if [ ! -f "$datapath_file" ]; then
27 |   echo "Datapath config file not found: $datapath_file";
28 |   exit 1;
29 | fi
30 | 
31 | # Set datapaths
32 | . "config/datapath.sh"
33 | eval_file="${DATAPATH}/tfrecords/${dataset}/test-*.tfrecord.gz"
34 | ckpt="${DATAPATH}/models/${job_id}/ckpt"
35 | 
36 | # Check the ML model config file
37 | config_file=config/$model_config.sh
38 | if [ ! -f "$config_file" ]; then
39 |   echo "ML model config file not found: $config_file";
40 |   exit 1;
41 | fi
42 | 
43 | # Read the ML model config file
44 | . "$config_file"
45 | 
46 | # Define the job name
47 | now=$(date +%Y%m%d_%H%M%S)
48 | job_name=evaluate_${now}_${model_config}_${dataset}_${label}
49 | log_file="log/${job_name}.log"
50 | 
51 | # Set package and module name
52 | package_path=ml_framework/
53 | module_name=ml_framework.evaluate
54 | 
55 | # Run the job
56 | echo 'Running ML evaluation.'
57 | echo "Logging to file: $log_file"
58 | python -m $module_name \
59 | --job_dir=$ckpt \
60 | $MODULE_ARGS \
61 | --eval_file=$eval_file 2>&1 | tee $log_file


--------------------------------------------------------------------------------
/bin/predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run ML model prediction
 4 | #
 5 | # e.g. bin/predict.sh model_config dataset label
 6 | #
 7 | # @param {model_config} Name of ML model configuration to use.
 8 | #            This should correspond to a configuration file named as follows:
 9 | #            config/${model_config}.sh.
10 | # @param {dataset} Dataset identifier.
11 | #            Check the variable `test_file` to make sure that this maps to the
12 | #            correct data.
13 | # @param {job_id} Job ID of the ML model to evaluate.
14 | #            Check the variable `ckpt` to make sure that this maps to the 
15 | #            correct ML model checkpoint.
16 | # @param {label} Optional label to add to the job name.
17 | 
18 | # Get arguments
19 | model_config=$1
20 | dataset=$2
21 | job_id=$3
22 | label=$4
23 | 
24 | # Check the datapath config file
25 | datapath_file=config/datapath.sh
26 | if [ ! -f "$datapath_file" ]; then
27 |   echo "Datapath config file not found: $datapath_file";
28 |   exit 1;
29 | fi
30 | 
31 | # Set datapaths
32 | . "config/datapath.sh"
33 | test_file="${DATAPATH}/continuous_data/${dataset}*"
34 | ckpt="${DATAPATH}/models/${job_id}/ckpt"
35 | 
36 | # Check the ML model config file
37 | config_file=config/$model_config.sh
38 | if [ ! -f "$config_file" ]; then
39 |   echo "ML model config file not found: $config_file";
40 |   exit 1;
41 | fi
42 | 
43 | # Read the ML model config file
44 | . "$config_file"
45 | 
46 | # Define the job name
47 | now=$(date +%Y%m%d_%H%M%S)
48 | job_name=predict_${now}_${model_config}_${dataset}_${label}
49 | log_file="log/${job_name}.log"
50 | 
51 | # Set package and module name
52 | package_path=ml_framework/
53 | module_name=ml_framework.predict
54 | 
55 | # Run the job
56 | echo 'Running ML prediction.'
57 | echo "Logging to file: $log_file"
58 | python -m $module_name \
59 | --job_dir=$ckpt \
60 | $MODULE_ARGS \
61 | --test_file=$test_file 2>&1 | tee $log_file
62 | 


--------------------------------------------------------------------------------
/bin/randomsearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run ML model random search
 4 | #
 5 | # e.g. bin/randomsearch.sh model_config dataset label
 6 | #
 7 | # @param {model_config} Name of ML model configuration to use.
 8 | #            This should correspond to a configuration file named as follows:
 9 | #            config/${model_config}.sh.
10 | # @param {dataset} Dataset identifier.
11 | #            Check the variables `train_file`, and `eval_file` in `bin/train.sh`
12 | #            to make sure that this maps to the correct data.
13 | 
14 | # Get arguments
15 | model_config=$1
16 | dataset=$2
17 | 
18 | # Check the ML model config file
19 | config_file=config/$model_config.sh
20 | if [ ! -f "$config_file" ]; then
21 |   echo "ML model config file not found: $config_file";
22 |   exit 1;
23 | fi
24 | 
25 | hptuning_config=config/${model_config}_hptuning.yaml
26 | if [ ! -f "$hptuning_config" ]; then
27 |   echo "Hyperparameter tuning config file not found: $hptuning_config";
28 |   exit 1;
29 | fi
30 | 
31 | # Set job name
32 | now=$(date +%Y%m%d_%H%M%S)
33 | job_name=randomsearch_${now}_${model_config}_${dataset}
34 | log_file="log/${job_name}.log"
35 | 
36 | # Set package and module name
37 | package_path=hptuning/
38 | module_name=hptuning.random_search
39 | 
40 | echo 'Running random search job.'
41 | echo "Logging to file: $log_file"
42 | python -m $module_name \
43 |   --model_config=$model_config \
44 |   --hptuning_config=$hptuning_config \
45 |   --dataset=$dataset \
46 |   --label=$now 2>&1 | tee $log_file
47 | 


--------------------------------------------------------------------------------
/bin/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run ML model training
 4 | #
 5 | # e.g. bin/train.sh model_config dataset label
 6 | #
 7 | # @param {model_config} Name of ML model configuration to use.
 8 | #            This should correspond to a configuration file named as follows:
 9 | #            config/${model_config}.sh.
10 | # @param {dataset} Dataset identifier.
11 | #            Check the variables `train_file` and `eval_file` to make sure that
12 | #            this maps to the correct data.
13 | # @param {label} Optional label to add to the job name.
14 | 
15 | # Get arguments
16 | model_config=$1
17 | dataset=$2
18 | label=$3
19 | 
20 | # Check the datapath config file
21 | datapath_file=config/datapath.sh
22 | if [ ! -f "$datapath_file" ]; then
23 |   echo "Datapath config file not found: $datapath_file";
24 |   exit 1;
25 | fi
26 | 
27 | # Set datapaths
28 | . "config/datapath.sh"
29 | train_file="${DATAPATH}/tfrecords/${dataset}/train-*.tfrecord.gz"
30 | eval_file="${DATAPATH}/tfrecords/${dataset}/eval-*.tfrecord.gz"
31 | 
32 | # Check the ML model config file
33 | if [ "$label" != "hptuning" ]; then
34 |   config_file=config/$model_config.sh
35 | else
36 |   config_file=config/autogenerated/$model_config.sh
37 |   # Stripping the time stamp from the model name
38 |   model_config=${model_config:16}
39 | fi
40 | if [ ! -f "$config_file" ]; then
41 |   echo "ML model config file not found: $config_file";
42 |   exit 1;
43 | fi
44 | 
45 | # Read the ML model config file
46 | . "$config_file"
47 | 
48 | # Define the job name
49 | now=$(date +%Y%m%d_%H%M%S)
50 | job_name=train_${now}_${model_config}_${dataset}_${label}
51 | job_dir="${DATAPATH}/models/${job_name}"
52 | log_file="log/${job_name}.log"
53 | 
54 | # Set package and module name
55 | package_path=ml_framework/
56 | module_name=ml_framework.train
57 | 
58 | # Run the job
59 | if [ "$label" != "hptuning" ]; then
60 |   echo 'Running ML job.'
61 |   echo "Logging to file: $log_file"
62 |   python -m $module_name \
63 |   --job_dir=$job_dir \
64 |   $MODULE_ARGS \
65 |   --train_file=$train_file \
66 |   --eval_file=$eval_file 2>&1 | tee $log_file
67 | else # if this is a hyperparameter tuning job, run it in the foreground
68 |   echo "Logging to file: $log_file"
69 |   python -m $module_name \
70 |   --job_dir=$job_dir \
71 |   $MODULE_ARGS \
72 |   --train_file=$train_file \
73 |   --eval_file=$eval_file \
74 |   > $log_file 2>&1
75 | fi


--------------------------------------------------------------------------------
/bin/tunehp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run ML model hyperparameter tuning
 4 | #
 5 | # e.g. bin/tunehp.sh model_config dataset label
 6 | #
 7 | # @param {model_config} Name of ML model configuration to use.
 8 | #            This should correspond to a configuration file named as follows:
 9 | #            config/${model_config}.sh.
10 | # @param {dataset} Dataset identifier.
11 | #            Check the variables `train_file`, and `eval_file` in `bin/train.sh`
12 | #            to make sure that this maps to the correct data.
13 | 
14 | # Get arguments
15 | model_config=$1
16 | dataset=$2
17 | 
18 | # Check the ML model config file
19 | config_file=config/$model_config.sh
20 | if [ ! -f "$config_file" ]; then
21 |   echo "ML model config file not found: $config_file";
22 |   exit 1;
23 | fi
24 | 
25 | hptuning_config=config/${model_config}_hptuning.yaml
26 | if [ ! -f "$hptuning_config" ]; then
27 |   echo "Hyperparameter tuning config file not found: $hptuning_config";
28 |   exit 1;
29 | fi
30 | 
31 | # Set job name
32 | now=$(date +%Y%m%d_%H%M%S)
33 | job_name=hptuning_job_${now}_${model_config}_${dataset}
34 | log_file="log/${job_name}.log"
35 | 
36 | # Set package and module name
37 | package_path=hptuning/
38 | module_name=hptuning.bayes_opt
39 | 
40 | echo 'Running hyperparameter tuning job.'
41 | echo "Logging to file: $log_file"
42 | python -m $module_name \
43 |   --model_config=$model_config \
44 |   --hptuning_config=$hptuning_config \
45 |   --dataset=$dataset \
46 |   --label=$now 2>&1 | tee $log_file
47 | 


--------------------------------------------------------------------------------
/config/autogenerated/README.md:
--------------------------------------------------------------------------------
1 | # Directory for autogenerated files


--------------------------------------------------------------------------------
/config/cnn2d_default.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | export MODULE_ARGS=" \
 4 |   --model=CNN2DModular \
 5 |   --height=512 \
 6 |   --width=128 \
 7 |   --channels=1 \
 8 |   --tfrecord_height=712 \
 9 |   --tfrecord_width=196 \
10 |   --num_epochs=100 \
11 |   --learning_rate=0.001 \
12 |   --batch_size=32 \
13 |   --network_depth=5 \
14 |   --num_filters=16 \
15 |   --filter_increase_mode=2 \
16 |   --filter_multiplier=8 \
17 |   --activation=0 \
18 |   --downsampling=1 \
19 |   --batchnorm=1 \
20 |   --conv_dropout=0.2 \
21 |   --dense_dropout=0.4 \
22 |   --regularizer=0 \
23 | "
24 | 


--------------------------------------------------------------------------------
/config/cnn2d_example.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | export MODULE_ARGS=" \
 4 |   --model=CNN2DExample \
 5 |   --height=512 \
 6 |   --width=128 \
 7 |   --tfrecord_height=712 \
 8 |   --tfrecord_width=196 \
 9 |   --num_epochs=10 \
10 | "


--------------------------------------------------------------------------------
/config/cnn2d_modular.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | export MODULE_ARGS=" \
 4 |   --model=CNN2DModular \
 5 |   --height=512 \
 6 |   --width=128 \
 7 |   --tfrecord_height=712 \
 8 |   --tfrecord_width=196 \
 9 |   --num_epochs=100 \
10 |   --overlap=0.9375 \
11 | "
12 | 


--------------------------------------------------------------------------------
/config/cnn2d_modular_hptuning.yaml:
--------------------------------------------------------------------------------
 1 | max_trials: 100
 2 | hyperparameters:
 3 |   - name: learning_rate
 4 |     min_value: 1e-4
 5 |     max_value: 1e-2
 6 |     scale: LOG10_SCALE
 7 |   - name: batch_size
 8 |     min_value: 32
 9 |     max_value: 128
10 |     scale: LOG2_SCALE
11 |   - name: network_depth
12 |     min_value: 2
13 |     max_value: 5
14 |     scale: LINEAR_SCALE
15 |   - name: num_filters
16 |     min_value: 16
17 |     max_value: 32
18 |     scale: LOG2_SCALE
19 |   - name: filter_increase_mode
20 |     min_value: 1
21 |     max_value: 3
22 |     scale: LINEAR_SCALE
23 |   - name: filter_multiplier
24 |     min_value: 2
25 |     max_value: 8
26 |     scale: LOG2_SCALE
27 |   - name: activation
28 |     min_value: 0
29 |     max_value: 1
30 |     scale: LINEAR_SCALE
31 |   - name: downsampling
32 |     min_value: 0
33 |     max_value: 1
34 |     scale: LINEAR_SCALE
35 |   - name: batchnorm
36 |     min_value: 0
37 |     max_value: 1
38 |     scale: LINEAR_SCALE
39 |   - name: conv_dropout
40 |     min_value: 0.0
41 |     max_value: 0.2
42 |     scale: DECIMAL_SCALE
43 |   - name: dense_dropout
44 |     min_value: 0.0
45 |     max_value: 0.6
46 |     scale: DECIMAL_SCALE
47 |   - name: regularizer
48 |     min_value: 0
49 |     max_value: 2
50 |     scale: LINEAR_SCALE
51 |   - name: regularizer_weight
52 |     min_value: 1e-4
53 |     max_value: 1e-2
54 |     scale: LOG10_SCALE
55 | 


--------------------------------------------------------------------------------
/config/cnn2d_tutorial.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | export MODULE_ARGS=" \
 3 |   --model=CNN2DModular \
 4 |   --height=512 \
 5 |   --width=128 \
 6 |   --tfrecord_height=712 \
 7 |   --tfrecord_width=196 \
 8 |   --num_epochs=100 \
 9 |   --learning_rate=0.001 \
10 |   --batch_size=32 \
11 |   --network_depth=5 \
12 |   --num_filters=16 \
13 |   --filter_increase_mode=2 \
14 |   --filter_multiplier=8 \
15 |   --activation=0 \
16 |   --downsampling=1 \
17 |   --batchnorm=1 \
18 |   --conv_dropout=0.2 \
19 |   --dense_dropout=0.4 \
20 |   --regularizer=0 \
21 | "


--------------------------------------------------------------------------------
/config/datapath.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 | export DATAPATH="/scr1/fantine/microseismic-detection-ml/"


--------------------------------------------------------------------------------
/config/get_datapath.py:
--------------------------------------------------------------------------------
 1 | """Retrieves the project datapath."""
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | _DATAPATH_FILE = 'config/datapath.sh'
 7 | 
 8 | 
 9 | def get_datapath():
10 |   """Gets the project datapath."""
11 |   regex_pattern = r'DATAPATH="(\S+)"'
12 |   with open(_DATAPATH_FILE, 'r') as f:
13 |     datapath_text = f.read()
14 |   regex_match = re.search(regex_pattern, datapath_text)
15 |   if regex_match:
16 |     return regex_match.group(1)
17 |   raise ValueError(
18 |       'Please set a correct datapath in {}'.format(_DATAPATH_FILE))
19 | 


--------------------------------------------------------------------------------
/config/tfrecord_baseline_eval.yaml:
--------------------------------------------------------------------------------
1 | manifest_file: tfrecords/manifests/baseline_eval_manifest.txt
2 | output_file_prefix: tfrecords/baseline/eval
3 | num_shards: 4
4 | min_val: -15.353061
5 | max_val: 15.353061


--------------------------------------------------------------------------------
/config/tfrecord_baseline_test.yaml:
--------------------------------------------------------------------------------
1 | manifest_file: tfrecords/manifests/baseline_test_manifest.txt
2 | output_file_prefix: tfrecords/baseline/test
3 | num_shards: 4
4 | min_val: -15.353061
5 | max_val: 15.353061


--------------------------------------------------------------------------------
/config/tfrecord_baseline_train.yaml:
--------------------------------------------------------------------------------
1 | manifest_file: tfrecords/manifests/baseline_train_manifest.txt
2 | output_file_prefix: tfrecords/baseline/train
3 | num_shards: 23
4 | min_val: -15.353061
5 | max_val: 15.353061


--------------------------------------------------------------------------------
/docs/convert_tfrecords.md:
--------------------------------------------------------------------------------
 1 | # Convert input data to TensorFlow records (TFRecords)
 2 | 
 3 | Once all the machine learning examples are preprocessed, we convert them into 
 4 | TensorFlow records to optimize the input pipeline for machine learning
 5 | training and inference. 
 6 | 
 7 | Reading large numbers of small files significantly impacts I/O performance.
 8 | For large datasets, we preprocess the input data into larger (~100MB) TFRecord
 9 | files to get maximum I/O throughput. 
10 | 
11 | ## Create TFRecords manifest files
12 | To generate the TFRecords, we create a manifest file for the dataset.
13 | The manifest file serves as a recipe for generating the TFRecords.
14 | It is a text file that contains all the filenames that constitute the dataset.
15 | 
16 | We generate a manifest files for each of the three datasets: training,
17 | evaluation, and testing. We take the full list of data files, separate them
18 | into these three datasets, and shuffle them, before writing out each manifest
19 | file.
20 | 
21 | ## Create TensorFlow records
22 | Convert the data files into TFRecord files with the following command:
23 | ```bash
24 | python -m tfrecords.convert_tfrecords -c config/tfrecord_baseline_train.yaml
25 | ```
26 | 
27 | The `-c` flag specifies a configuration file.
28 | You can create your own TFRecord configuration file inside the `config/` folder
29 | and replace `config/tfrecord_baseline_train.yaml` with the name of your
30 | TFRecord configuration file. Look at other TFRecord configuration files for
31 | examples.
32 | 
33 | In the TFRecord configuration file, the `DATAPATH` specified in 
34 | `config/datapath.sh` will automatically be prefixed to all the paths, so all
35 | the paths should be specified as relative paths.
36 | 
37 | ## TFRecord configuration files
38 | Variables in the TFRecord configuration file:
39 | 
40 | - `manifest_file`: Manifest file that contains all the filenames that
41 | constitute the dataset to convert into TFRecords. If no manifest file is found,
42 | the script will use `input_file_pattern` to create the list of filenames, and
43 | write them to the file specified by `manifest_file`.
44 | 
45 | - `input_file_pattern`: A Unix glob file pattern. When a manifest file is
46 | provided, this variable is ignored. If no manifest file is found, the script
47 | uses this file pattern to create the list of files for the manifest file.
48 | 
49 | - `output_file_prefix`: Filename prefix to write the TFRecords.
50 | 
51 | - `num_shards`: Number of TFRecord shards to generate. Adjust this number to
52 | generate files of about 100Mb. 
53 | 
54 | - `min_val` and `max_val` (optional): When specified, the data are clipped and
55 | rescaled using these values, scaling the dataset to the [0, 1] range.


--------------------------------------------------------------------------------
/docs/hptuning.md:
--------------------------------------------------------------------------------
 1 | # Hyperparameter tuning
 2 | 
 3 | The hyperparameters are tuned using Bayesian optimization. 
 4 | 
 5 | ## Run a hyperparameter tuning task
 6 | To tune the hyperparameters for an ML model:
 7 | ```bash
 8 | bin/tunehp.sh model_config dataset
 9 | ```
10 | 
11 | - `model_config`: Name of the ML model configuration to use. This should
12 | correspond to a configuration file named `config/model_config.sh`.
13 | - `dataset`: Dataset identifier. Check the variables `train_file` and
14 | `eval_file` in `bin/train.sh` to ensure that this maps to the correct data.
15 | 
16 | ## Define the domain for hyperparameter tuning
17 | 
18 | You can define the domain to explore for hyperparameter tuning by creating a
19 | corresponding configuration file: `config/your_model_config_hptuning.yaml`. 
20 | Look at other hyperparameter tuning configuration files in `config/` for
21 | examples.


--------------------------------------------------------------------------------
/docs/ml_framework.md:
--------------------------------------------------------------------------------
 1 | # Machine learning training and inference
 2 | 
 3 | ## Train an ML model
 4 | To train an ML model, use the following command:
 5 | ```bash
 6 | bin/train.sh model_config dataset
 7 | ```
 8 | 
 9 | - `model_config`: Name of ML model configuration to use. This should correspond 
10 | to a configuration file named `config/model_config.sh`.
11 | - `dataset`: Dataset identifier. Check the variables `train_file` and
12 | `eval_file` in `bin/train.sh` to make sure that this maps to the correct data.
13 | 
14 | For example, to run the tutorial machine learning model on the baseline dataset:
15 | ```bash
16 | bin/train.sh cnn2d_tutorial baseline
17 | ```
18 | 
19 | The training logs are written to a file of the form `log/job_id.log`. Take note
20 | of the `job_id` for it will be required at inference: the trained network
21 | is saved at `DATAPATH/models/job_id/`.
22 | 
23 | ### Configure ML model parameters
24 | The parameters for an ML task can be configured by creating a corresponding
25 | configuration file: `config/your_model_config.sh`. Look at other ML model
26 | configuration files in `config/` for examples.
27 | 
28 | ### Create a new ML model architecture
29 | - Create a new `your_model.py` file inside the `ml_framework/model` folder.
30 | Look at other models inside the folder for examples.
31 | - Reference your new model in `ml_framework/model/__init__.py`.
32 | - Set the `model` argument to your new model's name in your model configuration
33 | file `config/your_model_config.sh`.
34 | 
35 | ## Evaluate a ML model
36 | 
37 | Once the model is trained, evaluate the model's performance:
38 | ```bash
39 | bin/evaluate.sh model_config dataset job_id
40 | ```
41 | 
42 | - `model_config`: Name of ML model configuration to use. This should correspond 
43 | to a configuration file named `config/model_config.sh`.
44 | - `dataset`: Dataset identifier. Check the variable `eval_file` in 
45 | `bin/evaluate.sh` to make sure that this maps to the correct data. 
46 | - `job_id`: The job identifier of the machine learning training from which to
47 | load the trained network. 
48 | 
49 | This evaluates the model's performance on the test dataset and saves the
50 | corresponding logits to the following file: 
51 | `DATAPATH/models/job_id/eval_logits.npy`
52 | The logits are saved in the same order as provided in the input data pipeline,
53 | which means that they correspond to each line of the TFRecord manifest file.
54 | 
55 | ## Run inference on continuous data
56 | 
57 | Once the model is trained, run inference on continuous data:
58 | ```bash
59 | bin/predict.sh model_config dataset job_id
60 | ```
61 | 
62 | - `model_config`: Name of ML model configuration to use. This should correspond 
63 | to a configuration file named `config/model_config.sh`.
64 | - `dataset`: Dataset identifier. Check the variable `test_file` in 
65 | `bin/predict.sh` to make sure that this maps to the correct data. 
66 | - `job_id`: The job identifier of the machine learning training from which to
67 | load the trained network. 
68 | 
69 | `test_file` should be a Unix glob pattern that matches the continuous data
70 | Numpy files on which to run inference. For each continuous data file 
71 | `path_to_continuous_data/filename.npy`, a sliding window runs through the data
72 | and the correspoding logits are saved to 
73 | `path_to_continuous_data/filename_logits.npy`. The amount of overlap between
74 | sliding windows can be set by adding an `overlap` argument to the model
75 | configuration.
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/log/README.md:
--------------------------------------------------------------------------------
1 | # Directory for log files


--------------------------------------------------------------------------------
/preprocessing/parameters.py:
--------------------------------------------------------------------------------
 1 | """Parameters for data preprocessing."""
 2 | 
 3 | import os
 4 | from config import get_datapath
 5 | 
 6 | # pylint: disable=invalid-name
 7 | 
 8 | datapath = get_datapath.get_datapath()
 9 | 
10 | unprocessed_datapath = os.path.join(datapath, 'unprocessed_data')
11 | processed_datapath = os.path.join(datapath, 'processed_data')
12 | std_channels_file = 'std_channels.npy'
13 | clip_value = 11.21  # 99.5th percentile of absolute amplitudes
14 | 


--------------------------------------------------------------------------------
/preprocessing/process.py:
--------------------------------------------------------------------------------
 1 | """Data processing."""
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | import numpy as np
 7 | 
 8 | from preprocessing import parameters
 9 | from processing_utils import processing_utils as processing
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | 
13 | 
14 | def _process(data, clip_value, std_channels):
15 |   data = np.clip(data, -clip_value, clip_value)
16 |   data = data / np.expand_dims(std_channels, axis=1)
17 |   return data
18 | 
19 | 
20 | def get_start_channel(filename):
21 |   basename = os.path.basename(filename)
22 |   return int(basename[-16:-12])
23 | 
24 | 
25 | def process(file_pattern, in_dir, out_dir, clip_value, std_channels):
26 |   filenames = processing.get_filenames(file_pattern)
27 | 
28 |   for i, filename in enumerate(filenames):
29 |     if i % 1000 == 0:
30 |       logging.info('Processed %s files.', i)
31 |     data = np.load(filename)
32 |     start_ch = get_start_channel(filename)
33 |     n_channels = data.shape[0]
34 |     data = _process(data, clip_value,
35 |                     std_channels[start_ch:start_ch + n_channels])
36 |     out_file = filename.replace(in_dir, out_dir)
37 |     os.makedirs(os.path.dirname(out_file), exist_ok=True)
38 |     np.save(out_file, data)
39 | 
40 | 
41 | def main():
42 |   file_pattern = os.path.join(parameters.unprocessed_datapath, '*/*')
43 |   std_channels = np.load(parameters.std_channels_file)
44 |   process(
45 |       file_pattern,
46 |       in_dir=parameters.unprocessed_datapath,
47 |       out_dir=parameters.processed_datapath,
48 |       clip_value=parameters.clip_value,
49 |       std_channels=std_channels,
50 |   )
51 | 
52 | 
53 | if __name__ == '__main__':
54 |   main()
55 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-gpu>=2.4.1
2 | GPyOpt==1.2.6
3 | h5py==2.10.0
4 | matplotlib==3.3.2
5 | numpy==1.19.2
6 | PyYAML==5.3.1
7 | scipy==1.4.1


--------------------------------------------------------------------------------
/tfrecords/convert_tfrecords.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import enum
  3 | import logging
  4 | import os
  5 | import random
  6 | import re
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | import yaml
 12 | 
 13 | from config import get_datapath
 14 | 
 15 | 
 16 | random.seed(42)
 17 | 
 18 | 
 19 | class CompressionType(enum.Enum):
 20 |   GZIP = 'GZIP'
 21 |   NONE = ''
 22 | 
 23 | 
 24 | _FILE_EXTENSION = {
 25 |     CompressionType.GZIP: '.gz',
 26 |     CompressionType.NONE: '',
 27 | }
 28 | 
 29 | 
 30 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 31 | 
 32 | 
 33 | # def _float_feature(data):
 34 | #   return tf.train.Feature(float_list=tf.train.FloatList(value=data.reshape(-1)))
 35 | 
 36 | def _bytes_feature(data):
 37 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[data]))
 38 | 
 39 | 
 40 | def create_tf_example(inputs, labels):
 41 |   feature_dict = {
 42 |       'inputs': _bytes_feature(inputs.tobytes()),
 43 |       'labels': _bytes_feature(labels.tobytes()),
 44 |   }
 45 |   return tf.train.Example(features=tf.train.Features(feature=feature_dict))
 46 | 
 47 | 
 48 | class DataLoader():
 49 |   def __init__(self, min_val, max_val):
 50 |     self.min_val = min_val
 51 |     self.max_val = max_val
 52 | 
 53 |   def _clip_and_rescale(self, data):
 54 |     data = np.clip(data, self.min_val, self.max_val)
 55 |     return np.divide((data - self.min_val), (self.max_val - self.min_val))
 56 | 
 57 |   @ staticmethod
 58 |   def _get_label(filename):
 59 |     if 'noise' in filename:
 60 |       return np.zeros((1,), dtype=np.float32)
 61 |     return np.ones((1,), dtype=np.float32)
 62 | 
 63 |   def read(self, filename):
 64 |     inputs = np.float32(np.load(filename))
 65 |     if self.min_val != 0.0 or self.max_val != 1.0:
 66 |       inputs = self._clip_and_rescale(inputs)
 67 |     labels = self._get_label(filename)
 68 |     return inputs, labels
 69 | 
 70 | 
 71 | def _get_file_suffix(compression_type):
 72 |   return '.tfrecord{}'.format(_FILE_EXTENSION[compression_type])
 73 | 
 74 | 
 75 | def read_manifest(manifest_file):
 76 |   with open(manifest_file, 'r') as f:
 77 |     file_list = [line.rstrip() for line in f]
 78 |   logging.info('Converting %s files into TFRecords.', len(file_list))
 79 |   return file_list
 80 | 
 81 | 
 82 | def _glob(file_pattern):
 83 |   return sorted(tf.io.gfile.glob(file_pattern))
 84 | 
 85 | 
 86 | def create_manifest(manifest_file, file_pattern, shuffle=True):
 87 |   file_list = _glob(file_pattern)
 88 |   if shuffle:
 89 |     random.shuffle(file_list)
 90 |   os.makedirs(os.path.dirname(manifest_file), exist_ok=True)
 91 |   with open(manifest_file, 'w') as f:
 92 |     for filename in file_list:
 93 |       f.write(filename + '\n')
 94 | 
 95 | 
 96 | def convert_to_tfrecords(params):
 97 |   datapath = get_datapath.get_datapath()
 98 |   manifest_file = os.path.join(datapath, params.manifest_file)
 99 |   if not os.path.exists(manifest_file):
100 |     logging.info('Creating manifest file: %s', manifest_file)
101 |     create_manifest(manifest_file, os.path.join(
102 |         datapath, params.input_file_pattern))
103 |   else:
104 |     logging.info('Using the existing manifest file: %s', manifest_file)
105 | 
106 |   file_list = read_manifest(manifest_file)
107 |   file_shards = np.array_split(file_list, params.num_shards)
108 |   file_suffix = _get_file_suffix(params.compression_type)
109 |   options = tf.io.TFRecordOptions(
110 |       compression_type=params.compression_type.value)
111 |   data_loader = DataLoader(params.min_val, params.max_val)
112 |   output_file_prefix = os.path.join(datapath, params.output_file_prefix)
113 | 
114 |   os.makedirs(os.path.dirname(output_file_prefix), exist_ok=True)
115 |   for i, file_shard in enumerate(file_shards):
116 |     tfrecord_file = '{}-{:04d}-of-{:04d}{}'.format(
117 |         output_file_prefix, i, params.num_shards, file_suffix)
118 |     logging.info('Writing %s', tfrecord_file)
119 |     with tf.io.TFRecordWriter(tfrecord_file, options=options) as writer:
120 |       for filename in file_shard:
121 |         inputs, outputs = data_loader.read(filename)
122 |         tf_example = create_tf_example(inputs, outputs)
123 |         writer.write(tf_example.SerializeToString())
124 | 
125 | 
126 | class ArgumentParser():
127 | 
128 |   def __init__(self):
129 |     config_parser = argparse.ArgumentParser(
130 |         formatter_class=argparse.RawDescriptionHelpFormatter,
131 |         add_help=False)
132 | 
133 |     config_parser.add_argument(
134 |         '-c', '--config-file',
135 |         help='Parse script arguments from config file.',
136 |         default=None,
137 |         metavar='FILE')
138 | 
139 |     self._config_parser = config_parser
140 | 
141 |     self._parser = argparse.ArgumentParser(parents=[config_parser])
142 | 
143 |   @ staticmethod
144 |   def _parse_config(items):
145 |     argv = []
146 |     for k, v in items:
147 |       argv.append('--{}'.format(k))
148 |       argv.append(v)
149 |     return argv
150 | 
151 |   def _add_arguments(self, defaults=None):
152 |     parser = self._parser
153 | 
154 |     parser.add_argument(
155 |         '--input_file_pattern',
156 |         help='Input data files.',
157 |         default='',
158 |     )
159 |     parser.add_argument(
160 |         '--output_file_prefix',
161 |         help='Output file prefix.',
162 |         default='tfrecords/',
163 |     )
164 |     parser.add_argument(
165 |         '--input_height',
166 |         help='Input data height.',
167 |         type=int,
168 |     )
169 |     parser.add_argument(
170 |         '--input_width',
171 |         help='Input data width.',
172 |         type=int,
173 |         default=1,
174 |     )
175 |     parser.add_argument(
176 |         '--input_depth',
177 |         help='Input data depth.',
178 |         type=int,
179 |         default=1,
180 |     )
181 |     parser.add_argument(
182 |         '--input_channels',
183 |         help='Input data channels.',
184 |         type=int,
185 |         default=1,
186 |     )
187 |     parser.add_argument(
188 |         '--num_shards',
189 |         help='Number of shards to generate.',
190 |         type=int,
191 |         default=0,
192 |     )
193 |     parser.add_argument(
194 |         '--compression_type',
195 |         help='File compression type.',
196 |         type=CompressionType,
197 |         choices=list(CompressionType),
198 |         default=CompressionType.GZIP,
199 |     )
200 |     parser.add_argument(
201 |         '--manifest_file',
202 |         help='Manifest file.',
203 |         default='tfrecords/manifests/manifest.txt',
204 |     )
205 |     parser.add_argument(
206 |         '--min_val',
207 |         help='Minimum value.',
208 |         type=float,
209 |         default=0.0,
210 |     )
211 |     parser.add_argument(
212 |         '--max_val',
213 |         help='Maximum value.',
214 |         type=float,
215 |         default=1.0,
216 |     )
217 | 
218 |   def parse_known_args(self, argv):
219 |     args, remaining_argv = self._config_parser.parse_known_args(argv)
220 |     if args.config_file:
221 |       with open(args.config_file, 'r') as config:
222 |         defaults = yaml.safe_load(config)
223 |       defaults['config_file'] = args.config_file
224 |     else:
225 |       defaults = dict()
226 |     self._add_arguments(defaults=defaults)
227 |     self._parser.set_defaults(**defaults)
228 | 
229 |     return self._parser.parse_known_args(remaining_argv)
230 | 
231 | 
232 | def main():
233 |   params, _ = ArgumentParser().parse_known_args(sys.argv[1:])
234 |   convert_to_tfrecords(params)
235 | 
236 | 
237 | if __name__ == '__main__':
238 |   main()
239 | 


--------------------------------------------------------------------------------