├── .gitignore ├── .gitmodules ├── LICENSE.txt ├── README.md ├── bin ├── evaluate.sh ├── predict.sh ├── randomsearch.sh ├── train.sh └── tunehp.sh ├── config ├── autogenerated │ └── README.md ├── cnn2d_default.sh ├── cnn2d_example.sh ├── cnn2d_modular.sh ├── cnn2d_modular_hptuning.yaml ├── cnn2d_tutorial.sh ├── datapath.sh ├── get_datapath.py ├── tfrecord_baseline_eval.yaml ├── tfrecord_baseline_test.yaml └── tfrecord_baseline_train.yaml ├── docs ├── convert_tfrecords.md ├── hptuning.md └── ml_framework.md ├── log └── README.md ├── preprocessing ├── parameters.py └── process.py ├── requirements.txt └── tfrecords └── convert_tfrecords.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | .ipynb_checkpoints/ 4 | .vscode/ 5 | log/ 6 | config/autogenerated/ 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ml_framework"] 2 | path = ml_framework 3 | url = git@github.com:fantine/ml-framework.git 4 | [submodule "hptuning"] 5 | path = hptuning 6 | url = git@github.com:fantine/hptuning.git 7 | [submodule "containers"] 8 | path = containers 9 | url = git@github.com:fantine/containers.git 10 | [submodule "processing_utils"] 11 | path = processing_utils 12 | url = git@github.com:fantine/seismic-processing-utils.git 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 7 | * Neither the name of Stanford University, nor the name of Stanford Exploration Project, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 8 | * If the software is used to develop scientific or technical material that is published in any peer-reviewed papers, conference abstracts or similar publications, the recipient agrees to acknowledge the Stanford Exploration Project in a manner consistent with industry practice. 9 | * The author(s) would appreciate being notified of any errors found in the supplied code by emailing:seplib-support@sep.stanford.edu 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microseismic event detection on fiber-optic data using machine learning 2 | 3 | - **Author:** Fantine Huot 4 | 5 | Microseismic analysis is the primary tool available for fracture 6 | characterization in unconventional reservoirs. As distributed acoustic sensing 7 | (DAS) fibers are installed in the target reservoir and are thus close to the microseismic events, they hold vast potential for their high-resolution 8 | analysis. 9 | 10 | However, accurately detecting microseismic signals in continuous data is 11 | challenging and time-consuming. DAS acquisitions generate substantial data 12 | volumes, and microseismic events have a low signal-to-noise ratio in individual 13 | DAS channels. 14 | 15 | In this project, we design, train, and deploy a machine learning model to automatically detect thousands of microseismic events in DAS data acquired 16 | inside a shale reservoir. The stimulation of two offset wells generates the microseismic activity. 17 | 18 | The deep learning model achieves an accuracy of over 98\% on our benchmark 19 | dataset of manually-picked events and even detects low-amplitude events missed 20 | during manual picking. 21 | 22 | 23 | ## Getting started 24 | 25 | ### Update the submodules 26 | After cloning the repository, run the following commands to initialize and 27 | update the submodules. 28 | 29 | ```bash 30 | git submodule init 31 | git submodule update 32 | ``` 33 | 34 | ### Requirements 35 | 36 | You can run the project from an interactive bash session within the provided 37 | [Docker](https://www.docker.com]) container: 38 | ```bash 39 | docker run --gpus all -it fantine/ml_framework:latest bash 40 | ``` 41 | If you do not have root permissions to run Docker, [Singularity](https://singularity.lbl.gov) might be a good alternative for you. Refer to 42 | `containers/README.md` for more details. 43 | 44 | 45 | ## Folder structure 46 | 47 | - **bin:** Scripts to run machine learning jobs. 48 | - **config:** Configuration files. 49 | - **containers:** Details on how to use containers for this project. 50 | - **docs:** Documentation. 51 | - **log:** Directory for log files. 52 | - **ml_framework:** Machine learning framework. 53 | - **tfrecords:** Utility functions for converting files to TFRecords. 54 | 55 | ## Set the datapath for the project 56 | 57 | Set the `DATAPATH` variable inside `config/datapath.sh` to the data or scratch directory 58 | to which you want write data files. 59 | 60 | ## Create and run a machine learning model 61 | 62 | This repository provides a parameterized, modular framework for creating and 63 | running ML models. 64 | 65 | - [Convert input data to TensorFlow records](docs/convert_tfrecords.md) 66 | - [Machine learning training and inference](docs/ml_framework.md) 67 | - [Hyperparameter tuning](docs/hptuning.md) 68 | -------------------------------------------------------------------------------- /bin/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run ML model evaluation 4 | # 5 | # e.g. bin/evaluate.sh model_config dataset job_id label 6 | # 7 | # @param {model_config} Name of ML model configuration to use. 8 | # This should correspond to a configuration file named as follows: 9 | # config/${model_config}.sh. 10 | # @param {dataset} Dataset identifier. 11 | # Check the variable `eval_file` to make sure that this maps to the 12 | # correct data. 13 | # @param {job_id} Job ID of the ML model to evaluate. 14 | # Check the variable `ckpt` to make sure that this maps to the 15 | # correct ML model checkpoint. 16 | # @param {label} Optional label to add to the job name. 17 | 18 | # Get arguments 19 | model_config=$1 20 | dataset=$2 21 | job_id=$3 22 | label=$4 23 | 24 | # Check the datapath config file 25 | datapath_file=config/datapath.sh 26 | if [ ! -f "$datapath_file" ]; then 27 | echo "Datapath config file not found: $datapath_file"; 28 | exit 1; 29 | fi 30 | 31 | # Set datapaths 32 | . "config/datapath.sh" 33 | eval_file="${DATAPATH}/tfrecords/${dataset}/test-*.tfrecord.gz" 34 | ckpt="${DATAPATH}/models/${job_id}/ckpt" 35 | 36 | # Check the ML model config file 37 | config_file=config/$model_config.sh 38 | if [ ! -f "$config_file" ]; then 39 | echo "ML model config file not found: $config_file"; 40 | exit 1; 41 | fi 42 | 43 | # Read the ML model config file 44 | . "$config_file" 45 | 46 | # Define the job name 47 | now=$(date +%Y%m%d_%H%M%S) 48 | job_name=evaluate_${now}_${model_config}_${dataset}_${label} 49 | log_file="log/${job_name}.log" 50 | 51 | # Set package and module name 52 | package_path=ml_framework/ 53 | module_name=ml_framework.evaluate 54 | 55 | # Run the job 56 | echo 'Running ML evaluation.' 57 | echo "Logging to file: $log_file" 58 | python -m $module_name \ 59 | --job_dir=$ckpt \ 60 | $MODULE_ARGS \ 61 | --eval_file=$eval_file 2>&1 | tee $log_file -------------------------------------------------------------------------------- /bin/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run ML model prediction 4 | # 5 | # e.g. bin/predict.sh model_config dataset label 6 | # 7 | # @param {model_config} Name of ML model configuration to use. 8 | # This should correspond to a configuration file named as follows: 9 | # config/${model_config}.sh. 10 | # @param {dataset} Dataset identifier. 11 | # Check the variable `test_file` to make sure that this maps to the 12 | # correct data. 13 | # @param {job_id} Job ID of the ML model to evaluate. 14 | # Check the variable `ckpt` to make sure that this maps to the 15 | # correct ML model checkpoint. 16 | # @param {label} Optional label to add to the job name. 17 | 18 | # Get arguments 19 | model_config=$1 20 | dataset=$2 21 | job_id=$3 22 | label=$4 23 | 24 | # Check the datapath config file 25 | datapath_file=config/datapath.sh 26 | if [ ! -f "$datapath_file" ]; then 27 | echo "Datapath config file not found: $datapath_file"; 28 | exit 1; 29 | fi 30 | 31 | # Set datapaths 32 | . "config/datapath.sh" 33 | test_file="${DATAPATH}/continuous_data/${dataset}*" 34 | ckpt="${DATAPATH}/models/${job_id}/ckpt" 35 | 36 | # Check the ML model config file 37 | config_file=config/$model_config.sh 38 | if [ ! -f "$config_file" ]; then 39 | echo "ML model config file not found: $config_file"; 40 | exit 1; 41 | fi 42 | 43 | # Read the ML model config file 44 | . "$config_file" 45 | 46 | # Define the job name 47 | now=$(date +%Y%m%d_%H%M%S) 48 | job_name=predict_${now}_${model_config}_${dataset}_${label} 49 | log_file="log/${job_name}.log" 50 | 51 | # Set package and module name 52 | package_path=ml_framework/ 53 | module_name=ml_framework.predict 54 | 55 | # Run the job 56 | echo 'Running ML prediction.' 57 | echo "Logging to file: $log_file" 58 | python -m $module_name \ 59 | --job_dir=$ckpt \ 60 | $MODULE_ARGS \ 61 | --test_file=$test_file 2>&1 | tee $log_file 62 | -------------------------------------------------------------------------------- /bin/randomsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run ML model random search 4 | # 5 | # e.g. bin/randomsearch.sh model_config dataset label 6 | # 7 | # @param {model_config} Name of ML model configuration to use. 8 | # This should correspond to a configuration file named as follows: 9 | # config/${model_config}.sh. 10 | # @param {dataset} Dataset identifier. 11 | # Check the variables `train_file`, and `eval_file` in `bin/train.sh` 12 | # to make sure that this maps to the correct data. 13 | 14 | # Get arguments 15 | model_config=$1 16 | dataset=$2 17 | 18 | # Check the ML model config file 19 | config_file=config/$model_config.sh 20 | if [ ! -f "$config_file" ]; then 21 | echo "ML model config file not found: $config_file"; 22 | exit 1; 23 | fi 24 | 25 | hptuning_config=config/${model_config}_hptuning.yaml 26 | if [ ! -f "$hptuning_config" ]; then 27 | echo "Hyperparameter tuning config file not found: $hptuning_config"; 28 | exit 1; 29 | fi 30 | 31 | # Set job name 32 | now=$(date +%Y%m%d_%H%M%S) 33 | job_name=randomsearch_${now}_${model_config}_${dataset} 34 | log_file="log/${job_name}.log" 35 | 36 | # Set package and module name 37 | package_path=hptuning/ 38 | module_name=hptuning.random_search 39 | 40 | echo 'Running random search job.' 41 | echo "Logging to file: $log_file" 42 | python -m $module_name \ 43 | --model_config=$model_config \ 44 | --hptuning_config=$hptuning_config \ 45 | --dataset=$dataset \ 46 | --label=$now 2>&1 | tee $log_file 47 | -------------------------------------------------------------------------------- /bin/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run ML model training 4 | # 5 | # e.g. bin/train.sh model_config dataset label 6 | # 7 | # @param {model_config} Name of ML model configuration to use. 8 | # This should correspond to a configuration file named as follows: 9 | # config/${model_config}.sh. 10 | # @param {dataset} Dataset identifier. 11 | # Check the variables `train_file` and `eval_file` to make sure that 12 | # this maps to the correct data. 13 | # @param {label} Optional label to add to the job name. 14 | 15 | # Get arguments 16 | model_config=$1 17 | dataset=$2 18 | label=$3 19 | 20 | # Check the datapath config file 21 | datapath_file=config/datapath.sh 22 | if [ ! -f "$datapath_file" ]; then 23 | echo "Datapath config file not found: $datapath_file"; 24 | exit 1; 25 | fi 26 | 27 | # Set datapaths 28 | . "config/datapath.sh" 29 | train_file="${DATAPATH}/tfrecords/${dataset}/train-*.tfrecord.gz" 30 | eval_file="${DATAPATH}/tfrecords/${dataset}/eval-*.tfrecord.gz" 31 | 32 | # Check the ML model config file 33 | if [ "$label" != "hptuning" ]; then 34 | config_file=config/$model_config.sh 35 | else 36 | config_file=config/autogenerated/$model_config.sh 37 | # Stripping the time stamp from the model name 38 | model_config=${model_config:16} 39 | fi 40 | if [ ! -f "$config_file" ]; then 41 | echo "ML model config file not found: $config_file"; 42 | exit 1; 43 | fi 44 | 45 | # Read the ML model config file 46 | . "$config_file" 47 | 48 | # Define the job name 49 | now=$(date +%Y%m%d_%H%M%S) 50 | job_name=train_${now}_${model_config}_${dataset}_${label} 51 | job_dir="${DATAPATH}/models/${job_name}" 52 | log_file="log/${job_name}.log" 53 | 54 | # Set package and module name 55 | package_path=ml_framework/ 56 | module_name=ml_framework.train 57 | 58 | # Run the job 59 | if [ "$label" != "hptuning" ]; then 60 | echo 'Running ML job.' 61 | echo "Logging to file: $log_file" 62 | python -m $module_name \ 63 | --job_dir=$job_dir \ 64 | $MODULE_ARGS \ 65 | --train_file=$train_file \ 66 | --eval_file=$eval_file 2>&1 | tee $log_file 67 | else # if this is a hyperparameter tuning job, run it in the foreground 68 | echo "Logging to file: $log_file" 69 | python -m $module_name \ 70 | --job_dir=$job_dir \ 71 | $MODULE_ARGS \ 72 | --train_file=$train_file \ 73 | --eval_file=$eval_file \ 74 | > $log_file 2>&1 75 | fi -------------------------------------------------------------------------------- /bin/tunehp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run ML model hyperparameter tuning 4 | # 5 | # e.g. bin/tunehp.sh model_config dataset label 6 | # 7 | # @param {model_config} Name of ML model configuration to use. 8 | # This should correspond to a configuration file named as follows: 9 | # config/${model_config}.sh. 10 | # @param {dataset} Dataset identifier. 11 | # Check the variables `train_file`, and `eval_file` in `bin/train.sh` 12 | # to make sure that this maps to the correct data. 13 | 14 | # Get arguments 15 | model_config=$1 16 | dataset=$2 17 | 18 | # Check the ML model config file 19 | config_file=config/$model_config.sh 20 | if [ ! -f "$config_file" ]; then 21 | echo "ML model config file not found: $config_file"; 22 | exit 1; 23 | fi 24 | 25 | hptuning_config=config/${model_config}_hptuning.yaml 26 | if [ ! -f "$hptuning_config" ]; then 27 | echo "Hyperparameter tuning config file not found: $hptuning_config"; 28 | exit 1; 29 | fi 30 | 31 | # Set job name 32 | now=$(date +%Y%m%d_%H%M%S) 33 | job_name=hptuning_job_${now}_${model_config}_${dataset} 34 | log_file="log/${job_name}.log" 35 | 36 | # Set package and module name 37 | package_path=hptuning/ 38 | module_name=hptuning.bayes_opt 39 | 40 | echo 'Running hyperparameter tuning job.' 41 | echo "Logging to file: $log_file" 42 | python -m $module_name \ 43 | --model_config=$model_config \ 44 | --hptuning_config=$hptuning_config \ 45 | --dataset=$dataset \ 46 | --label=$now 2>&1 | tee $log_file 47 | -------------------------------------------------------------------------------- /config/autogenerated/README.md: -------------------------------------------------------------------------------- 1 | # Directory for autogenerated files -------------------------------------------------------------------------------- /config/cnn2d_default.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export MODULE_ARGS=" \ 4 | --model=CNN2DModular \ 5 | --height=512 \ 6 | --width=128 \ 7 | --channels=1 \ 8 | --tfrecord_height=712 \ 9 | --tfrecord_width=196 \ 10 | --num_epochs=100 \ 11 | --learning_rate=0.001 \ 12 | --batch_size=32 \ 13 | --network_depth=5 \ 14 | --num_filters=16 \ 15 | --filter_increase_mode=2 \ 16 | --filter_multiplier=8 \ 17 | --activation=0 \ 18 | --downsampling=1 \ 19 | --batchnorm=1 \ 20 | --conv_dropout=0.2 \ 21 | --dense_dropout=0.4 \ 22 | --regularizer=0 \ 23 | " 24 | -------------------------------------------------------------------------------- /config/cnn2d_example.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export MODULE_ARGS=" \ 4 | --model=CNN2DExample \ 5 | --height=512 \ 6 | --width=128 \ 7 | --tfrecord_height=712 \ 8 | --tfrecord_width=196 \ 9 | --num_epochs=10 \ 10 | " -------------------------------------------------------------------------------- /config/cnn2d_modular.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export MODULE_ARGS=" \ 4 | --model=CNN2DModular \ 5 | --height=512 \ 6 | --width=128 \ 7 | --tfrecord_height=712 \ 8 | --tfrecord_width=196 \ 9 | --num_epochs=100 \ 10 | --overlap=0.9375 \ 11 | " 12 | -------------------------------------------------------------------------------- /config/cnn2d_modular_hptuning.yaml: -------------------------------------------------------------------------------- 1 | max_trials: 100 2 | hyperparameters: 3 | - name: learning_rate 4 | min_value: 1e-4 5 | max_value: 1e-2 6 | scale: LOG10_SCALE 7 | - name: batch_size 8 | min_value: 32 9 | max_value: 128 10 | scale: LOG2_SCALE 11 | - name: network_depth 12 | min_value: 2 13 | max_value: 5 14 | scale: LINEAR_SCALE 15 | - name: num_filters 16 | min_value: 16 17 | max_value: 32 18 | scale: LOG2_SCALE 19 | - name: filter_increase_mode 20 | min_value: 1 21 | max_value: 3 22 | scale: LINEAR_SCALE 23 | - name: filter_multiplier 24 | min_value: 2 25 | max_value: 8 26 | scale: LOG2_SCALE 27 | - name: activation 28 | min_value: 0 29 | max_value: 1 30 | scale: LINEAR_SCALE 31 | - name: downsampling 32 | min_value: 0 33 | max_value: 1 34 | scale: LINEAR_SCALE 35 | - name: batchnorm 36 | min_value: 0 37 | max_value: 1 38 | scale: LINEAR_SCALE 39 | - name: conv_dropout 40 | min_value: 0.0 41 | max_value: 0.2 42 | scale: DECIMAL_SCALE 43 | - name: dense_dropout 44 | min_value: 0.0 45 | max_value: 0.6 46 | scale: DECIMAL_SCALE 47 | - name: regularizer 48 | min_value: 0 49 | max_value: 2 50 | scale: LINEAR_SCALE 51 | - name: regularizer_weight 52 | min_value: 1e-4 53 | max_value: 1e-2 54 | scale: LOG10_SCALE 55 | -------------------------------------------------------------------------------- /config/cnn2d_tutorial.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | export MODULE_ARGS=" \ 3 | --model=CNN2DModular \ 4 | --height=512 \ 5 | --width=128 \ 6 | --tfrecord_height=712 \ 7 | --tfrecord_width=196 \ 8 | --num_epochs=100 \ 9 | --learning_rate=0.001 \ 10 | --batch_size=32 \ 11 | --network_depth=5 \ 12 | --num_filters=16 \ 13 | --filter_increase_mode=2 \ 14 | --filter_multiplier=8 \ 15 | --activation=0 \ 16 | --downsampling=1 \ 17 | --batchnorm=1 \ 18 | --conv_dropout=0.2 \ 19 | --dense_dropout=0.4 \ 20 | --regularizer=0 \ 21 | " -------------------------------------------------------------------------------- /config/datapath.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | export DATAPATH="/scr1/fantine/microseismic-detection-ml/" -------------------------------------------------------------------------------- /config/get_datapath.py: -------------------------------------------------------------------------------- 1 | """Retrieves the project datapath.""" 2 | 3 | import re 4 | 5 | 6 | _DATAPATH_FILE = 'config/datapath.sh' 7 | 8 | 9 | def get_datapath(): 10 | """Gets the project datapath.""" 11 | regex_pattern = r'DATAPATH="(\S+)"' 12 | with open(_DATAPATH_FILE, 'r') as f: 13 | datapath_text = f.read() 14 | regex_match = re.search(regex_pattern, datapath_text) 15 | if regex_match: 16 | return regex_match.group(1) 17 | raise ValueError( 18 | 'Please set a correct datapath in {}'.format(_DATAPATH_FILE)) 19 | -------------------------------------------------------------------------------- /config/tfrecord_baseline_eval.yaml: -------------------------------------------------------------------------------- 1 | manifest_file: tfrecords/manifests/baseline_eval_manifest.txt 2 | output_file_prefix: tfrecords/baseline/eval 3 | num_shards: 4 4 | min_val: -15.353061 5 | max_val: 15.353061 -------------------------------------------------------------------------------- /config/tfrecord_baseline_test.yaml: -------------------------------------------------------------------------------- 1 | manifest_file: tfrecords/manifests/baseline_test_manifest.txt 2 | output_file_prefix: tfrecords/baseline/test 3 | num_shards: 4 4 | min_val: -15.353061 5 | max_val: 15.353061 -------------------------------------------------------------------------------- /config/tfrecord_baseline_train.yaml: -------------------------------------------------------------------------------- 1 | manifest_file: tfrecords/manifests/baseline_train_manifest.txt 2 | output_file_prefix: tfrecords/baseline/train 3 | num_shards: 23 4 | min_val: -15.353061 5 | max_val: 15.353061 -------------------------------------------------------------------------------- /docs/convert_tfrecords.md: -------------------------------------------------------------------------------- 1 | # Convert input data to TensorFlow records (TFRecords) 2 | 3 | Once all the machine learning examples are preprocessed, we convert them into 4 | TensorFlow records to optimize the input pipeline for machine learning 5 | training and inference. 6 | 7 | Reading large numbers of small files significantly impacts I/O performance. 8 | For large datasets, we preprocess the input data into larger (~100MB) TFRecord 9 | files to get maximum I/O throughput. 10 | 11 | ## Create TFRecords manifest files 12 | To generate the TFRecords, we create a manifest file for the dataset. 13 | The manifest file serves as a recipe for generating the TFRecords. 14 | It is a text file that contains all the filenames that constitute the dataset. 15 | 16 | We generate a manifest files for each of the three datasets: training, 17 | evaluation, and testing. We take the full list of data files, separate them 18 | into these three datasets, and shuffle them, before writing out each manifest 19 | file. 20 | 21 | ## Create TensorFlow records 22 | Convert the data files into TFRecord files with the following command: 23 | ```bash 24 | python -m tfrecords.convert_tfrecords -c config/tfrecord_baseline_train.yaml 25 | ``` 26 | 27 | The `-c` flag specifies a configuration file. 28 | You can create your own TFRecord configuration file inside the `config/` folder 29 | and replace `config/tfrecord_baseline_train.yaml` with the name of your 30 | TFRecord configuration file. Look at other TFRecord configuration files for 31 | examples. 32 | 33 | In the TFRecord configuration file, the `DATAPATH` specified in 34 | `config/datapath.sh` will automatically be prefixed to all the paths, so all 35 | the paths should be specified as relative paths. 36 | 37 | ## TFRecord configuration files 38 | Variables in the TFRecord configuration file: 39 | 40 | - `manifest_file`: Manifest file that contains all the filenames that 41 | constitute the dataset to convert into TFRecords. If no manifest file is found, 42 | the script will use `input_file_pattern` to create the list of filenames, and 43 | write them to the file specified by `manifest_file`. 44 | 45 | - `input_file_pattern`: A Unix glob file pattern. When a manifest file is 46 | provided, this variable is ignored. If no manifest file is found, the script 47 | uses this file pattern to create the list of files for the manifest file. 48 | 49 | - `output_file_prefix`: Filename prefix to write the TFRecords. 50 | 51 | - `num_shards`: Number of TFRecord shards to generate. Adjust this number to 52 | generate files of about 100Mb. 53 | 54 | - `min_val` and `max_val` (optional): When specified, the data are clipped and 55 | rescaled using these values, scaling the dataset to the [0, 1] range. -------------------------------------------------------------------------------- /docs/hptuning.md: -------------------------------------------------------------------------------- 1 | # Hyperparameter tuning 2 | 3 | The hyperparameters are tuned using Bayesian optimization. 4 | 5 | ## Run a hyperparameter tuning task 6 | To tune the hyperparameters for an ML model: 7 | ```bash 8 | bin/tunehp.sh model_config dataset 9 | ``` 10 | 11 | - `model_config`: Name of the ML model configuration to use. This should 12 | correspond to a configuration file named `config/model_config.sh`. 13 | - `dataset`: Dataset identifier. Check the variables `train_file` and 14 | `eval_file` in `bin/train.sh` to ensure that this maps to the correct data. 15 | 16 | ## Define the domain for hyperparameter tuning 17 | 18 | You can define the domain to explore for hyperparameter tuning by creating a 19 | corresponding configuration file: `config/your_model_config_hptuning.yaml`. 20 | Look at other hyperparameter tuning configuration files in `config/` for 21 | examples. -------------------------------------------------------------------------------- /docs/ml_framework.md: -------------------------------------------------------------------------------- 1 | # Machine learning training and inference 2 | 3 | ## Train an ML model 4 | To train an ML model, use the following command: 5 | ```bash 6 | bin/train.sh model_config dataset 7 | ``` 8 | 9 | - `model_config`: Name of ML model configuration to use. This should correspond 10 | to a configuration file named `config/model_config.sh`. 11 | - `dataset`: Dataset identifier. Check the variables `train_file` and 12 | `eval_file` in `bin/train.sh` to make sure that this maps to the correct data. 13 | 14 | For example, to run the tutorial machine learning model on the baseline dataset: 15 | ```bash 16 | bin/train.sh cnn2d_tutorial baseline 17 | ``` 18 | 19 | The training logs are written to a file of the form `log/job_id.log`. Take note 20 | of the `job_id` for it will be required at inference: the trained network 21 | is saved at `DATAPATH/models/job_id/`. 22 | 23 | ### Configure ML model parameters 24 | The parameters for an ML task can be configured by creating a corresponding 25 | configuration file: `config/your_model_config.sh`. Look at other ML model 26 | configuration files in `config/` for examples. 27 | 28 | ### Create a new ML model architecture 29 | - Create a new `your_model.py` file inside the `ml_framework/model` folder. 30 | Look at other models inside the folder for examples. 31 | - Reference your new model in `ml_framework/model/__init__.py`. 32 | - Set the `model` argument to your new model's name in your model configuration 33 | file `config/your_model_config.sh`. 34 | 35 | ## Evaluate a ML model 36 | 37 | Once the model is trained, evaluate the model's performance: 38 | ```bash 39 | bin/evaluate.sh model_config dataset job_id 40 | ``` 41 | 42 | - `model_config`: Name of ML model configuration to use. This should correspond 43 | to a configuration file named `config/model_config.sh`. 44 | - `dataset`: Dataset identifier. Check the variable `eval_file` in 45 | `bin/evaluate.sh` to make sure that this maps to the correct data. 46 | - `job_id`: The job identifier of the machine learning training from which to 47 | load the trained network. 48 | 49 | This evaluates the model's performance on the test dataset and saves the 50 | corresponding logits to the following file: 51 | `DATAPATH/models/job_id/eval_logits.npy` 52 | The logits are saved in the same order as provided in the input data pipeline, 53 | which means that they correspond to each line of the TFRecord manifest file. 54 | 55 | ## Run inference on continuous data 56 | 57 | Once the model is trained, run inference on continuous data: 58 | ```bash 59 | bin/predict.sh model_config dataset job_id 60 | ``` 61 | 62 | - `model_config`: Name of ML model configuration to use. This should correspond 63 | to a configuration file named `config/model_config.sh`. 64 | - `dataset`: Dataset identifier. Check the variable `test_file` in 65 | `bin/predict.sh` to make sure that this maps to the correct data. 66 | - `job_id`: The job identifier of the machine learning training from which to 67 | load the trained network. 68 | 69 | `test_file` should be a Unix glob pattern that matches the continuous data 70 | Numpy files on which to run inference. For each continuous data file 71 | `path_to_continuous_data/filename.npy`, a sliding window runs through the data 72 | and the correspoding logits are saved to 73 | `path_to_continuous_data/filename_logits.npy`. The amount of overlap between 74 | sliding windows can be set by adding an `overlap` argument to the model 75 | configuration. 76 | 77 | 78 | -------------------------------------------------------------------------------- /log/README.md: -------------------------------------------------------------------------------- 1 | # Directory for log files -------------------------------------------------------------------------------- /preprocessing/parameters.py: -------------------------------------------------------------------------------- 1 | """Parameters for data preprocessing.""" 2 | 3 | import os 4 | from config import get_datapath 5 | 6 | # pylint: disable=invalid-name 7 | 8 | datapath = get_datapath.get_datapath() 9 | 10 | unprocessed_datapath = os.path.join(datapath, 'unprocessed_data') 11 | processed_datapath = os.path.join(datapath, 'processed_data') 12 | std_channels_file = 'std_channels.npy' 13 | clip_value = 11.21 # 99.5th percentile of absolute amplitudes 14 | -------------------------------------------------------------------------------- /preprocessing/process.py: -------------------------------------------------------------------------------- 1 | """Data processing.""" 2 | 3 | import logging 4 | import os 5 | 6 | import numpy as np 7 | 8 | from preprocessing import parameters 9 | from processing_utils import processing_utils as processing 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | 14 | def _process(data, clip_value, std_channels): 15 | data = np.clip(data, -clip_value, clip_value) 16 | data = data / np.expand_dims(std_channels, axis=1) 17 | return data 18 | 19 | 20 | def get_start_channel(filename): 21 | basename = os.path.basename(filename) 22 | return int(basename[-16:-12]) 23 | 24 | 25 | def process(file_pattern, in_dir, out_dir, clip_value, std_channels): 26 | filenames = processing.get_filenames(file_pattern) 27 | 28 | for i, filename in enumerate(filenames): 29 | if i % 1000 == 0: 30 | logging.info('Processed %s files.', i) 31 | data = np.load(filename) 32 | start_ch = get_start_channel(filename) 33 | n_channels = data.shape[0] 34 | data = _process(data, clip_value, 35 | std_channels[start_ch:start_ch + n_channels]) 36 | out_file = filename.replace(in_dir, out_dir) 37 | os.makedirs(os.path.dirname(out_file), exist_ok=True) 38 | np.save(out_file, data) 39 | 40 | 41 | def main(): 42 | file_pattern = os.path.join(parameters.unprocessed_datapath, '*/*') 43 | std_channels = np.load(parameters.std_channels_file) 44 | process( 45 | file_pattern, 46 | in_dir=parameters.unprocessed_datapath, 47 | out_dir=parameters.processed_datapath, 48 | clip_value=parameters.clip_value, 49 | std_channels=std_channels, 50 | ) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu>=2.4.1 2 | GPyOpt==1.2.6 3 | h5py==2.10.0 4 | matplotlib==3.3.2 5 | numpy==1.19.2 6 | PyYAML==5.3.1 7 | scipy==1.4.1 -------------------------------------------------------------------------------- /tfrecords/convert_tfrecords.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import enum 3 | import logging 4 | import os 5 | import random 6 | import re 7 | import sys 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import yaml 12 | 13 | from config import get_datapath 14 | 15 | 16 | random.seed(42) 17 | 18 | 19 | class CompressionType(enum.Enum): 20 | GZIP = 'GZIP' 21 | NONE = '' 22 | 23 | 24 | _FILE_EXTENSION = { 25 | CompressionType.GZIP: '.gz', 26 | CompressionType.NONE: '', 27 | } 28 | 29 | 30 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 31 | 32 | 33 | # def _float_feature(data): 34 | # return tf.train.Feature(float_list=tf.train.FloatList(value=data.reshape(-1))) 35 | 36 | def _bytes_feature(data): 37 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[data])) 38 | 39 | 40 | def create_tf_example(inputs, labels): 41 | feature_dict = { 42 | 'inputs': _bytes_feature(inputs.tobytes()), 43 | 'labels': _bytes_feature(labels.tobytes()), 44 | } 45 | return tf.train.Example(features=tf.train.Features(feature=feature_dict)) 46 | 47 | 48 | class DataLoader(): 49 | def __init__(self, min_val, max_val): 50 | self.min_val = min_val 51 | self.max_val = max_val 52 | 53 | def _clip_and_rescale(self, data): 54 | data = np.clip(data, self.min_val, self.max_val) 55 | return np.divide((data - self.min_val), (self.max_val - self.min_val)) 56 | 57 | @ staticmethod 58 | def _get_label(filename): 59 | if 'noise' in filename: 60 | return np.zeros((1,), dtype=np.float32) 61 | return np.ones((1,), dtype=np.float32) 62 | 63 | def read(self, filename): 64 | inputs = np.float32(np.load(filename)) 65 | if self.min_val != 0.0 or self.max_val != 1.0: 66 | inputs = self._clip_and_rescale(inputs) 67 | labels = self._get_label(filename) 68 | return inputs, labels 69 | 70 | 71 | def _get_file_suffix(compression_type): 72 | return '.tfrecord{}'.format(_FILE_EXTENSION[compression_type]) 73 | 74 | 75 | def read_manifest(manifest_file): 76 | with open(manifest_file, 'r') as f: 77 | file_list = [line.rstrip() for line in f] 78 | logging.info('Converting %s files into TFRecords.', len(file_list)) 79 | return file_list 80 | 81 | 82 | def _glob(file_pattern): 83 | return sorted(tf.io.gfile.glob(file_pattern)) 84 | 85 | 86 | def create_manifest(manifest_file, file_pattern, shuffle=True): 87 | file_list = _glob(file_pattern) 88 | if shuffle: 89 | random.shuffle(file_list) 90 | os.makedirs(os.path.dirname(manifest_file), exist_ok=True) 91 | with open(manifest_file, 'w') as f: 92 | for filename in file_list: 93 | f.write(filename + '\n') 94 | 95 | 96 | def convert_to_tfrecords(params): 97 | datapath = get_datapath.get_datapath() 98 | manifest_file = os.path.join(datapath, params.manifest_file) 99 | if not os.path.exists(manifest_file): 100 | logging.info('Creating manifest file: %s', manifest_file) 101 | create_manifest(manifest_file, os.path.join( 102 | datapath, params.input_file_pattern)) 103 | else: 104 | logging.info('Using the existing manifest file: %s', manifest_file) 105 | 106 | file_list = read_manifest(manifest_file) 107 | file_shards = np.array_split(file_list, params.num_shards) 108 | file_suffix = _get_file_suffix(params.compression_type) 109 | options = tf.io.TFRecordOptions( 110 | compression_type=params.compression_type.value) 111 | data_loader = DataLoader(params.min_val, params.max_val) 112 | output_file_prefix = os.path.join(datapath, params.output_file_prefix) 113 | 114 | os.makedirs(os.path.dirname(output_file_prefix), exist_ok=True) 115 | for i, file_shard in enumerate(file_shards): 116 | tfrecord_file = '{}-{:04d}-of-{:04d}{}'.format( 117 | output_file_prefix, i, params.num_shards, file_suffix) 118 | logging.info('Writing %s', tfrecord_file) 119 | with tf.io.TFRecordWriter(tfrecord_file, options=options) as writer: 120 | for filename in file_shard: 121 | inputs, outputs = data_loader.read(filename) 122 | tf_example = create_tf_example(inputs, outputs) 123 | writer.write(tf_example.SerializeToString()) 124 | 125 | 126 | class ArgumentParser(): 127 | 128 | def __init__(self): 129 | config_parser = argparse.ArgumentParser( 130 | formatter_class=argparse.RawDescriptionHelpFormatter, 131 | add_help=False) 132 | 133 | config_parser.add_argument( 134 | '-c', '--config-file', 135 | help='Parse script arguments from config file.', 136 | default=None, 137 | metavar='FILE') 138 | 139 | self._config_parser = config_parser 140 | 141 | self._parser = argparse.ArgumentParser(parents=[config_parser]) 142 | 143 | @ staticmethod 144 | def _parse_config(items): 145 | argv = [] 146 | for k, v in items: 147 | argv.append('--{}'.format(k)) 148 | argv.append(v) 149 | return argv 150 | 151 | def _add_arguments(self, defaults=None): 152 | parser = self._parser 153 | 154 | parser.add_argument( 155 | '--input_file_pattern', 156 | help='Input data files.', 157 | default='', 158 | ) 159 | parser.add_argument( 160 | '--output_file_prefix', 161 | help='Output file prefix.', 162 | default='tfrecords/', 163 | ) 164 | parser.add_argument( 165 | '--input_height', 166 | help='Input data height.', 167 | type=int, 168 | ) 169 | parser.add_argument( 170 | '--input_width', 171 | help='Input data width.', 172 | type=int, 173 | default=1, 174 | ) 175 | parser.add_argument( 176 | '--input_depth', 177 | help='Input data depth.', 178 | type=int, 179 | default=1, 180 | ) 181 | parser.add_argument( 182 | '--input_channels', 183 | help='Input data channels.', 184 | type=int, 185 | default=1, 186 | ) 187 | parser.add_argument( 188 | '--num_shards', 189 | help='Number of shards to generate.', 190 | type=int, 191 | default=0, 192 | ) 193 | parser.add_argument( 194 | '--compression_type', 195 | help='File compression type.', 196 | type=CompressionType, 197 | choices=list(CompressionType), 198 | default=CompressionType.GZIP, 199 | ) 200 | parser.add_argument( 201 | '--manifest_file', 202 | help='Manifest file.', 203 | default='tfrecords/manifests/manifest.txt', 204 | ) 205 | parser.add_argument( 206 | '--min_val', 207 | help='Minimum value.', 208 | type=float, 209 | default=0.0, 210 | ) 211 | parser.add_argument( 212 | '--max_val', 213 | help='Maximum value.', 214 | type=float, 215 | default=1.0, 216 | ) 217 | 218 | def parse_known_args(self, argv): 219 | args, remaining_argv = self._config_parser.parse_known_args(argv) 220 | if args.config_file: 221 | with open(args.config_file, 'r') as config: 222 | defaults = yaml.safe_load(config) 223 | defaults['config_file'] = args.config_file 224 | else: 225 | defaults = dict() 226 | self._add_arguments(defaults=defaults) 227 | self._parser.set_defaults(**defaults) 228 | 229 | return self._parser.parse_known_args(remaining_argv) 230 | 231 | 232 | def main(): 233 | params, _ = ArgumentParser().parse_known_args(sys.argv[1:]) 234 | convert_to_tfrecords(params) 235 | 236 | 237 | if __name__ == '__main__': 238 | main() 239 | --------------------------------------------------------------------------------