├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── Makefile ├── Poster.pdf ├── README.md ├── build_docker_container.sh ├── convert_coco_yolo.py ├── preprocess_flir_dataset.sh ├── run_all_iters.sh ├── run_docker_container.sh ├── start_map_calc.sh ├── start_training.sh ├── thermal.data ├── thermal.names ├── yolov3-spp-custom.cfg ├── yolov3-spp.cfg ├── yolov3-thermal.cfg └── yolov3_5l.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.ipynb 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "darknet"] 2 | path = darknet 3 | url = https://github.com/AlexeyAB/darknet.git 4 | ignore = dirty 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-devel-ubuntu18.04 2 | LABEL maintainer "NVIDIA CORPORATION " 3 | 4 | ENV CUDNN_VERSION 7.5.0.56 5 | LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" 6 | 7 | RUN apt-get update && apt-get install -y --no-install-recommends \ 8 | build-essential \ 9 | cmake \ 10 | git \ 11 | wget \ 12 | sudo \ 13 | python \ 14 | libcudnn7=$CUDNN_VERSION-1+cuda10.0 \ 15 | libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \ 16 | apt-mark hold libcudnn7 && \ 17 | rm -rf /var/lib/apt/lists/* 18 | ENV HOME /home 19 | ENV REPOSITORY_PATH $HOME/object-detection 20 | ENV DARKNET_PATH $REPOSITORY_PATH/darknet 21 | RUN cd $HOME && git clone https://github.com/enesozi/object-detection $REPOSITORY_PATH && \ 22 | cd $REPOSITORY_PATH && git submodule update --init && cp Makefile $DARKNET_PATH && \ 23 | cd $DARKNET_PATH && make -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Enes Özipek 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GPU=1 2 | CUDNN=1 3 | CUDNN_HALF=1 4 | OPENCV=0 5 | AVX=1 6 | OPENMP=1 7 | LIBSO=1 8 | ZED_CAMERA=0 9 | 10 | # set GPU=1 and CUDNN=1 to speedup on GPU 11 | # set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher 12 | # set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0) 13 | 14 | DEBUG=0 15 | 16 | ARCH= -gencode arch=compute_30,code=sm_30 \ 17 | -gencode arch=compute_35,code=sm_35 \ 18 | -gencode arch=compute_50,code=[sm_50,compute_50] \ 19 | -gencode arch=compute_52,code=[sm_52,compute_52] \ 20 | -gencode arch=compute_61,code=[sm_61,compute_61] \ 21 | -gencode arch=compute_75,code=[sm_75,compute_75] 22 | 23 | OS := $(shell uname) 24 | 25 | # Tesla V100 26 | # ARCH= -gencode arch=compute_70,code=[sm_70,compute_70] 27 | 28 | # GeForce RTX 2080 Ti, RTX 2080, RTX 2070, Quadro RTX 8000, Quadro RTX 6000, Quadro RTX 5000, Tesla T4, XNOR Tensor Cores 29 | # ARCH= -gencode arch=compute_75,code=[sm_75,compute_75] 30 | 31 | # Jetson XAVIER 32 | # ARCH= -gencode arch=compute_72,code=[sm_72,compute_72] 33 | 34 | # GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4 35 | # ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61 36 | 37 | # GP100/Tesla P100 - DGX-1 38 | # ARCH= -gencode arch=compute_60,code=sm_60 39 | 40 | # For Jetson TX1, Tegra X1, DRIVE CX, DRIVE PX - uncomment: 41 | # ARCH= -gencode arch=compute_53,code=[sm_53,compute_53] 42 | 43 | # For Jetson Tx2 or Drive-PX2 uncomment: 44 | # ARCH= -gencode arch=compute_62,code=[sm_62,compute_62] 45 | 46 | 47 | VPATH=./src/ 48 | EXEC=darknet 49 | OBJDIR=./obj/ 50 | 51 | ifeq ($(LIBSO), 1) 52 | LIBNAMESO=libdarknet.so 53 | APPNAMESO=uselib 54 | endif 55 | 56 | CC=gcc 57 | CPP=g++ 58 | NVCC=nvcc 59 | OPTS=-Ofast 60 | LDFLAGS= -lm -pthread 61 | COMMON= -Iinclude/ -I3rdparty/stb/include 62 | CFLAGS=-Wall -Wfatal-errors -Wno-unused-result -Wno-unknown-pragmas -fPIC 63 | 64 | ifeq ($(DEBUG), 1) 65 | #OPTS= -O0 -g 66 | #OPTS= -Og -g 67 | COMMON+= -DDEBUG 68 | CFLAGS+= -DDEBUG 69 | else 70 | ifeq ($(AVX), 1) 71 | CFLAGS+= -ffp-contract=fast -mavx -mavx2 -msse3 -msse4.1 -msse4.2 -msse4a 72 | endif 73 | endif 74 | 75 | CFLAGS+=$(OPTS) 76 | 77 | ifeq ($(OPENCV), 1) 78 | COMMON+= -DOPENCV 79 | CFLAGS+= -DOPENCV 80 | LDFLAGS+= `pkg-config --libs opencv` 81 | COMMON+= `pkg-config --cflags opencv` 82 | endif 83 | 84 | ifeq ($(OPENMP), 1) 85 | CFLAGS+= -fopenmp 86 | LDFLAGS+= -lgomp 87 | endif 88 | 89 | ifeq ($(GPU), 1) 90 | COMMON+= -DGPU -I/usr/local/cuda/include/ 91 | CFLAGS+= -DGPU 92 | ifeq ($(OS),Darwin) #MAC 93 | LDFLAGS+= -L/usr/local/cuda/lib -lcuda -lcudart -lcublas -lcurand 94 | else 95 | LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand 96 | endif 97 | endif 98 | 99 | ifeq ($(CUDNN), 1) 100 | COMMON+= -DCUDNN 101 | ifeq ($(OS),Darwin) #MAC 102 | CFLAGS+= -DCUDNN -I/usr/local/cuda/include 103 | LDFLAGS+= -L/usr/local/cuda/lib -lcudnn 104 | else 105 | CFLAGS+= -DCUDNN -I/usr/local/cudnn/include 106 | LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn 107 | endif 108 | endif 109 | 110 | ifeq ($(CUDNN_HALF), 1) 111 | COMMON+= -DCUDNN_HALF 112 | CFLAGS+= -DCUDNN_HALF 113 | ARCH+= -gencode arch=compute_70,code=[sm_70,compute_70] 114 | endif 115 | 116 | ifeq ($(ZED_CAMERA), 1) 117 | CFLAGS+= -DZED_STEREO -I/usr/local/zed/include 118 | LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed 119 | #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 120 | endif 121 | 122 | OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o 123 | ifeq ($(GPU), 1) 124 | LDFLAGS+= -lstdc++ 125 | OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o 126 | endif 127 | 128 | OBJS = $(addprefix $(OBJDIR), $(OBJ)) 129 | DEPS = $(wildcard src/*.h) Makefile include/darknet.h 130 | 131 | all: obj backup results setchmod $(EXEC) $(LIBNAMESO) $(APPNAMESO) 132 | 133 | ifeq ($(LIBSO), 1) 134 | CFLAGS+= -fPIC 135 | 136 | $(LIBNAMESO): $(OBJS) include/yolo_v2_class.hpp src/yolo_v2_class.cpp 137 | $(CPP) -shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS) $(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS) 138 | 139 | $(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp src/yolo_console_dll.cpp 140 | $(CPP) -std=c++11 $(COMMON) $(CFLAGS) -o $@ src/yolo_console_dll.cpp $(LDFLAGS) -L ./ -l:$(LIBNAMESO) 141 | endif 142 | 143 | $(EXEC): $(OBJS) 144 | $(CPP) -std=c++11 $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) 145 | 146 | $(OBJDIR)%.o: %.c $(DEPS) 147 | $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ 148 | 149 | $(OBJDIR)%.o: %.cpp $(DEPS) 150 | $(CPP) -std=c++11 $(COMMON) $(CFLAGS) -c $< -o $@ 151 | 152 | $(OBJDIR)%.o: %.cu $(DEPS) 153 | $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ 154 | 155 | obj: 156 | mkdir -p obj 157 | backup: 158 | mkdir -p backup 159 | results: 160 | mkdir -p results 161 | setchmod: 162 | chmod +x *.sh 163 | 164 | .PHONY: clean 165 | 166 | clean: 167 | rm -rf $(OBJS) $(EXEC) $(LIBNAMESO) $(APPNAMESO) 168 | -------------------------------------------------------------------------------- /Poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/enesozi/object-detection/fc60f9a5a8ce261f6beace0dc387bb6feee859f2/Poster.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Object Detection 2 | Object detection on thermal images 3 | 4 | ### Steps to follow: 5 | * **./build_docker_container.sh** (To build an nvidia-docker) 6 | * **./run_docker_container.sh** (To run the built nvidia-docker by name "darknet_thermal" and with mounted dataset. 7 | * Make sure that your gpu arch is included in [Makefile](https://github.com/enesozi/object-detection/blob/master/Makefile#L16) 8 | * If it's not, then add your gpu arch and run **make clean** and **make** commands in darknet directory. 9 | * **./preprocess_flir_dataset.sh** (Make sure that image directories are consistent with yours.) 10 | * Exit the container by using "**Ctrl+P and Q**". This leaves the container still running. 11 | * Start training in detached mode by using the following command: 12 | * **nvidia-docker exec -d darknet_thermal bash -c "cd /home/object-detection/ ; ./preprocess_flir_dataset.sh ; ./start_training.sh"** 13 | * In **start_training.sh** script gpu id is 3 by default. You might need to adjust this according to yours. 14 | 15 | #### PyCoco Results for IoU=0.50, area=all, maxDets=100 16 | Average Precision (AP) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = **0.714** 17 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = -1.000 18 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = -1.000 19 | Average Precision (AP) @[ IoU=0.50:0.50 | area= small | maxDets=100 ] = 0.576 20 | Average Precision (AP) @[ IoU=0.50:0.50 | area=medium | maxDets=100 ] = 0.819 21 | Average Precision (AP) @[ IoU=0.50:0.50 | area= large | maxDets=100 ] = 0.906 22 | Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets= 1 ] = 0.348 23 | Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets= 10 ] = 0.781 24 | Average Recall (AR) @[ IoU=0.50:0.50 | area= all | maxDets=100 ] = **0.787** 25 | Average Recall (AR) @[ IoU=0.50:0.50 | area= small | maxDets=100 ] = 0.719 26 | Average Recall (AR) @[ IoU=0.50:0.50 | area=medium | maxDets=100 ] = 0.834 27 | Average Recall (AR) @[ IoU=0.50:0.50 | area= large | maxDets=100 ] = 0.918 28 | 29 | Baseline result: mAP IoU(0.5) of 0.587 30 | 31 | You can download the dataset from [here](https://mega.nz/#!j9l32aAJ!wB4pk6H_12AaCRZT5flmNKcBcpCDdfleTaMi4WA8_-0) 32 | 33 | You can find the [blog post](https://medium.com/swlh/object-detection-on-thermal-images-4f3410a89db4) published on Medium. 34 | 35 | Pretrained weights: [thermal](https://mega.nz/#!vk9HDICC!qK13x8bjF1zY2aIJalR6BIZ1yfQye_r1NLcTxUJGNEs) 36 | -------------------------------------------------------------------------------- /build_docker_container.sh: -------------------------------------------------------------------------------- 1 | nvidia-docker build --no-cache -t thermal:darknet . -------------------------------------------------------------------------------- /convert_coco_yolo.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | # best model until now 3 | import argparse 4 | import glob 5 | import os 6 | import sys 7 | import json 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument( 12 | "path", help='Directory of json files containing annotations') 13 | parser.add_argument( 14 | "output_path", help='Output directory for image.txt files') 15 | parser.add_argument("--debug", action="store_true") 16 | args = parser.parse_args() 17 | json_files = sorted(glob.glob(os.path.join(args.path, '*.json'))) 18 | if args.debug: 19 | total_count = 0 20 | cats = {0: 0, 1: 0, 2: 0} 21 | bike_images = set() 22 | for json_file in json_files: 23 | with open(json_file) as f: 24 | data = json.load(f) 25 | image = data['image'] 26 | annotations = data['annotation'] 27 | file_name = image['file_name'] 28 | width = float(image['width']) 29 | height = float(image['height']) 30 | converted_results = [] 31 | for ann in annotations: 32 | cat_id = int(ann['category_id']) 33 | if cat_id <= 3: 34 | left, top, bbox_width, bbox_height = map( 35 | float, ann['bbox']) 36 | 37 | # Yolo classes are starting from zero index 38 | cat_id -= 1 39 | if args.debug: 40 | cats[cat_id] += 1 41 | total_count += 1 42 | if cat_id == 1: 43 | bike_images.add(file_name) 44 | x_center, y_center = ( 45 | left + bbox_width / 2, top + bbox_height / 2) 46 | # darknet expects relative values wrt image width&height 47 | x_rel, y_rel = (x_center / width, y_center / height) 48 | w_rel, h_rel = (bbox_width / width, bbox_height / height) 49 | converted_results.append( 50 | (cat_id, x_rel, y_rel, w_rel, h_rel)) 51 | if not args.debug: 52 | with open(os.path.join(args.output_path, file_name + '.txt'), 'w+') as fp: 53 | fp.write('\n'.join('%d %.6f %.6f %.6f %.6f' % 54 | res for res in converted_results)) 55 | if args.debug: 56 | print({cat: cats[cat] for cat in cats}) 57 | print(total_count) 58 | with open('bikes.txt', 'a+') as f: 59 | f.write('\n'.join("data/thermal/%s.jpeg"%b_img for b_img in bike_images)) 60 | f.write('\n') 61 | -------------------------------------------------------------------------------- /preprocess_flir_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | train_images="$HOME/Downloads/FLIR_ADAS/training/PreviewData" 3 | validation_images="$HOME/Downloads/FLIR_ADAS/validation/PreviewData" 4 | video_images="$HOME/Downloads/FLIR_ADAS/video/PreviewData" 5 | train_anns="$HOME/Downloads/FLIR_ADAS/training/Annotations" 6 | validation_anns="$HOME/Downloads/FLIR_ADAS/validation/Annotations" 7 | video_anns="$HOME/Downloads/FLIR_ADAS/video/Annotations" 8 | train_file="thermal_train.txt" 9 | valid_file="thermal_validation.txt" 10 | cfg_file="yolov3-spp-custom.cfg" 11 | data_file="thermal.data" 12 | name_file="thermal.names" 13 | image_dir="$PWD/darknet/build/darknet/x64/data/thermal" 14 | 15 | [ -f "$train_file" ] && rm "$train_file" 16 | [ -f "$valid_file" ] && rm "$valid_file" 17 | 18 | 19 | for value in {1..8862} 20 | do 21 | printf "data/thermal/FLIR_%05d.jpeg\n" $value >> "$train_file" 22 | done 23 | for value in {1..4224} 24 | do 25 | printf "data/thermal/FLIR_video_%05d.jpeg\n" $value >> "$train_file" 26 | done 27 | for value in {8863..10228} 28 | do 29 | printf "data/thermal/FLIR_%05d.jpeg\n" $value >> "$valid_file" 30 | done 31 | 32 | # Copy images to the correct directory 33 | rm -rf "$image_dir" 34 | mkdir "$image_dir" 35 | cp "$train_images/"* "$image_dir" 2>/dev/null 36 | cp "$validation_images/"* "$image_dir" 2>/dev/null 37 | cp "$video_images/"* "$image_dir" 2>/dev/null 38 | 39 | # Convert anns from standard COCO format to darknet format 40 | python convert_coco_yolo.py "$train_anns" "$image_dir" 41 | python convert_coco_yolo.py "$validation_anns" "$image_dir" 42 | python convert_coco_yolo.py "$video_anns" "$image_dir" 43 | 44 | # Quick fix for imbalanced dataset 45 | [ -f "bikes.txt" ] && rm "bikes.txt" 46 | 47 | python convert_coco_yolo.py "$train_anns" . --debug 48 | python convert_coco_yolo.py "$validation_anns" . --debug 49 | python convert_coco_yolo.py "$video_anns" . --debug 50 | 51 | for iter in {1..5} 52 | do 53 | sed p "bikes.txt" >> "bikes_new.txt" 54 | done 55 | 56 | mv "bikes_new.txt" "bikes.txt" 57 | cat "bikes.txt" >> "$train_file" 58 | rm "bikes.txt" 59 | 60 | # Shuffle train dataset 61 | shuf "$train_file" > "train_file_shuffled.txt" 62 | mv "train_file_shuffled.txt" "$train_file" 63 | 64 | # Copy necessary files to the correct directories 65 | cp "$cfg_file" "$PWD/darknet/build/darknet/x64/" 66 | cp "$train_file" "$PWD/darknet/build/darknet/x64/data/" 67 | cp "$valid_file" "$PWD/darknet/build/darknet/x64/data/" 68 | cp "$data_file" "$PWD/darknet/build/darknet/x64/data/" 69 | cp "$name_file" "$PWD/darknet/build/darknet/x64/data/" 70 | cp "run_all_iters.sh" "$PWD/darknet/build/darknet/x64/" 71 | 72 | # Download pretrained weight 73 | wget https://pjreddie.com/media/files/darknet53.conv.74 -O "$PWD/darknet/build/darknet/x64/darknet53.conv.74" -------------------------------------------------------------------------------- /run_all_iters.sh: -------------------------------------------------------------------------------- 1 | for iter in {17000..1000..1000} 2 | do 3 | ../../../darknet detector map data/thermal.data yolov3-spp-custom.cfg backup/yolov3-spp-custom_${iter}.weights -gpus 3 >> val_res.txt 4 | done 5 | -------------------------------------------------------------------------------- /run_docker_container.sh: -------------------------------------------------------------------------------- 1 | docker run --runtime=nvidia -it --name darknet_thermal -v ~/Downloads/FLIR_ADAS_12_11_18:/home/Downloads thermal:darknet -------------------------------------------------------------------------------- /start_map_calc.sh: -------------------------------------------------------------------------------- 1 | cd "$PWD/darknet/build/darknet/x64/" 2 | ./run_all_iters.sh -------------------------------------------------------------------------------- /start_training.sh: -------------------------------------------------------------------------------- 1 | cd "$PWD/darknet/build/darknet/x64/" 2 | ../../../darknet detector train data/thermal.data yolov3-spp-custom.cfg darknet53.conv.74 -gpus 3 -dont_show -map >> /home/tra_results.txt 3 | -------------------------------------------------------------------------------- /thermal.data: -------------------------------------------------------------------------------- 1 | classes = 3 2 | train = data/thermal_train.txt 3 | valid = data/thermal_validation.txt 4 | names = data/thermal.names 5 | backup = backup/ 6 | -------------------------------------------------------------------------------- /thermal.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | -------------------------------------------------------------------------------- /yolov3-spp-custom.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=32 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=24 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 8,20, 17,24, 14,52, 25,82, 34,37, 53,145, 61,60, 110,103, 165,220 643 | classes=3 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=16 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 4,5 728 | anchors = 8,20, 17,24, 14,52, 25,82, 34,37, 53,145, 61,60, 110,103, 165,220 729 | classes=3 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=32 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2,3 815 | anchors = 8,20, 17,24, 14,52, 25,82, 34,37, 53,145, 61,60, 110,103, 165,220 816 | classes=3 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | 823 | -------------------------------------------------------------------------------- /yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=18 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 7,8 642 | anchors = 8,20, 17,22, 13,47, 33,35, 24,80, 58,56, 51,152, 102,93, 158,195 643 | classes=4 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=27 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,5,6 728 | anchors = 8,20, 17,22, 13,47, 33,35, 24,80, 58,56, 51,152, 102,93, 158,195 729 | classes=4 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=36 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2,4 815 | anchors = 8,20, 17,22, 13,47, 33,35, 24,80, 58,56, 51,152, 102,93, 158,195 816 | classes=4 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | 823 | -------------------------------------------------------------------------------- /yolov3-thermal.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | try_fix_nan=1 7 | batch=64 8 | subdivisions=64 9 | width=608 10 | height=608 11 | channels=1 12 | momentum=0.9 13 | decay=0.0005 14 | angle=0 15 | saturation = 1.5 16 | exposure = 1.5 17 | hue=.1 18 | adam=0 19 | learning_rate=0.0003 20 | burn_in=1000 21 | max_batches = 10000 22 | policy=steps 23 | steps=8000,9000 24 | scales=.1,.1 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=32 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | # Downsample 35 | 36 | [convolutional] 37 | batch_normalize=1 38 | filters=64 39 | size=3 40 | stride=2 41 | pad=1 42 | activation=leaky 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=32 47 | size=1 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [convolutional] 53 | batch_normalize=1 54 | filters=64 55 | size=3 56 | stride=1 57 | pad=1 58 | activation=leaky 59 | 60 | [shortcut] 61 | from=-3 62 | activation=linear 63 | 64 | # Downsample 65 | 66 | [convolutional] 67 | batch_normalize=1 68 | filters=128 69 | size=3 70 | stride=2 71 | pad=1 72 | activation=leaky 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=64 77 | size=1 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [convolutional] 83 | batch_normalize=1 84 | filters=128 85 | size=3 86 | stride=1 87 | pad=1 88 | activation=leaky 89 | 90 | [shortcut] 91 | from=-3 92 | activation=linear 93 | 94 | [convolutional] 95 | batch_normalize=1 96 | filters=64 97 | size=1 98 | stride=1 99 | pad=1 100 | activation=leaky 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | filters=128 105 | size=3 106 | stride=1 107 | pad=1 108 | activation=leaky 109 | 110 | [shortcut] 111 | from=-3 112 | activation=linear 113 | 114 | # Downsample 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=256 119 | size=3 120 | stride=2 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | batch_normalize=1 126 | filters=128 127 | size=1 128 | stride=1 129 | pad=1 130 | activation=leaky 131 | 132 | [convolutional] 133 | batch_normalize=1 134 | filters=256 135 | size=3 136 | stride=1 137 | pad=1 138 | activation=leaky 139 | 140 | [shortcut] 141 | from=-3 142 | activation=linear 143 | 144 | [convolutional] 145 | batch_normalize=1 146 | filters=128 147 | size=1 148 | stride=1 149 | pad=1 150 | activation=leaky 151 | 152 | [convolutional] 153 | batch_normalize=1 154 | filters=256 155 | size=3 156 | stride=1 157 | pad=1 158 | activation=leaky 159 | 160 | [shortcut] 161 | from=-3 162 | activation=linear 163 | 164 | [convolutional] 165 | batch_normalize=1 166 | filters=128 167 | size=1 168 | stride=1 169 | pad=1 170 | activation=leaky 171 | 172 | [convolutional] 173 | batch_normalize=1 174 | filters=256 175 | size=3 176 | stride=1 177 | pad=1 178 | activation=leaky 179 | 180 | [shortcut] 181 | from=-3 182 | activation=linear 183 | 184 | [convolutional] 185 | batch_normalize=1 186 | filters=128 187 | size=1 188 | stride=1 189 | pad=1 190 | activation=leaky 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | filters=256 195 | size=3 196 | stride=1 197 | pad=1 198 | activation=leaky 199 | 200 | [shortcut] 201 | from=-3 202 | activation=linear 203 | 204 | 205 | [convolutional] 206 | batch_normalize=1 207 | filters=128 208 | size=1 209 | stride=1 210 | pad=1 211 | activation=leaky 212 | 213 | [convolutional] 214 | batch_normalize=1 215 | filters=256 216 | size=3 217 | stride=1 218 | pad=1 219 | activation=leaky 220 | 221 | [shortcut] 222 | from=-3 223 | activation=linear 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | filters=128 228 | size=1 229 | stride=1 230 | pad=1 231 | activation=leaky 232 | 233 | [convolutional] 234 | batch_normalize=1 235 | filters=256 236 | size=3 237 | stride=1 238 | pad=1 239 | activation=leaky 240 | 241 | [shortcut] 242 | from=-3 243 | activation=linear 244 | 245 | [convolutional] 246 | batch_normalize=1 247 | filters=128 248 | size=1 249 | stride=1 250 | pad=1 251 | activation=leaky 252 | 253 | [convolutional] 254 | batch_normalize=1 255 | filters=256 256 | size=3 257 | stride=1 258 | pad=1 259 | activation=leaky 260 | 261 | [shortcut] 262 | from=-3 263 | activation=linear 264 | 265 | [convolutional] 266 | batch_normalize=1 267 | filters=128 268 | size=1 269 | stride=1 270 | pad=1 271 | activation=leaky 272 | 273 | [convolutional] 274 | batch_normalize=1 275 | filters=256 276 | size=3 277 | stride=1 278 | pad=1 279 | activation=leaky 280 | 281 | [shortcut] 282 | from=-3 283 | activation=linear 284 | 285 | # Downsample 286 | 287 | [convolutional] 288 | batch_normalize=1 289 | filters=512 290 | size=3 291 | stride=2 292 | pad=1 293 | activation=leaky 294 | 295 | [convolutional] 296 | batch_normalize=1 297 | filters=256 298 | size=1 299 | stride=1 300 | pad=1 301 | activation=leaky 302 | 303 | [convolutional] 304 | batch_normalize=1 305 | filters=512 306 | size=3 307 | stride=1 308 | pad=1 309 | activation=leaky 310 | 311 | [shortcut] 312 | from=-3 313 | activation=linear 314 | 315 | 316 | [convolutional] 317 | batch_normalize=1 318 | filters=256 319 | size=1 320 | stride=1 321 | pad=1 322 | activation=leaky 323 | 324 | [convolutional] 325 | batch_normalize=1 326 | filters=512 327 | size=3 328 | stride=1 329 | pad=1 330 | activation=leaky 331 | 332 | [shortcut] 333 | from=-3 334 | activation=linear 335 | 336 | 337 | [convolutional] 338 | batch_normalize=1 339 | filters=256 340 | size=1 341 | stride=1 342 | pad=1 343 | activation=leaky 344 | 345 | [convolutional] 346 | batch_normalize=1 347 | filters=512 348 | size=3 349 | stride=1 350 | pad=1 351 | activation=leaky 352 | 353 | [shortcut] 354 | from=-3 355 | activation=linear 356 | 357 | 358 | [convolutional] 359 | batch_normalize=1 360 | filters=256 361 | size=1 362 | stride=1 363 | pad=1 364 | activation=leaky 365 | 366 | [convolutional] 367 | batch_normalize=1 368 | filters=512 369 | size=3 370 | stride=1 371 | pad=1 372 | activation=leaky 373 | 374 | [shortcut] 375 | from=-3 376 | activation=linear 377 | 378 | [convolutional] 379 | batch_normalize=1 380 | filters=256 381 | size=1 382 | stride=1 383 | pad=1 384 | activation=leaky 385 | 386 | [convolutional] 387 | batch_normalize=1 388 | filters=512 389 | size=3 390 | stride=1 391 | pad=1 392 | activation=leaky 393 | 394 | [shortcut] 395 | from=-3 396 | activation=linear 397 | 398 | 399 | [convolutional] 400 | batch_normalize=1 401 | filters=256 402 | size=1 403 | stride=1 404 | pad=1 405 | activation=leaky 406 | 407 | [convolutional] 408 | batch_normalize=1 409 | filters=512 410 | size=3 411 | stride=1 412 | pad=1 413 | activation=leaky 414 | 415 | [shortcut] 416 | from=-3 417 | activation=linear 418 | 419 | 420 | [convolutional] 421 | batch_normalize=1 422 | filters=256 423 | size=1 424 | stride=1 425 | pad=1 426 | activation=leaky 427 | 428 | [convolutional] 429 | batch_normalize=1 430 | filters=512 431 | size=3 432 | stride=1 433 | pad=1 434 | activation=leaky 435 | 436 | [shortcut] 437 | from=-3 438 | activation=linear 439 | 440 | [convolutional] 441 | batch_normalize=1 442 | filters=256 443 | size=1 444 | stride=1 445 | pad=1 446 | activation=leaky 447 | 448 | [convolutional] 449 | batch_normalize=1 450 | filters=512 451 | size=3 452 | stride=1 453 | pad=1 454 | activation=leaky 455 | 456 | [shortcut] 457 | from=-3 458 | activation=linear 459 | 460 | # Downsample 461 | 462 | [convolutional] 463 | batch_normalize=1 464 | filters=1024 465 | size=3 466 | stride=2 467 | pad=1 468 | activation=leaky 469 | 470 | [convolutional] 471 | batch_normalize=1 472 | filters=512 473 | size=1 474 | stride=1 475 | pad=1 476 | activation=leaky 477 | 478 | [convolutional] 479 | batch_normalize=1 480 | filters=1024 481 | size=3 482 | stride=1 483 | pad=1 484 | activation=leaky 485 | 486 | [shortcut] 487 | from=-3 488 | activation=linear 489 | 490 | [convolutional] 491 | batch_normalize=1 492 | filters=512 493 | size=1 494 | stride=1 495 | pad=1 496 | activation=leaky 497 | 498 | [convolutional] 499 | batch_normalize=1 500 | filters=1024 501 | size=3 502 | stride=1 503 | pad=1 504 | activation=leaky 505 | 506 | [shortcut] 507 | from=-3 508 | activation=linear 509 | 510 | [convolutional] 511 | batch_normalize=1 512 | filters=512 513 | size=1 514 | stride=1 515 | pad=1 516 | activation=leaky 517 | 518 | [convolutional] 519 | batch_normalize=1 520 | filters=1024 521 | size=3 522 | stride=1 523 | pad=1 524 | activation=leaky 525 | 526 | [shortcut] 527 | from=-3 528 | activation=linear 529 | 530 | [convolutional] 531 | batch_normalize=1 532 | filters=512 533 | size=1 534 | stride=1 535 | pad=1 536 | activation=leaky 537 | 538 | [convolutional] 539 | batch_normalize=1 540 | filters=1024 541 | size=3 542 | stride=1 543 | pad=1 544 | activation=leaky 545 | 546 | [shortcut] 547 | from=-3 548 | activation=linear 549 | 550 | ###################### 551 | 552 | [convolutional] 553 | batch_normalize=1 554 | filters=512 555 | size=1 556 | stride=1 557 | pad=1 558 | activation=leaky 559 | 560 | [convolutional] 561 | batch_normalize=1 562 | size=3 563 | stride=1 564 | pad=1 565 | filters=1024 566 | activation=leaky 567 | 568 | [convolutional] 569 | batch_normalize=1 570 | filters=512 571 | size=1 572 | stride=1 573 | pad=1 574 | activation=leaky 575 | 576 | [convolutional] 577 | batch_normalize=1 578 | size=3 579 | stride=1 580 | pad=1 581 | filters=1024 582 | activation=leaky 583 | 584 | [convolutional] 585 | batch_normalize=1 586 | filters=512 587 | size=1 588 | stride=1 589 | pad=1 590 | activation=leaky 591 | 592 | [convolutional] 593 | batch_normalize=1 594 | size=3 595 | stride=1 596 | pad=1 597 | filters=1024 598 | activation=leaky 599 | 600 | [convolutional] 601 | size=1 602 | stride=1 603 | pad=1 604 | filters=27 605 | activation=linear 606 | 607 | 608 | [yolo] 609 | mask = 5,7,8 610 | anchors = 13,21, 16,47, 28,29, 47,45, 27,84, 73,71, 54,161, 117,107, 165,200 611 | classes=4 612 | num=9 613 | jitter=.3 614 | ignore_thresh = .5 615 | truth_thresh = 1 616 | random=1 617 | 618 | 619 | [route] 620 | layers = -4 621 | 622 | [convolutional] 623 | batch_normalize=1 624 | filters=256 625 | size=1 626 | stride=1 627 | pad=1 628 | activation=leaky 629 | 630 | [upsample] 631 | stride=2 632 | 633 | [route] 634 | layers = -1, 61 635 | 636 | 637 | 638 | [convolutional] 639 | batch_normalize=1 640 | filters=256 641 | size=1 642 | stride=1 643 | pad=1 644 | activation=leaky 645 | 646 | [convolutional] 647 | batch_normalize=1 648 | size=3 649 | stride=1 650 | pad=1 651 | filters=512 652 | activation=leaky 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [convolutional] 663 | batch_normalize=1 664 | size=3 665 | stride=1 666 | pad=1 667 | filters=512 668 | activation=leaky 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | size=1 688 | stride=1 689 | pad=1 690 | filters=18 691 | activation=linear 692 | 693 | 694 | [yolo] 695 | mask = 3,6 696 | anchors = 13,21, 16,47, 28,29, 47,45, 27,84, 73,71, 54,161, 117,107, 165,200 697 | classes=4 698 | num=9 699 | jitter=.3 700 | ignore_thresh = .5 701 | truth_thresh = 1 702 | random=1 703 | 704 | 705 | 706 | [route] 707 | layers = -4 708 | 709 | [convolutional] 710 | batch_normalize=1 711 | filters=128 712 | size=1 713 | stride=1 714 | pad=1 715 | activation=leaky 716 | 717 | [upsample] 718 | stride=2 719 | 720 | [route] 721 | layers = -1, 36 722 | 723 | 724 | 725 | [convolutional] 726 | batch_normalize=1 727 | filters=128 728 | size=1 729 | stride=1 730 | pad=1 731 | activation=leaky 732 | 733 | [convolutional] 734 | batch_normalize=1 735 | size=3 736 | stride=1 737 | pad=1 738 | filters=256 739 | activation=leaky 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [convolutional] 750 | batch_normalize=1 751 | size=3 752 | stride=1 753 | pad=1 754 | filters=256 755 | activation=leaky 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | size=1 775 | stride=1 776 | pad=1 777 | filters=36 778 | activation=linear 779 | 780 | 781 | [yolo] 782 | mask = 0,1,2,4 783 | anchors = 13,21, 16,47, 28,29, 47,45, 27,84, 73,71, 54,161, 117,107, 165,200 784 | classes=4 785 | num=9 786 | jitter=.3 787 | ignore_thresh = .5 788 | truth_thresh = 1 789 | random=1 790 | max=200 791 | -------------------------------------------------------------------------------- /yolov3_5l.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=32 8 | width=608 9 | height=608 10 | channels=1 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=48 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 9,10,11,12,13,14 609 | anchors = 7,18, 11,36, 13,68, 15,20, 19,45, 27,28, 24,77, 41,43, 38,107, 63,61, 98,91, 63,162, 154,141, 107,277, 273,253 610 | classes=3 611 | num=15 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=16 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 7,8 695 | anchors = 7,18, 11,36, 13,68, 15,20, 19,45, 27,28, 24,77, 41,43, 38,107, 63,61, 98,91, 63,162, 154,141, 107,277, 273,253 696 | classes=3 697 | num=15 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=24 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 4,5,6 782 | anchors = 7,18, 11,36, 13,68, 15,20, 19,45, 27,28, 24,77, 41,43, 38,107, 63,61, 98,91, 63,162, 154,141, 107,277, 273,253 783 | classes=3 784 | num=15 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | 790 | 791 | 792 | ############### 793 | 794 | 795 | [route] 796 | layers = -4 797 | 798 | [convolutional] 799 | batch_normalize=1 800 | filters=128 801 | size=1 802 | stride=1 803 | pad=1 804 | activation=leaky 805 | 806 | [upsample] 807 | stride=2 808 | 809 | [route] 810 | layers = -1, 11 811 | 812 | 813 | 814 | [convolutional] 815 | batch_normalize=1 816 | filters=64 817 | size=1 818 | stride=1 819 | pad=1 820 | activation=leaky 821 | 822 | [convolutional] 823 | batch_normalize=1 824 | size=3 825 | stride=1 826 | pad=1 827 | filters=128 828 | activation=leaky 829 | 830 | [convolutional] 831 | batch_normalize=1 832 | filters=64 833 | size=1 834 | stride=1 835 | pad=1 836 | activation=leaky 837 | 838 | [convolutional] 839 | batch_normalize=1 840 | size=3 841 | stride=1 842 | pad=1 843 | filters=128 844 | activation=leaky 845 | 846 | [convolutional] 847 | batch_normalize=1 848 | filters=64 849 | size=1 850 | stride=1 851 | pad=1 852 | activation=leaky 853 | 854 | [convolutional] 855 | batch_normalize=1 856 | size=3 857 | stride=1 858 | pad=1 859 | filters=128 860 | activation=leaky 861 | 862 | [convolutional] 863 | size=1 864 | stride=1 865 | pad=1 866 | filters=16 867 | activation=linear 868 | 869 | 870 | [yolo] 871 | mask = 2,3 872 | anchors = 7,18, 11,36, 13,68, 15,20, 19,45, 27,28, 24,77, 41,43, 38,107, 63,61, 98,91, 63,162, 154,141, 107,277, 273,253 873 | classes=3 874 | num=15 875 | jitter=.3 876 | ignore_thresh = .7 877 | truth_thresh = 1 878 | random=1 879 | 880 | 881 | 882 | 883 | 884 | [route] 885 | layers = -4 886 | 887 | [convolutional] 888 | batch_normalize=1 889 | filters=128 890 | size=1 891 | stride=1 892 | pad=1 893 | activation=leaky 894 | 895 | [upsample] 896 | stride=2 897 | 898 | [route] 899 | layers = -1, 4 900 | 901 | 902 | 903 | [convolutional] 904 | batch_normalize=1 905 | filters=32 906 | size=1 907 | stride=1 908 | pad=1 909 | activation=leaky 910 | 911 | [convolutional] 912 | batch_normalize=1 913 | size=3 914 | stride=1 915 | pad=1 916 | filters=64 917 | activation=leaky 918 | 919 | [convolutional] 920 | batch_normalize=1 921 | filters=32 922 | size=1 923 | stride=1 924 | pad=1 925 | activation=leaky 926 | 927 | [convolutional] 928 | batch_normalize=1 929 | size=3 930 | stride=1 931 | pad=1 932 | filters=64 933 | activation=leaky 934 | 935 | [convolutional] 936 | batch_normalize=1 937 | filters=32 938 | size=1 939 | stride=1 940 | pad=1 941 | activation=leaky 942 | 943 | [convolutional] 944 | batch_normalize=1 945 | size=3 946 | stride=1 947 | pad=1 948 | filters=64 949 | activation=leaky 950 | 951 | [convolutional] 952 | size=1 953 | stride=1 954 | pad=1 955 | filters=16 956 | activation=linear 957 | 958 | 959 | [yolo] 960 | mask = 0,1 961 | anchors = 7,18, 11,36, 13,68, 15,20, 19,45, 27,28, 24,77, 41,43, 38,107, 63,61, 98,91, 63,162, 154,141, 107,277, 273,253 962 | classes=3 963 | num=15 964 | jitter=.3 965 | ignore_thresh = .7 966 | truth_thresh = 1 967 | random=1 --------------------------------------------------------------------------------