├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── config.yaml
├── convert_tf_to_pt.sh
├── copy_weights.py
├── datasets.py
├── detect_faces_on_videos.py
├── dsfacedetector
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   └── config.py
    ├── face_ssd_infer.py
    ├── layers
    │   ├── __init__.py
    │   ├── detection.py
    │   ├── modules.py
    │   └── prior_box.py
    └── utils.py
├── external_data
    ├── convert_tf_to_pt.py
    └── original_tf
    │   ├── __init__.py
    │   ├── efficientnet_builder.py
    │   ├── efficientnet_model.py
    │   ├── eval_ckpt_main.py
    │   ├── preprocessing.py
    │   └── utils.py
├── extract_tracks_from_videos.py
├── generate_aligned_tracks.py
├── generate_track_pairs.py
├── generate_tracks.py
├── images
    ├── augmented_mixup.jpg
    ├── clip_example.jpg
    ├── first_and_second_model_inputs.jpg
    ├── mixup_example.jpg
    ├── pred_transform.jpg
    └── third_model_input.jpg
├── models
    └── .gitkeep
├── predict.py
├── tracker
    ├── __init__.py
    ├── iou_tracker.py
    └── utils.py
├── train_b7_ns_aa_original_large_crop_100k.py
├── train_b7_ns_aa_original_re_100k.py
└── train_b7_ns_seq_aa_original_100k.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # PyCharm
 2 | .idea
 3 | 
 4 | # Jupyter Notebook
 5 | .ipynb_checkpoints
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 2 | 
 3 | SHELL ["/bin/bash", "-c"]
 4 | 
 5 | RUN rm /etc/apt/sources.list.d/cuda.list \
 6 |     /etc/apt/sources.list.d/nvidia-ml.list && \
 7 |     apt-get update && \
 8 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 9 |         software-properties-common \
10 |         wget \
11 |         git && \
12 |     add-apt-repository -y ppa:deadsnakes/ppa && \
13 |     apt-get update && \
14 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
15 |         python3.6 \
16 |         python3.6-dev && \
17 |     wget -O ~/get-pip.py \
18 |         https://bootstrap.pypa.io/get-pip.py && \
19 |     python3.6 ~/get-pip.py && \
20 |     pip3 --no-cache-dir install \
21 |         numpy==1.17.4 \
22 |         PyYAML==5.1.2 \
23 |         mkl==2019.0 \
24 |         mkl-include==2019.0 \
25 |         cmake==3.15.3 \
26 |         cffi==1.13.2 \
27 |         typing==3.7.4.1 \
28 |         six==1.13.0 \
29 |         Pillow==6.2.1 \
30 |         scipy==1.4.1 && \
31 |     cd /tmp && \
32 |     git clone https://github.com/pytorch/pytorch.git && \
33 |     cd pytorch && \
34 |     git checkout v1.3.0 && \
35 |     git submodule update --init --recursive && \
36 |     python3.6 setup.py install && \
37 |     cd /tmp && \
38 |     git clone https://github.com/pytorch/vision.git && \
39 |     cd vision && \
40 |     git checkout v0.4.1 && \
41 |     python3.6 setup.py install && \
42 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
43 |         ffmpeg && \
44 |     pip3 --no-cache-dir install \
45 | 	    opencv-python==4.1.2.30 \
46 | 	    albumentations==0.4.3 \
47 | 	    tqdm==4.39.0 \
48 | 	    timm==0.1.18 \
49 | 	    efficientnet-pytorch==0.6.3 \
50 | 	    ffmpeg-python==0.2.0 \
51 | 	    tensorflow==1.15.2 && \
52 |     cd / && \
53 |     apt-get clean && \
54 |     apt-get autoremove && \
55 |     rm -rf /var/lib/apt/lists/* /tmp/*
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 N-TECH.LAB LTD
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deepfake Detection Challenge
  2 | Solution for the [Deepfake Detection Challenge](https://www.kaggle.com/c/deepfake-detection-challenge).  
  3 | Private LB score: **0.43452**
  4 | ## Solution description
  5 | ### Summary
  6 | Our solution consists of three EfficientNet-B7 models (we used the Noisy Student pre-trained weights). We did not use 
  7 | external data, except for pre-trained weights. One model runs on frame sequences (a 3D convolution has been added to 
  8 | each EfficientNet-B7 block). The other two models work frame-by-frame and differ in the size of the face crop and 
  9 | augmentations during training. To tackle overfitting problem, we used mixup technique on aligned real-fake pairs. In 
 10 | addition, we used the following augmentations: AutoAugment, Random Erasing, Random Crops, Random Flips, and various 
 11 | video compression parameters. Video compression augmentation was done on-the-fly. To do this, short cropped tracks (50 
 12 | frames each) were saved in PNG format, and at each training iteration they were loaded and reencoded with random 
 13 | parameters using ffmpeg. Due to the mixup, model predictions were “uncertain”, so at the inference stage, model 
 14 | confidence was strengthened by a simple transformation. The final prediction was obtained by averaging the predictions 
 15 | of models with weights proportional to confidence.The total training and preprocessing time is approximately 5 days on 
 16 | DGX-1.
 17 | ### Key ingredients
 18 | #### Mixup on aligned real-fake pairs
 19 | One of the main difficulties of this competition is a severe overfitting. Initially, all models overfitted in 2-3 epochs 
 20 | (the validation loss started to increase). The idea, which helped a lot with the overfitting, is to train the model on 
 21 | a mix of real and fake faces: for each fake face, we take the corresponding real face from the original video (with the 
 22 | same box coordinates and the same frame number) an do a linear combination of them. In terms of tensor it’s  
 23 | ```python
 24 | input_tensor = (1.0 - target) * real_input_tensor + target * fake_input_tensor
 25 | ```
 26 | where target is drawn from a Beta distribution with parameters alpha=beta=0.5. With these parameters, there is a very 
 27 | high probability of picking values close to 0 or 1 (pure real or pure fake face). You can see the examples below:  
 28 | ![mixup example](images/mixup_example.jpg "Mixup example")  
 29 | Due to the fact that real and fake samples are aligned, the background remains almost unchanged on interpolated samples, 
 30 | which reduces overfitting and makes the model pay more attention to the face.
 31 | #### Video compression augmentation
 32 | In the paper \[1\] it was pointed out that augmentations close to degradations seen in real-life video distributions 
 33 | were applied to the test data. Specifically, these augmentations were (1) reduce the FPS of the video to 15; (2) reduce 
 34 | the resolution of the video to 1/4 of its original size; and (3) reduce the overall encoding quality. In order to make 
 35 | the model resistant to various parameters of video compression, we added augmentations with random parameters of video 
 36 | encoding to training. It would be infeasible to apply such augmentations to the original videos on-the-fly during 
 37 | training, so instead of the original videos, cropped (1.5x areas around the face) short (50 frames) clips were used. 
 38 | Each clip was saved as separate frames in png format. An example of a clip is given below:  
 39 | ![clip example](images/clip_example.jpg "Clip example")  
 40 |  For on-the-fly augmentation, ffmpeg-python was used. At each iteration, the following parameters were randomly sampled 
 41 |  (see \[2\]):
 42 | - FPS (15 to 30)
 43 | - scale (0.25 to 1.0)
 44 | - CRF (17 to 40)
 45 | - random tuning option
 46 | #### Model architecture
 47 | As a result of the experiments, we found out that the EfficientNet models work better than others (we checked ResNet, 
 48 | ResNeXt, SE-ResNeXt). The best model was EfficientNet-B7 with Noisy Student pre-trained weights \[3\]. The size of the 
 49 | input image is 224x192 (most of the faces in the training dataset are smaller). The final ensemble consists of three 
 50 | models, two of which are frame-by-frame, and the third works on sequence.
 51 | ##### Frame-by-frame models
 52 | Frame-by-frame models work quite well. They differ in the size of the area around the face and augmentations during 
 53 | training. Below are examples of input images for each of the models:  
 54 | ![first and second model inputs](images/first_and_second_model_inputs.jpg "First and second model input examples")  
 55 | ##### Sequence-based model
 56 | Probably, time dependencies can be useful for detecting fakes. Therefore, we added a 3d convolution to each block of the 
 57 | EfficientNet model. This model worked slightly better than similar frame-by-frame model. The length of the input 
 58 | sequence is 7 frames. The step between frames is 1/15 of a second. An example of an input sequence is given below:  
 59 | ![third model input](images/third_model_input.jpg "Third model input example")  
 60 | #### Image augmentations
 61 | To improve model generalization, we used the following augmentations: AutoAugment \[4\], Random Erasing, Random Crops, 
 62 | Random Horizontal Flips. Since we used mixup, it was important to augment real-fake pairs the same way (see example). 
 63 | For a sequence-based model, it was important to augment frames that belong to the same clip in the same way.  
 64 | ![augmented mixup](images/augmented_mixup.jpg "Augmented mixup example")
 65 | #### Inference post-processing
 66 | Due to mixup, the predictions of the models were uncertain, which was not optimal for the logloss. To increase 
 67 | confidence, we applied the following transformation:  
 68 | ![prediction transform](images/pred_transform.jpg "Prediction transformation")  
 69 | Due to computational limitations, predictions are made on a subsample of frames. Half of the frames were horizontally 
 70 | flipped. The prediction for the video is obtained by averaging all the predictions with weights proportional to the 
 71 | confidence (the closer the prediction to 0.5, the lower its weight). Such averaging works like attention, because the 
 72 | model gives predictions close to 0.5 on poor quality frames (profile faces, blur, etc.). 
 73 | #### References
 74 | \[1\] Brian Dolhansky, Russ Howes, Ben Pflaum, Nicole Baram, Cristian Canton Ferrer, “The Deepfake Detection Challenge 
 75 | (DFDC) Preview Dataset”  
 76 | \[2\] [https://trac.ffmpeg.org/wiki/Encode/H.264](https://trac.ffmpeg.org/wiki/Encode/H.264)  
 77 | \[3\] Qizhe Xie, Minh-Thang Luong, Eduard Hovy, Quoc V. Le, “Self-training with Noisy Student improves ImageNet classification”  
 78 | \[4\] Ekin D. Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, Quoc V. Le, “AutoAugment: Learning Augmentation Policies from Data”
 79 | ## The hardware we used
 80 | - CPU: Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
 81 | - GPU: 8x NVIDIA Tesla V100 SXM2 32 GB
 82 | - RAM: 512 GB
 83 | - SSD: 6 TB
 84 | ## Prerequisites
 85 | ### Environment
 86 | Use the docker to get an environment close to what was used in the training. Run the following command to build the docker image:
 87 | ```bash
 88 | cd path/to/solution
 89 | sudo docker build -t dfdc .
 90 | ```
 91 | ### Data
 92 | Download the [deepfake-detection-challenge-data](https://www.kaggle.com/c/deepfake-detection-challenge/data) and extract all files to `/path/to/dfdc-data`. This directory must have the following structure:
 93 | ```
 94 | dfdc-data
 95 | ├── dfdc_train_part_0
 96 | ├── dfdc_train_part_1
 97 | ├── dfdc_train_part_10
 98 | ├── dfdc_train_part_11
 99 | ├── dfdc_train_part_12
100 | ├── dfdc_train_part_13
101 | ├── dfdc_train_part_14
102 | ├── dfdc_train_part_15
103 | ├── dfdc_train_part_16
104 | ├── dfdc_train_part_17
105 | ├── dfdc_train_part_18
106 | ├── dfdc_train_part_19
107 | ├── dfdc_train_part_2
108 | ├── dfdc_train_part_20
109 | ├── dfdc_train_part_21
110 | ├── dfdc_train_part_22
111 | ├── dfdc_train_part_23
112 | ├── dfdc_train_part_24
113 | ├── dfdc_train_part_25
114 | ├── dfdc_train_part_26
115 | ├── dfdc_train_part_27
116 | ├── dfdc_train_part_28
117 | ├── dfdc_train_part_29
118 | ├── dfdc_train_part_3
119 | ├── dfdc_train_part_30
120 | ├── dfdc_train_part_31
121 | ├── dfdc_train_part_32
122 | ├── dfdc_train_part_33
123 | ├── dfdc_train_part_34
124 | ├── dfdc_train_part_35
125 | ├── dfdc_train_part_36
126 | ├── dfdc_train_part_37
127 | ├── dfdc_train_part_38
128 | ├── dfdc_train_part_39
129 | ├── dfdc_train_part_4
130 | ├── dfdc_train_part_40
131 | ├── dfdc_train_part_41
132 | ├── dfdc_train_part_42
133 | ├── dfdc_train_part_43
134 | ├── dfdc_train_part_44
135 | ├── dfdc_train_part_45
136 | ├── dfdc_train_part_46
137 | ├── dfdc_train_part_47
138 | ├── dfdc_train_part_48
139 | ├── dfdc_train_part_49
140 | ├── dfdc_train_part_5
141 | ├── dfdc_train_part_6
142 | ├── dfdc_train_part_7
143 | ├── dfdc_train_part_8
144 | ├── dfdc_train_part_9
145 | └── test_videos
146 | ```
147 | 
148 | ### External data
149 | According to the rules of the competition, external data is allowed. The solution does not use other external data, except for pre-trained models. Below is a table with information about these models.
150 | 
151 | | File Name | Source | Direct Link | Forum Post |
152 | | --------- | ------ | ----------- | ---------- |
153 | | WIDERFace_DSFD_RES152.pth | [github](https://github.com/Tencent/FaceDetection-DSFD/tree/31aa8bdeaf01a0c408adaf2709754a16b17aec79) | [google drive](https://drive.google.com/file/d/1WeXlNYsM6dMP3xQQELI-4gxhwKUQxc3-/view) | [link](https://www.kaggle.com/c/deepfake-detection-challenge/discussion/121203#761391) |
154 | | noisy_student_efficientnet-b7.tar.gz | [github](https://github.com/tensorflow/tpu/tree/4719695c9128622fb26dedb19ea19bd9d1ee3177/models/official/efficientnet) | [link](https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/noisystudent/noisy_student_efficientnet-b7.tar.gz) | [link](https://www.kaggle.com/c/deepfake-detection-challenge/discussion/121203#748358) |  
155 | 
156 | Download these files and copy them to the `external_data` folder. 
157 | 
158 | ## How to train the model
159 | Run the docker container with the paths correctly mounted:
160 | ```bash
161 | sudo docker run --runtime=nvidia -i -t -d --rm --ipc=host -v /path/to/dfdc-data:/kaggle/input/deepfake-detection-challenge:ro -v /path/to/solution:/kaggle/solution --name dfdc dfdc
162 | sudo docker exec -it dfdc /bin/bash
163 | cd /kaggle/solution
164 | ```
165 | Convert pre-trained model from tensorflow to pytorch:
166 | ```bash
167 | bash convert_tf_to_pt.sh
168 | ```
169 | Detect faces on videos:
170 | ```bash
171 | python3.6 detect_faces_on_videos.py
172 | ```
173 | _Note: You can parallelize this operation using the `--part` and `--num_parts` arguments_  
174 | Generate tracks:
175 | ```bash
176 | python3.6 generate_tracks.py
177 | ```
178 | Generate aligned tracks:
179 | ```bash
180 | python3.6 generate_aligned_tracks.py
181 | ```
182 | Extract tracks from videos:
183 | ```bash
184 | python3.6 extract_tracks_from_videos.py
185 | ```
186 | _Note: You can parallelize this operation using the `--part` and `--num_parts` arguments_  
187 | Generate track pairs:
188 | ```bash
189 | python3.6 generate_track_pairs.py
190 | ```
191 | Train models:
192 | ```bash
193 | python3.6 train_b7_ns_aa_original_large_crop_100k.py
194 | python3.6 train_b7_ns_aa_original_re_100k.py
195 | python3.6 train_b7_ns_seq_aa_original_100k.py
196 | ```
197 | Copy the final weights and convert them to FP16:
198 | ```bash
199 | python3.6 copy_weights.py
200 | ```
201 | ## Serialized copy of the trained model
202 | You can download the final weights that were used in the competition (the result of the `copy_weights.py` script): [GoogleDrive](https://drive.google.com/file/d/1S-HeppZcbXDF0F-BO96zhqZyrRWOaan6/view?usp=sharing)
203 | ## How to generate submission
204 | Run the following command
205 | ```bash
206 | python3.6 predict.py
207 | ```


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | DFDC_DATA_PATH: "/kaggle/input/deepfake-detection-challenge"
2 | ARTIFACTS_PATH: "/kaggle/solution/artifacts"
3 | MODELS_PATH: "/kaggle/solution/models"
4 | SUBMISSION_PATH: "/kaggle/solution/output/submission.csv"


--------------------------------------------------------------------------------
/convert_tf_to_pt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd external_data && \
3 | tar -xzf noisy_student_efficientnet-b7.tar.gz && \
4 | python3.6 convert_tf_to_pt.py --model_name efficientnet-b7 --tf_checkpoint noisy-student-efficientnet-b7 --output_file noisy_student_efficientnet-b7.pth && \
5 | rm -rf noisy-student-efficientnet-b7 tmp && \
6 | cd ..


--------------------------------------------------------------------------------
/copy_weights.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | import torch
 5 | 
 6 | WEIGHTS_MAPPING = {
 7 |     'snapshots/efficientnet-b7_ns_aa-original-mstd0.5_large_crop_100k/snapshot_100000.pth': 'efficientnet-b7_ns_aa-original-mstd0.5_large_crop_100k_v4_cad79a/snapshot_100000.fp16.pth',
 8 |     'snapshots/efficientnet-b7_ns_aa-original-mstd0.5_re_100k/snapshot_100000.pth': 'efficientnet-b7_ns_aa-original-mstd0.5_re_100k_v4_cad79a/snapshot_100000.fp16.pth',
 9 |     'snapshots/efficientnet-b7_ns_seq_aa-original-mstd0.5_100k/snapshot_100000.pth': 'efficientnet-b7_ns_seq_aa-original-mstd0.5_100k_v4_cad79a/snapshot_100000.fp16.pth'
10 | }
11 | 
12 | SRC_DETECTOR_WEIGHTS = 'external_data/WIDERFace_DSFD_RES152.pth'
13 | DST_DETECTOR_WEIGHTS = 'WIDERFace_DSFD_RES152.fp16.pth'
14 | 
15 | 
16 | def copy_weights(src_path, dst_path):
17 |     state = torch.load(src_path, map_location=lambda storage, loc: storage)
18 |     state = {key: value.half() for key, value in state.items()}
19 |     os.makedirs(os.path.dirname(dst_path), exist_ok=True)
20 |     torch.save(state, dst_path)
21 | 
22 | 
23 | def main():
24 |     with open('config.yaml', 'r') as f:
25 |         config = yaml.load(f)
26 | 
27 |     for src_rel_path, dst_rel_path in WEIGHTS_MAPPING.items():
28 |         src_path = os.path.join(config['ARTIFACTS_PATH'], src_rel_path)
29 |         dst_path = os.path.join(config['MODELS_PATH'], dst_rel_path)
30 |         copy_weights(src_path, dst_path)
31 | 
32 |     copy_weights(SRC_DETECTOR_WEIGHTS, os.path.join(config['MODELS_PATH'], DST_DETECTOR_WEIGHTS))
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import glob
  4 | 
  5 | import cv2
  6 | import numpy as np
  7 | 
  8 | from torch.utils.data import Dataset
  9 | 
 10 | 
 11 | class UnlabeledVideoDataset(Dataset):
 12 |     def __init__(self, root_dir, content=None, transform=None):
 13 |         self.root_dir = os.path.normpath(root_dir)
 14 |         self.transform = transform
 15 | 
 16 |         if content is not None:
 17 |             self.content = content
 18 |         else:
 19 |             self.content = []
 20 |             for path in glob.iglob(os.path.join(self.root_dir, '**', '*.mp4'), recursive=True):
 21 |                 rel_path = path[len(self.root_dir) + 1:]
 22 |                 self.content.append(rel_path)
 23 |             self.content = sorted(self.content)
 24 | 
 25 |     def __len__(self):
 26 |         return len(self.content)
 27 | 
 28 |     def __getitem__(self, idx):
 29 |         rel_path = self.content[idx]
 30 |         path = os.path.join(self.root_dir, rel_path)
 31 | 
 32 |         capture = cv2.VideoCapture(path)
 33 | 
 34 |         frames = []
 35 |         if capture.isOpened():
 36 |             while True:
 37 |                 ret, frame = capture.read()
 38 |                 if not ret:
 39 |                     break
 40 | 
 41 |                 if self.transform is not None:
 42 |                     frame = self.transform(frame)
 43 | 
 44 |                 frames.append(frame)
 45 | 
 46 |         sample = {
 47 |             'frames': frames,
 48 |             'index': idx
 49 |         }
 50 | 
 51 |         return sample
 52 | 
 53 | 
 54 | class FaceDataset(Dataset):
 55 |     def __init__(self, root_dir, content, labels=None, transform=None):
 56 |         self.root_dir = os.path.normpath(root_dir)
 57 |         self.content = content
 58 |         self.labels = labels
 59 |         self.transform = transform
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.content)
 63 | 
 64 |     def __getitem__(self, idx):
 65 |         rel_path = self.content[idx]
 66 |         path = os.path.join(self.root_dir, rel_path)
 67 | 
 68 |         face = cv2.imread(path, cv2.IMREAD_COLOR)
 69 |         face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
 70 | 
 71 |         if self.transform is not None:
 72 |             face = self.transform(image=face)['image']
 73 | 
 74 |         sample = {
 75 |             'face': face,
 76 |             'index': idx
 77 |         }
 78 | 
 79 |         if self.labels is not None:
 80 |             sample['label'] = self.labels[idx]
 81 | 
 82 |         return sample
 83 | 
 84 | 
 85 | class TrackPairDataset(Dataset):
 86 |     FPS = 30
 87 | 
 88 |     def __init__(self, tracks_root, pairs_path, indices, track_length, track_transform=None, image_transform=None,
 89 |                  sequence_mode=True):
 90 |         self.tracks_root = os.path.normpath(tracks_root)
 91 |         self.track_transform = track_transform
 92 |         self.image_transform = image_transform
 93 |         self.indices = np.asarray(indices, dtype=np.int32)
 94 |         self.track_length = track_length
 95 |         self.sequence_mode = sequence_mode
 96 | 
 97 |         self.pairs = []
 98 |         with open(pairs_path, 'r') as f:
 99 |             for line in f:
100 |                 real_track, fake_track = line.strip().split(',')
101 |                 self.pairs.append((real_track, fake_track))
102 | 
103 |     def __len__(self):
104 |         return len(self.pairs)
105 | 
106 |     def __getitem__(self, idx):
107 |         real_track_path, fake_track_path = self.pairs[idx]
108 | 
109 |         real_track_path = os.path.join(self.tracks_root, real_track_path)
110 |         fake_track_path = os.path.join(self.tracks_root, fake_track_path)
111 | 
112 |         if self.track_transform is not None:
113 |             img = self.load_img(real_track_path, 0)
114 |             src_height, src_width = img.shape[:2]
115 |             track_transform_params = self.track_transform.get_params(self.FPS, src_height, src_width)
116 |         else:
117 |             track_transform_params = None
118 | 
119 |         real_track = self.load_track(real_track_path, self.indices, track_transform_params)
120 |         fake_track = self.load_track(fake_track_path, self.indices, track_transform_params)
121 | 
122 |         if self.image_transform is not None:
123 |             prev_state = random.getstate()
124 |             transformed_real_track = []
125 |             for img in real_track:
126 |                 if self.sequence_mode:
127 |                     random.setstate(prev_state)
128 |                 transformed_real_track.append(self.image_transform(image=img)['image'])
129 | 
130 |             real_track = transformed_real_track
131 | 
132 |             random.setstate(prev_state)
133 |             transformed_fake_track = []
134 |             for img in fake_track:
135 |                 if self.sequence_mode:
136 |                     random.setstate(prev_state)
137 |                 transformed_fake_track.append(self.image_transform(image=img)['image'])
138 |             fake_track = transformed_fake_track
139 | 
140 |         sample = {
141 |             'real': real_track,
142 |             'fake': fake_track
143 |         }
144 | 
145 |         return sample
146 | 
147 |     def load_img(self, track_path, idx):
148 |         img = cv2.imread(os.path.join(track_path, '{}.png'.format(idx)))
149 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
150 | 
151 |         return img
152 | 
153 |     def load_track(self, track_path, indices, transform_params):
154 |         if transform_params is None:
155 |             track = np.stack([self.load_img(track_path, idx) for idx in indices])
156 |         else:
157 |             track = self.track_transform(track_path, self.FPS, *transform_params)
158 |             indices = (indices.astype(np.float32) / self.track_length) * len(track)
159 |             indices = np.round(indices).astype(np.int32).clip(0, len(track) - 1)
160 |             track = track[indices]
161 | 
162 |         return track
163 | 


--------------------------------------------------------------------------------
/detect_faces_on_videos.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import glob
 4 | import yaml
 5 | import pickle
 6 | import tqdm
 7 | 
 8 | import torch
 9 | from torch.utils.data import DataLoader
10 | 
11 | from dsfacedetector.face_ssd_infer import SSD
12 | from datasets import UnlabeledVideoDataset
13 | 
14 | DETECTOR_WEIGHTS_PATH = 'external_data/WIDERFace_DSFD_RES152.pth'
15 | DETECTOR_THRESHOLD = 0.3
16 | DETECTOR_STEP = 6
17 | DETECTOR_TARGET_SIZE = (512, 512)
18 | 
19 | BATCH_SIZE = 1
20 | NUM_WORKERS = 0
21 | 
22 | DETECTIONS_ROOT = 'detections'
23 | DETECTIONS_FILE_NAME = 'detections.pkl'
24 | 
25 | 
26 | def main():
27 |     parser = argparse.ArgumentParser(description='Detects faces on videos')
28 |     parser.add_argument('--num_parts', type=int, default=1, help='Number of parts')
29 |     parser.add_argument('--part', type=int, default=0, help='Part index')
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     with open('config.yaml', 'r') as f:
34 |         config = yaml.load(f)
35 | 
36 |     content = []
37 |     for path in glob.iglob(os.path.join(config['DFDC_DATA_PATH'], 'dfdc_train_part_*', '*.mp4')):
38 |         parts = path.split('/')
39 |         content.append('/'.join(parts[-2:]))
40 |     content = sorted(content)
41 | 
42 |     print('Total number of videos: {}'.format(len(content)))
43 | 
44 |     part_size = len(content) // args.num_parts + 1
45 |     assert part_size * args.num_parts >= len(content)
46 |     part_start = part_size * args.part
47 |     part_end = min(part_start + part_size, len(content))
48 |     print('Part {} ({}, {})'.format(args.part, part_start, part_end))
49 | 
50 |     dataset = UnlabeledVideoDataset(config['DFDC_DATA_PATH'], content[part_start:part_end])
51 | 
52 |     detector = SSD('test')
53 |     state = torch.load(DETECTOR_WEIGHTS_PATH, map_location=lambda storage, loc: storage)
54 |     detector.load_state_dict(state)
55 |     device = torch.device('cuda')
56 |     detector = detector.eval().to(device)
57 | 
58 |     loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=lambda X: X,
59 |                         drop_last=False)
60 | 
61 |     dst_root = os.path.join(config['ARTIFACTS_PATH'], DETECTIONS_ROOT)
62 |     os.makedirs(dst_root, exist_ok=True)
63 | 
64 |     for video_sample in tqdm.tqdm(loader):
65 |         frames = video_sample[0]['frames']
66 |         video_idx = video_sample[0]['index']
67 |         video_rel_path = dataset.content[video_idx]
68 | 
69 |         detections = []
70 |         for frame in frames[::DETECTOR_STEP]:
71 |             with torch.no_grad():
72 |                 detections_per_frame = detector.detect_on_image(frame, DETECTOR_TARGET_SIZE, device, is_pad=False,
73 |                                                                 keep_thresh=DETECTOR_THRESHOLD)
74 |                 detections.append({'boxes': detections_per_frame[:, :4], 'scores': detections_per_frame[:, 4]})
75 | 
76 |         os.makedirs(os.path.join(dst_root, video_rel_path), exist_ok=True)
77 |         with open(os.path.join(dst_root, video_rel_path, DETECTIONS_FILE_NAME), 'wb') as f:
78 |             pickle.dump(detections, f)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/dsfacedetector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/dsfacedetector/__init__.py


--------------------------------------------------------------------------------
/dsfacedetector/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/dsfacedetector/data/__init__.py


--------------------------------------------------------------------------------
/dsfacedetector/data/config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def test_base_transform(image, mean):
 5 |     x = image.astype(np.float32)
 6 |     x -= mean
 7 |     x = x.astype(np.float32)
 8 |     return x
 9 | 
10 | 
11 | class TestBaseTransform:
12 |     def __init__(self, mean):
13 |         self.mean = np.array(mean, dtype=np.float32)
14 | 
15 |     def __call__(self, image):
16 |         return test_base_transform(image, self.mean)
17 | 
18 | 
19 | widerface_640 = {
20 |     'num_classes': 2,
21 | 
22 |     'feature_maps': [160, 80, 40, 20, 10, 5],
23 |     'min_dim': 640,
24 | 
25 |     'steps': [4, 8, 16, 32, 64, 128],  # stride
26 | 
27 |     'variance': [0.1, 0.2],
28 |     'clip': True,  # make default box in [0,1]
29 |     'name': 'WIDERFace',
30 |     'l2norm_scale': [10, 8, 5],
31 |     'base': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 512, 512, 512],
32 |     'extras': [256, 'S', 512, 128, 'S', 256],
33 | 
34 |     'mbox': [1, 1, 1, 1, 1, 1],
35 |     'min_sizes': [16, 32, 64, 128, 256, 512],
36 |     'max_sizes': [],
37 |     'aspect_ratios': [[1.5], [1.5], [1.5], [1.5], [1.5], [1.5]],  # [1,2]  default 1
38 | 
39 |     'backbone': 'resnet152',
40 |     'feature_pyramid_network': True,
41 |     'bottom_up_path': False,
42 |     'feature_enhance_module': True,
43 |     'max_in_out': True,
44 |     'focal_loss': False,
45 |     'progressive_anchor': True,
46 |     'refinedet': False,
47 |     'max_out': False,
48 |     'anchor_compensation': False,
49 |     'data_anchor_sampling': False,
50 | 
51 |     'overlap_thresh': [0.4],
52 |     'negpos_ratio': 3,
53 |     # test
54 |     'nms_thresh': 0.3,
55 |     'conf_thresh': 0.01,
56 |     'num_thresh': 5000,
57 | }
58 | 


--------------------------------------------------------------------------------
/dsfacedetector/face_ssd_infer.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/vlad3996/FaceDetection-DSFD
  2 | 
  3 | import torch
  4 | import torchvision
  5 | import torch.nn as nn
  6 | 
  7 | from .data.config import TestBaseTransform, widerface_640 as cfg
  8 | from .layers import Detect, get_prior_boxes, FEM, pa_multibox, mio_module, upsample_product
  9 | from .utils import resize_image
 10 | 
 11 | 
 12 | class SSD(nn.Module):
 13 | 
 14 |     def __init__(self, phase, nms_thresh=0.3, nms_conf_thresh=0.01):
 15 |         super(SSD, self).__init__()
 16 |         self.phase = phase
 17 |         self.num_classes = 2
 18 |         self.cfg = cfg
 19 | 
 20 |         resnet = torchvision.models.resnet152(pretrained=False)
 21 | 
 22 |         self.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1)
 23 |         self.layer2 = nn.Sequential(resnet.layer2)
 24 |         self.layer3 = nn.Sequential(resnet.layer3)
 25 |         self.layer4 = nn.Sequential(resnet.layer4)
 26 |         self.layer5 = nn.Sequential(
 27 |             *[nn.Conv2d(2048, 512, kernel_size=1),
 28 |               nn.BatchNorm2d(512),
 29 |               nn.ReLU(inplace=True),
 30 |               nn.Conv2d(512, 512, kernel_size=3, padding=1, stride=2),
 31 |               nn.BatchNorm2d(512),
 32 |               nn.ReLU(inplace=True)]
 33 |         )
 34 |         self.layer6 = nn.Sequential(
 35 |             *[nn.Conv2d(512, 128, kernel_size=1, ),
 36 |               nn.BatchNorm2d(128),
 37 |               nn.ReLU(inplace=True),
 38 |               nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2),
 39 |               nn.BatchNorm2d(256),
 40 |               nn.ReLU(inplace=True)]
 41 |         )
 42 | 
 43 |         output_channels = [256, 512, 1024, 2048, 512, 256]
 44 | 
 45 |         # FPN
 46 |         fpn_in = output_channels
 47 | 
 48 |         self.latlayer3 = nn.Conv2d(fpn_in[3], fpn_in[2], kernel_size=1, stride=1, padding=0)
 49 |         self.latlayer2 = nn.Conv2d(fpn_in[2], fpn_in[1], kernel_size=1, stride=1, padding=0)
 50 |         self.latlayer1 = nn.Conv2d(fpn_in[1], fpn_in[0], kernel_size=1, stride=1, padding=0)
 51 | 
 52 |         self.smooth3 = nn.Conv2d(fpn_in[2], fpn_in[2], kernel_size=1, stride=1, padding=0)
 53 |         self.smooth2 = nn.Conv2d(fpn_in[1], fpn_in[1], kernel_size=1, stride=1, padding=0)
 54 |         self.smooth1 = nn.Conv2d(fpn_in[0], fpn_in[0], kernel_size=1, stride=1, padding=0)
 55 | 
 56 |         # FEM
 57 |         cpm_in = output_channels
 58 | 
 59 |         self.cpm3_3 = FEM(cpm_in[0])
 60 |         self.cpm4_3 = FEM(cpm_in[1])
 61 |         self.cpm5_3 = FEM(cpm_in[2])
 62 |         self.cpm7 = FEM(cpm_in[3])
 63 |         self.cpm6_2 = FEM(cpm_in[4])
 64 |         self.cpm7_2 = FEM(cpm_in[5])
 65 | 
 66 |         # head
 67 |         head = pa_multibox(output_channels)
 68 |         self.loc = nn.ModuleList(head[0])
 69 |         self.conf = nn.ModuleList(head[1])
 70 | 
 71 |         self.softmax = nn.Softmax(dim=-1)
 72 | 
 73 |         if self.phase != 'onnx_export':
 74 |             self.detect = Detect(self.num_classes, 0, cfg['num_thresh'], nms_conf_thresh, nms_thresh,
 75 |                                  cfg['variance'])
 76 |             self.last_image_size = None
 77 |             self.last_feature_maps = None
 78 | 
 79 |         if self.phase == 'test':
 80 |             self.test_transform = TestBaseTransform((104, 117, 123))
 81 | 
 82 |     def forward(self, x):
 83 | 
 84 |         image_size = [x.shape[2], x.shape[3]]
 85 |         loc = list()
 86 |         conf = list()
 87 | 
 88 |         conv3_3_x = self.layer1(x)
 89 |         conv4_3_x = self.layer2(conv3_3_x)
 90 |         conv5_3_x = self.layer3(conv4_3_x)
 91 |         fc7_x = self.layer4(conv5_3_x)
 92 |         conv6_2_x = self.layer5(fc7_x)
 93 |         conv7_2_x = self.layer6(conv6_2_x)
 94 | 
 95 |         lfpn3 = upsample_product(self.latlayer3(fc7_x), self.smooth3(conv5_3_x))
 96 |         lfpn2 = upsample_product(self.latlayer2(lfpn3), self.smooth2(conv4_3_x))
 97 |         lfpn1 = upsample_product(self.latlayer1(lfpn2), self.smooth1(conv3_3_x))
 98 | 
 99 |         conv5_3_x = lfpn3
100 |         conv4_3_x = lfpn2
101 |         conv3_3_x = lfpn1
102 | 
103 |         sources = [conv3_3_x, conv4_3_x, conv5_3_x, fc7_x, conv6_2_x, conv7_2_x]
104 | 
105 |         sources[0] = self.cpm3_3(sources[0])
106 |         sources[1] = self.cpm4_3(sources[1])
107 |         sources[2] = self.cpm5_3(sources[2])
108 |         sources[3] = self.cpm7(sources[3])
109 |         sources[4] = self.cpm6_2(sources[4])
110 |         sources[5] = self.cpm7_2(sources[5])
111 | 
112 |         # apply multibox head to source layers
113 |         featuremap_size = []
114 |         for (x, l, c) in zip(sources, self.loc, self.conf):
115 |             featuremap_size.append([x.shape[2], x.shape[3]])
116 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
117 |             len_conf = len(conf)
118 |             cls = mio_module(c(x), len_conf)
119 |             conf.append(cls.permute(0, 2, 3, 1).contiguous())
120 | 
121 |         face_loc = torch.cat([o[:, :, :, :4].contiguous().view(o.size(0), -1) for o in loc], 1)
122 |         face_loc = face_loc.view(face_loc.size(0), -1, 4)
123 |         face_conf = torch.cat([o[:, :, :, :2].contiguous().view(o.size(0), -1) for o in conf], 1)
124 |         face_conf = self.softmax(face_conf.view(face_conf.size(0), -1, self.num_classes))
125 | 
126 |         if self.phase != 'onnx_export':
127 | 
128 |             if self.last_image_size is None or self.last_image_size != image_size or self.last_feature_maps != featuremap_size:
129 |                 self.priors = get_prior_boxes(self.cfg, featuremap_size, image_size).to(face_loc.device)
130 |                 self.last_image_size = image_size
131 |                 self.last_feature_maps = featuremap_size
132 |             with torch.no_grad():
133 |                 output = self.detect(face_loc, face_conf, self.priors)
134 |         else:
135 |             output = torch.cat((face_loc, face_conf), 2)
136 |         return output
137 | 
138 |     def detect_on_image(self, source_image, target_size, device, is_pad=False, keep_thresh=0.3):
139 | 
140 |         image, shift_h_scaled, shift_w_scaled, scale = resize_image(source_image, target_size, is_pad=is_pad)
141 | 
142 |         x = torch.from_numpy(self.test_transform(image)).permute(2, 0, 1).to(device)
143 |         x.unsqueeze_(0)
144 | 
145 |         detections = self.forward(x).cpu().numpy()
146 | 
147 |         scores = detections[0, 1, :, 0]
148 |         keep_idxs = scores > keep_thresh  # find keeping indexes
149 |         detections = detections[0, 1, keep_idxs, :]  # select detections over threshold
150 |         detections = detections[:, [1, 2, 3, 4, 0]]  # reorder
151 | 
152 |         detections[:, [0, 2]] -= shift_w_scaled  # 0 or pad percent from left corner
153 |         detections[:, [1, 3]] -= shift_h_scaled  # 0 or pad percent from top
154 |         detections[:, :4] *= scale
155 | 
156 |         return detections
157 | 


--------------------------------------------------------------------------------
/dsfacedetector/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox, get_prior_boxes
3 | from .modules import FEM, pa_multibox, mio_module, upsample_product
4 | 


--------------------------------------------------------------------------------
/dsfacedetector/layers/detection.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | class Detect(nn.Module):
  7 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
  8 |     apply non-maximum suppression to location predictions based on conf
  9 |     scores and threshold to a top_k number of output predictions for both
 10 |     confidence score and locations.
 11 |     """
 12 | 
 13 |     def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh, variance=(0.1, 0.2)):
 14 |         super(Detect, self).__init__()
 15 |         self.num_classes = num_classes
 16 |         self.background_label = bkg_label
 17 |         self.top_k = top_k
 18 |         # Parameters used in nms.
 19 |         self.nms_thresh = nms_thresh
 20 |         if nms_thresh <= 0:
 21 |             raise ValueError('nms_threshold must be non negative.')
 22 |         self.conf_thresh = conf_thresh
 23 |         self.variance = variance
 24 | 
 25 |     def forward(self, loc_data, conf_data, prior_data):
 26 |         """
 27 |         Args:
 28 |             loc_data: (tensor) Loc preds from loc layers
 29 |                 Shape: [batch,num_priors*4]
 30 |             conf_data: (tensor) Shape: Conf preds from conf layers
 31 |                 Shape: [batch*num_priors,num_classes]
 32 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
 33 |                 Shape: [1,num_priors,4]
 34 |         """
 35 |         num = loc_data.size(0)  # batch size
 36 |         num_priors = prior_data.size(0)
 37 | 
 38 |         output = torch.zeros(num, self.num_classes, self.top_k, 5)
 39 |         conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
 40 | 
 41 |         # Decode predictions into bboxes.
 42 |         for i in range(num):
 43 |             default = prior_data
 44 |             decoded_boxes = decode(loc_data[i], default, self.variance)
 45 |             # For each class, perform nms
 46 |             conf_scores = conf_preds[i].clone()
 47 | 
 48 |             for cl in range(1, self.num_classes):
 49 |                 c_mask = conf_scores[cl].gt(self.conf_thresh)
 50 |                 scores = conf_scores[cl][c_mask]
 51 |                 if scores.dim() == 0 or scores.size(0) == 0:
 52 |                     continue
 53 |                 l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
 54 |                 boxes = decoded_boxes[l_mask].view(-1, 4)
 55 |                 # idx of highest scoring and non-overlapping boxes per class
 56 |                 ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
 57 |                 output[i, cl, :count] = \
 58 |                     torch.cat((scores[ids[:count]].unsqueeze(1),
 59 |                                boxes[ids[:count]]), 1)
 60 |         flt = output.contiguous().view(num, -1, 5)
 61 |         _, idx = flt[:, :, 0].sort(1, descending=True)
 62 |         _, rank = idx.sort(1)
 63 |         flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0)
 64 |         return output
 65 | 
 66 | 
 67 | # Adapted from https://github.com/Hakuyume/chainer-ssd
 68 | def decode(loc, priors, variances):
 69 |     """Decode locations from predictions using priors to undo
 70 |     the encoding we did for offset regression at train time.
 71 |     Args:
 72 |         loc (tensor): location predictions for loc layers,
 73 |             Shape: [num_priors,4]
 74 |         priors (tensor): Prior boxes in center-offset form.
 75 |             Shape: [num_priors,4].
 76 |         variances: (list[float]) Variances of priorboxes
 77 |     Return:
 78 |         decoded bounding box predictions
 79 |     """
 80 | 
 81 |     boxes = torch.cat((
 82 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
 83 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
 84 |     boxes[:, :2] -= boxes[:, 2:] / 2
 85 |     boxes[:, 2:] += boxes[:, :2]
 86 |     # (cx,cy,w,h)->(x0,y0,x1,y1)
 87 |     return boxes
 88 | 
 89 | 
 90 | # Original author: Francisco Massa:
 91 | # https://github.com/fmassa/object-detection.torch
 92 | # Ported to PyTorch by Max deGroot (02/01/2017)
 93 | def nms(boxes, scores, overlap=0.5, top_k=200):
 94 |     """Apply non-maximum suppression at test time to avoid detecting too many
 95 |     overlapping bounding boxes for a given object.
 96 |     Args:
 97 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
 98 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
 99 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
100 |         top_k: (int) The Maximum number of box preds to consider.
101 |     Return:
102 |         The indices of the kept boxes with respect to num_priors.
103 |     """
104 | 
105 |     keep = scores.new(scores.size(0)).zero_().long()
106 |     if boxes.numel() == 0:
107 |         return keep
108 |     x1 = boxes[:, 0]
109 |     y1 = boxes[:, 1]
110 |     x2 = boxes[:, 2]
111 |     y2 = boxes[:, 3]
112 |     area = torch.mul(x2 - x1, y2 - y1)
113 |     v, idx = scores.sort(0)  # sort in ascending order
114 |     # I = I[v >= 0.01]
115 |     idx = idx[-top_k:]  # indices of the top-k largest vals
116 |     xx1 = boxes.new()
117 |     yy1 = boxes.new()
118 |     xx2 = boxes.new()
119 |     yy2 = boxes.new()
120 |     w = boxes.new()
121 |     h = boxes.new()
122 | 
123 |     # keep = torch.Tensor()
124 |     count = 0
125 |     while idx.numel() > 0:
126 |         i = idx[-1]  # index of current largest val
127 |         # keep.append(i)
128 |         keep[count] = i
129 |         count += 1
130 |         if idx.size(0) == 1:
131 |             break
132 |         idx = idx[:-1]  # remove kept element from view
133 |         # load bboxes of next highest vals
134 |         torch.index_select(x1, 0, idx, out=xx1)
135 |         torch.index_select(y1, 0, idx, out=yy1)
136 |         torch.index_select(x2, 0, idx, out=xx2)
137 |         torch.index_select(y2, 0, idx, out=yy2)
138 |         # store element-wise max with next highest score
139 |         xx1 = torch.clamp(xx1, min=x1[i])
140 |         yy1 = torch.clamp(yy1, min=y1[i])
141 |         xx2 = torch.clamp(xx2, max=x2[i])
142 |         yy2 = torch.clamp(yy2, max=y2[i])
143 |         w.resize_as_(xx2)
144 |         h.resize_as_(yy2)
145 |         w = xx2 - xx1
146 |         h = yy2 - yy1
147 |         # check sizes of xx1 and xx2.. after each iteration
148 |         w = torch.clamp(w, min=0.0)
149 |         h = torch.clamp(h, min=0.0)
150 |         inter = w * h
151 |         # IoU = i / (area(a) + area(b) - i)
152 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
153 |         union = (rem_areas - inter) + area[i]
154 |         IoU = inter / union  # store result in iou
155 |         # keep only elements with an IoU <= overlap
156 |         idx = idx[IoU.le(overlap)]
157 |     return keep, count
158 | 


--------------------------------------------------------------------------------
/dsfacedetector/layers/modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class DeepHeadModule(nn.Module):
 7 |     def __init__(self, input_channels, output_channels):
 8 |         super(DeepHeadModule, self).__init__()
 9 |         self._input_channels = input_channels
10 |         self._output_channels = output_channels
11 |         self._mid_channels = min(self._input_channels, 256)
12 | 
13 |         self.conv1 = nn.Conv2d(self._input_channels, self._mid_channels, kernel_size=3, dilation=1, stride=1, padding=1)
14 |         self.conv2 = nn.Conv2d(self._mid_channels, self._mid_channels, kernel_size=3, dilation=1, stride=1, padding=1)
15 |         self.conv3 = nn.Conv2d(self._mid_channels, self._mid_channels, kernel_size=3, dilation=1, stride=1, padding=1)
16 |         self.conv4 = nn.Conv2d(self._mid_channels, self._output_channels, kernel_size=1, dilation=1, stride=1,
17 |                                padding=0)
18 | 
19 |     def forward(self, x):
20 |         return self.conv4(
21 |             F.relu(self.conv3(F.relu(self.conv2(F.relu(self.conv1(x), inplace=True)), inplace=True)), inplace=True))
22 | 
23 | 
24 | class FEM(nn.Module):
25 |     def __init__(self, channel_size):
26 |         super(FEM, self).__init__()
27 |         self.cs = channel_size
28 |         self.cpm1 = nn.Conv2d(self.cs, 256, kernel_size=3, dilation=1, stride=1, padding=1)
29 |         self.cpm2 = nn.Conv2d(self.cs, 256, kernel_size=3, dilation=2, stride=1, padding=2)
30 |         self.cpm3 = nn.Conv2d(256, 128, kernel_size=3, dilation=1, stride=1, padding=1)
31 |         self.cpm4 = nn.Conv2d(256, 128, kernel_size=3, dilation=2, stride=1, padding=2)
32 |         self.cpm5 = nn.Conv2d(128, 128, kernel_size=3, dilation=1, stride=1, padding=1)
33 | 
34 |     def forward(self, x):
35 |         x1_1 = F.relu(self.cpm1(x), inplace=True)
36 |         x1_2 = F.relu(self.cpm2(x), inplace=True)
37 |         x2_1 = F.relu(self.cpm3(x1_2), inplace=True)
38 |         x2_2 = F.relu(self.cpm4(x1_2), inplace=True)
39 |         x3_1 = F.relu(self.cpm5(x2_2), inplace=True)
40 |         return torch.cat((x1_1, x2_1, x3_1), 1)
41 | 
42 | 
43 | def upsample_product(x, y):
44 |     '''Upsample and add two feature maps.
45 |        Args:
46 |          x: (Variable) top feature map to be upsampled.
47 |          y: (Variable) lateral feature map.
48 |        Returns:
49 |          (Variable) added feature map.
50 |        Note in PyTorch, when input size is odd, the upsampled feature map
51 |        with `F.upsample(..., scale_factor=2, mode='nearest')`
52 |        maybe not equal to the lateral feature map size.
53 |        e.g.
54 |        original input size: [N,_,15,15] ->
55 |        conv2d feature map size: [N,_,8,8] ->
56 |        upsampled feature map size: [N,_,16,16]
57 |        So we choose bilinear upsample which supports arbitrary output sizes.
58 |        '''
59 |     _, _, H, W = y.size()
60 | 
61 |     # FOR ONNX CONVERSION
62 |     # return F.interpolate(x, scale_factor=2, mode='nearest') * y
63 |     return F.interpolate(x, size=(int(H), int(W)), mode='bilinear', align_corners=False) * y
64 | 
65 | 
66 | def pa_multibox(output_channels):
67 |     loc_layers = []
68 |     conf_layers = []
69 |     for k, v in enumerate(output_channels):
70 |         if k == 0:
71 |             loc_output = 4
72 |             conf_output = 2
73 |         elif k == 1:
74 |             loc_output = 8
75 |             conf_output = 4
76 |         else:
77 |             loc_output = 12
78 |             conf_output = 6
79 |         loc_layers += [DeepHeadModule(512, loc_output)]
80 |         conf_layers += [DeepHeadModule(512, (2 + conf_output))]
81 |     return (loc_layers, conf_layers)
82 | 
83 | 
84 | def mio_module(each_mmbox, len_conf, your_mind_state='peasant'):
85 |     # chunk = torch.split(each_mmbox, 1, 1) - !!!!! failed to export on PyTorch v1.0.1 (ONNX version 1.3)
86 |     chunk = torch.chunk(each_mmbox, int(each_mmbox.shape[1]), 1)
87 | 
88 |     # some hacks for ONNX and Inference Engine export
89 |     if your_mind_state == 'peasant':
90 |         bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
91 |     elif your_mind_state == 'advanced':
92 |         bmax = torch.max(each_mmbox[:, :3], 1)[0].unsqueeze(0)
93 |     else: # supermind
94 |         bmax = torch.nn.functional.max_pool3d(each_mmbox[:, :3], kernel_size=(3, 1, 1))
95 | 
96 |     cls = (torch.cat((bmax, chunk[3]), dim=1) if len_conf == 0 else torch.cat((chunk[3], bmax), dim=1))
97 |     cls = torch.cat((cls, *list(chunk[4:])), dim=1)
98 |     return cls


--------------------------------------------------------------------------------
/dsfacedetector/layers/prior_box.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from math import sqrt as sqrt
  3 | import torch
  4 | 
  5 | 
  6 | class PriorBox(object):
  7 |     """Compute priorbox coordinates in center-offset form for each source
  8 |     feature map.
  9 |     """
 10 | 
 11 |     def __init__(self, cfg, min_size, max_size):
 12 |         super(PriorBox, self).__init__()
 13 |         self.image_size = cfg['min_dim']
 14 |         self.feature_maps = cfg['feature_maps']
 15 | 
 16 |         self.variance = cfg['variance'] or [0.1]
 17 |         self.min_sizes = min_size
 18 |         self.max_sizes = max_size
 19 |         self.steps = cfg['steps']
 20 |         self.aspect_ratios = cfg['aspect_ratios']
 21 |         self.clip = cfg['clip']
 22 | 
 23 |         for v in self.variance:
 24 |             if v <= 0:
 25 |                 raise ValueError('Variances must be greater than 0')
 26 | 
 27 |     def forward(self):
 28 | 
 29 |         mean = []
 30 | 
 31 |         if len(self.min_sizes) == 5:
 32 |             self.feature_maps = self.feature_maps[1:]
 33 |             self.steps = self.steps[1:]
 34 |         if len(self.min_sizes) == 4:
 35 |             self.feature_maps = self.feature_maps[2:]
 36 |             self.steps = self.steps[2:]
 37 | 
 38 |         for k, f in enumerate(self.feature_maps):
 39 |             # for i, j in product(range(f), repeat=2):
 40 |             for i in range(f[0]):
 41 |                 for j in range(f[1]):
 42 |                     # f_k = self.image_size / self.steps[k]
 43 |                     f_k_i = self.image_size[0] / self.steps[k]
 44 |                     f_k_j = self.image_size[1] / self.steps[k]
 45 |                     # unit center x,y
 46 |                     cx = (j + 0.5) / f_k_j
 47 |                     cy = (i + 0.5) / f_k_i
 48 |                     # aspect_ratio: 1
 49 |                     # rel size: min_size
 50 |                     s_k_i = self.min_sizes[k] / self.image_size[1]
 51 |                     s_k_j = self.min_sizes[k] / self.image_size[0]
 52 |                     # swordli@tencent
 53 |                     if len(self.aspect_ratios[0]) == 0:
 54 |                         mean += [cx, cy, s_k_i, s_k_j]
 55 | 
 56 |                     # aspect_ratio: 1
 57 |                     # rel size: sqrt(s_k * s_(k+1))
 58 |                     # s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
 59 |                     if len(self.max_sizes) == len(self.min_sizes):
 60 |                         s_k_prime_i = sqrt(s_k_i * (self.max_sizes[k] / self.image_size[1]))
 61 |                         s_k_prime_j = sqrt(s_k_j * (self.max_sizes[k] / self.image_size[0]))
 62 |                         mean += [cx, cy, s_k_prime_i, s_k_prime_j]
 63 |                     # rest of aspect ratios
 64 |                     for ar in self.aspect_ratios[k]:
 65 |                         if len(self.max_sizes) == len(self.min_sizes):
 66 |                             mean += [cx, cy, s_k_prime_i / sqrt(ar), s_k_prime_j * sqrt(ar)]
 67 |                         mean += [cx, cy, s_k_i / sqrt(ar), s_k_j * sqrt(ar)]
 68 | 
 69 |         # back to torch land
 70 |         output = torch.Tensor(mean).view(-1, 4)
 71 |         if self.clip:
 72 |             output.clamp_(max=1, min=0)
 73 |         return output
 74 | 
 75 | 
 76 | def get_prior_boxes(cfg, feature_maps, image_size):
 77 | 
 78 |     # number of priors for feature map location (either 4 or 6)
 79 |     variance = cfg['variance'] or [0.1]
 80 |     min_sizes = cfg['min_sizes']
 81 |     max_sizes = cfg['max_sizes']
 82 |     steps = cfg['steps']
 83 |     aspect_ratios = cfg['aspect_ratios']
 84 |     clip = cfg['clip']
 85 |     for v in variance:
 86 |         if v <= 0:
 87 |             raise ValueError('Variances must be greater than 0')
 88 | 
 89 |     mean = []
 90 | 
 91 |     if len(min_sizes) == 5:
 92 |         feature_maps = feature_maps[1:]
 93 |         steps = steps[1:]
 94 |     if len(min_sizes) == 4:
 95 |         feature_maps = feature_maps[2:]
 96 |         steps = steps[2:]
 97 | 
 98 |     for k, f in enumerate(feature_maps):
 99 |         # for i, j in product(range(f), repeat=2):
100 |         for i in range(f[0]):
101 |             for j in range(f[1]):
102 |                 # f_k = image_size / steps[k]
103 |                 f_k_i = image_size[0] / steps[k]
104 |                 f_k_j = image_size[1] / steps[k]
105 |                 # unit center x,y
106 |                 cx = (j + 0.5) / f_k_j
107 |                 cy = (i + 0.5) / f_k_i
108 |                 # aspect_ratio: 1
109 |                 # rel size: min_size
110 |                 s_k_i = min_sizes[k] / image_size[1]
111 |                 s_k_j = min_sizes[k] / image_size[0]
112 |                 # swordli@tencent
113 |                 if len(aspect_ratios[0]) == 0:
114 |                     mean += [cx, cy, s_k_i, s_k_j]
115 | 
116 |                 # aspect_ratio: 1
117 |                 # rel size: sqrt(s_k * s_(k+1))
118 |                 # s_k_prime = sqrt(s_k * (max_sizes[k]/image_size))
119 |                 if len(max_sizes) == len(min_sizes):
120 |                     s_k_prime_i = sqrt(s_k_i * (max_sizes[k] / image_size[1]))
121 |                     s_k_prime_j = sqrt(s_k_j * (max_sizes[k] / image_size[0]))
122 |                     mean += [cx, cy, s_k_prime_i, s_k_prime_j]
123 |                 # rest of aspect ratios
124 |                 for ar in aspect_ratios[k]:
125 |                     if len(max_sizes) == len(min_sizes):
126 |                         mean += [cx, cy, s_k_prime_i / sqrt(ar), s_k_prime_j * sqrt(ar)]
127 |                     mean += [cx, cy, s_k_i / sqrt(ar), s_k_j * sqrt(ar)]
128 | 
129 |     # back to torch land
130 |     output = torch.Tensor(mean).view(-1, 4)
131 |     if clip:
132 |         output.clamp_(max=1, min=0)
133 |     return output
134 | 


--------------------------------------------------------------------------------
/dsfacedetector/utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | def vis_detections(im, dets, thresh=0.5, show_text=True):
  7 |     """Draw detected bounding boxes."""
  8 |     class_name = 'face'
  9 |     inds = np.where(dets[:, -1] >= thresh)[0] if dets is not None else []
 10 |     if len(inds) == 0:
 11 |         return
 12 |     im = im[:, :, (2, 1, 0)]
 13 |     fig, ax = plt.subplots(figsize=(12, 12))
 14 |     ax.imshow(im, aspect='equal')
 15 |     for i in inds:
 16 |         bbox = dets[i, :4]
 17 |         score = dets[i, -1]
 18 |         ax.add_patch(
 19 |             plt.Rectangle((bbox[0], bbox[1]),
 20 |                           bbox[2] - bbox[0],
 21 |                           bbox[3] - bbox[1], fill=False,
 22 |                           edgecolor='red', linewidth=2.5)
 23 |         )
 24 |         if show_text:
 25 |             ax.text(bbox[0], bbox[1] - 5,
 26 |                     '{:s} {:.3f}'.format(class_name, score),
 27 |                     bbox=dict(facecolor='blue', alpha=0.5),
 28 |                     fontsize=10, color='white')
 29 |     ax.set_title(('{} detections with '
 30 |                   'p({} | box) >= {:.1f}').format(class_name, class_name,
 31 |                                                   thresh),
 32 |                  fontsize=10)
 33 |     plt.axis('off')
 34 |     plt.tight_layout()
 35 |     plt.savefig('out.png')
 36 |     plt.show()
 37 | 
 38 | 
 39 | def bbox_vote(det):
 40 |     order = det[:, 4].ravel().argsort()[::-1]
 41 |     det = det[order, :]
 42 |     dets = None
 43 |     while det.shape[0] > 0:
 44 |         # IOU
 45 |         area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
 46 |         xx1 = np.maximum(det[0, 0], det[:, 0])
 47 |         yy1 = np.maximum(det[0, 1], det[:, 1])
 48 |         xx2 = np.minimum(det[0, 2], det[:, 2])
 49 |         yy2 = np.minimum(det[0, 3], det[:, 3])
 50 |         w = np.maximum(0.0, xx2 - xx1 + 1)
 51 |         h = np.maximum(0.0, yy2 - yy1 + 1)
 52 |         inter = w * h
 53 |         o = inter / (area[0] + area[:] - inter)
 54 |         # get needed merge det and delete these det
 55 |         merge_index = np.where(o >= 0.3)[0]
 56 |         det_accu = det[merge_index, :]
 57 |         det = np.delete(det, merge_index, 0)
 58 |         if merge_index.shape[0] <= 1:
 59 |             continue
 60 |         det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
 61 |         max_score = np.max(det_accu[:, 4])
 62 |         det_accu_sum = np.zeros((1, 5))
 63 |         det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
 64 |         det_accu_sum[:, 4] = max_score
 65 |         try:
 66 |             dets = np.row_stack((dets, det_accu_sum))
 67 |         except:
 68 |             dets = det_accu_sum
 69 |     if dets is not None:
 70 |         dets = dets[0:750, :]
 71 |     return dets
 72 | 
 73 | 
 74 | def add_borders(curr_img, target_shape=(224, 224), fill_type=0):
 75 |     curr_h, curr_w = curr_img.shape[0:2]
 76 |     shift_h = max(target_shape[0] - curr_h, 0)
 77 |     shift_w = max(target_shape[1] - curr_w, 0)
 78 | 
 79 |     image = cv2.copyMakeBorder(curr_img, shift_h // 2, (shift_h + 1) // 2, shift_w // 2, (shift_w + 1) // 2, fill_type)
 80 |     return image, shift_h, shift_w
 81 | 
 82 | 
 83 | def resize_image(image, target_size, resize_factor=None, is_pad=True, interpolation=3):
 84 |     curr_image_size = image.shape[0:2]
 85 | 
 86 |     if resize_factor is None and is_pad:
 87 |         resize_factor = min(target_size[0] / curr_image_size[0], target_size[1] / curr_image_size[1])
 88 |     elif resize_factor is None and not is_pad:
 89 |         resize_factor = np.sqrt((target_size[0] * target_size[1]) / (curr_image_size[0] * curr_image_size[1]))
 90 | 
 91 |     image = cv2.resize(image, None, None, fx=resize_factor, fy=resize_factor, interpolation=interpolation)
 92 | 
 93 |     if is_pad:
 94 |         image, shift_h, shift_w = add_borders(image, target_size)
 95 |     else:
 96 |         shift_h = shift_w = 0
 97 | 
 98 |     scale = np.array([image.shape[1]/resize_factor, image.shape[0]/resize_factor,
 99 |                       image.shape[1]/resize_factor, image.shape[0]/resize_factor])
100 | 
101 |     return image, shift_h/image.shape[0]/2, shift_w/image.shape[1]/2, scale


--------------------------------------------------------------------------------
/external_data/convert_tf_to_pt.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/lukemelas/EfficientNet-PyTorch
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import torch
  6 | 
  7 | def load_param(checkpoint_file, conversion_table, model_name):
  8 |     """
  9 |     Load parameters according to conversion_table.
 10 | 
 11 |     Args:
 12 |         checkpoint_file (string): pretrained checkpoint model file in tensorflow
 13 |         conversion_table (dict): { pytorch tensor in a model : checkpoint variable name }
 14 |     """
 15 |     for pyt_param, tf_param_name in conversion_table.items():
 16 |         tf_param_name = str(model_name) + '/' +  tf_param_name
 17 |         tf_param = tf.train.load_variable(checkpoint_file, tf_param_name)
 18 |         if 'conv' in tf_param_name and 'kernel' in tf_param_name:
 19 |             tf_param = np.transpose(tf_param, (3, 2, 0, 1))
 20 |             if 'depthwise' in tf_param_name:
 21 |                 tf_param = np.transpose(tf_param, (1, 0, 2, 3))
 22 |         elif tf_param_name.endswith('kernel'):  # for weight(kernel), we should do transpose
 23 |             tf_param = np.transpose(tf_param)
 24 |         assert pyt_param.size() == tf_param.shape, \
 25 |             'Dim Mismatch: %s vs %s ; %s' % (tuple(pyt_param.size()), tf_param.shape, tf_param_name)
 26 |         pyt_param.data = torch.from_numpy(tf_param)
 27 | 
 28 | 
 29 | def load_efficientnet(model, checkpoint_file, model_name):
 30 |     """
 31 |     Load PyTorch EfficientNet from TensorFlow checkpoint file
 32 |     """
 33 | 
 34 |     # This will store the enire conversion table
 35 |     conversion_table = {}
 36 |     merge = lambda dict1, dict2: {**dict1, **dict2}
 37 | 
 38 |     # All the weights not in the conv blocks
 39 |     conversion_table_for_weights_outside_blocks = {
 40 |         model._conv_stem.weight: 'stem/conv2d/kernel',  # [3, 3, 3, 32]),
 41 |         model._bn0.bias: 'stem/tpu_batch_normalization/beta',  # [32]),
 42 |         model._bn0.weight: 'stem/tpu_batch_normalization/gamma',  # [32]),
 43 |         model._bn0.running_mean: 'stem/tpu_batch_normalization/moving_mean',  # [32]),
 44 |         model._bn0.running_var: 'stem/tpu_batch_normalization/moving_variance',  # [32]),
 45 |         model._conv_head.weight: 'head/conv2d/kernel',  # [1, 1, 320, 1280]),
 46 |         model._bn1.bias: 'head/tpu_batch_normalization/beta',  # [1280]),
 47 |         model._bn1.weight: 'head/tpu_batch_normalization/gamma',  # [1280]),
 48 |         model._bn1.running_mean: 'head/tpu_batch_normalization/moving_mean',  # [32]),
 49 |         model._bn1.running_var: 'head/tpu_batch_normalization/moving_variance',  # [32]),
 50 |         model._fc.bias: 'head/dense/bias',  # [1000]),
 51 |         model._fc.weight: 'head/dense/kernel',  # [1280, 1000]),
 52 |     }
 53 |     conversion_table = merge(conversion_table, conversion_table_for_weights_outside_blocks)
 54 | 
 55 |     # The first conv block is special because it does not have _expand_conv
 56 |     conversion_table_for_first_block = {
 57 |         model._blocks[0]._project_conv.weight: 'blocks_0/conv2d/kernel',  # 1, 1, 32, 16]),
 58 |         model._blocks[0]._depthwise_conv.weight: 'blocks_0/depthwise_conv2d/depthwise_kernel',  # [3, 3, 32, 1]),
 59 |         model._blocks[0]._se_reduce.bias: 'blocks_0/se/conv2d/bias',  # , [8]),
 60 |         model._blocks[0]._se_reduce.weight: 'blocks_0/se/conv2d/kernel',  # , [1, 1, 32, 8]),
 61 |         model._blocks[0]._se_expand.bias: 'blocks_0/se/conv2d_1/bias',  # , [32]),
 62 |         model._blocks[0]._se_expand.weight: 'blocks_0/se/conv2d_1/kernel',  # , [1, 1, 8, 32]),
 63 |         model._blocks[0]._bn1.bias: 'blocks_0/tpu_batch_normalization/beta',  # [32]),
 64 |         model._blocks[0]._bn1.weight: 'blocks_0/tpu_batch_normalization/gamma',  # [32]),
 65 |         model._blocks[0]._bn1.running_mean: 'blocks_0/tpu_batch_normalization/moving_mean',
 66 |         model._blocks[0]._bn1.running_var: 'blocks_0/tpu_batch_normalization/moving_variance',
 67 |         model._blocks[0]._bn2.bias: 'blocks_0/tpu_batch_normalization_1/beta',  # [16]),
 68 |         model._blocks[0]._bn2.weight: 'blocks_0/tpu_batch_normalization_1/gamma',  # [16]),
 69 |         model._blocks[0]._bn2.running_mean: 'blocks_0/tpu_batch_normalization_1/moving_mean',
 70 |         model._blocks[0]._bn2.running_var: 'blocks_0/tpu_batch_normalization_1/moving_variance',
 71 |     }
 72 |     conversion_table = merge(conversion_table, conversion_table_for_first_block)
 73 | 
 74 |     # Conv blocks
 75 |     for i in range(len(model._blocks)):
 76 | 
 77 |         is_first_block = '_expand_conv.weight' not in [n for n, p in model._blocks[i].named_parameters()]
 78 | 
 79 |         if is_first_block:
 80 |             conversion_table_block = {
 81 |                 model._blocks[i]._project_conv.weight: 'blocks_' + str(i) + '/conv2d/kernel',  # 1, 1, 32, 16]),
 82 |                 model._blocks[i]._depthwise_conv.weight: 'blocks_' + str(i) + '/depthwise_conv2d/depthwise_kernel',
 83 |                 # [3, 3, 32, 1]),
 84 |                 model._blocks[i]._se_reduce.bias: 'blocks_' + str(i) + '/se/conv2d/bias',  # , [8]),
 85 |                 model._blocks[i]._se_reduce.weight: 'blocks_' + str(i) + '/se/conv2d/kernel',  # , [1, 1, 32, 8]),
 86 |                 model._blocks[i]._se_expand.bias: 'blocks_' + str(i) + '/se/conv2d_1/bias',  # , [32]),
 87 |                 model._blocks[i]._se_expand.weight: 'blocks_' + str(i) + '/se/conv2d_1/kernel',  # , [1, 1, 8, 32]),
 88 |                 model._blocks[i]._bn1.bias: 'blocks_' + str(i) + '/tpu_batch_normalization/beta',  # [32]),
 89 |                 model._blocks[i]._bn1.weight: 'blocks_' + str(i) + '/tpu_batch_normalization/gamma',  # [32]),
 90 |                 model._blocks[i]._bn1.running_mean: 'blocks_' + str(i) + '/tpu_batch_normalization/moving_mean',
 91 |                 model._blocks[i]._bn1.running_var: 'blocks_' + str(i) + '/tpu_batch_normalization/moving_variance',
 92 |                 model._blocks[i]._bn2.bias: 'blocks_' + str(i) + '/tpu_batch_normalization_1/beta',  # [16]),
 93 |                 model._blocks[i]._bn2.weight: 'blocks_' + str(i) + '/tpu_batch_normalization_1/gamma',  # [16]),
 94 |                 model._blocks[i]._bn2.running_mean: 'blocks_' + str(i) + '/tpu_batch_normalization_1/moving_mean',
 95 |                 model._blocks[i]._bn2.running_var: 'blocks_' + str(i) + '/tpu_batch_normalization_1/moving_variance',
 96 |             }
 97 | 
 98 |         else:
 99 |             conversion_table_block = {
100 |                 model._blocks[i]._expand_conv.weight:       'blocks_' + str(i) + '/conv2d/kernel',
101 |                 model._blocks[i]._project_conv.weight:      'blocks_' + str(i) + '/conv2d_1/kernel',
102 |                 model._blocks[i]._depthwise_conv.weight:    'blocks_' + str(i) + '/depthwise_conv2d/depthwise_kernel',
103 |                 model._blocks[i]._se_reduce.bias:           'blocks_' + str(i) + '/se/conv2d/bias',
104 |                 model._blocks[i]._se_reduce.weight:         'blocks_' + str(i) + '/se/conv2d/kernel',
105 |                 model._blocks[i]._se_expand.bias:           'blocks_' + str(i) + '/se/conv2d_1/bias',
106 |                 model._blocks[i]._se_expand.weight:         'blocks_' + str(i) + '/se/conv2d_1/kernel',
107 |                 model._blocks[i]._bn0.bias:                 'blocks_' + str(i) + '/tpu_batch_normalization/beta',
108 |                 model._blocks[i]._bn0.weight:               'blocks_' + str(i) + '/tpu_batch_normalization/gamma',
109 |                 model._blocks[i]._bn0.running_mean:         'blocks_' + str(i) + '/tpu_batch_normalization/moving_mean',
110 |                 model._blocks[i]._bn0.running_var:          'blocks_' + str(i) + '/tpu_batch_normalization/moving_variance',
111 |                 model._blocks[i]._bn1.bias:                 'blocks_' + str(i) + '/tpu_batch_normalization_1/beta',
112 |                 model._blocks[i]._bn1.weight:               'blocks_' + str(i) + '/tpu_batch_normalization_1/gamma',
113 |                 model._blocks[i]._bn1.running_mean:         'blocks_' + str(i) + '/tpu_batch_normalization_1/moving_mean',
114 |                 model._blocks[i]._bn1.running_var:          'blocks_' + str(i) + '/tpu_batch_normalization_1/moving_variance',
115 |                 model._blocks[i]._bn2.bias:                 'blocks_' + str(i) + '/tpu_batch_normalization_2/beta',
116 |                 model._blocks[i]._bn2.weight:               'blocks_' + str(i) + '/tpu_batch_normalization_2/gamma',
117 |                 model._blocks[i]._bn2.running_mean:         'blocks_' + str(i) + '/tpu_batch_normalization_2/moving_mean',
118 |                 model._blocks[i]._bn2.running_var:          'blocks_' + str(i) + '/tpu_batch_normalization_2/moving_variance',
119 |             }
120 | 
121 |         conversion_table = merge(conversion_table, conversion_table_block)
122 | 
123 |     # Load TensorFlow parameters into PyTorch model
124 |     load_param(checkpoint_file, conversion_table, model_name)
125 |     return conversion_table
126 | 
127 | 
128 | def load_and_save_temporary_tensorflow_model(model_name, model_ckpt, example_img= '../../example/img.jpg'):
129 |     """ Loads and saves a TensorFlow model. """
130 |     image_files = [example_img]
131 |     eval_ckpt_driver = eval_ckpt_main.EvalCkptDriver(model_name)
132 |     with tf.Graph().as_default(), tf.Session() as sess:
133 |         images, labels = eval_ckpt_driver.build_dataset(image_files, [0] * len(image_files), False)
134 |         probs = eval_ckpt_driver.build_model(images, is_training=False)
135 |         sess.run(tf.global_variables_initializer())
136 |         print(model_ckpt)
137 |         eval_ckpt_driver.restore_model(sess, model_ckpt)
138 |         tf.train.Saver().save(sess, 'tmp/model.ckpt')
139 | 
140 | 
141 | if __name__ == '__main__':
142 | 
143 |     import sys
144 |     import argparse
145 | 
146 |     sys.path.append('original_tf')
147 |     import eval_ckpt_main
148 | 
149 |     from efficientnet_pytorch import EfficientNet
150 | 
151 |     parser = argparse.ArgumentParser(
152 |         description='Convert TF model to PyTorch model and save for easier future loading')
153 |     parser.add_argument('--model_name', type=str, default='efficientnet-b0',
154 |                         help='efficientnet-b{N}, where N is an integer 0 <= N <= 8')
155 |     parser.add_argument('--tf_checkpoint', type=str, default='pretrained_tensorflow/efficientnet-b0/',
156 |                         help='checkpoint file path')
157 |     parser.add_argument('--output_file', type=str, default='pretrained_pytorch/efficientnet-b0.pth',
158 |                         help='output PyTorch model file name')
159 |     args = parser.parse_args()
160 | 
161 |     # Build model
162 |     model = EfficientNet.from_name(args.model_name)
163 | 
164 |     # Load and save temporary TensorFlow file due to TF nuances
165 |     print(args.tf_checkpoint)
166 |     load_and_save_temporary_tensorflow_model(args.model_name, args.tf_checkpoint)
167 | 
168 |     # Load weights
169 |     load_efficientnet(model, 'tmp/model.ckpt', model_name=args.model_name)
170 |     print('Loaded TF checkpoint weights')
171 | 
172 |     # Save PyTorch file
173 |     torch.save(model.state_dict(), args.output_file)
174 |     print('Saved model to', args.output_file)
175 | 


--------------------------------------------------------------------------------
/external_data/original_tf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/external_data/original_tf/__init__.py


--------------------------------------------------------------------------------
/external_data/original_tf/efficientnet_builder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Model Builder for EfficientNet."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import functools
 22 | import os
 23 | import re
 24 | from absl import logging
 25 | import numpy as np
 26 | import six
 27 | import tensorflow.compat.v1 as tf
 28 | 
 29 | import efficientnet_model
 30 | import utils
 31 | MEAN_RGB = [0.485 * 255, 0.456 * 255, 0.406 * 255]
 32 | STDDEV_RGB = [0.229 * 255, 0.224 * 255, 0.225 * 255]
 33 | 
 34 | 
 35 | def efficientnet_params(model_name):
 36 |   """Get efficientnet params based on model name."""
 37 |   params_dict = {
 38 |       # (width_coefficient, depth_coefficient, resolution, dropout_rate)
 39 |       'efficientnet-b0': (1.0, 1.0, 224, 0.2),
 40 |       'efficientnet-b1': (1.0, 1.1, 240, 0.2),
 41 |       'efficientnet-b2': (1.1, 1.2, 260, 0.3),
 42 |       'efficientnet-b3': (1.2, 1.4, 300, 0.3),
 43 |       'efficientnet-b4': (1.4, 1.8, 380, 0.4),
 44 |       'efficientnet-b5': (1.6, 2.2, 456, 0.4),
 45 |       'efficientnet-b6': (1.8, 2.6, 528, 0.5),
 46 |       'efficientnet-b7': (2.0, 3.1, 600, 0.5),
 47 |       'efficientnet-b8': (2.2, 3.6, 672, 0.5),
 48 |       'efficientnet-l2': (4.3, 5.3, 800, 0.5),
 49 |   }
 50 |   return params_dict[model_name]
 51 | 
 52 | 
 53 | class BlockDecoder(object):
 54 |   """Block Decoder for readability."""
 55 | 
 56 |   def _decode_block_string(self, block_string):
 57 |     """Gets a block through a string notation of arguments."""
 58 |     if six.PY2:
 59 |       assert isinstance(block_string, (str, unicode))
 60 |     else:
 61 |       assert isinstance(block_string, str)
 62 |     ops = block_string.split('_')
 63 |     options = {}
 64 |     for op in ops:
 65 |       splits = re.split(r'(\d.*)', op)
 66 |       if len(splits) >= 2:
 67 |         key, value = splits[:2]
 68 |         options[key] = value
 69 | 
 70 |     if 's' not in options or len(options['s']) != 2:
 71 |       raise ValueError('Strides options should be a pair of integers.')
 72 | 
 73 |     return efficientnet_model.BlockArgs(
 74 |         kernel_size=int(options['k']),
 75 |         num_repeat=int(options['r']),
 76 |         input_filters=int(options['i']),
 77 |         output_filters=int(options['o']),
 78 |         expand_ratio=int(options['e']),
 79 |         id_skip=('noskip' not in block_string),
 80 |         se_ratio=float(options['se']) if 'se' in options else None,
 81 |         strides=[int(options['s'][0]),
 82 |                  int(options['s'][1])],
 83 |         conv_type=int(options['c']) if 'c' in options else 0,
 84 |         fused_conv=int(options['f']) if 'f' in options else 0,
 85 |         super_pixel=int(options['p']) if 'p' in options else 0,
 86 |         condconv=('cc' in block_string))
 87 | 
 88 |   def _encode_block_string(self, block):
 89 |     """Encodes a block to a string."""
 90 |     args = [
 91 |         'r%d' % block.num_repeat,
 92 |         'k%d' % block.kernel_size,
 93 |         's%d%d' % (block.strides[0], block.strides[1]),
 94 |         'e%s' % block.expand_ratio,
 95 |         'i%d' % block.input_filters,
 96 |         'o%d' % block.output_filters,
 97 |         'c%d' % block.conv_type,
 98 |         'f%d' % block.fused_conv,
 99 |         'p%d' % block.super_pixel,
100 |     ]
101 |     if block.se_ratio > 0 and block.se_ratio <= 1:
102 |       args.append('se%s' % block.se_ratio)
103 |     if block.id_skip is False:  # pylint: disable=g-bool-id-comparison
104 |       args.append('noskip')
105 |     if block.condconv:
106 |       args.append('cc')
107 |     return '_'.join(args)
108 | 
109 |   def decode(self, string_list):
110 |     """Decodes a list of string notations to specify blocks inside the network.
111 | 
112 |     Args:
113 |       string_list: a list of strings, each string is a notation of block.
114 | 
115 |     Returns:
116 |       A list of namedtuples to represent blocks arguments.
117 |     """
118 |     assert isinstance(string_list, list)
119 |     blocks_args = []
120 |     for block_string in string_list:
121 |       blocks_args.append(self._decode_block_string(block_string))
122 |     return blocks_args
123 | 
124 |   def encode(self, blocks_args):
125 |     """Encodes a list of Blocks to a list of strings.
126 | 
127 |     Args:
128 |       blocks_args: A list of namedtuples to represent blocks arguments.
129 |     Returns:
130 |       a list of strings, each string is a notation of block.
131 |     """
132 |     block_strings = []
133 |     for block in blocks_args:
134 |       block_strings.append(self._encode_block_string(block))
135 |     return block_strings
136 | 
137 | 
138 | def swish(features, use_native=True, use_hard=False):
139 |   """Computes the Swish activation function.
140 | 
141 |   We provide three alternnatives:
142 |     - Native tf.nn.swish, use less memory during training than composable swish.
143 |     - Quantization friendly hard swish.
144 |     - A composable swish, equivalant to tf.nn.swish, but more general for
145 |       finetuning and TF-Hub.
146 | 
147 |   Args:
148 |     features: A `Tensor` representing preactivation values.
149 |     use_native: Whether to use the native swish from tf.nn that uses a custom
150 |       gradient to reduce memory usage, or to use customized swish that uses
151 |       default TensorFlow gradient computation.
152 |     use_hard: Whether to use quantization-friendly hard swish.
153 | 
154 |   Returns:
155 |     The activation value.
156 |   """
157 |   if use_native and use_hard:
158 |     raise ValueError('Cannot specify both use_native and use_hard.')
159 | 
160 |   if use_native:
161 |     return tf.nn.swish(features)
162 | 
163 |   if use_hard:
164 |     return features * tf.nn.relu6(features + np.float32(3)) * (1. / 6.)
165 | 
166 |   features = tf.convert_to_tensor(features, name='features')
167 |   return features * tf.nn.sigmoid(features)
168 | 
169 | 
170 | _DEFAULT_BLOCKS_ARGS = [
171 |     'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
172 |     'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
173 |     'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
174 |     'r1_k3_s11_e6_i192_o320_se0.25',
175 | ]
176 | 
177 | 
178 | def efficientnet(width_coefficient=None,
179 |                  depth_coefficient=None,
180 |                  dropout_rate=0.2,
181 |                  survival_prob=0.8):
182 |   """Creates a efficientnet model."""
183 |   global_params = efficientnet_model.GlobalParams(
184 |       blocks_args=_DEFAULT_BLOCKS_ARGS,
185 |       batch_norm_momentum=0.99,
186 |       batch_norm_epsilon=1e-3,
187 |       dropout_rate=dropout_rate,
188 |       survival_prob=survival_prob,
189 |       data_format='channels_last',
190 |       num_classes=1000,
191 |       width_coefficient=width_coefficient,
192 |       depth_coefficient=depth_coefficient,
193 |       depth_divisor=8,
194 |       min_depth=None,
195 |       relu_fn=tf.nn.swish,
196 |       # The default is TPU-specific batch norm.
197 |       # The alternative is tf.layers.BatchNormalization.
198 |       batch_norm=utils.TpuBatchNormalization,  # TPU-specific requirement.
199 |       use_se=True,
200 |       clip_projection_output=False)
201 |   return global_params
202 | 
203 | 
204 | def get_model_params(model_name, override_params):
205 |   """Get the block args and global params for a given model."""
206 |   if model_name.startswith('efficientnet'):
207 |     width_coefficient, depth_coefficient, _, dropout_rate = (
208 |         efficientnet_params(model_name))
209 |     global_params = efficientnet(
210 |         width_coefficient, depth_coefficient, dropout_rate)
211 |   else:
212 |     raise NotImplementedError('model name is not pre-defined: %s' % model_name)
213 | 
214 |   if override_params:
215 |     # ValueError will be raised here if override_params has fields not included
216 |     # in global_params.
217 |     global_params = global_params._replace(**override_params)
218 | 
219 |   decoder = BlockDecoder()
220 |   blocks_args = decoder.decode(global_params.blocks_args)
221 | 
222 |   logging.info('global_params= %s', global_params)
223 |   return blocks_args, global_params
224 | 
225 | 
226 | def build_model(images,
227 |                 model_name,
228 |                 training,
229 |                 override_params=None,
230 |                 model_dir=None,
231 |                 fine_tuning=False,
232 |                 features_only=False,
233 |                 pooled_features_only=False):
234 |   """A helper functiion to creates a model and returns predicted logits.
235 | 
236 |   Args:
237 |     images: input images tensor.
238 |     model_name: string, the predefined model name.
239 |     training: boolean, whether the model is constructed for training.
240 |     override_params: A dictionary of params for overriding. Fields must exist in
241 |       efficientnet_model.GlobalParams.
242 |     model_dir: string, optional model dir for saving configs.
243 |     fine_tuning: boolean, whether the model is used for finetuning.
244 |     features_only: build the base feature network only (excluding final
245 |       1x1 conv layer, global pooling, dropout and fc head).
246 |     pooled_features_only: build the base network for features extraction (after
247 |       1x1 conv layer and global pooling, but before dropout and fc head).
248 | 
249 |   Returns:
250 |     logits: the logits tensor of classes.
251 |     endpoints: the endpoints for each layer.
252 | 
253 |   Raises:
254 |     When model_name specified an undefined model, raises NotImplementedError.
255 |     When override_params has invalid fields, raises ValueError.
256 |   """
257 |   assert isinstance(images, tf.Tensor)
258 |   assert not (features_only and pooled_features_only)
259 | 
260 |   # For backward compatibility.
261 |   if override_params and override_params.get('drop_connect_rate', None):
262 |     override_params['survival_prob'] = 1 - override_params['drop_connect_rate']
263 | 
264 |   if not training or fine_tuning:
265 |     if not override_params:
266 |       override_params = {}
267 |     override_params['batch_norm'] = utils.BatchNormalization
268 |     if fine_tuning:
269 |       override_params['relu_fn'] = functools.partial(swish, use_native=False)
270 |   blocks_args, global_params = get_model_params(model_name, override_params)
271 | 
272 |   if model_dir:
273 |     param_file = os.path.join(model_dir, 'model_params.txt')
274 |     if not tf.gfile.Exists(param_file):
275 |       if not tf.gfile.Exists(model_dir):
276 |         tf.gfile.MakeDirs(model_dir)
277 |       with tf.gfile.GFile(param_file, 'w') as f:
278 |         logging.info('writing to %s', param_file)
279 |         f.write('model_name= %s\n\n' % model_name)
280 |         f.write('global_params= %s\n\n' % str(global_params))
281 |         f.write('blocks_args= %s\n\n' % str(blocks_args))
282 | 
283 |   with tf.variable_scope(model_name):
284 |     model = efficientnet_model.Model(blocks_args, global_params)
285 |     outputs = model(
286 |         images,
287 |         training=training,
288 |         features_only=features_only,
289 |         pooled_features_only=pooled_features_only)
290 |   if features_only:
291 |     outputs = tf.identity(outputs, 'features')
292 |   elif pooled_features_only:
293 |     outputs = tf.identity(outputs, 'pooled_features')
294 |   else:
295 |     outputs = tf.identity(outputs, 'logits')
296 |   return outputs, model.endpoints
297 | 
298 | 
299 | def build_model_base(images, model_name, training, override_params=None):
300 |   """A helper functiion to create a base model and return global_pool.
301 | 
302 |   Args:
303 |     images: input images tensor.
304 |     model_name: string, the predefined model name.
305 |     training: boolean, whether the model is constructed for training.
306 |     override_params: A dictionary of params for overriding. Fields must exist in
307 |       efficientnet_model.GlobalParams.
308 | 
309 |   Returns:
310 |     features: global pool features.
311 |     endpoints: the endpoints for each layer.
312 | 
313 |   Raises:
314 |     When model_name specified an undefined model, raises NotImplementedError.
315 |     When override_params has invalid fields, raises ValueError.
316 |   """
317 |   assert isinstance(images, tf.Tensor)
318 |   # For backward compatibility.
319 |   if override_params and override_params.get('drop_connect_rate', None):
320 |     override_params['survival_prob'] = 1 - override_params['drop_connect_rate']
321 | 
322 |   blocks_args, global_params = get_model_params(model_name, override_params)
323 | 
324 |   with tf.variable_scope(model_name):
325 |     model = efficientnet_model.Model(blocks_args, global_params)
326 |     features = model(images, training=training, features_only=True)
327 | 
328 |   features = tf.identity(features, 'features')
329 |   return features, model.endpoints
330 | 


--------------------------------------------------------------------------------
/external_data/original_tf/efficientnet_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains definitions for EfficientNet model.
 16 | 
 17 | [1] Mingxing Tan, Quoc V. Le
 18 |   EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
 19 |   ICML'19, https://arxiv.org/abs/1905.11946
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import collections
 27 | import functools
 28 | import math
 29 | 
 30 | from absl import logging
 31 | import numpy as np
 32 | import six
 33 | from six.moves import xrange
 34 | import tensorflow.compat.v1 as tf
 35 | 
 36 | import utils
 37 | # from condconv import condconv_layers
 38 | 
 39 | GlobalParams = collections.namedtuple('GlobalParams', [
 40 |     'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'data_format',
 41 |     'num_classes', 'width_coefficient', 'depth_coefficient', 'depth_divisor',
 42 |     'min_depth', 'survival_prob', 'relu_fn', 'batch_norm', 'use_se',
 43 |     'local_pooling', 'condconv_num_experts', 'clip_projection_output',
 44 |     'blocks_args'
 45 | ])
 46 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
 47 | 
 48 | BlockArgs = collections.namedtuple('BlockArgs', [
 49 |     'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
 50 |     'expand_ratio', 'id_skip', 'strides', 'se_ratio', 'conv_type', 'fused_conv',
 51 |     'super_pixel', 'condconv'
 52 | ])
 53 | # defaults will be a public argument for namedtuple in Python 3.7
 54 | # https://docs.python.org/3/library/collections.html#collections.namedtuple
 55 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
 56 | 
 57 | 
 58 | def conv_kernel_initializer(shape, dtype=None, partition_info=None):
 59 |   """Initialization for convolutional kernels.
 60 | 
 61 |   The main difference with tf.variance_scaling_initializer is that
 62 |   tf.variance_scaling_initializer uses a truncated normal with an uncorrected
 63 |   standard deviation, whereas here we use a normal distribution. Similarly,
 64 |   tf.initializers.variance_scaling uses a truncated normal with
 65 |   a corrected standard deviation.
 66 | 
 67 |   Args:
 68 |     shape: shape of variable
 69 |     dtype: dtype of variable
 70 |     partition_info: unused
 71 | 
 72 |   Returns:
 73 |     an initialization for the variable
 74 |   """
 75 |   del partition_info
 76 |   kernel_height, kernel_width, _, out_filters = shape
 77 |   fan_out = int(kernel_height * kernel_width * out_filters)
 78 |   return tf.random_normal(
 79 |       shape, mean=0.0, stddev=np.sqrt(2.0 / fan_out), dtype=dtype)
 80 | 
 81 | 
 82 | def dense_kernel_initializer(shape, dtype=None, partition_info=None):
 83 |   """Initialization for dense kernels.
 84 | 
 85 |   This initialization is equal to
 86 |     tf.variance_scaling_initializer(scale=1.0/3.0, mode='fan_out',
 87 |                                     distribution='uniform').
 88 |   It is written out explicitly here for clarity.
 89 | 
 90 |   Args:
 91 |     shape: shape of variable
 92 |     dtype: dtype of variable
 93 |     partition_info: unused
 94 | 
 95 |   Returns:
 96 |     an initialization for the variable
 97 |   """
 98 |   del partition_info
 99 |   init_range = 1.0 / np.sqrt(shape[1])
100 |   return tf.random_uniform(shape, -init_range, init_range, dtype=dtype)
101 | 
102 | 
103 | def superpixel_kernel_initializer(shape, dtype='float32', partition_info=None):
104 |   """Initializes superpixel kernels.
105 | 
106 |   This is inspired by space-to-depth transformation that is mathematically
107 |   equivalent before and after the transformation. But we do the space-to-depth
108 |   via a convolution. Moreover, we make the layer trainable instead of direct
109 |   transform, we can initialization it this way so that the model can learn not
110 |   to do anything but keep it mathematically equivalent, when improving
111 |   performance.
112 | 
113 | 
114 |   Args:
115 |     shape: shape of variable
116 |     dtype: dtype of variable
117 |     partition_info: unused
118 | 
119 |   Returns:
120 |     an initialization for the variable
121 |   """
122 |   del partition_info
123 |   #  use input depth to make superpixel kernel.
124 |   depth = shape[-2]
125 |   filters = np.zeros([2, 2, depth, 4 * depth], dtype=dtype)
126 |   i = np.arange(2)
127 |   j = np.arange(2)
128 |   k = np.arange(depth)
129 |   mesh = np.array(np.meshgrid(i, j, k)).T.reshape(-1, 3).T
130 |   filters[
131 |       mesh[0],
132 |       mesh[1],
133 |       mesh[2],
134 |       4 * mesh[2] + 2 * mesh[0] + mesh[1]] = 1
135 |   return filters
136 | 
137 | 
138 | def round_filters(filters, global_params):
139 |   """Round number of filters based on depth multiplier."""
140 |   orig_f = filters
141 |   multiplier = global_params.width_coefficient
142 |   divisor = global_params.depth_divisor
143 |   min_depth = global_params.min_depth
144 |   if not multiplier:
145 |     return filters
146 | 
147 |   filters *= multiplier
148 |   min_depth = min_depth or divisor
149 |   new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
150 |   # Make sure that round down does not go down by more than 10%.
151 |   if new_filters < 0.9 * filters:
152 |     new_filters += divisor
153 |   logging.info('round_filter input=%s output=%s', orig_f, new_filters)
154 |   return int(new_filters)
155 | 
156 | 
157 | def round_repeats(repeats, global_params):
158 |   """Round number of filters based on depth multiplier."""
159 |   multiplier = global_params.depth_coefficient
160 |   if not multiplier:
161 |     return repeats
162 |   return int(math.ceil(multiplier * repeats))
163 | 
164 | 
165 | class MBConvBlock(tf.keras.layers.Layer):
166 |   """A class of MBConv: Mobile Inverted Residual Bottleneck.
167 | 
168 |   Attributes:
169 |     endpoints: dict. A list of internal tensors.
170 |   """
171 | 
172 |   def __init__(self, block_args, global_params):
173 |     """Initializes a MBConv block.
174 | 
175 |     Args:
176 |       block_args: BlockArgs, arguments to create a Block.
177 |       global_params: GlobalParams, a set of global parameters.
178 |     """
179 |     super(MBConvBlock, self).__init__()
180 |     self._block_args = block_args
181 |     self._batch_norm_momentum = global_params.batch_norm_momentum
182 |     self._batch_norm_epsilon = global_params.batch_norm_epsilon
183 |     self._batch_norm = global_params.batch_norm
184 |     self._condconv_num_experts = global_params.condconv_num_experts
185 |     self._data_format = global_params.data_format
186 |     if self._data_format == 'channels_first':
187 |       self._channel_axis = 1
188 |       self._spatial_dims = [2, 3]
189 |     else:
190 |       self._channel_axis = -1
191 |       self._spatial_dims = [1, 2]
192 | 
193 |     self._relu_fn = global_params.relu_fn or tf.nn.swish
194 |     self._has_se = (
195 |         global_params.use_se and self._block_args.se_ratio is not None and
196 |         0 < self._block_args.se_ratio <= 1)
197 | 
198 |     self._clip_projection_output = global_params.clip_projection_output
199 | 
200 |     self.endpoints = None
201 | 
202 |     self.conv_cls = tf.layers.Conv2D
203 |     self.depthwise_conv_cls = utils.DepthwiseConv2D
204 |     if self._block_args.condconv:
205 |       self.conv_cls = functools.partial(
206 |           condconv_layers.CondConv2D, num_experts=self._condconv_num_experts)
207 |       self.depthwise_conv_cls = functools.partial(
208 |           condconv_layers.DepthwiseCondConv2D,
209 |           num_experts=self._condconv_num_experts)
210 | 
211 |     # Builds the block accordings to arguments.
212 |     self._build()
213 | 
214 |   def block_args(self):
215 |     return self._block_args
216 | 
217 |   def _build(self):
218 |     """Builds block according to the arguments."""
219 |     if self._block_args.super_pixel == 1:
220 |       self._superpixel = tf.layers.Conv2D(
221 |           self._block_args.input_filters,
222 |           kernel_size=[2, 2],
223 |           strides=[2, 2],
224 |           kernel_initializer=conv_kernel_initializer,
225 |           padding='same',
226 |           data_format=self._data_format,
227 |           use_bias=False)
228 |       self._bnsp = self._batch_norm(
229 |           axis=self._channel_axis,
230 |           momentum=self._batch_norm_momentum,
231 |           epsilon=self._batch_norm_epsilon)
232 | 
233 |     if self._block_args.condconv:
234 |       # Add the example-dependent routing function
235 |       self._avg_pooling = tf.keras.layers.GlobalAveragePooling2D(
236 |           data_format=self._data_format)
237 |       self._routing_fn = tf.layers.Dense(
238 |           self._condconv_num_experts, activation=tf.nn.sigmoid)
239 | 
240 |     filters = self._block_args.input_filters * self._block_args.expand_ratio
241 |     kernel_size = self._block_args.kernel_size
242 | 
243 |     # Fused expansion phase. Called if using fused convolutions.
244 |     self._fused_conv = self.conv_cls(
245 |         filters=filters,
246 |         kernel_size=[kernel_size, kernel_size],
247 |         strides=self._block_args.strides,
248 |         kernel_initializer=conv_kernel_initializer,
249 |         padding='same',
250 |         data_format=self._data_format,
251 |         use_bias=False)
252 | 
253 |     # Expansion phase. Called if not using fused convolutions and expansion
254 |     # phase is necessary.
255 |     self._expand_conv = self.conv_cls(
256 |         filters=filters,
257 |         kernel_size=[1, 1],
258 |         strides=[1, 1],
259 |         kernel_initializer=conv_kernel_initializer,
260 |         padding='same',
261 |         data_format=self._data_format,
262 |         use_bias=False)
263 |     self._bn0 = self._batch_norm(
264 |         axis=self._channel_axis,
265 |         momentum=self._batch_norm_momentum,
266 |         epsilon=self._batch_norm_epsilon)
267 | 
268 |     # Depth-wise convolution phase. Called if not using fused convolutions.
269 |     self._depthwise_conv = self.depthwise_conv_cls(
270 |         kernel_size=[kernel_size, kernel_size],
271 |         strides=self._block_args.strides,
272 |         depthwise_initializer=conv_kernel_initializer,
273 |         padding='same',
274 |         data_format=self._data_format,
275 |         use_bias=False)
276 | 
277 |     self._bn1 = self._batch_norm(
278 |         axis=self._channel_axis,
279 |         momentum=self._batch_norm_momentum,
280 |         epsilon=self._batch_norm_epsilon)
281 | 
282 |     if self._has_se:
283 |       num_reduced_filters = max(
284 |           1, int(self._block_args.input_filters * self._block_args.se_ratio))
285 |       # Squeeze and Excitation layer.
286 |       self._se_reduce = tf.layers.Conv2D(
287 |           num_reduced_filters,
288 |           kernel_size=[1, 1],
289 |           strides=[1, 1],
290 |           kernel_initializer=conv_kernel_initializer,
291 |           padding='same',
292 |           data_format=self._data_format,
293 |           use_bias=True)
294 |       self._se_expand = tf.layers.Conv2D(
295 |           filters,
296 |           kernel_size=[1, 1],
297 |           strides=[1, 1],
298 |           kernel_initializer=conv_kernel_initializer,
299 |           padding='same',
300 |           data_format=self._data_format,
301 |           use_bias=True)
302 | 
303 |     # Output phase.
304 |     filters = self._block_args.output_filters
305 |     self._project_conv = self.conv_cls(
306 |         filters=filters,
307 |         kernel_size=[1, 1],
308 |         strides=[1, 1],
309 |         kernel_initializer=conv_kernel_initializer,
310 |         padding='same',
311 |         data_format=self._data_format,
312 |         use_bias=False)
313 |     self._bn2 = self._batch_norm(
314 |         axis=self._channel_axis,
315 |         momentum=self._batch_norm_momentum,
316 |         epsilon=self._batch_norm_epsilon)
317 | 
318 |   def _call_se(self, input_tensor):
319 |     """Call Squeeze and Excitation layer.
320 | 
321 |     Args:
322 |       input_tensor: Tensor, a single input tensor for Squeeze/Excitation layer.
323 | 
324 |     Returns:
325 |       A output tensor, which should have the same shape as input.
326 |     """
327 |     se_tensor = tf.reduce_mean(input_tensor, self._spatial_dims, keepdims=True)
328 |     se_tensor = self._se_expand(self._relu_fn(self._se_reduce(se_tensor)))
329 |     logging.info('Built Squeeze and Excitation with tensor shape: %s',
330 |                  (se_tensor.shape))
331 |     return tf.sigmoid(se_tensor) * input_tensor
332 | 
333 |   def call(self, inputs, training=True, survival_prob=None):
334 |     """Implementation of call().
335 | 
336 |     Args:
337 |       inputs: the inputs tensor.
338 |       training: boolean, whether the model is constructed for training.
339 |       survival_prob: float, between 0 to 1, drop connect rate.
340 | 
341 |     Returns:
342 |       A output tensor.
343 |     """
344 |     logging.info('Block input: %s shape: %s', inputs.name, inputs.shape)
345 |     logging.info('Block input depth: %s output depth: %s',
346 |                  self._block_args.input_filters,
347 |                  self._block_args.output_filters)
348 | 
349 |     x = inputs
350 | 
351 |     fused_conv_fn = self._fused_conv
352 |     expand_conv_fn = self._expand_conv
353 |     depthwise_conv_fn = self._depthwise_conv
354 |     project_conv_fn = self._project_conv
355 | 
356 |     if self._block_args.condconv:
357 |       pooled_inputs = self._avg_pooling(inputs)
358 |       routing_weights = self._routing_fn(pooled_inputs)
359 |       # Capture routing weights as additional input to CondConv layers
360 |       fused_conv_fn = functools.partial(
361 |           self._fused_conv, routing_weights=routing_weights)
362 |       expand_conv_fn = functools.partial(
363 |           self._expand_conv, routing_weights=routing_weights)
364 |       depthwise_conv_fn = functools.partial(
365 |           self._depthwise_conv, routing_weights=routing_weights)
366 |       project_conv_fn = functools.partial(
367 |           self._project_conv, routing_weights=routing_weights)
368 | 
369 |     # creates conv 2x2 kernel
370 |     if self._block_args.super_pixel == 1:
371 |       with tf.variable_scope('super_pixel'):
372 |         x = self._relu_fn(
373 |             self._bnsp(self._superpixel(x), training=training))
374 |       logging.info(
375 |           'Block start with SuperPixel: %s shape: %s', x.name, x.shape)
376 | 
377 |     if self._block_args.fused_conv:
378 |       # If use fused mbconv, skip expansion and use regular conv.
379 |       x = self._relu_fn(self._bn1(fused_conv_fn(x), training=training))
380 |       logging.info('Conv2D: %s shape: %s', x.name, x.shape)
381 |     else:
382 |       # Otherwise, first apply expansion and then apply depthwise conv.
383 |       if self._block_args.expand_ratio != 1:
384 |         x = self._relu_fn(self._bn0(expand_conv_fn(x), training=training))
385 |         logging.info('Expand: %s shape: %s', x.name, x.shape)
386 | 
387 |       x = self._relu_fn(self._bn1(depthwise_conv_fn(x), training=training))
388 |       logging.info('DWConv: %s shape: %s', x.name, x.shape)
389 | 
390 |     if self._has_se:
391 |       with tf.variable_scope('se'):
392 |         x = self._call_se(x)
393 | 
394 |     self.endpoints = {'expansion_output': x}
395 | 
396 |     x = self._bn2(project_conv_fn(x), training=training)
397 |     # Add identity so that quantization-aware training can insert quantization
398 |     # ops correctly.
399 |     x = tf.identity(x)
400 |     if self._clip_projection_output:
401 |       x = tf.clip_by_value(x, -6, 6)
402 |     if self._block_args.id_skip:
403 |       if all(
404 |           s == 1 for s in self._block_args.strides
405 |       ) and self._block_args.input_filters == self._block_args.output_filters:
406 |         # Apply only if skip connection presents.
407 |         if survival_prob:
408 |           x = utils.drop_connect(x, training, survival_prob)
409 |         x = tf.add(x, inputs)
410 |     logging.info('Project: %s shape: %s', x.name, x.shape)
411 |     return x
412 | 
413 | 
414 | class MBConvBlockWithoutDepthwise(MBConvBlock):
415 |   """MBConv-like block without depthwise convolution and squeeze-and-excite."""
416 | 
417 |   def _build(self):
418 |     """Builds block according to the arguments."""
419 |     filters = self._block_args.input_filters * self._block_args.expand_ratio
420 |     if self._block_args.expand_ratio != 1:
421 |       # Expansion phase:
422 |       self._expand_conv = tf.layers.Conv2D(
423 |           filters,
424 |           kernel_size=[3, 3],
425 |           strides=[1, 1],
426 |           kernel_initializer=conv_kernel_initializer,
427 |           padding='same',
428 |           use_bias=False)
429 |       self._bn0 = self._batch_norm(
430 |           axis=self._channel_axis,
431 |           momentum=self._batch_norm_momentum,
432 |           epsilon=self._batch_norm_epsilon)
433 | 
434 |     # Output phase:
435 |     filters = self._block_args.output_filters
436 |     self._project_conv = tf.layers.Conv2D(
437 |         filters,
438 |         kernel_size=[1, 1],
439 |         strides=self._block_args.strides,
440 |         kernel_initializer=conv_kernel_initializer,
441 |         padding='same',
442 |         use_bias=False)
443 |     self._bn1 = self._batch_norm(
444 |         axis=self._channel_axis,
445 |         momentum=self._batch_norm_momentum,
446 |         epsilon=self._batch_norm_epsilon)
447 | 
448 |   def call(self, inputs, training=True, survival_prob=None):
449 |     """Implementation of call().
450 | 
451 |     Args:
452 |       inputs: the inputs tensor.
453 |       training: boolean, whether the model is constructed for training.
454 |       survival_prob: float, between 0 to 1, drop connect rate.
455 | 
456 |     Returns:
457 |       A output tensor.
458 |     """
459 |     logging.info('Block input: %s shape: %s', inputs.name, inputs.shape)
460 |     if self._block_args.expand_ratio != 1:
461 |       x = self._relu_fn(self._bn0(self._expand_conv(inputs), training=training))
462 |     else:
463 |       x = inputs
464 |     logging.info('Expand: %s shape: %s', x.name, x.shape)
465 | 
466 |     self.endpoints = {'expansion_output': x}
467 | 
468 |     x = self._bn1(self._project_conv(x), training=training)
469 |     # Add identity so that quantization-aware training can insert quantization
470 |     # ops correctly.
471 |     x = tf.identity(x)
472 |     if self._clip_projection_output:
473 |       x = tf.clip_by_value(x, -6, 6)
474 | 
475 |     if self._block_args.id_skip:
476 |       if all(
477 |           s == 1 for s in self._block_args.strides
478 |       ) and self._block_args.input_filters == self._block_args.output_filters:
479 |         # Apply only if skip connection presents.
480 |         if survival_prob:
481 |           x = utils.drop_connect(x, training, survival_prob)
482 |         x = tf.add(x, inputs)
483 |     logging.info('Project: %s shape: %s', x.name, x.shape)
484 |     return x
485 | 
486 | 
487 | class Model(tf.keras.Model):
488 |   """A class implements tf.keras.Model for MNAS-like model.
489 | 
490 |     Reference: https://arxiv.org/abs/1807.11626
491 |   """
492 | 
493 |   def __init__(self, blocks_args=None, global_params=None):
494 |     """Initializes an `Model` instance.
495 | 
496 |     Args:
497 |       blocks_args: A list of BlockArgs to construct block modules.
498 |       global_params: GlobalParams, a set of global parameters.
499 | 
500 |     Raises:
501 |       ValueError: when blocks_args is not specified as a list.
502 |     """
503 |     super(Model, self).__init__()
504 |     if not isinstance(blocks_args, list):
505 |       raise ValueError('blocks_args should be a list.')
506 |     self._global_params = global_params
507 |     self._blocks_args = blocks_args
508 |     self._relu_fn = global_params.relu_fn or tf.nn.swish
509 |     self._batch_norm = global_params.batch_norm
510 | 
511 |     self.endpoints = None
512 | 
513 |     self._build()
514 | 
515 |   def _get_conv_block(self, conv_type):
516 |     conv_block_map = {0: MBConvBlock, 1: MBConvBlockWithoutDepthwise}
517 |     return conv_block_map[conv_type]
518 | 
519 |   def _build(self):
520 |     """Builds a model."""
521 |     self._blocks = []
522 |     batch_norm_momentum = self._global_params.batch_norm_momentum
523 |     batch_norm_epsilon = self._global_params.batch_norm_epsilon
524 |     if self._global_params.data_format == 'channels_first':
525 |       channel_axis = 1
526 |       self._spatial_dims = [2, 3]
527 |     else:
528 |       channel_axis = -1
529 |       self._spatial_dims = [1, 2]
530 | 
531 |     # Stem part.
532 |     self._conv_stem = tf.layers.Conv2D(
533 |         filters=round_filters(32, self._global_params),
534 |         kernel_size=[3, 3],
535 |         strides=[2, 2],
536 |         kernel_initializer=conv_kernel_initializer,
537 |         padding='same',
538 |         data_format=self._global_params.data_format,
539 |         use_bias=False)
540 |     self._bn0 = self._batch_norm(
541 |         axis=channel_axis,
542 |         momentum=batch_norm_momentum,
543 |         epsilon=batch_norm_epsilon)
544 | 
545 |     # Builds blocks.
546 |     for block_args in self._blocks_args:
547 |       assert block_args.num_repeat > 0
548 |       assert block_args.super_pixel in [0, 1, 2]
549 |       # Update block input and output filters based on depth multiplier.
550 |       input_filters = round_filters(block_args.input_filters,
551 |                                     self._global_params)
552 |       output_filters = round_filters(block_args.output_filters,
553 |                                      self._global_params)
554 |       kernel_size = block_args.kernel_size
555 |       block_args = block_args._replace(
556 |           input_filters=input_filters,
557 |           output_filters=output_filters,
558 |           num_repeat=round_repeats(block_args.num_repeat, self._global_params))
559 | 
560 |       # The first block needs to take care of stride and filter size increase.
561 |       conv_block = self._get_conv_block(block_args.conv_type)
562 |       if not block_args.super_pixel:  #  no super_pixel at all
563 |         self._blocks.append(conv_block(block_args, self._global_params))
564 |       else:
565 |         # if superpixel, adjust filters, kernels, and strides.
566 |         depth_factor = int(4 / block_args.strides[0] / block_args.strides[1])
567 |         block_args = block_args._replace(
568 |             input_filters=block_args.input_filters * depth_factor,
569 |             output_filters=block_args.output_filters * depth_factor,
570 |             kernel_size=((block_args.kernel_size + 1) // 2 if depth_factor > 1
571 |                          else block_args.kernel_size))
572 |         # if the first block has stride-2 and super_pixel trandformation
573 |         if (block_args.strides[0] == 2 and block_args.strides[1] == 2):
574 |           block_args = block_args._replace(strides=[1, 1])
575 |           self._blocks.append(conv_block(block_args, self._global_params))
576 |           block_args = block_args._replace(  # sp stops at stride-2
577 |               super_pixel=0,
578 |               input_filters=input_filters,
579 |               output_filters=output_filters,
580 |               kernel_size=kernel_size)
581 |         elif block_args.super_pixel == 1:
582 |           self._blocks.append(conv_block(block_args, self._global_params))
583 |           block_args = block_args._replace(super_pixel=2)
584 |         else:
585 |           self._blocks.append(conv_block(block_args, self._global_params))
586 |       if block_args.num_repeat > 1:  # rest of blocks with the same block_arg
587 |         # pylint: disable=protected-access
588 |         block_args = block_args._replace(
589 |             input_filters=block_args.output_filters, strides=[1, 1])
590 |         # pylint: enable=protected-access
591 |       for _ in xrange(block_args.num_repeat - 1):
592 |         self._blocks.append(conv_block(block_args, self._global_params))
593 | 
594 |     # Head part.
595 |     self._conv_head = tf.layers.Conv2D(
596 |         filters=round_filters(1280, self._global_params),
597 |         kernel_size=[1, 1],
598 |         strides=[1, 1],
599 |         kernel_initializer=conv_kernel_initializer,
600 |         padding='same',
601 |         use_bias=False)
602 |     self._bn1 = self._batch_norm(
603 |         axis=channel_axis,
604 |         momentum=batch_norm_momentum,
605 |         epsilon=batch_norm_epsilon)
606 | 
607 |     self._avg_pooling = tf.keras.layers.GlobalAveragePooling2D(
608 |         data_format=self._global_params.data_format)
609 |     if self._global_params.num_classes:
610 |       self._fc = tf.layers.Dense(
611 |           self._global_params.num_classes,
612 |           kernel_initializer=dense_kernel_initializer)
613 |     else:
614 |       self._fc = None
615 | 
616 |     if self._global_params.dropout_rate > 0:
617 |       self._dropout = tf.keras.layers.Dropout(self._global_params.dropout_rate)
618 |     else:
619 |       self._dropout = None
620 | 
621 |   def call(self,
622 |            inputs,
623 |            training=True,
624 |            features_only=None,
625 |            pooled_features_only=False):
626 |     """Implementation of call().
627 | 
628 |     Args:
629 |       inputs: input tensors.
630 |       training: boolean, whether the model is constructed for training.
631 |       features_only: build the base feature network only.
632 |       pooled_features_only: build the base network for features extraction
633 |         (after 1x1 conv layer and global pooling, but before dropout and fc
634 |         head).
635 | 
636 |     Returns:
637 |       output tensors.
638 |     """
639 |     outputs = None
640 |     self.endpoints = {}
641 |     reduction_idx = 0
642 |     # Calls Stem layers
643 |     with tf.variable_scope('stem'):
644 |       outputs = self._relu_fn(
645 |           self._bn0(self._conv_stem(inputs), training=training))
646 |     logging.info('Built stem layers with output shape: %s', outputs.shape)
647 |     self.endpoints['stem'] = outputs
648 | 
649 |     # Calls blocks.
650 |     for idx, block in enumerate(self._blocks):
651 |       is_reduction = False  # reduction flag for blocks after the stem layer
652 |       # If the first block has super-pixel (space-to-depth) layer, then stem is
653 |       # the first reduction point.
654 |       if (block.block_args().super_pixel == 1 and idx == 0):
655 |         reduction_idx += 1
656 |         self.endpoints['reduction_%s' % reduction_idx] = outputs
657 | 
658 |       elif ((idx == len(self._blocks) - 1) or
659 |             self._blocks[idx + 1].block_args().strides[0] > 1):
660 |         is_reduction = True
661 |         reduction_idx += 1
662 | 
663 |       with tf.variable_scope('blocks_%s' % idx):
664 |         survival_prob = self._global_params.survival_prob
665 |         if survival_prob:
666 |           drop_rate = 1.0 - survival_prob
667 |           survival_prob = 1.0 - drop_rate * float(idx) / len(self._blocks)
668 |           logging.info('block_%s survival_prob: %s', idx, survival_prob)
669 |         outputs = block.call(
670 |             outputs, training=training, survival_prob=survival_prob)
671 |         self.endpoints['block_%s' % idx] = outputs
672 |         if is_reduction:
673 |           self.endpoints['reduction_%s' % reduction_idx] = outputs
674 |         if block.endpoints:
675 |           for k, v in six.iteritems(block.endpoints):
676 |             self.endpoints['block_%s/%s' % (idx, k)] = v
677 |             if is_reduction:
678 |               self.endpoints['reduction_%s/%s' % (reduction_idx, k)] = v
679 |     self.endpoints['features'] = outputs
680 | 
681 |     if not features_only:
682 |       # Calls final layers and returns logits.
683 |       with tf.variable_scope('head'):
684 |         outputs = self._relu_fn(
685 |             self._bn1(self._conv_head(outputs), training=training))
686 |         self.endpoints['head_1x1'] = outputs
687 | 
688 |         if self._global_params.local_pooling:
689 |           shape = outputs.get_shape().as_list()
690 |           kernel_size = [
691 |               1, shape[self._spatial_dims[0]], shape[self._spatial_dims[1]], 1]
692 |           outputs = tf.nn.avg_pool(
693 |               outputs, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
694 |           self.endpoints['pooled_features'] = outputs
695 |           if not pooled_features_only:
696 |             if self._dropout:
697 |               outputs = self._dropout(outputs, training=training)
698 |             self.endpoints['global_pool'] = outputs
699 |             if self._fc:
700 |               outputs = tf.squeeze(outputs, self._spatial_dims)
701 |               outputs = self._fc(outputs)
702 |             self.endpoints['head'] = outputs
703 |         else:
704 |           outputs = self._avg_pooling(outputs)
705 |           self.endpoints['pooled_features'] = outputs
706 |           if not pooled_features_only:
707 |             if self._dropout:
708 |               outputs = self._dropout(outputs, training=training)
709 |             self.endpoints['global_pool'] = outputs
710 |             if self._fc:
711 |               outputs = self._fc(outputs)
712 |             self.endpoints['head'] = outputs
713 |     return outputs
714 | 


--------------------------------------------------------------------------------
/external_data/original_tf/eval_ckpt_main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Eval checkpoint driver.
 16 | 
 17 | This is an example evaluation script for users to understand the EfficientNet
 18 | model checkpoints on CPU. To serve EfficientNet, please consider to export a
 19 | `SavedModel` from checkpoints and use tf-serving to serve.
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import json
 27 | import sys
 28 | from absl import app
 29 | from absl import flags
 30 | import numpy as np
 31 | import tensorflow as tf
 32 | 
 33 | 
 34 | import efficientnet_builder
 35 | import preprocessing
 36 | 
 37 | 
 38 | flags.DEFINE_string('model_name', 'efficientnet-b0', 'Model name to eval.')
 39 | flags.DEFINE_string('runmode', 'examples', 'Running mode: examples or imagenet')
 40 | flags.DEFINE_string('imagenet_eval_glob', None,
 41 |                     'Imagenet eval image glob, '
 42 |                     'such as /imagenet/ILSVRC2012*.JPEG')
 43 | flags.DEFINE_string('imagenet_eval_label', None,
 44 |                     'Imagenet eval label file path, '
 45 |                     'such as /imagenet/ILSVRC2012_validation_ground_truth.txt')
 46 | flags.DEFINE_string('ckpt_dir', '/tmp/ckpt/', 'Checkpoint folders')
 47 | flags.DEFINE_string('example_img', '/tmp/panda.jpg',
 48 |                     'Filepath for a single example image.')
 49 | flags.DEFINE_string('labels_map_file', '/tmp/labels_map.txt',
 50 |                     'Labels map from label id to its meaning.')
 51 | flags.DEFINE_integer('num_images', 5000,
 52 |                      'Number of images to eval. Use -1 to eval all images.')
 53 | FLAGS = flags.FLAGS
 54 | 
 55 | MEAN_RGB = [0.485 * 255, 0.456 * 255, 0.406 * 255]
 56 | STDDEV_RGB = [0.229 * 255, 0.224 * 255, 0.225 * 255]
 57 | 
 58 | 
 59 | class EvalCkptDriver(object):
 60 |   """A driver for running eval inference.
 61 | 
 62 |   Attributes:
 63 |     model_name: str. Model name to eval.
 64 |     batch_size: int. Eval batch size.
 65 |     num_classes: int. Number of classes, default to 1000 for ImageNet.
 66 |     image_size: int. Input image size, determined by model name.
 67 |   """
 68 | 
 69 |   def __init__(self, model_name='efficientnet-b0', batch_size=1):
 70 |     """Initialize internal variables."""
 71 |     self.model_name = model_name
 72 |     self.batch_size = batch_size
 73 |     self.num_classes = 1000
 74 |     # Model Scaling parameters
 75 |     _, _, self.image_size, _ = efficientnet_builder.efficientnet_params(
 76 |         model_name)
 77 | 
 78 |   def restore_model(self, sess, ckpt_dir):
 79 |     """Restore variables from checkpoint dir."""
 80 |     checkpoint = tf.train.latest_checkpoint(ckpt_dir)
 81 |     ema = tf.train.ExponentialMovingAverage(decay=0.9999)
 82 |     ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars')
 83 |     for v in tf.global_variables():
 84 |       if 'moving_mean' in v.name or 'moving_variance' in v.name:
 85 |         ema_vars.append(v)
 86 |     ema_vars = list(set(ema_vars))
 87 |     var_dict = ema.variables_to_restore(ema_vars)
 88 |     saver = tf.train.Saver(var_dict, max_to_keep=1)
 89 |     saver.restore(sess, checkpoint)
 90 | 
 91 |   def build_model(self, features, is_training):
 92 |     """Build model with input features."""
 93 |     features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype)
 94 |     features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype)
 95 |     logits, _ = efficientnet_builder.build_model(
 96 |         features, self.model_name, is_training)
 97 |     probs = tf.nn.softmax(logits)
 98 |     probs = tf.squeeze(probs)
 99 |     return probs
100 | 
101 |   def build_dataset(self, filenames, labels, is_training):
102 |     """Build input dataset."""
103 |     filenames = tf.constant(filenames)
104 |     labels = tf.constant(labels)
105 |     dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
106 | 
107 |     def _parse_function(filename, label):
108 |       image_string = tf.read_file(filename)
109 |       image_decoded = preprocessing.preprocess_image(
110 |           image_string, is_training, self.image_size)
111 |       image = tf.cast(image_decoded, tf.float32)
112 |       return image, label
113 | 
114 |     dataset = dataset.map(_parse_function)
115 |     dataset = dataset.batch(self.batch_size)
116 | 
117 |     iterator = dataset.make_one_shot_iterator()
118 |     images, labels = iterator.get_next()
119 |     return images, labels
120 | 
121 |   def run_inference(self, ckpt_dir, image_files, labels):
122 |     """Build and run inference on the target images and labels."""
123 |     with tf.Graph().as_default(), tf.Session() as sess:
124 |       images, labels = self.build_dataset(image_files, labels, False)
125 |       probs = self.build_model(images, is_training=False)
126 | 
127 |       sess.run(tf.global_variables_initializer())
128 |       self.restore_model(sess, ckpt_dir)
129 | 
130 |       prediction_idx = []
131 |       prediction_prob = []
132 |       for _ in range(len(image_files) // self.batch_size):
133 |         out_probs = sess.run(probs)
134 |         idx = np.argsort(out_probs)[::-1]
135 |         prediction_idx.append(idx[:5])
136 |         prediction_prob.append([out_probs[pid] for pid in idx[:5]])
137 | 
138 |       # Return the top 5 predictions (idx and prob) for each image.
139 |       return prediction_idx, prediction_prob
140 | 
141 | 
142 | def eval_example_images(model_name, ckpt_dir, image_files, labels_map_file):
143 |   """Eval a list of example images.
144 | 
145 |   Args:
146 |     model_name: str. The name of model to eval.
147 |     ckpt_dir: str. Checkpoint directory path.
148 |     image_files: List[str]. A list of image file paths.
149 |     labels_map_file: str. The labels map file path.
150 | 
151 |   Returns:
152 |     A tuple (pred_idx, and pred_prob), where pred_idx is the top 5 prediction
153 |     index and pred_prob is the top 5 prediction probability.
154 |   """
155 |   eval_ckpt_driver = EvalCkptDriver(model_name)
156 |   classes = json.loads(tf.gfile.Open(labels_map_file).read())
157 |   pred_idx, pred_prob = eval_ckpt_driver.run_inference(
158 |       ckpt_dir, image_files, [0] * len(image_files))
159 |   for i in range(len(image_files)):
160 |     print('predicted class for image {}: '.format(image_files[i]))
161 |     for j, idx in enumerate(pred_idx[i]):
162 |       print('  -> top_{} ({:4.2f}%): {}  '.format(
163 |           j, pred_prob[i][j] * 100, classes[str(idx)]))
164 |   return pred_idx, pred_prob
165 | 
166 | 
167 | def eval_imagenet(model_name,
168 |                   ckpt_dir,
169 |                   imagenet_eval_glob,
170 |                   imagenet_eval_label,
171 |                   num_images):
172 |   """Eval ImageNet images and report top1/top5 accuracy.
173 | 
174 |   Args:
175 |     model_name: str. The name of model to eval.
176 |     ckpt_dir: str. Checkpoint directory path.
177 |     imagenet_eval_glob: str. File path glob for all eval images.
178 |     imagenet_eval_label: str. File path for eval label.
179 |     num_images: int. Number of images to eval: -1 means eval the whole dataset.
180 | 
181 |   Returns:
182 |     A tuple (top1, top5) for top1 and top5 accuracy.
183 |   """
184 |   eval_ckpt_driver = EvalCkptDriver(model_name)
185 |   imagenet_val_labels = [int(i) for i in tf.gfile.GFile(imagenet_eval_label)]
186 |   imagenet_filenames = sorted(tf.gfile.Glob(imagenet_eval_glob))
187 |   if num_images < 0:
188 |     num_images = len(imagenet_filenames)
189 |   image_files = imagenet_filenames[:num_images]
190 |   labels = imagenet_val_labels[:num_images]
191 | 
192 |   pred_idx, _ = eval_ckpt_driver.run_inference(ckpt_dir, image_files, labels)
193 |   top1_cnt, top5_cnt = 0.0, 0.0
194 |   for i, label in enumerate(labels):
195 |     top1_cnt += label in pred_idx[i][:1]
196 |     top5_cnt += label in pred_idx[i][:5]
197 |     if i % 100 == 0:
198 |       print('Step {}: top1_acc = {:4.2f}%  top5_acc = {:4.2f}%'.format(
199 |           i, 100 * top1_cnt / (i + 1), 100 * top5_cnt / (i + 1)))
200 |       sys.stdout.flush()
201 |   top1, top5 = 100 * top1_cnt / num_images, 100 * top5_cnt / num_images
202 |   print('Final: top1_acc = {:4.2f}%  top5_acc = {:4.2f}%'.format(top1, top5))
203 |   return top1, top5
204 | 
205 | 
206 | def main(unused_argv):
207 |   tf.logging.set_verbosity(tf.logging.ERROR)
208 |   if FLAGS.runmode == 'examples':
209 |     # Run inference for an example image.
210 |     eval_example_images(FLAGS.model_name, FLAGS.ckpt_dir, [FLAGS.example_img],
211 |                         FLAGS.labels_map_file)
212 |   elif FLAGS.runmode == 'imagenet':
213 |     # Run inference for imagenet.
214 |     eval_imagenet(FLAGS.model_name, FLAGS.ckpt_dir, FLAGS.imagenet_eval_glob,
215 |                   FLAGS.imagenet_eval_label, FLAGS.num_images)
216 |   else:
217 |     print('must specify runmode: examples or imagenet')
218 | 
219 | 
220 | if __name__ == '__main__':
221 |   app.run(main)
222 | 


--------------------------------------------------------------------------------
/external_data/original_tf/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """ImageNet preprocessing."""
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | from absl import logging
 21 | 
 22 | import tensorflow.compat.v1 as tf
 23 | 
 24 | 
 25 | IMAGE_SIZE = 224
 26 | CROP_PADDING = 32
 27 | 
 28 | 
 29 | def distorted_bounding_box_crop(image_bytes,
 30 |                                 bbox,
 31 |                                 min_object_covered=0.1,
 32 |                                 aspect_ratio_range=(0.75, 1.33),
 33 |                                 area_range=(0.05, 1.0),
 34 |                                 max_attempts=100,
 35 |                                 scope=None):
 36 |   """Generates cropped_image using one of the bboxes randomly distorted.
 37 | 
 38 |   See `tf.image.sample_distorted_bounding_box` for more documentation.
 39 | 
 40 |   Args:
 41 |     image_bytes: `Tensor` of binary image data.
 42 |     bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
 43 |         where each coordinate is [0, 1) and the coordinates are arranged
 44 |         as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
 45 |         image.
 46 |     min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
 47 |         area of the image must contain at least this fraction of any bounding
 48 |         box supplied.
 49 |     aspect_ratio_range: An optional list of `float`s. The cropped area of the
 50 |         image must have an aspect ratio = width / height within this range.
 51 |     area_range: An optional list of `float`s. The cropped area of the image
 52 |         must contain a fraction of the supplied image within in this range.
 53 |     max_attempts: An optional `int`. Number of attempts at generating a cropped
 54 |         region of the image of the specified constraints. After `max_attempts`
 55 |         failures, return the entire image.
 56 |     scope: Optional `str` for name scope.
 57 |   Returns:
 58 |     cropped image `Tensor`
 59 |   """
 60 |   with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]):
 61 |     shape = tf.image.extract_jpeg_shape(image_bytes)
 62 |     sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
 63 |         shape,
 64 |         bounding_boxes=bbox,
 65 |         min_object_covered=min_object_covered,
 66 |         aspect_ratio_range=aspect_ratio_range,
 67 |         area_range=area_range,
 68 |         max_attempts=max_attempts,
 69 |         use_image_if_no_bounding_boxes=True)
 70 |     bbox_begin, bbox_size, _ = sample_distorted_bounding_box
 71 | 
 72 |     # Crop the image to the specified bounding box.
 73 |     offset_y, offset_x, _ = tf.unstack(bbox_begin)
 74 |     target_height, target_width, _ = tf.unstack(bbox_size)
 75 |     crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
 76 |     image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
 77 | 
 78 |     return image
 79 | 
 80 | 
 81 | def _at_least_x_are_equal(a, b, x):
 82 |   """At least `x` of `a` and `b` `Tensors` are equal."""
 83 |   match = tf.equal(a, b)
 84 |   match = tf.cast(match, tf.int32)
 85 |   return tf.greater_equal(tf.reduce_sum(match), x)
 86 | 
 87 | 
 88 | def _decode_and_random_crop(image_bytes, image_size):
 89 |   """Make a random crop of image_size."""
 90 |   bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
 91 |   image = distorted_bounding_box_crop(
 92 |       image_bytes,
 93 |       bbox,
 94 |       min_object_covered=0.1,
 95 |       aspect_ratio_range=(3. / 4, 4. / 3.),
 96 |       area_range=(0.08, 1.0),
 97 |       max_attempts=10,
 98 |       scope=None)
 99 |   original_shape = tf.image.extract_jpeg_shape(image_bytes)
100 |   bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3)
101 | 
102 |   image = tf.cond(
103 |       bad,
104 |       lambda: _decode_and_center_crop(image_bytes, image_size),
105 |       lambda: tf.image.resize_bicubic([image],  # pylint: disable=g-long-lambda
106 |                                       [image_size, image_size])[0])
107 | 
108 |   return image
109 | 
110 | 
111 | def _decode_and_center_crop(image_bytes, image_size):
112 |   """Crops to center of image with padding then scales image_size."""
113 |   shape = tf.image.extract_jpeg_shape(image_bytes)
114 |   image_height = shape[0]
115 |   image_width = shape[1]
116 | 
117 |   padded_center_crop_size = tf.cast(
118 |       ((image_size / (image_size + CROP_PADDING)) *
119 |        tf.cast(tf.minimum(image_height, image_width), tf.float32)),
120 |       tf.int32)
121 | 
122 |   offset_height = ((image_height - padded_center_crop_size) + 1) // 2
123 |   offset_width = ((image_width - padded_center_crop_size) + 1) // 2
124 |   crop_window = tf.stack([offset_height, offset_width,
125 |                           padded_center_crop_size, padded_center_crop_size])
126 |   image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
127 |   image = tf.image.resize_bicubic([image], [image_size, image_size])[0]
128 |   return image
129 | 
130 | 
131 | def _flip(image):
132 |   """Random horizontal image flip."""
133 |   image = tf.image.random_flip_left_right(image)
134 |   return image
135 | 
136 | 
137 | def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE,
138 |                          augment_name=None,
139 |                          randaug_num_layers=None, randaug_magnitude=None):
140 |   """Preprocesses the given image for evaluation.
141 | 
142 |   Args:
143 |     image_bytes: `Tensor` representing an image binary of arbitrary size.
144 |     use_bfloat16: `bool` for whether to use bfloat16.
145 |     image_size: image size.
146 |     augment_name: `string` that is the name of the augmentation method
147 |       to apply to the image. `autoaugment` if AutoAugment is to be used or
148 |       `randaugment` if RandAugment is to be used. If the value is `None` no
149 |       augmentation method will be applied applied. See autoaugment.py for more
150 |       details.
151 |     randaug_num_layers: 'int', if RandAug is used, what should the number of
152 |       layers be. See autoaugment.py for detailed description.
153 |     randaug_magnitude: 'int', if RandAug is used, what should the magnitude
154 |       be. See autoaugment.py for detailed description.
155 | 
156 |   Returns:
157 |     A preprocessed image `Tensor`.
158 |   """
159 |   image = _decode_and_random_crop(image_bytes, image_size)
160 |   image = _flip(image)
161 |   image = tf.reshape(image, [image_size, image_size, 3])
162 | 
163 |   image = tf.image.convert_image_dtype(
164 |       image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
165 | 
166 |   if augment_name:
167 |     try:
168 |       import autoaugment  # pylint: disable=g-import-not-at-top
169 |     except ImportError as e:
170 |       logging.exception('Autoaugment is not supported in TF 2.x.')
171 |       raise e
172 | 
173 |     logging.info('Apply AutoAugment policy %s', augment_name)
174 |     input_image_type = image.dtype
175 |     image = tf.clip_by_value(image, 0.0, 255.0)
176 |     image = tf.cast(image, dtype=tf.uint8)
177 | 
178 |     if augment_name == 'autoaugment':
179 |       logging.info('Apply AutoAugment policy %s', augment_name)
180 |       image = autoaugment.distort_image_with_autoaugment(image, 'v0')
181 |     elif augment_name == 'randaugment':
182 |       image = autoaugment.distort_image_with_randaugment(
183 |           image, randaug_num_layers, randaug_magnitude)
184 |     else:
185 |       raise ValueError('Invalid value for augment_name: %s' % (augment_name))
186 | 
187 |     image = tf.cast(image, dtype=input_image_type)
188 |   return image
189 | 
190 | 
191 | def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
192 |   """Preprocesses the given image for evaluation.
193 | 
194 |   Args:
195 |     image_bytes: `Tensor` representing an image binary of arbitrary size.
196 |     use_bfloat16: `bool` for whether to use bfloat16.
197 |     image_size: image size.
198 | 
199 |   Returns:
200 |     A preprocessed image `Tensor`.
201 |   """
202 |   image = _decode_and_center_crop(image_bytes, image_size)
203 |   image = tf.reshape(image, [image_size, image_size, 3])
204 |   image = tf.image.convert_image_dtype(
205 |       image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
206 |   return image
207 | 
208 | 
209 | def preprocess_image(image_bytes,
210 |                      is_training=False,
211 |                      use_bfloat16=False,
212 |                      image_size=IMAGE_SIZE,
213 |                      augment_name=None,
214 |                      randaug_num_layers=None,
215 |                      randaug_magnitude=None):
216 |   """Preprocesses the given image.
217 | 
218 |   Args:
219 |     image_bytes: `Tensor` representing an image binary of arbitrary size.
220 |     is_training: `bool` for whether the preprocessing is for training.
221 |     use_bfloat16: `bool` for whether to use bfloat16.
222 |     image_size: image size.
223 |     augment_name: `string` that is the name of the augmentation method
224 |       to apply to the image. `autoaugment` if AutoAugment is to be used or
225 |       `randaugment` if RandAugment is to be used. If the value is `None` no
226 |       augmentation method will be applied applied. See autoaugment.py for more
227 |       details.
228 |     randaug_num_layers: 'int', if RandAug is used, what should the number of
229 |       layers be. See autoaugment.py for detailed description.
230 |     randaug_magnitude: 'int', if RandAug is used, what should the magnitude
231 |       be. See autoaugment.py for detailed description.
232 | 
233 |   Returns:
234 |     A preprocessed image `Tensor` with value range of [0, 255].
235 |   """
236 |   if is_training:
237 |     return preprocess_for_train(
238 |         image_bytes, use_bfloat16, image_size, augment_name,
239 |         randaug_num_layers, randaug_magnitude)
240 |   else:
241 |     return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
242 | 


--------------------------------------------------------------------------------
/external_data/original_tf/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Model utilities."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import json
 22 | import os
 23 | import sys
 24 | 
 25 | from absl import logging
 26 | import numpy as np
 27 | import tensorflow.compat.v1 as tf
 28 | 
 29 | from tensorflow.python.tpu import tpu_function  # pylint:disable=g-direct-tensorflow-import
 30 | 
 31 | 
 32 | def build_learning_rate(initial_lr,
 33 |                         global_step,
 34 |                         steps_per_epoch=None,
 35 |                         lr_decay_type='exponential',
 36 |                         decay_factor=0.97,
 37 |                         decay_epochs=2.4,
 38 |                         total_steps=None,
 39 |                         warmup_epochs=5):
 40 |   """Build learning rate."""
 41 |   if lr_decay_type == 'exponential':
 42 |     assert steps_per_epoch is not None
 43 |     decay_steps = steps_per_epoch * decay_epochs
 44 |     lr = tf.train.exponential_decay(
 45 |         initial_lr, global_step, decay_steps, decay_factor, staircase=True)
 46 |   elif lr_decay_type == 'cosine':
 47 |     assert total_steps is not None
 48 |     lr = 0.5 * initial_lr * (
 49 |         1 + tf.cos(np.pi * tf.cast(global_step, tf.float32) / total_steps))
 50 |   elif lr_decay_type == 'constant':
 51 |     lr = initial_lr
 52 |   else:
 53 |     assert False, 'Unknown lr_decay_type : %s' % lr_decay_type
 54 | 
 55 |   if warmup_epochs:
 56 |     logging.info('Learning rate warmup_epochs: %d', warmup_epochs)
 57 |     warmup_steps = int(warmup_epochs * steps_per_epoch)
 58 |     warmup_lr = (
 59 |         initial_lr * tf.cast(global_step, tf.float32) / tf.cast(
 60 |             warmup_steps, tf.float32))
 61 |     lr = tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
 62 | 
 63 |   return lr
 64 | 
 65 | 
 66 | def build_optimizer(learning_rate,
 67 |                     optimizer_name='rmsprop',
 68 |                     decay=0.9,
 69 |                     epsilon=0.001,
 70 |                     momentum=0.9):
 71 |   """Build optimizer."""
 72 |   if optimizer_name == 'sgd':
 73 |     logging.info('Using SGD optimizer')
 74 |     optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
 75 |   elif optimizer_name == 'momentum':
 76 |     logging.info('Using Momentum optimizer')
 77 |     optimizer = tf.train.MomentumOptimizer(
 78 |         learning_rate=learning_rate, momentum=momentum)
 79 |   elif optimizer_name == 'rmsprop':
 80 |     logging.info('Using RMSProp optimizer')
 81 |     optimizer = tf.train.RMSPropOptimizer(learning_rate, decay, momentum,
 82 |                                           epsilon)
 83 |   else:
 84 |     logging.fatal('Unknown optimizer: %s', optimizer_name)
 85 | 
 86 |   return optimizer
 87 | 
 88 | 
 89 | class TpuBatchNormalization(tf.layers.BatchNormalization):
 90 |   # class TpuBatchNormalization(tf.layers.BatchNormalization):
 91 |   """Cross replica batch normalization."""
 92 | 
 93 |   def __init__(self, fused=False, **kwargs):
 94 |     if fused in (True, None):
 95 |       raise ValueError('TpuBatchNormalization does not support fused=True.')
 96 |     super(TpuBatchNormalization, self).__init__(fused=fused, **kwargs)
 97 | 
 98 |   def _cross_replica_average(self, t, num_shards_per_group):
 99 |     """Calculates the average value of input tensor across TPU replicas."""
100 |     num_shards = tpu_function.get_tpu_context().number_of_shards
101 |     group_assignment = None
102 |     if num_shards_per_group > 1:
103 |       if num_shards % num_shards_per_group != 0:
104 |         raise ValueError('num_shards: %d mod shards_per_group: %d, should be 0'
105 |                          % (num_shards, num_shards_per_group))
106 |       num_groups = num_shards // num_shards_per_group
107 |       group_assignment = [[
108 |           x for x in range(num_shards) if x // num_shards_per_group == y
109 |       ] for y in range(num_groups)]
110 |     return tf.tpu.cross_replica_sum(t, group_assignment) / tf.cast(
111 |         num_shards_per_group, t.dtype)
112 | 
113 |   def _moments(self, inputs, reduction_axes, keep_dims):
114 |     """Compute the mean and variance: it overrides the original _moments."""
115 |     shard_mean, shard_variance = super(TpuBatchNormalization, self)._moments(
116 |         inputs, reduction_axes, keep_dims=keep_dims)
117 | 
118 |     num_shards = tpu_function.get_tpu_context().number_of_shards or 1
119 |     if num_shards <= 8:  # Skip cross_replica for 2x2 or smaller slices.
120 |       num_shards_per_group = 1
121 |     else:
122 |       num_shards_per_group = max(8, num_shards // 8)
123 |     logging.info('TpuBatchNormalization with num_shards_per_group %s',
124 |                  num_shards_per_group)
125 |     if num_shards_per_group > 1:
126 |       # Compute variance using: Var[X]= E[X^2] - E[X]^2.
127 |       shard_square_of_mean = tf.math.square(shard_mean)
128 |       shard_mean_of_square = shard_variance + shard_square_of_mean
129 |       group_mean = self._cross_replica_average(
130 |           shard_mean, num_shards_per_group)
131 |       group_mean_of_square = self._cross_replica_average(
132 |           shard_mean_of_square, num_shards_per_group)
133 |       group_variance = group_mean_of_square - tf.math.square(group_mean)
134 |       return (group_mean, group_variance)
135 |     else:
136 |       return (shard_mean, shard_variance)
137 | 
138 | 
139 | class BatchNormalization(tf.layers.BatchNormalization):
140 |   """Fixed default name of BatchNormalization to match TpuBatchNormalization."""
141 | 
142 |   def __init__(self, name='tpu_batch_normalization', **kwargs):
143 |     super(BatchNormalization, self).__init__(name=name, **kwargs)
144 | 
145 | 
146 | def drop_connect(inputs, is_training, survival_prob):
147 |   """Drop the entire conv with given survival probability."""
148 |   # "Deep Networks with Stochastic Depth", https://arxiv.org/pdf/1603.09382.pdf
149 |   if not is_training:
150 |     return inputs
151 | 
152 |   # Compute tensor.
153 |   batch_size = tf.shape(inputs)[0]
154 |   random_tensor = survival_prob
155 |   random_tensor += tf.random_uniform([batch_size, 1, 1, 1], dtype=inputs.dtype)
156 |   binary_tensor = tf.floor(random_tensor)
157 |   # Unlike conventional way that multiply survival_prob at test time, here we
158 |   # divide survival_prob at training time, such that no addition compute is
159 |   # needed at test time.
160 |   output = tf.div(inputs, survival_prob) * binary_tensor
161 |   return output
162 | 
163 | 
164 | def archive_ckpt(ckpt_eval, ckpt_objective, ckpt_path):
165 |   """Archive a checkpoint if the metric is better."""
166 |   ckpt_dir, ckpt_name = os.path.split(ckpt_path)
167 | 
168 |   saved_objective_path = os.path.join(ckpt_dir, 'best_objective.txt')
169 |   saved_objective = float('-inf')
170 |   if tf.gfile.Exists(saved_objective_path):
171 |     with tf.gfile.GFile(saved_objective_path, 'r') as f:
172 |       saved_objective = float(f.read())
173 |   if saved_objective > ckpt_objective:
174 |     logging.info('Ckpt %s is worse than %s', ckpt_objective, saved_objective)
175 |     return False
176 | 
177 |   filenames = tf.gfile.Glob(ckpt_path + '.*')
178 |   if filenames is None:
179 |     logging.info('No files to copy for checkpoint %s', ckpt_path)
180 |     return False
181 | 
182 |   # Clear the old folder.
183 |   dst_dir = os.path.join(ckpt_dir, 'archive')
184 |   if tf.gfile.Exists(dst_dir):
185 |     tf.gfile.DeleteRecursively(dst_dir)
186 |   tf.gfile.MakeDirs(dst_dir)
187 | 
188 |   # Write checkpoints.
189 |   for f in filenames:
190 |     dest = os.path.join(dst_dir, os.path.basename(f))
191 |     tf.gfile.Copy(f, dest, overwrite=True)
192 |   ckpt_state = tf.train.generate_checkpoint_state_proto(
193 |       dst_dir,
194 |       model_checkpoint_path=ckpt_name,
195 |       all_model_checkpoint_paths=[ckpt_name])
196 |   with tf.gfile.GFile(os.path.join(dst_dir, 'checkpoint'), 'w') as f:
197 |     f.write(str(ckpt_state))
198 |   with tf.gfile.GFile(os.path.join(dst_dir, 'best_eval.txt'), 'w') as f:
199 |     f.write('%s' % ckpt_eval)
200 | 
201 |   # Update the best objective.
202 |   with tf.gfile.GFile(saved_objective_path, 'w') as f:
203 |     f.write('%f' % ckpt_objective)
204 | 
205 |   logging.info('Copying checkpoint %s to %s', ckpt_path, dst_dir)
206 |   return True
207 | 
208 | 
209 | def get_ema_vars():
210 |   """Get all exponential moving average (ema) variables."""
211 |   ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars')
212 |   for v in tf.global_variables():
213 |     # We maintain mva for batch norm moving mean and variance as well.
214 |     if 'moving_mean' in v.name or 'moving_variance' in v.name:
215 |       ema_vars.append(v)
216 |   return list(set(ema_vars))
217 | 
218 | 
219 | class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, tf.layers.Layer):
220 |   """Wrap keras DepthwiseConv2D to tf.layers."""
221 | 
222 |   pass
223 | 
224 | 
225 | class EvalCkptDriver(object):
226 |   """A driver for running eval inference.
227 | 
228 |   Attributes:
229 |     model_name: str. Model name to eval.
230 |     batch_size: int. Eval batch size.
231 |     image_size: int. Input image size, determined by model name.
232 |     num_classes: int. Number of classes, default to 1000 for ImageNet.
233 |     include_background_label: whether to include extra background label.
234 |   """
235 | 
236 |   def __init__(self,
237 |                model_name,
238 |                batch_size=1,
239 |                image_size=224,
240 |                num_classes=1000,
241 |                include_background_label=False):
242 |     """Initialize internal variables."""
243 |     self.model_name = model_name
244 |     self.batch_size = batch_size
245 |     self.num_classes = num_classes
246 |     self.include_background_label = include_background_label
247 |     self.image_size = image_size
248 | 
249 |   def restore_model(self, sess, ckpt_dir, enable_ema=True, export_ckpt=None):
250 |     """Restore variables from checkpoint dir."""
251 |     sess.run(tf.global_variables_initializer())
252 |     checkpoint = tf.train.latest_checkpoint(ckpt_dir)
253 |     if enable_ema:
254 |       ema = tf.train.ExponentialMovingAverage(decay=0.0)
255 |       ema_vars = get_ema_vars()
256 |       var_dict = ema.variables_to_restore(ema_vars)
257 |       ema_assign_op = ema.apply(ema_vars)
258 |     else:
259 |       var_dict = get_ema_vars()
260 |       ema_assign_op = None
261 | 
262 |     tf.train.get_or_create_global_step()
263 |     sess.run(tf.global_variables_initializer())
264 |     saver = tf.train.Saver(var_dict, max_to_keep=1)
265 |     saver.restore(sess, checkpoint)
266 | 
267 |     if export_ckpt:
268 |       if ema_assign_op is not None:
269 |         sess.run(ema_assign_op)
270 |       saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
271 |       saver.save(sess, export_ckpt)
272 | 
273 |   def build_model(self, features, is_training):
274 |     """Build model with input features."""
275 |     del features, is_training
276 |     raise ValueError('Must be implemented by subclasses.')
277 | 
278 |   def get_preprocess_fn(self):
279 |     raise ValueError('Must be implemented by subclsses.')
280 | 
281 |   def build_dataset(self, filenames, labels, is_training):
282 |     """Build input dataset."""
283 |     batch_drop_remainder = False
284 |     if 'condconv' in self.model_name and not is_training:
285 |       # CondConv layers can only be called with known batch dimension. Thus, we
286 |       # must drop all remaining examples that do not make up one full batch.
287 |       # To ensure all examples are evaluated, use a batch size that evenly
288 |       # divides the number of files.
289 |       batch_drop_remainder = True
290 |       num_files = len(filenames)
291 |       if num_files % self.batch_size != 0:
292 |         tf.logging.warn('Remaining examples in last batch are not being '
293 |                         'evaluated.')
294 |     filenames = tf.constant(filenames)
295 |     labels = tf.constant(labels)
296 |     dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
297 | 
298 |     def _parse_function(filename, label):
299 |       image_string = tf.read_file(filename)
300 |       preprocess_fn = self.get_preprocess_fn()
301 |       image_decoded = preprocess_fn(
302 |           image_string, is_training, image_size=self.image_size)
303 |       image = tf.cast(image_decoded, tf.float32)
304 |       return image, label
305 | 
306 |     dataset = dataset.map(_parse_function)
307 |     dataset = dataset.batch(self.batch_size,
308 |                             drop_remainder=batch_drop_remainder)
309 | 
310 |     iterator = dataset.make_one_shot_iterator()
311 |     images, labels = iterator.get_next()
312 |     return images, labels
313 | 
314 |   def run_inference(self,
315 |                     ckpt_dir,
316 |                     image_files,
317 |                     labels,
318 |                     enable_ema=True,
319 |                     export_ckpt=None):
320 |     """Build and run inference on the target images and labels."""
321 |     label_offset = 1 if self.include_background_label else 0
322 |     with tf.Graph().as_default(), tf.Session() as sess:
323 |       images, labels = self.build_dataset(image_files, labels, False)
324 |       probs = self.build_model(images, is_training=False)
325 |       if isinstance(probs, tuple):
326 |         probs = probs[0]
327 | 
328 |       self.restore_model(sess, ckpt_dir, enable_ema, export_ckpt)
329 | 
330 |       prediction_idx = []
331 |       prediction_prob = []
332 |       for _ in range(len(image_files) // self.batch_size):
333 |         out_probs = sess.run(probs)
334 |         idx = np.argsort(out_probs)[::-1]
335 |         prediction_idx.append(idx[:5] - label_offset)
336 |         prediction_prob.append([out_probs[pid] for pid in idx[:5]])
337 | 
338 |       # Return the top 5 predictions (idx and prob) for each image.
339 |       return prediction_idx, prediction_prob
340 | 
341 |   def eval_example_images(self,
342 |                           ckpt_dir,
343 |                           image_files,
344 |                           labels_map_file,
345 |                           enable_ema=True,
346 |                           export_ckpt=None):
347 |     """Eval a list of example images.
348 | 
349 |     Args:
350 |       ckpt_dir: str. Checkpoint directory path.
351 |       image_files: List[str]. A list of image file paths.
352 |       labels_map_file: str. The labels map file path.
353 |       enable_ema: enable expotential moving average.
354 |       export_ckpt: export ckpt folder.
355 | 
356 |     Returns:
357 |       A tuple (pred_idx, and pred_prob), where pred_idx is the top 5 prediction
358 |       index and pred_prob is the top 5 prediction probability.
359 |     """
360 |     classes = json.loads(tf.gfile.Open(labels_map_file).read())
361 |     pred_idx, pred_prob = self.run_inference(
362 |         ckpt_dir, image_files, [0] * len(image_files), enable_ema, export_ckpt)
363 |     for i in range(len(image_files)):
364 |       print('predicted class for image {}: '.format(image_files[i]))
365 |       for j, idx in enumerate(pred_idx[i]):
366 |         print('  -> top_{} ({:4.2f}%): {}  '.format(j, pred_prob[i][j] * 100,
367 |                                                     classes[str(idx)]))
368 |     return pred_idx, pred_prob
369 | 
370 |   def eval_imagenet(self, ckpt_dir, imagenet_eval_glob,
371 |                     imagenet_eval_label, num_images, enable_ema, export_ckpt):
372 |     """Eval ImageNet images and report top1/top5 accuracy.
373 | 
374 |     Args:
375 |       ckpt_dir: str. Checkpoint directory path.
376 |       imagenet_eval_glob: str. File path glob for all eval images.
377 |       imagenet_eval_label: str. File path for eval label.
378 |       num_images: int. Number of images to eval: -1 means eval the whole
379 |         dataset.
380 |       enable_ema: enable expotential moving average.
381 |       export_ckpt: export checkpoint folder.
382 | 
383 |     Returns:
384 |       A tuple (top1, top5) for top1 and top5 accuracy.
385 |     """
386 |     imagenet_val_labels = [int(i) for i in tf.gfile.GFile(imagenet_eval_label)]
387 |     imagenet_filenames = sorted(tf.gfile.Glob(imagenet_eval_glob))
388 |     if num_images < 0:
389 |       num_images = len(imagenet_filenames)
390 |     image_files = imagenet_filenames[:num_images]
391 |     labels = imagenet_val_labels[:num_images]
392 | 
393 |     pred_idx, _ = self.run_inference(
394 |         ckpt_dir, image_files, labels, enable_ema, export_ckpt)
395 |     top1_cnt, top5_cnt = 0.0, 0.0
396 |     for i, label in enumerate(labels):
397 |       top1_cnt += label in pred_idx[i][:1]
398 |       top5_cnt += label in pred_idx[i][:5]
399 |       if i % 100 == 0:
400 |         print('Step {}: top1_acc = {:4.2f}%  top5_acc = {:4.2f}%'.format(
401 |             i, 100 * top1_cnt / (i + 1), 100 * top5_cnt / (i + 1)))
402 |         sys.stdout.flush()
403 |     top1, top5 = 100 * top1_cnt / num_images, 100 * top5_cnt / num_images
404 |     print('Final: top1_acc = {:4.2f}%  top5_acc = {:4.2f}%'.format(top1, top5))
405 |     return top1, top5
406 | 


--------------------------------------------------------------------------------
/extract_tracks_from_videos.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import yaml
  4 | import random
  5 | import pickle
  6 | import tqdm
  7 | 
  8 | import cv2
  9 | import numpy as np
 10 | 
 11 | from generate_aligned_tracks import ALIGNED_TRACKS_FILE_NAME
 12 | 
 13 | SEED = 0xDEADFACE
 14 | TRACK_LENGTH = 50
 15 | DETECTOR_STEP = 6
 16 | BOX_MULT = 1.5
 17 | 
 18 | TRACKS_ROOT = 'tracks'
 19 | BOXES_FILE_NAME = 'boxes.float32'
 20 | 
 21 | 
 22 | def main():
 23 |     parser = argparse.ArgumentParser(description='Extracts tracks from videos')
 24 |     parser.add_argument('--num_parts', type=int, default=1, help='Number of parts')
 25 |     parser.add_argument('--part', type=int, default=0, help='Part')
 26 | 
 27 |     args = parser.parse_args()
 28 | 
 29 |     with open('config.yaml', 'r') as f:
 30 |         config = yaml.load(f)
 31 | 
 32 |     with open(os.path.join(config['ARTIFACTS_PATH'], ALIGNED_TRACKS_FILE_NAME), 'rb') as f:
 33 |         aligned_tracks = pickle.load(f)
 34 | 
 35 |     part_size = len(aligned_tracks) // args.num_parts + 1
 36 |     assert part_size * args.num_parts >= len(aligned_tracks)
 37 |     part_start = part_size * args.part
 38 |     part_end = min(part_start + part_size, len(aligned_tracks))
 39 |     print('Part {} ({}, {})'.format(args.part, part_start, part_end))
 40 | 
 41 |     random.seed(SEED)
 42 |     for real_video, fake_video, aligned_track in tqdm.tqdm(aligned_tracks[part_start:part_end]):
 43 |         if len(aligned_track) < TRACK_LENGTH // DETECTOR_STEP:
 44 |             continue
 45 |         real_boxes = [item[1] for item in aligned_track]
 46 |         fake_boxes = [item[2] for item in aligned_track]
 47 |         start_idx = random.randint(0, len(aligned_track) - TRACK_LENGTH // DETECTOR_STEP)
 48 |         start_frame = aligned_track[start_idx][0] * DETECTOR_STEP
 49 |         middle_idx = start_idx + TRACK_LENGTH // DETECTOR_STEP // 2
 50 | 
 51 |         if random.choice([False, True]):
 52 |             xmin, ymin, xmax, ymax = real_boxes[middle_idx]
 53 |         else:
 54 |             xmin, ymin, xmax, ymax = fake_boxes[middle_idx]
 55 | 
 56 |         width = xmax - xmin
 57 |         height = ymax - ymin
 58 |         xcenter = xmin + width / 2
 59 |         ycenter = ymin + height / 2
 60 |         width = width * BOX_MULT
 61 |         height = height * BOX_MULT
 62 |         xmin = xcenter - width / 2
 63 |         ymin = ycenter - height / 2
 64 |         xmax = xmin + width
 65 |         ymax = ymin + height
 66 | 
 67 |         for video, boxes in [(real_video, real_boxes), (fake_video, fake_boxes)]:
 68 |             capture = cv2.VideoCapture(os.path.join(config['DFDC_DATA_PATH'], video))
 69 |             frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
 70 |             if frame_count == 0:
 71 |                 continue
 72 |             frame_height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
 73 |             frame_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
 74 | 
 75 |             xmin = max(int(xmin), 0)
 76 |             xmax = min(int(xmax), frame_width)
 77 |             ymin = max(int(ymin), 0)
 78 |             ymax = min(int(ymax), frame_height)
 79 | 
 80 |             dst_root = os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT,
 81 |                                     video + '_{}_{}_{}'.format(start_frame, xmin, ymin))
 82 |             if os.path.exists(dst_root):
 83 |                 continue
 84 |             os.makedirs(dst_root)
 85 |             for i in range(start_frame + TRACK_LENGTH):
 86 |                 capture.grab()
 87 |                 if i < start_frame:
 88 |                     continue
 89 |                 ret, frame = capture.retrieve()
 90 |                 if not ret:
 91 |                     continue
 92 |                 face = frame[ymin:ymax, xmin:xmax]
 93 |                 dst_path = os.path.join(dst_root, '{}.png'.format(i - start_frame))
 94 |                 cv2.imwrite(dst_path, face)
 95 | 
 96 |             boxes = np.array(boxes, dtype=np.float32)
 97 |             boxes[:, 0] -= xmin
 98 |             boxes[:, 1] -= ymin
 99 |             boxes[:, 2] -= xmin
100 |             boxes[:, 3] -= ymin
101 |             boxes.tofile(os.path.join(dst_root, BOXES_FILE_NAME))
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/generate_aligned_tracks.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import yaml
  4 | import json
  5 | from collections import defaultdict
  6 | import tqdm
  7 | import pickle
  8 | 
  9 | from tracker.utils import iou
 10 | 
 11 | from generate_tracks import TRACKS_FILE_NAME
 12 | 
 13 | MIN_TRACK_LENGTH = 5
 14 | IOU_THRESHOLD = 0.5
 15 | METADATA_FILE_NAME = 'metadata.json'
 16 | 
 17 | ALIGNED_TRACKS_FILE_NAME = 'aligned_tracks.pkl'
 18 | 
 19 | 
 20 | def get_track(tracks, min_track_length):
 21 |     good_tracks = [track for track in tracks if len(track) >= min_track_length]
 22 |     if len(good_tracks) == 1:
 23 |         return good_tracks[0]
 24 |     else:
 25 |         return None
 26 | 
 27 | 
 28 | def main():
 29 |     with open('config.yaml', 'r') as f:
 30 |         config = yaml.load(f)
 31 | 
 32 |     video_to_meta = {}
 33 | 
 34 |     for path in glob.iglob(os.path.join(config['DFDC_DATA_PATH'], '**', METADATA_FILE_NAME), recursive=True):
 35 |         root = os.path.basename(os.path.dirname(path))
 36 |         with open(path, 'r') as f:
 37 |             for video, meta in json.load(f).items():
 38 |                 video_to_meta[os.path.join(root, video)] = meta
 39 | 
 40 |     real_video_to_fake_videos = defaultdict(list)
 41 |     for video in video_to_meta:
 42 |         root = os.path.dirname(video)
 43 |         meta = video_to_meta[video]
 44 |         if meta['label'] == 'FAKE':
 45 |             original_video = os.path.join(root, meta['original'])
 46 |             real_video_to_fake_videos[original_video].append(video)
 47 | 
 48 |     print('Total number of real videos: {}'.format(len(real_video_to_fake_videos)))
 49 |     print('Total number of fake videos: {}'.format(sum([len(fake_videos) for fake_videos in real_video_to_fake_videos.items()])))
 50 | 
 51 |     with open(os.path.join(config['ARTIFACTS_PATH'], TRACKS_FILE_NAME), 'rb') as f:
 52 |         video_to_tracks = pickle.load(f)
 53 | 
 54 |     real_fake_aligned_tracks = []
 55 |     real_videos = sorted(real_video_to_fake_videos)
 56 |     for real_video in tqdm.tqdm(real_videos):
 57 |         if real_video not in video_to_tracks:
 58 |             continue
 59 |         real_tracks = [track for track in video_to_tracks[real_video] if len(track) >= MIN_TRACK_LENGTH]
 60 | 
 61 |         for fake_video in real_video_to_fake_videos[real_video]:
 62 |             if fake_video not in video_to_tracks:
 63 |                 continue
 64 |             fake_tracks = [track for track in video_to_tracks[fake_video] if len(track) >= MIN_TRACK_LENGTH]
 65 | 
 66 |             for real_track in real_tracks:
 67 |                 real_frame_idx_to_bbox = {}
 68 |                 for real_frame_idx, real_bbox in real_track:
 69 |                     real_frame_idx_to_bbox[real_frame_idx] = real_bbox
 70 | 
 71 |                 for fake_track in fake_tracks:
 72 |                     fake_frame_idx_to_bbox = {}
 73 |                     ious = []
 74 |                     for fake_frame_idx, fake_bbox in fake_track:
 75 |                         fake_frame_idx_to_bbox[fake_frame_idx] = fake_bbox
 76 |                         if fake_frame_idx in real_frame_idx_to_bbox:
 77 |                             real_bbox = real_frame_idx_to_bbox[fake_frame_idx]
 78 |                             ious.append(iou(real_bbox, fake_bbox))
 79 |                     if len(ious) > 0 and min(ious) > IOU_THRESHOLD:
 80 |                         start_frame_idx = max(min(real_frame_idx_to_bbox), min(fake_frame_idx_to_bbox))
 81 |                         end_frame_idx = min(max(real_frame_idx_to_bbox), max(fake_frame_idx_to_bbox)) + 1
 82 |                         assert start_frame_idx < end_frame_idx
 83 |                         real_fake_aligned_track = []
 84 |                         for frame_idx in range(start_frame_idx, end_frame_idx):
 85 |                             real_bbox = real_frame_idx_to_bbox[frame_idx]
 86 |                             fake_bbox = fake_frame_idx_to_bbox[frame_idx]
 87 |                             assert iou(real_bbox, fake_bbox) > IOU_THRESHOLD
 88 |                             real_fake_aligned_track.append((frame_idx, real_bbox, fake_bbox))
 89 |                         real_fake_aligned_tracks.append((real_video, fake_video, real_fake_aligned_track))
 90 |                         break
 91 | 
 92 |     print('Total number of tracks: {}'.format(len(real_fake_aligned_tracks)))
 93 | 
 94 |     with open(os.path.join(config['ARTIFACTS_PATH'], ALIGNED_TRACKS_FILE_NAME), 'wb') as f:
 95 |         pickle.dump(real_fake_aligned_tracks, f)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/generate_track_pairs.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | import json
 4 | from collections import defaultdict
 5 | import glob
 6 | 
 7 | from generate_aligned_tracks import METADATA_FILE_NAME
 8 | from extract_tracks_from_videos import TRACKS_ROOT
 9 | 
10 | TRACK_PAIRS_FILE_NAME = 'track_pairs.txt'
11 | 
12 | 
13 | def main():
14 |     with open('config.yaml', 'r') as f:
15 |         config = yaml.load(f)
16 | 
17 |     video_to_tracks = defaultdict(list)
18 | 
19 |     for path in glob.iglob(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT, 'dfdc_train_part_*', '*.mp4_*')):
20 |         parts = path.split('/')
21 |         rel_path = '/'.join(parts[-2:])
22 |         video = '_'.join(rel_path.split('_')[:-3])
23 |         video_to_tracks[video].append(rel_path)
24 | 
25 |     video_to_meta = {}
26 | 
27 |     for path in glob.iglob(os.path.join(config['DFDC_DATA_PATH'], '**', METADATA_FILE_NAME), recursive=True):
28 |         root = os.path.basename(os.path.dirname(path))
29 |         with open(path, 'r') as f:
30 |             for video, meta in json.load(f).items():
31 |                 video_to_meta[os.path.join(root, video)] = meta
32 | 
33 |     fake_video_to_real_video = {}
34 |     for video in video_to_meta:
35 |         root = os.path.dirname(video)
36 |         meta = video_to_meta[video]
37 |         if meta['label'] == 'FAKE':
38 |             original_video = os.path.join(root, meta['original'])
39 |             fake_video_to_real_video[video] = original_video
40 | 
41 |     print('Total number of fake videos: {}'.format(len(fake_video_to_real_video)))
42 | 
43 |     track_pairs = []
44 | 
45 |     fake_videos = sorted(fake_video_to_real_video)
46 |     for fake_video in fake_videos:
47 |         real_video = fake_video_to_real_video[fake_video]
48 |         fake_tracks = video_to_tracks[fake_video]
49 |         real_tracks = video_to_tracks[real_video]
50 | 
51 |         for fake_track in fake_tracks:
52 |             if not os.path.exists(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT, fake_track, '0.png')):
53 |                 continue
54 |             suffix = fake_track[len(fake_video):]
55 |             for real_track in real_tracks:
56 |                 if not os.path.exists(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT, real_track, '0.png')):
57 |                     continue
58 |                 if real_track.endswith(suffix):
59 |                     track_pairs.append((real_track, fake_track))
60 |                     break
61 | 
62 |     print('Total number of track pairs: {}'.format(len(track_pairs)))
63 | 
64 |     with open(os.path.join(config['ARTIFACTS_PATH'], TRACK_PAIRS_FILE_NAME), 'w') as f:
65 |         for real_track, fake_track in track_pairs:
66 |             f.write('{},{}\n'.format(real_track, fake_track))
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------
/generate_tracks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | import tqdm
 4 | import glob
 5 | import pickle
 6 | 
 7 | from tracker.iou_tracker import track_iou
 8 | from detect_faces_on_videos import DETECTIONS_FILE_NAME, DETECTIONS_ROOT
 9 | 
10 | SIGMA_L = 0.3
11 | SIGMA_H = 0.9
12 | SIGMA_IOU = 0.3
13 | T_MIN = 1
14 | 
15 | TRACKS_FILE_NAME = 'tracks.pkl'
16 | 
17 | 
18 | def get_tracks(detections):
19 |     if len(detections) == 0:
20 |         return []
21 | 
22 |     converted_detections = []
23 |     for i, detections_per_frame in enumerate(detections):
24 |         converted_detections_per_frame = []
25 |         for j, (bbox, score) in enumerate(zip(detections_per_frame['boxes'], detections_per_frame['scores'])):
26 |             bbox = tuple(bbox.tolist())
27 |             converted_detections_per_frame.append({'bbox': bbox, 'score': score})
28 |         converted_detections.append(converted_detections_per_frame)
29 | 
30 |     tracks = track_iou(converted_detections, SIGMA_L, SIGMA_H, SIGMA_IOU, T_MIN)
31 |     tracks_converted = []
32 |     for track in tracks:
33 |         track_converted = []
34 |         start_frame = track['start_frame'] - 1
35 |         for i, bbox in enumerate(track['bboxes']):
36 |             track_converted.append((start_frame + i, bbox))
37 |         tracks_converted.append(track_converted)
38 | 
39 |     return tracks_converted
40 | 
41 | 
42 | def main():
43 |     with open('config.yaml', 'r') as f:
44 |         config = yaml.load(f)
45 | 
46 |     root_dir = os.path.join(config['ARTIFACTS_PATH'], DETECTIONS_ROOT)
47 |     detections_content = []
48 |     for path in glob.iglob(os.path.join(root_dir, '**', DETECTIONS_FILE_NAME), recursive=True):
49 |         rel_path = path[len(root_dir) + 1:]
50 |         detections_content.append(rel_path)
51 | 
52 |     detections_content = sorted(detections_content)
53 |     print('Total number of videos: {}'.format(len(detections_content)))
54 | 
55 |     video_to_tracks = {}
56 |     for rel_path in tqdm.tqdm(detections_content):
57 |         video = os.path.dirname(rel_path)
58 |         with open(os.path.join(root_dir, rel_path), 'rb') as f:
59 |             detections = pickle.load(f)
60 |         video_to_tracks[video] = get_tracks(detections)
61 | 
62 |     track_count = sum([len(tracks) for tracks in video_to_tracks.values()])
63 |     print('Total number of tracks: {}'.format(track_count))
64 | 
65 |     with open(os.path.join(config['ARTIFACTS_PATH'], TRACKS_FILE_NAME), 'wb') as f:
66 |         pickle.dump(video_to_tracks, f)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------
/images/augmented_mixup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/augmented_mixup.jpg


--------------------------------------------------------------------------------
/images/clip_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/clip_example.jpg


--------------------------------------------------------------------------------
/images/first_and_second_model_inputs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/first_and_second_model_inputs.jpg


--------------------------------------------------------------------------------
/images/mixup_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/mixup_example.jpg


--------------------------------------------------------------------------------
/images/pred_transform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/pred_transform.jpg


--------------------------------------------------------------------------------
/images/third_model_input.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/images/third_model_input.jpg


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/models/.gitkeep


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import glob
  4 | 
  5 | import numpy as np
  6 | import cv2
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | from torchvision.models.detection.transform import GeneralizedRCNNTransform
 13 | 
 14 | from albumentations import Compose, SmallestMaxSize, CenterCrop, Normalize, PadIfNeeded
 15 | from albumentations.pytorch import ToTensor
 16 | 
 17 | from dsfacedetector.face_ssd_infer import SSD
 18 | from tracker.iou_tracker import track_iou
 19 | from efficientnet_pytorch.model import EfficientNet, MBConvBlock
 20 | 
 21 | DETECTOR_WEIGHTS_PATH = 'WIDERFace_DSFD_RES152.fp16.pth'
 22 | DETECTOR_THRESHOLD = 0.3
 23 | DETECTOR_MIN_SIZE = 512
 24 | DETECTOR_MAX_SIZE = 512
 25 | DETECTOR_MEAN = (104.0, 117.0, 123.0)
 26 | DETECTOR_STD = (1.0, 1.0, 1.0)
 27 | DETECTOR_BATCH_SIZE = 16
 28 | DETECTOR_STEP = 3
 29 | 
 30 | TRACKER_SIGMA_L = 0.3
 31 | TRACKER_SIGMA_H = 0.9
 32 | TRACKER_SIGMA_IOU = 0.3
 33 | TRACKER_T_MIN = 7
 34 | 
 35 | VIDEO_MODEL_BBOX_MULT = 1.5
 36 | VIDEO_MODEL_MIN_SIZE = 224
 37 | VIDEO_MODEL_CROP_HEIGHT = 224
 38 | VIDEO_MODEL_CROP_WIDTH = 192
 39 | VIDEO_FACE_MODEL_TRACK_STEP = 2
 40 | VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH = 7
 41 | VIDEO_SEQUENCE_MODEL_TRACK_STEP = 14
 42 | 
 43 | VIDEO_SEQUENCE_MODEL_WEIGHTS_PATH = 'efficientnet-b7_ns_seq_aa-original-mstd0.5_100k_v4_cad79a/snapshot_100000.fp16.pth'
 44 | FIRST_VIDEO_FACE_MODEL_WEIGHTS_PATH = 'efficientnet-b7_ns_aa-original-mstd0.5_large_crop_100k_v4_cad79a/snapshot_100000.fp16.pth'
 45 | SECOND_VIDEO_FACE_MODEL_WEIGHTS_PATH = 'efficientnet-b7_ns_aa-original-mstd0.5_re_100k_v4_cad79a/snapshot_100000.fp16.pth'
 46 | 
 47 | VIDEO_BATCH_SIZE = 1
 48 | VIDEO_TARGET_FPS = 15
 49 | VIDEO_NUM_WORKERS = 0
 50 | 
 51 | 
 52 | class UnlabeledVideoDataset(Dataset):
 53 |     def __init__(self, root_dir, content=None):
 54 |         self.root_dir = os.path.normpath(root_dir)
 55 |         if content is not None:
 56 |             self.content = content
 57 |         else:
 58 |             self.content = []
 59 |             for path in glob.iglob(os.path.join(self.root_dir, '**', '*.mp4'), recursive=True):
 60 |                 rel_path = path[len(self.root_dir) + 1:]
 61 |                 self.content.append(rel_path)
 62 |             self.content = sorted(self.content)
 63 | 
 64 |     def __len__(self):
 65 |         return len(self.content)
 66 | 
 67 |     def __getitem__(self, idx):
 68 |         rel_path = self.content[idx]
 69 |         path = os.path.join(self.root_dir, rel_path)
 70 | 
 71 |         sample = {
 72 |             'frames': [],
 73 |             'index': idx
 74 |         }
 75 | 
 76 |         capture = cv2.VideoCapture(path)
 77 |         frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
 78 |         if frame_count == 0:
 79 |             return sample
 80 | 
 81 |         fps = int(capture.get(cv2.CAP_PROP_FPS))
 82 |         video_step = round(fps / VIDEO_TARGET_FPS)
 83 |         if video_step == 0:
 84 |             return sample
 85 | 
 86 |         for i in range(frame_count):
 87 |             capture.grab()
 88 |             if i % video_step != 0:
 89 |                 continue
 90 |             ret, frame = capture.retrieve()
 91 |             if not ret:
 92 |                 continue
 93 | 
 94 |             sample['frames'].append(frame)
 95 | 
 96 |         return sample
 97 | 
 98 | 
 99 | class Detector(object):
100 |     def __init__(self, weights_path):
101 |         self.model = SSD('test')
102 |         self.model.cuda().eval()
103 | 
104 |         state = torch.load(weights_path, map_location=lambda storage, loc: storage)
105 |         state = {key: value.float() for key, value in state.items()}
106 |         self.model.load_state_dict(state)
107 | 
108 |         self.transform = GeneralizedRCNNTransform(DETECTOR_MIN_SIZE, DETECTOR_MAX_SIZE, DETECTOR_MEAN, DETECTOR_STD)
109 |         self.transform.eval()
110 | 
111 |     def detect(self, images):
112 |         images = torch.stack([torch.from_numpy(image).cuda() for image in images])
113 |         images = images.transpose(1, 3).transpose(2, 3).float()
114 |         original_image_sizes = [img.shape[-2:] for img in images]
115 |         images, _ = self.transform(images, None)
116 |         with torch.no_grad():
117 |             detections_batch = self.model(images.tensors).cpu().numpy()
118 |         result = []
119 |         for detections, image_size in zip(detections_batch, images.image_sizes):
120 |             scores = detections[1, :, 0]
121 |             keep_idxs = scores > DETECTOR_THRESHOLD
122 |             detections = detections[1, keep_idxs, :]
123 |             detections = detections[:, [1, 2, 3, 4, 0]]
124 |             detections[:, 0] *= image_size[1]
125 |             detections[:, 1] *= image_size[0]
126 |             detections[:, 2] *= image_size[1]
127 |             detections[:, 3] *= image_size[0]
128 |             result.append({
129 |                 'scores': torch.from_numpy(detections[:, 4]),
130 |                 'boxes': torch.from_numpy(detections[:, :4])
131 |             })
132 | 
133 |         result = self.transform.postprocess(result, images.image_sizes, original_image_sizes)
134 |         return result
135 | 
136 | 
137 | def get_tracks(detections):
138 |     if len(detections) == 0:
139 |         return []
140 | 
141 |     converted_detections = []
142 |     frame_bbox_to_face_idx = {}
143 |     for i, detections_per_frame in enumerate(detections):
144 |         converted_detections_per_frame = []
145 |         for j, (bbox, score) in enumerate(zip(detections_per_frame['boxes'], detections_per_frame['scores'])):
146 |             bbox = tuple(bbox.tolist())
147 |             frame_bbox_to_face_idx[(i, bbox)] = j
148 |             converted_detections_per_frame.append({'bbox': bbox, 'score': score})
149 |         converted_detections.append(converted_detections_per_frame)
150 | 
151 |     tracks = track_iou(converted_detections, TRACKER_SIGMA_L, TRACKER_SIGMA_H, TRACKER_SIGMA_IOU, TRACKER_T_MIN)
152 |     tracks_converted = []
153 |     for track in tracks:
154 |         start_frame = track['start_frame'] - 1
155 |         bboxes = np.array(track['bboxes'], dtype=np.float32)
156 |         frame_indices = np.arange(start_frame, start_frame + len(bboxes)) * DETECTOR_STEP
157 |         interp_frame_indices = np.arange(frame_indices[0], frame_indices[-1] + 1)
158 |         interp_bboxes = np.zeros((len(interp_frame_indices), 4), dtype=np.float32)
159 |         for i in range(4):
160 |             interp_bboxes[:, i] = np.interp(interp_frame_indices, frame_indices, bboxes[:, i])
161 | 
162 |         track_converted = []
163 |         for frame_idx, bbox in zip(interp_frame_indices, interp_bboxes):
164 |             track_converted.append((frame_idx, bbox))
165 |         tracks_converted.append(track_converted)
166 | 
167 |     return tracks_converted
168 | 
169 | 
170 | class SeqExpandConv(nn.Module):
171 |     def __init__(self, in_channels, out_channels, seq_length):
172 |         super(SeqExpandConv, self).__init__()
173 |         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size=(3, 1, 1), padding=(1, 0, 0), bias=False)
174 |         self.seq_length = seq_length
175 | 
176 |     def forward(self, x):
177 |         batch_size, in_channels, height, width = x.shape
178 |         x = x.view(batch_size // self.seq_length, self.seq_length, in_channels, height, width)
179 |         x = self.conv(x.transpose(1, 2).contiguous()).transpose(2, 1).contiguous()
180 |         x = x.flatten(0, 1)
181 |         return x
182 | 
183 | 
184 | class TrackSequencesClassifier(object):
185 |     def __init__(self, weights_path):
186 |         model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
187 | 
188 |         for module in model.modules():
189 |             if isinstance(module, MBConvBlock):
190 |                 if module._block_args.expand_ratio != 1:
191 |                     expand_conv = module._expand_conv
192 |                     seq_expand_conv = SeqExpandConv(expand_conv.in_channels, expand_conv.out_channels,
193 |                                                     VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH)
194 |                     module._expand_conv = seq_expand_conv
195 |         self.model = model.cuda().eval()
196 | 
197 |         normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
198 |         self.transform = Compose(
199 |             [SmallestMaxSize(VIDEO_MODEL_MIN_SIZE), CenterCrop(VIDEO_MODEL_CROP_HEIGHT, VIDEO_MODEL_CROP_WIDTH),
200 |              normalize, ToTensor()])
201 | 
202 |         state = torch.load(weights_path, map_location=lambda storage, loc: storage)
203 |         state = {key: value.float() for key, value in state.items()}
204 |         self.model.load_state_dict(state)
205 | 
206 |     def classify(self, track_sequences):
207 |         track_sequences = [torch.stack([self.transform(image=face)['image'] for face in sequence]) for sequence in
208 |                            track_sequences]
209 |         track_sequences = torch.cat(track_sequences).cuda()
210 |         with torch.no_grad():
211 |             track_probs = torch.sigmoid(self.model(track_sequences)).flatten().cpu().numpy()
212 | 
213 |         return track_probs
214 | 
215 | 
216 | class TrackFacesClassifier(object):
217 |     def __init__(self, first_weights_path, second_weights_path):
218 |         first_model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
219 |         self.first_model = first_model.cuda().eval()
220 |         second_model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
221 |         self.second_model = second_model.cuda().eval()
222 | 
223 |         first_normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
224 |         self.first_transform = Compose(
225 |             [SmallestMaxSize(VIDEO_MODEL_CROP_WIDTH), PadIfNeeded(VIDEO_MODEL_CROP_HEIGHT, VIDEO_MODEL_CROP_WIDTH),
226 |              CenterCrop(VIDEO_MODEL_CROP_HEIGHT, VIDEO_MODEL_CROP_WIDTH), first_normalize, ToTensor()])
227 | 
228 |         second_normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
229 |         self.second_transform = Compose(
230 |             [SmallestMaxSize(VIDEO_MODEL_MIN_SIZE), CenterCrop(VIDEO_MODEL_CROP_HEIGHT, VIDEO_MODEL_CROP_WIDTH),
231 |              second_normalize, ToTensor()])
232 | 
233 |         first_state = torch.load(first_weights_path, map_location=lambda storage, loc: storage)
234 |         first_state = {key: value.float() for key, value in first_state.items()}
235 |         self.first_model.load_state_dict(first_state)
236 | 
237 |         second_state = torch.load(second_weights_path, map_location=lambda storage, loc: storage)
238 |         second_state = {key: value.float() for key, value in second_state.items()}
239 |         self.second_model.load_state_dict(second_state)
240 | 
241 |     def classify(self, track_faces):
242 |         first_track_faces = []
243 |         second_track_faces = []
244 |         for i, face in enumerate(track_faces):
245 |             if i % 4 < 2:
246 |                 first_track_faces.append(self.first_transform(image=face)['image'])
247 |             else:
248 |                 second_track_faces.append(self.second_transform(image=face)['image'])
249 |         first_track_faces = torch.stack(first_track_faces).cuda()
250 |         second_track_faces = torch.stack(second_track_faces).cuda()
251 |         with torch.no_grad():
252 |             first_track_probs = torch.sigmoid(self.first_model(first_track_faces)).flatten().cpu().numpy()
253 |             second_track_probs = torch.sigmoid(self.second_model(second_track_faces)).flatten().cpu().numpy()
254 |             track_probs = np.concatenate((first_track_probs, second_track_probs))
255 | 
256 |         return track_probs
257 | 
258 | 
259 | def extract_sequence(frames, start_idx, bbox, flip):
260 |     frame_height, frame_width, _ = frames[start_idx].shape
261 |     xmin, ymin, xmax, ymax = bbox
262 |     width = xmax - xmin
263 |     height = ymax - ymin
264 |     xcenter = xmin + width / 2
265 |     ycenter = ymin + height / 2
266 |     width = width * VIDEO_MODEL_BBOX_MULT
267 |     height = height * VIDEO_MODEL_BBOX_MULT
268 |     xmin = xcenter - width / 2
269 |     ymin = ycenter - height / 2
270 |     xmax = xmin + width
271 |     ymax = ymin + height
272 | 
273 |     xmin = max(int(xmin), 0)
274 |     xmax = min(int(xmax), frame_width)
275 |     ymin = max(int(ymin), 0)
276 |     ymax = min(int(ymax), frame_height)
277 | 
278 |     sequence = []
279 |     for i in range(VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH):
280 |         face = cv2.cvtColor(frames[start_idx + i][ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)
281 |         sequence.append(face)
282 | 
283 |     if flip:
284 |         sequence = [face[:, ::-1] for face in sequence]
285 | 
286 |     return sequence
287 | 
288 | 
289 | def extract_face(frame, bbox, flip):
290 |     frame_height, frame_width, _ = frame.shape
291 |     xmin, ymin, xmax, ymax = bbox
292 |     width = xmax - xmin
293 |     height = ymax - ymin
294 |     xcenter = xmin + width / 2
295 |     ycenter = ymin + height / 2
296 |     width = width * VIDEO_MODEL_BBOX_MULT
297 |     height = height * VIDEO_MODEL_BBOX_MULT
298 |     xmin = xcenter - width / 2
299 |     ymin = ycenter - height / 2
300 |     xmax = xmin + width
301 |     ymax = ymin + height
302 | 
303 |     xmin = max(int(xmin), 0)
304 |     xmax = min(int(xmax), frame_width)
305 |     ymin = max(int(ymin), 0)
306 |     ymax = min(int(ymax), frame_height)
307 | 
308 |     face = cv2.cvtColor(frame[ymin:ymax, xmin:xmax], cv2.COLOR_BGR2RGB)
309 |     if flip:
310 |         face = face[:, ::-1].copy()
311 | 
312 |     return face
313 | 
314 | 
315 | def main():
316 |     with open('config.yaml', 'r') as f:
317 |         config = yaml.load(f)
318 | 
319 |     detector = Detector(os.path.join(config['MODELS_PATH'], DETECTOR_WEIGHTS_PATH))
320 |     track_sequences_classifier = TrackSequencesClassifier(os.path.join(config['MODELS_PATH'], VIDEO_SEQUENCE_MODEL_WEIGHTS_PATH))
321 |     track_faces_classifier = TrackFacesClassifier(os.path.join(config['MODELS_PATH'], FIRST_VIDEO_FACE_MODEL_WEIGHTS_PATH),
322 |                                                   os.path.join(config['MODELS_PATH'], SECOND_VIDEO_FACE_MODEL_WEIGHTS_PATH))
323 | 
324 |     dataset = UnlabeledVideoDataset(os.path.join(config['DFDC_DATA_PATH'], 'test_videos'))
325 |     print('Total number of videos: {}'.format(len(dataset)))
326 | 
327 |     loader = DataLoader(dataset, batch_size=VIDEO_BATCH_SIZE, shuffle=False, num_workers=VIDEO_NUM_WORKERS,
328 |                         collate_fn=lambda X: X,
329 |                         drop_last=False)
330 | 
331 |     video_name_to_score = {}
332 | 
333 |     for video_sample in loader:
334 |         frames = video_sample[0]['frames']
335 |         detector_frames = frames[::DETECTOR_STEP]
336 |         video_idx = video_sample[0]['index']
337 |         video_rel_path = dataset.content[video_idx]
338 |         video_name = os.path.basename(video_rel_path)
339 | 
340 |         if len(frames) == 0:
341 |             video_name_to_score[video_name] = 0.5
342 |             continue
343 | 
344 |         detections = []
345 |         for start in range(0, len(detector_frames), DETECTOR_BATCH_SIZE):
346 |             end = min(len(detector_frames), start + DETECTOR_BATCH_SIZE)
347 |             detections_batch = detector.detect(detector_frames[start:end])
348 |             for detections_per_frame in detections_batch:
349 |                 detections.append({key: value.cpu().numpy() for key, value in detections_per_frame.items()})
350 | 
351 |         tracks = get_tracks(detections)
352 |         if len(tracks) == 0:
353 |             video_name_to_score[video_name] = 0.5
354 |             continue
355 | 
356 |         sequence_track_scores = []
357 |         for track in tracks:
358 |             track_sequences = []
359 |             for i, (start_idx, _) in enumerate(
360 |                     track[:-VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH + 1:VIDEO_SEQUENCE_MODEL_TRACK_STEP]):
361 |                 assert start_idx >= 0 and start_idx + VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH <= len(frames)
362 |                 _, bbox = track[i * VIDEO_SEQUENCE_MODEL_TRACK_STEP + VIDEO_SEQUENCE_MODEL_SEQUENCE_LENGTH // 2]
363 |                 track_sequences.append(extract_sequence(frames, start_idx, bbox, i % 2 == 0))
364 |             sequence_track_scores.append(track_sequences_classifier.classify(track_sequences))
365 | 
366 |         face_track_scores = []
367 |         for track in tracks:
368 |             track_faces = []
369 |             for i, (frame_idx, bbox) in enumerate(track[::VIDEO_FACE_MODEL_TRACK_STEP]):
370 |                 face = extract_face(frames[frame_idx], bbox, i % 2 == 0)
371 |                 track_faces.append(face)
372 |             face_track_scores.append(track_faces_classifier.classify(track_faces))
373 | 
374 |         sequence_track_scores = np.concatenate(sequence_track_scores)
375 |         face_track_scores = np.concatenate(face_track_scores)
376 |         track_probs = np.concatenate((sequence_track_scores, face_track_scores))
377 | 
378 |         delta = track_probs - 0.5
379 |         sign = np.sign(delta)
380 |         pos_delta = delta > 0
381 |         neg_delta = delta < 0
382 |         track_probs[pos_delta] = np.clip(0.5 + sign[pos_delta] * np.power(abs(delta[pos_delta]), 0.65), 0.01, 0.99)
383 |         track_probs[neg_delta] = np.clip(0.5 + sign[neg_delta] * np.power(abs(delta[neg_delta]), 0.65), 0.01, 0.99)
384 |         weights = np.power(abs(delta), 1.0) + 1e-4
385 |         video_score = float((track_probs * weights).sum() / weights.sum())
386 | 
387 |         video_name_to_score[video_name] = video_score
388 |         print('NUM DETECTION FRAMES: {}, VIDEO SCORE: {}. {}'.format(len(detections), video_name_to_score[video_name],
389 |                                                                      video_rel_path))
390 | 
391 |     os.makedirs(os.path.dirname(config['SUBMISSION_PATH']), exist_ok=True)
392 |     with open(config['SUBMISSION_PATH'], 'w') as f:
393 |         f.write('filename,label\n')
394 |         for video_name in sorted(video_name_to_score):
395 |             score = video_name_to_score[video_name]
396 |             f.write('{},{}\n'.format(video_name, score))
397 | 
398 | 
399 | main()


--------------------------------------------------------------------------------
/tracker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NTech-Lab/deepfake-detection-challenge/52095ce4a49f298faf075a5eb28391722b9e4103/tracker/__init__.py


--------------------------------------------------------------------------------
/tracker/iou_tracker.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/bochinski/iou-tracker
 2 | 
 3 | from .utils import iou
 4 | 
 5 | 
 6 | def track_iou(detections, sigma_l, sigma_h, sigma_iou, t_min):
 7 |     """
 8 |     Simple IOU based tracker.
 9 |     See "High-Speed Tracking-by-Detection Without Using Image Information by E. Bochinski, V. Eiselein, T. Sikora" for
10 |     more information.
11 | 
12 |     Args:
13 |          detections (list): list of detections per frame, usually generated by util.load_mot
14 |          sigma_l (float): low detection threshold.
15 |          sigma_h (float): high detection threshold.
16 |          sigma_iou (float): IOU threshold.
17 |          t_min (float): minimum track length in frames.
18 | 
19 |     Returns:
20 |         list: list of tracks.
21 |     """
22 | 
23 |     tracks_active = []
24 |     tracks_finished = []
25 | 
26 |     for frame_num, detections_frame in enumerate(detections, start=1):
27 |         # apply low threshold to detections
28 |         dets = [det for det in detections_frame if det['score'] >= sigma_l]
29 | 
30 |         updated_tracks = []
31 |         for track in tracks_active:
32 |             if len(dets) > 0:
33 |                 # get det with highest iou
34 |                 best_match = max(dets, key=lambda x: iou(track['bboxes'][-1], x['bbox']))
35 |                 if iou(track['bboxes'][-1], best_match['bbox']) >= sigma_iou:
36 |                     track['bboxes'].append(best_match['bbox'])
37 |                     track['max_score'] = max(track['max_score'], best_match['score'])
38 | 
39 |                     updated_tracks.append(track)
40 | 
41 |                     # remove from best matching detection from detections
42 |                     del dets[dets.index(best_match)]
43 | 
44 |             # if track was not updated
45 |             if len(updated_tracks) == 0 or track is not updated_tracks[-1]:
46 |                 # finish track when the conditions are met
47 |                 if track['max_score'] >= sigma_h and len(track['bboxes']) >= t_min:
48 |                     tracks_finished.append(track)
49 | 
50 |         # create new tracks
51 |         new_tracks = [{'bboxes': [det['bbox']], 'max_score': det['score'], 'start_frame': frame_num} for det in dets]
52 |         tracks_active = updated_tracks + new_tracks
53 | 
54 |     # finish all remaining active tracks
55 |     tracks_finished += [track for track in tracks_active
56 |                         if track['max_score'] >= sigma_h and len(track['bboxes']) >= t_min]
57 | 
58 |     return tracks_finished
59 | 


--------------------------------------------------------------------------------
/tracker/utils.py:
--------------------------------------------------------------------------------
 1 | def iou(bbox1, bbox2):
 2 |     """
 3 |     Calculates the intersection-over-union of two bounding boxes.
 4 | 
 5 |     Args:
 6 |         bbox1 (numpy.array, list of floats): bounding box in format x1,y1,x2,y2.
 7 |         bbox2 (numpy.array, list of floats): bounding box in format x1,y1,x2,y2.
 8 | 
 9 |     Returns:
10 |         int: intersection-over-onion of bbox1, bbox2
11 |     """
12 | 
13 |     bbox1 = [float(x) for x in bbox1]
14 |     bbox2 = [float(x) for x in bbox2]
15 | 
16 |     (x0_1, y0_1, x1_1, y1_1) = bbox1
17 |     (x0_2, y0_2, x1_2, y1_2) = bbox2
18 | 
19 |     # get the overlap rectangle
20 |     overlap_x0 = max(x0_1, x0_2)
21 |     overlap_y0 = max(y0_1, y0_2)
22 |     overlap_x1 = min(x1_1, x1_2)
23 |     overlap_y1 = min(y1_1, y1_2)
24 | 
25 |     # check if there is an overlap
26 |     if overlap_x1 - overlap_x0 <= 0 or overlap_y1 - overlap_y0 <= 0:
27 |         return 0
28 | 
29 |     # if yes, calculate the ratio of the overlap to each ROI size and the unified size
30 |     size_1 = (x1_1 - x0_1) * (y1_1 - y0_1)
31 |     size_2 = (x1_2 - x0_2) * (y1_2 - y0_2)
32 |     size_intersection = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
33 |     size_union = size_1 + size_2 - size_intersection
34 | 
35 |     return size_intersection / size_union


--------------------------------------------------------------------------------
/train_b7_ns_aa_original_large_crop_100k.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import random
  4 | import tqdm
  5 | 
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | import torch
 10 | from torch import distributions
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | from torch.utils.tensorboard import SummaryWriter
 15 | 
 16 | import ffmpeg
 17 | 
 18 | from albumentations import ImageOnlyTransform
 19 | from albumentations import SmallestMaxSize, PadIfNeeded, HorizontalFlip, Normalize, Compose, RandomCrop
 20 | from albumentations.pytorch import ToTensor
 21 | from efficientnet_pytorch import EfficientNet
 22 | 
 23 | from timm.data.transforms_factory import transforms_imagenet_train
 24 | 
 25 | from datasets import TrackPairDataset
 26 | from extract_tracks_from_videos import TRACK_LENGTH, TRACKS_ROOT
 27 | from generate_track_pairs import TRACK_PAIRS_FILE_NAME
 28 | 
 29 | SEED = 30
 30 | BATCH_SIZE = 8
 31 | TRAIN_INDICES = [9, 13, 17, 21, 25, 29, 33, 37]
 32 | INITIAL_LR = 0.005
 33 | MOMENTUM = 0.9
 34 | WEIGHT_DECAY = 1e-4
 35 | NUM_WORKERS = 8
 36 | NUM_WARMUP_ITERATIONS = 100
 37 | SNAPSHOT_FREQUENCY = 1000
 38 | OUTPUT_FOLDER_NAME = 'efficientnet-b7_ns_aa-original-mstd0.5_large_crop_100k'
 39 | SNAPSHOT_NAME_TEMPLATE = 'snapshot_{}.pth'
 40 | MAX_ITERS = 100000
 41 | 
 42 | FPS_RANGE = (15, 30)
 43 | SCALE_RANGE = (0.25, 1)
 44 | CRF_RANGE = (17, 40)
 45 | TUNE_VALUES = ['film', 'animation', 'grain', 'stillimage', 'fastdecode', 'zerolatency']
 46 | 
 47 | CROP_HEIGHT = 224
 48 | CROP_WIDTH = 192
 49 | 
 50 | PRETRAINED_WEIGHTS_PATH = 'external_data/noisy_student_efficientnet-b7.pth'
 51 | SNAPSHOTS_ROOT = 'snapshots'
 52 | LOGS_ROOT = 'logs'
 53 | 
 54 | 
 55 | class TrackTransform(object):
 56 |     def __init__(self, fps_range, scale_range, crf_range, tune_values):
 57 |         self.fps_range = fps_range
 58 |         self.scale_range = scale_range
 59 |         self.crf_range = crf_range
 60 |         self.tune_values = tune_values
 61 | 
 62 |     def get_params(self, src_fps, src_height, src_width):
 63 |         if random.random() > 0.5:
 64 |             return None
 65 | 
 66 |         dst_fps = src_fps
 67 |         if random.random() > 0.5:
 68 |             dst_fps = random.randrange(*self.fps_range)
 69 | 
 70 |         scale = 1.0
 71 |         if random.random() > 0.5:
 72 |             scale = random.uniform(*self.scale_range)
 73 | 
 74 |         dst_height = round(scale * src_height) // 2 * 2
 75 |         dst_width = round(scale * src_width) // 2 * 2
 76 | 
 77 |         crf = random.randrange(*self.crf_range)
 78 |         tune = random.choice(self.tune_values)
 79 | 
 80 |         return dst_fps, dst_height, dst_width, crf, tune
 81 | 
 82 |     def __call__(self, track_path, src_fps, dst_fps, dst_height, dst_width, crf, tune):
 83 |         out, err = (
 84 |             ffmpeg
 85 |                 .input(os.path.join(track_path, '%d.png'), framerate=src_fps, start_number=0)
 86 |                 .filter('fps', fps=dst_fps)
 87 |                 .filter('scale', dst_width, dst_height)
 88 |                 .output('pipe:', format='h264', vcodec='libx264', crf=crf, tune=tune)
 89 |                 .run(capture_stdout=True, quiet=True)
 90 |         )
 91 |         out, err = (
 92 |             ffmpeg
 93 |                 .input('pipe:', format='h264')
 94 |                 .output('pipe:', format='rawvideo', pix_fmt='rgb24')
 95 |                 .run(capture_stdout=True, input=out, quiet=True)
 96 |         )
 97 | 
 98 |         imgs = np.frombuffer(out, dtype=np.uint8).reshape(-1, dst_height, dst_width, 3)
 99 | 
100 |         return imgs
101 | 
102 | 
103 | class VisionTransform(ImageOnlyTransform):
104 |     def __init__(
105 |             self, transform, is_tensor=True, always_apply=False, p=1.0
106 |     ):
107 |         super(VisionTransform, self).__init__(always_apply, p)
108 |         self.transform = transform
109 |         self.is_tensor = is_tensor
110 | 
111 |     def apply(self, image, **params):
112 |         if self.is_tensor:
113 |             return self.transform(image)
114 |         else:
115 |             return np.array(self.transform(Image.fromarray(image)))
116 | 
117 |     def get_transform_init_args_names(self):
118 |         return ("transform")
119 | 
120 | 
121 | def set_global_seed(seed):
122 |     torch.manual_seed(seed)
123 |     if torch.cuda.is_available():
124 |         torch.cuda.manual_seed_all(seed)
125 |     random.seed(seed)
126 |     np.random.seed(seed)
127 | 
128 | 
129 | def prepare_cudnn(deterministic=None, benchmark=None):
130 |     # https://pytorch.org/docs/stable/notes/randomness.html#cudnn
131 |     if deterministic is None:
132 |         deterministic = os.environ.get("CUDNN_DETERMINISTIC", "True") == "True"
133 |     torch.backends.cudnn.deterministic = deterministic
134 | 
135 |     # https://discuss.pytorch.org/t/how-should-i-disable-using-cudnn-in-my-code/38053/4
136 |     if benchmark is None:
137 |         benchmark = os.environ.get("CUDNN_BENCHMARK", "True") == "True"
138 |     torch.backends.cudnn.benchmark = benchmark
139 | 
140 | 
141 | def main():
142 |     with open('config.yaml', 'r') as f:
143 |         config = yaml.load(f)
144 | 
145 |     set_global_seed(SEED)
146 |     prepare_cudnn(deterministic=True, benchmark=True)
147 | 
148 |     model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
149 |     state = torch.load(PRETRAINED_WEIGHTS_PATH, map_location=lambda storage, loc: storage)
150 |     state.pop('_fc.weight')
151 |     state.pop('_fc.bias')
152 |     res = model.load_state_dict(state, strict=False)
153 |     assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
154 |     model = model.cuda()
155 | 
156 |     normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
157 |     _, rand_augment, _ = transforms_imagenet_train((CROP_HEIGHT, CROP_WIDTH), auto_augment='original-mstd0.5',
158 |                                                    separate=True)
159 | 
160 |     train_dataset = TrackPairDataset(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT),
161 |                                      os.path.join(config['ARTIFACTS_PATH'], TRACK_PAIRS_FILE_NAME),
162 |                                      TRAIN_INDICES,
163 |                                      track_length=TRACK_LENGTH,
164 |                                      track_transform=TrackTransform(FPS_RANGE, SCALE_RANGE, CRF_RANGE, TUNE_VALUES),
165 |                                      image_transform=Compose([
166 |                                          SmallestMaxSize(CROP_WIDTH),
167 |                                          PadIfNeeded(CROP_HEIGHT, CROP_WIDTH),
168 |                                          HorizontalFlip(),
169 |                                          RandomCrop(CROP_HEIGHT, CROP_WIDTH),
170 |                                          VisionTransform(rand_augment, is_tensor=False, p=0.5),
171 |                                          normalize,
172 |                                          ToTensor()
173 |                                      ]), sequence_mode=False)
174 | 
175 |     print('Train dataset size: {}.'.format(len(train_dataset)))
176 | 
177 |     warmup_optimizer = torch.optim.SGD(model._fc.parameters(), INITIAL_LR, momentum=MOMENTUM,
178 |                                        weight_decay=WEIGHT_DECAY, nesterov=True)
179 | 
180 |     full_optimizer = torch.optim.SGD(model.parameters(), INITIAL_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY,
181 |                                      nesterov=True)
182 |     full_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(full_optimizer,
183 |                                                           lambda iteration: (MAX_ITERS - iteration) / MAX_ITERS)
184 | 
185 |     snapshots_root = os.path.join(config['ARTIFACTS_PATH'], SNAPSHOTS_ROOT, OUTPUT_FOLDER_NAME)
186 |     os.makedirs(snapshots_root)
187 |     log_root = os.path.join(config['ARTIFACTS_PATH'], LOGS_ROOT, OUTPUT_FOLDER_NAME)
188 |     os.makedirs(log_root)
189 | 
190 |     writer = SummaryWriter(log_root)
191 | 
192 |     iteration = 0
193 |     if iteration < NUM_WARMUP_ITERATIONS:
194 |         print('Start {} warmup iterations'.format(NUM_WARMUP_ITERATIONS))
195 |         model.eval()
196 |         model._fc.train()
197 |         for param in model.parameters():
198 |             param.requires_grad = False
199 |         for param in model._fc.parameters():
200 |             param.requires_grad = True
201 |         optimizer = warmup_optimizer
202 |     else:
203 |         print('Start without warmup iterations')
204 |         model.train()
205 |         optimizer = full_optimizer
206 | 
207 |     max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
208 |     writer.add_scalar('train/max_lr', max_lr, iteration)
209 | 
210 |     epoch = 0
211 |     fake_prob_dist = distributions.beta.Beta(0.5, 0.5)
212 |     while True:
213 |         epoch += 1
214 |         print('Epoch {} is in progress'.format(epoch))
215 |         loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
216 |         for samples in tqdm.tqdm(loader):
217 |             iteration += 1
218 |             fake_input_tensor = torch.cat(samples['fake']).cuda()
219 |             real_input_tensor = torch.cat(samples['real']).cuda()
220 |             target_fake_prob = fake_prob_dist.sample((len(fake_input_tensor),)).float().cuda()
221 |             fake_weight = target_fake_prob.view(-1, 1, 1, 1)
222 | 
223 |             input_tensor = (1.0 - fake_weight) * real_input_tensor + fake_weight * fake_input_tensor
224 |             pred = model(input_tensor).flatten()
225 | 
226 |             loss = F.binary_cross_entropy_with_logits(pred, target_fake_prob)
227 | 
228 |             optimizer.zero_grad()
229 |             loss.backward()
230 |             optimizer.step()
231 |             if iteration > NUM_WARMUP_ITERATIONS:
232 |                 full_lr_scheduler.step()
233 |                 max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
234 |                 writer.add_scalar('train/max_lr', max_lr, iteration)
235 | 
236 |             writer.add_scalar('train/loss', loss.item(), iteration)
237 | 
238 |             if iteration == NUM_WARMUP_ITERATIONS:
239 |                 print('Stop warmup iterations')
240 |                 model.train()
241 |                 for param in model.parameters():
242 |                     param.requires_grad = True
243 |                 optimizer = full_optimizer
244 | 
245 |             if iteration % SNAPSHOT_FREQUENCY == 0:
246 |                 snapshot_name = SNAPSHOT_NAME_TEMPLATE.format(iteration)
247 |                 snapshot_path = os.path.join(snapshots_root, snapshot_name)
248 |                 print('Saving snapshot to {}'.format(snapshot_path))
249 |                 torch.save(model.state_dict(), snapshot_path)
250 | 
251 |             if iteration >= MAX_ITERS:
252 |                 print('Stop training due to maximum iteration exceeded')
253 |                 return
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     main()
258 | 


--------------------------------------------------------------------------------
/train_b7_ns_aa_original_re_100k.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import random
  4 | import tqdm
  5 | 
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | import torch
 10 | from torch import distributions
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | from torch.utils.tensorboard import SummaryWriter
 15 | 
 16 | import ffmpeg
 17 | 
 18 | from albumentations import ImageOnlyTransform
 19 | from albumentations import SmallestMaxSize, HorizontalFlip, Normalize, Compose, RandomCrop
 20 | from albumentations.pytorch import ToTensor
 21 | from efficientnet_pytorch import EfficientNet
 22 | 
 23 | from timm.data.transforms_factory import transforms_imagenet_train
 24 | from timm.data.random_erasing import RandomErasing
 25 | 
 26 | from datasets import TrackPairDataset
 27 | from extract_tracks_from_videos import TRACK_LENGTH, TRACKS_ROOT
 28 | from generate_track_pairs import TRACK_PAIRS_FILE_NAME
 29 | 
 30 | SEED = 10
 31 | BATCH_SIZE = 8
 32 | TRAIN_INDICES = [9, 13, 17, 21, 25, 29, 33, 37]
 33 | INITIAL_LR = 0.005
 34 | MOMENTUM = 0.9
 35 | WEIGHT_DECAY = 1e-4
 36 | NUM_WORKERS = 8
 37 | NUM_WARMUP_ITERATIONS = 100
 38 | SNAPSHOT_FREQUENCY = 1000
 39 | OUTPUT_FOLDER_NAME = 'efficientnet-b7_ns_aa-original-mstd0.5_re_100k'
 40 | SNAPSHOT_NAME_TEMPLATE = 'snapshot_{}.pth'
 41 | MAX_ITERS = 100000
 42 | 
 43 | FPS_RANGE = (15, 30)
 44 | SCALE_RANGE = (0.25, 1)
 45 | CRF_RANGE = (17, 40)
 46 | TUNE_VALUES = ['film', 'animation', 'grain', 'stillimage', 'fastdecode', 'zerolatency']
 47 | 
 48 | RE_PROB = 0.2
 49 | RE_MODE = 'pixel'
 50 | RE_COUNT = 1
 51 | RE_NUM_SPLITS = 0
 52 | 
 53 | MIN_SIZE = 224
 54 | CROP_HEIGHT = 224
 55 | CROP_WIDTH = 192
 56 | 
 57 | PRETRAINED_WEIGHTS_PATH = 'external_data/noisy_student_efficientnet-b7.pth'
 58 | SNAPSHOTS_ROOT = 'snapshots'
 59 | LOGS_ROOT = 'logs'
 60 | 
 61 | 
 62 | class TrackTransform(object):
 63 |     def __init__(self, fps_range, scale_range, crf_range, tune_values):
 64 |         self.fps_range = fps_range
 65 |         self.scale_range = scale_range
 66 |         self.crf_range = crf_range
 67 |         self.tune_values = tune_values
 68 | 
 69 |     def get_params(self, src_fps, src_height, src_width):
 70 |         if random.random() > 0.5:
 71 |             return None
 72 | 
 73 |         dst_fps = src_fps
 74 |         if random.random() > 0.5:
 75 |             dst_fps = random.randrange(*self.fps_range)
 76 | 
 77 |         scale = 1.0
 78 |         if random.random() > 0.5:
 79 |             scale = random.uniform(*self.scale_range)
 80 | 
 81 |         dst_height = round(scale * src_height) // 2 * 2
 82 |         dst_width = round(scale * src_width) // 2 * 2
 83 | 
 84 |         crf = random.randrange(*self.crf_range)
 85 |         tune = random.choice(self.tune_values)
 86 | 
 87 |         return dst_fps, dst_height, dst_width, crf, tune
 88 | 
 89 |     def __call__(self, track_path, src_fps, dst_fps, dst_height, dst_width, crf, tune):
 90 |         out, err = (
 91 |             ffmpeg
 92 |                 .input(os.path.join(track_path, '%d.png'), framerate=src_fps, start_number=0)
 93 |                 .filter('fps', fps=dst_fps)
 94 |                 .filter('scale', dst_width, dst_height)
 95 |                 .output('pipe:', format='h264', vcodec='libx264', crf=crf, tune=tune)
 96 |                 .run(capture_stdout=True, quiet=True)
 97 |         )
 98 |         out, err = (
 99 |             ffmpeg
100 |                 .input('pipe:', format='h264')
101 |                 .output('pipe:', format='rawvideo', pix_fmt='rgb24')
102 |                 .run(capture_stdout=True, input=out, quiet=True)
103 |         )
104 | 
105 |         imgs = np.frombuffer(out, dtype=np.uint8).reshape(-1, dst_height, dst_width, 3)
106 | 
107 |         return imgs
108 | 
109 | 
110 | class VisionTransform(ImageOnlyTransform):
111 |     def __init__(
112 |             self, transform, is_tensor=True, always_apply=False, p=1.0
113 |     ):
114 |         super(VisionTransform, self).__init__(always_apply, p)
115 |         self.transform = transform
116 |         self.is_tensor = is_tensor
117 | 
118 |     def apply(self, image, **params):
119 |         if self.is_tensor:
120 |             return self.transform(image)
121 |         else:
122 |             return np.array(self.transform(Image.fromarray(image)))
123 | 
124 |     def get_transform_init_args_names(self):
125 |         return ("transform")
126 | 
127 | 
128 | def set_global_seed(seed):
129 |     torch.manual_seed(seed)
130 |     if torch.cuda.is_available():
131 |         torch.cuda.manual_seed_all(seed)
132 |     random.seed(seed)
133 |     np.random.seed(seed)
134 | 
135 | 
136 | def prepare_cudnn(deterministic=None, benchmark=None):
137 |     # https://pytorch.org/docs/stable/notes/randomness.html#cudnn
138 |     if deterministic is None:
139 |         deterministic = os.environ.get("CUDNN_DETERMINISTIC", "True") == "True"
140 |     torch.backends.cudnn.deterministic = deterministic
141 | 
142 |     # https://discuss.pytorch.org/t/how-should-i-disable-using-cudnn-in-my-code/38053/4
143 |     if benchmark is None:
144 |         benchmark = os.environ.get("CUDNN_BENCHMARK", "True") == "True"
145 |     torch.backends.cudnn.benchmark = benchmark
146 | 
147 | 
148 | def main():
149 |     with open('config.yaml', 'r') as f:
150 |         config = yaml.load(f)
151 | 
152 |     set_global_seed(SEED)
153 |     prepare_cudnn(deterministic=True, benchmark=True)
154 | 
155 |     model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
156 |     state = torch.load(PRETRAINED_WEIGHTS_PATH, map_location=lambda storage, loc: storage)
157 |     state.pop('_fc.weight')
158 |     state.pop('_fc.bias')
159 |     res = model.load_state_dict(state, strict=False)
160 |     assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
161 |     model = model.cuda()
162 | 
163 |     normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
164 |     _, rand_augment, _ = transforms_imagenet_train((CROP_HEIGHT, CROP_WIDTH), auto_augment='original-mstd0.5',
165 |                                                    separate=True)
166 | 
167 |     train_dataset = TrackPairDataset(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT),
168 |                                      os.path.join(config['ARTIFACTS_PATH'], TRACK_PAIRS_FILE_NAME),
169 |                                      TRAIN_INDICES,
170 |                                      track_length=TRACK_LENGTH,
171 |                                      track_transform=TrackTransform(FPS_RANGE, SCALE_RANGE, CRF_RANGE, TUNE_VALUES),
172 |                                      image_transform=Compose([
173 |                                          SmallestMaxSize(MIN_SIZE),
174 |                                          HorizontalFlip(),
175 |                                          RandomCrop(CROP_HEIGHT, CROP_WIDTH),
176 |                                          VisionTransform(rand_augment, is_tensor=False, p=0.5),
177 |                                          normalize,
178 |                                          ToTensor(),
179 |                                          VisionTransform(
180 |                                              RandomErasing(probability=RE_PROB, mode=RE_MODE, max_count=RE_COUNT,
181 |                                                            num_splits=RE_NUM_SPLITS, device='cpu'), is_tensor=True)
182 |                                      ]), sequence_mode=False)
183 | 
184 |     print('Train dataset size: {}.'.format(len(train_dataset)))
185 | 
186 |     warmup_optimizer = torch.optim.SGD(model._fc.parameters(), INITIAL_LR, momentum=MOMENTUM,
187 |                                        weight_decay=WEIGHT_DECAY, nesterov=True)
188 | 
189 |     full_optimizer = torch.optim.SGD(model.parameters(), INITIAL_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY,
190 |                                      nesterov=True)
191 |     full_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(full_optimizer,
192 |                                                           lambda iteration: (MAX_ITERS - iteration) / MAX_ITERS)
193 | 
194 |     snapshots_root = os.path.join(config['ARTIFACTS_PATH'], SNAPSHOTS_ROOT, OUTPUT_FOLDER_NAME)
195 |     os.makedirs(snapshots_root)
196 |     log_root = os.path.join(config['ARTIFACTS_PATH'], LOGS_ROOT, OUTPUT_FOLDER_NAME)
197 |     os.makedirs(log_root)
198 | 
199 |     writer = SummaryWriter(log_root)
200 | 
201 |     iteration = 0
202 |     if iteration < NUM_WARMUP_ITERATIONS:
203 |         print('Start {} warmup iterations'.format(NUM_WARMUP_ITERATIONS))
204 |         model.eval()
205 |         model._fc.train()
206 |         for param in model.parameters():
207 |             param.requires_grad = False
208 |         for param in model._fc.parameters():
209 |             param.requires_grad = True
210 |         optimizer = warmup_optimizer
211 |     else:
212 |         print('Start without warmup iterations')
213 |         model.train()
214 |         optimizer = full_optimizer
215 | 
216 |     max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
217 |     writer.add_scalar('train/max_lr', max_lr, iteration)
218 | 
219 |     epoch = 0
220 |     fake_prob_dist = distributions.beta.Beta(0.5, 0.5)
221 |     while True:
222 |         epoch += 1
223 |         print('Epoch {} is in progress'.format(epoch))
224 |         loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
225 |         for samples in tqdm.tqdm(loader):
226 |             iteration += 1
227 |             fake_input_tensor = torch.cat(samples['fake']).cuda()
228 |             real_input_tensor = torch.cat(samples['real']).cuda()
229 |             target_fake_prob = fake_prob_dist.sample((len(fake_input_tensor),)).float().cuda()
230 |             fake_weight = target_fake_prob.view(-1, 1, 1, 1)
231 | 
232 |             input_tensor = (1.0 - fake_weight) * real_input_tensor + fake_weight * fake_input_tensor
233 |             pred = model(input_tensor).flatten()
234 | 
235 |             loss = F.binary_cross_entropy_with_logits(pred, target_fake_prob)
236 | 
237 |             optimizer.zero_grad()
238 |             loss.backward()
239 |             optimizer.step()
240 |             if iteration > NUM_WARMUP_ITERATIONS:
241 |                 full_lr_scheduler.step()
242 |                 max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
243 |                 writer.add_scalar('train/max_lr', max_lr, iteration)
244 | 
245 |             writer.add_scalar('train/loss', loss.item(), iteration)
246 | 
247 |             if iteration == NUM_WARMUP_ITERATIONS:
248 |                 print('Stop warmup iterations')
249 |                 model.train()
250 |                 for param in model.parameters():
251 |                     param.requires_grad = True
252 |                 optimizer = full_optimizer
253 | 
254 |             if iteration % SNAPSHOT_FREQUENCY == 0:
255 |                 snapshot_name = SNAPSHOT_NAME_TEMPLATE.format(iteration)
256 |                 snapshot_path = os.path.join(snapshots_root, snapshot_name)
257 |                 print('Saving snapshot to {}'.format(snapshot_path))
258 |                 torch.save(model.state_dict(), snapshot_path)
259 | 
260 |             if iteration >= MAX_ITERS:
261 |                 print('Stop training due to maximum iteration exceeded')
262 |                 return
263 | 
264 | 
265 | if __name__ == '__main__':
266 |     main()
267 | 


--------------------------------------------------------------------------------
/train_b7_ns_seq_aa_original_100k.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import random
  4 | import tqdm
  5 | 
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | from torch import distributions
 12 | from torch.nn import functional as F
 13 | from torch.utils.data import DataLoader
 14 | 
 15 | from torch.utils.tensorboard import SummaryWriter
 16 | 
 17 | import ffmpeg
 18 | 
 19 | from albumentations import ImageOnlyTransform
 20 | from albumentations import SmallestMaxSize, HorizontalFlip, Normalize, Compose, RandomCrop
 21 | from albumentations.pytorch import ToTensor
 22 | from efficientnet_pytorch import EfficientNet
 23 | from efficientnet_pytorch.model import MBConvBlock
 24 | 
 25 | from timm.data.transforms_factory import transforms_imagenet_train
 26 | 
 27 | from datasets import TrackPairDataset
 28 | from extract_tracks_from_videos import TRACK_LENGTH, TRACKS_ROOT
 29 | from generate_track_pairs import TRACK_PAIRS_FILE_NAME
 30 | 
 31 | SEED = 20
 32 | BATCH_SIZE = 8
 33 | TRAIN_INDICES = [19, 21, 23, 25, 27, 29, 31]
 34 | INITIAL_LR = 0.005
 35 | MOMENTUM = 0.9
 36 | WEIGHT_DECAY = 1e-4
 37 | NUM_WORKERS = 8
 38 | NUM_WARMUP_ITERATIONS = 100
 39 | SNAPSHOT_FREQUENCY = 1000
 40 | OUTPUT_FOLDER_NAME = 'efficientnet-b7_ns_seq_aa-original-mstd0.5_100k'
 41 | SNAPSHOT_NAME_TEMPLATE = 'snapshot_{}.pth'
 42 | FINAL_SNAPSHOT_NAME = 'final.pth'
 43 | MAX_ITERS = 100000
 44 | 
 45 | FPS_RANGE = (15, 30)
 46 | SCALE_RANGE = (0.25, 1)
 47 | CRF_RANGE = (17, 40)
 48 | TUNE_VALUES = ['film', 'animation', 'grain', 'stillimage', 'fastdecode', 'zerolatency']
 49 | 
 50 | MIN_SIZE = 224
 51 | CROP_HEIGHT = 224
 52 | CROP_WIDTH = 192
 53 | 
 54 | PRETRAINED_WEIGHTS_PATH = 'external_data/noisy_student_efficientnet-b7.pth'
 55 | SNAPSHOTS_ROOT = 'snapshots'
 56 | LOGS_ROOT = 'logs'
 57 | 
 58 | 
 59 | class SeqExpandConv(nn.Module):
 60 |     def __init__(self, in_channels, out_channels, seq_length):
 61 |         super(SeqExpandConv, self).__init__()
 62 |         self.conv = nn.Conv3d(in_channels, out_channels, kernel_size=(3, 1, 1), padding=(1, 0, 0), bias=False)
 63 |         self.seq_length = seq_length
 64 | 
 65 |     def forward(self, x):
 66 |         batch_size, in_channels, height, width = x.shape
 67 |         x = x.view(batch_size // self.seq_length, self.seq_length, in_channels, height, width)
 68 |         x = self.conv(x.transpose(1, 2).contiguous()).transpose(2, 1).contiguous()
 69 |         x = x.flatten(0, 1)
 70 |         return x
 71 | 
 72 | 
 73 | class TrackTransform(object):
 74 |     def __init__(self, fps_range, scale_range, crf_range, tune_values):
 75 |         self.fps_range = fps_range
 76 |         self.scale_range = scale_range
 77 |         self.crf_range = crf_range
 78 |         self.tune_values = tune_values
 79 | 
 80 |     def get_params(self, src_fps, src_height, src_width):
 81 |         if random.random() > 0.5:
 82 |             return None
 83 | 
 84 |         dst_fps = src_fps
 85 |         if random.random() > 0.5:
 86 |             dst_fps = random.randrange(*self.fps_range)
 87 | 
 88 |         scale = 1.0
 89 |         if random.random() > 0.5:
 90 |             scale = random.uniform(*self.scale_range)
 91 | 
 92 |         dst_height = round(scale * src_height) // 2 * 2
 93 |         dst_width = round(scale * src_width) // 2 * 2
 94 | 
 95 |         crf = random.randrange(*self.crf_range)
 96 |         tune = random.choice(self.tune_values)
 97 | 
 98 |         return dst_fps, dst_height, dst_width, crf, tune
 99 | 
100 |     def __call__(self, track_path, src_fps, dst_fps, dst_height, dst_width, crf, tune):
101 |         out, err = (
102 |             ffmpeg
103 |                 .input(os.path.join(track_path, '%d.png'), framerate=src_fps, start_number=0)
104 |                 .filter('fps', fps=dst_fps)
105 |                 .filter('scale', dst_width, dst_height)
106 |                 .output('pipe:', format='h264', vcodec='libx264', crf=crf, tune=tune)
107 |                 .run(capture_stdout=True, quiet=True)
108 |         )
109 |         out, err = (
110 |             ffmpeg
111 |                 .input('pipe:', format='h264')
112 |                 .output('pipe:', format='rawvideo', pix_fmt='rgb24')
113 |                 .run(capture_stdout=True, input=out, quiet=True)
114 |         )
115 | 
116 |         imgs = np.frombuffer(out, dtype=np.uint8).reshape(-1, dst_height, dst_width, 3)
117 | 
118 |         return imgs
119 | 
120 | 
121 | class VisionTransform(ImageOnlyTransform):
122 |     def __init__(
123 |             self, transform, always_apply=False, p=1.0
124 |     ):
125 |         super(VisionTransform, self).__init__(always_apply, p)
126 |         self.transform = transform
127 | 
128 |     def apply(self, image, **params):
129 |         return np.array(self.transform(Image.fromarray(image)))
130 | 
131 |     def get_transform_init_args_names(self):
132 |         return ("transform")
133 | 
134 | 
135 | def set_global_seed(seed):
136 |     torch.manual_seed(seed)
137 |     if torch.cuda.is_available():
138 |         torch.cuda.manual_seed_all(seed)
139 |     random.seed(seed)
140 |     np.random.seed(seed)
141 | 
142 | 
143 | def prepare_cudnn(deterministic=None, benchmark=None):
144 |     # https://pytorch.org/docs/stable/notes/randomness.html#cudnn
145 |     if deterministic is None:
146 |         deterministic = os.environ.get("CUDNN_DETERMINISTIC", "True") == "True"
147 |     torch.backends.cudnn.deterministic = deterministic
148 | 
149 |     # https://discuss.pytorch.org/t/how-should-i-disable-using-cudnn-in-my-code/38053/4
150 |     if benchmark is None:
151 |         benchmark = os.environ.get("CUDNN_BENCHMARK", "True") == "True"
152 |     torch.backends.cudnn.benchmark = benchmark
153 | 
154 | 
155 | def main():
156 |     with open('config.yaml', 'r') as f:
157 |         config = yaml.load(f)
158 | 
159 |     set_global_seed(SEED)
160 |     prepare_cudnn(deterministic=True, benchmark=True)
161 | 
162 |     model = EfficientNet.from_name('efficientnet-b7', override_params={'num_classes': 1})
163 |     state = torch.load(PRETRAINED_WEIGHTS_PATH, map_location=lambda storage, loc: storage)
164 |     state.pop('_fc.weight')
165 |     state.pop('_fc.bias')
166 |     res = model.load_state_dict(state, strict=False)
167 |     assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
168 | 
169 |     for module in model.modules():
170 |         if isinstance(module, MBConvBlock):
171 |             if module._block_args.expand_ratio != 1:
172 |                 expand_conv = module._expand_conv
173 |                 seq_expand_conv = SeqExpandConv(expand_conv.in_channels, expand_conv.out_channels, len(TRAIN_INDICES))
174 |                 seq_expand_conv.conv.weight.data[:, :, 0, :, :].copy_(expand_conv.weight.data / 3)
175 |                 seq_expand_conv.conv.weight.data[:, :, 1, :, :].copy_(expand_conv.weight.data / 3)
176 |                 seq_expand_conv.conv.weight.data[:, :, 2, :, :].copy_(expand_conv.weight.data / 3)
177 |                 module._expand_conv = seq_expand_conv
178 | 
179 |     model = model.cuda()
180 | 
181 |     normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
182 |     _, rand_augment, _ = transforms_imagenet_train((CROP_HEIGHT, CROP_WIDTH), auto_augment='original-mstd0.5',
183 |                                                    separate=True)
184 | 
185 |     train_dataset = TrackPairDataset(os.path.join(config['ARTIFACTS_PATH'], TRACKS_ROOT),
186 |                                      os.path.join(config['ARTIFACTS_PATH'], TRACK_PAIRS_FILE_NAME),
187 |                                      TRAIN_INDICES,
188 |                                      track_length=TRACK_LENGTH,
189 |                                      track_transform=TrackTransform(FPS_RANGE, SCALE_RANGE, CRF_RANGE, TUNE_VALUES),
190 |                                      image_transform=Compose([
191 |                                          SmallestMaxSize(MIN_SIZE),
192 |                                          HorizontalFlip(),
193 |                                          RandomCrop(CROP_HEIGHT, CROP_WIDTH),
194 |                                          VisionTransform(rand_augment, p=0.5),
195 |                                          normalize,
196 |                                          ToTensor()
197 |                                      ]), sequence_mode=True)
198 | 
199 |     print('Train dataset size: {}.'.format(len(train_dataset)))
200 | 
201 |     warmup_optimizer = torch.optim.SGD(model._fc.parameters(), INITIAL_LR, momentum=MOMENTUM,
202 |                                        weight_decay=WEIGHT_DECAY, nesterov=True)
203 | 
204 |     full_optimizer = torch.optim.SGD(model.parameters(), INITIAL_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY,
205 |                                      nesterov=True)
206 |     full_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(full_optimizer,
207 |                                                           lambda iteration: (MAX_ITERS - iteration) / MAX_ITERS)
208 | 
209 |     snapshots_root = os.path.join(config['ARTIFACTS_PATH'], SNAPSHOTS_ROOT, OUTPUT_FOLDER_NAME)
210 |     os.makedirs(snapshots_root)
211 |     log_root = os.path.join(config['ARTIFACTS_PATH'], LOGS_ROOT, OUTPUT_FOLDER_NAME)
212 |     os.makedirs(log_root)
213 | 
214 |     writer = SummaryWriter(log_root)
215 | 
216 |     iteration = 0
217 |     if iteration < NUM_WARMUP_ITERATIONS:
218 |         print('Start {} warmup iterations'.format(NUM_WARMUP_ITERATIONS))
219 |         model.eval()
220 |         model._fc.train()
221 |         for param in model.parameters():
222 |             param.requires_grad = False
223 |         for param in model._fc.parameters():
224 |             param.requires_grad = True
225 |         optimizer = warmup_optimizer
226 |     else:
227 |         print('Start without warmup iterations')
228 |         model.train()
229 |         optimizer = full_optimizer
230 | 
231 |     max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
232 |     writer.add_scalar('train/max_lr', max_lr, iteration)
233 | 
234 |     epoch = 0
235 |     fake_prob_dist = distributions.beta.Beta(0.5, 0.5)
236 |     while True:
237 |         epoch += 1
238 |         print('Epoch {} is in progress'.format(epoch))
239 |         loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
240 |         for samples in tqdm.tqdm(loader):
241 |             iteration += 1
242 |             fake_input_tensor = torch.stack(samples['fake']).transpose(0, 1).cuda()
243 |             real_input_tensor = torch.stack(samples['real']).transpose(0, 1).cuda()
244 |             target_fake_prob = fake_prob_dist.sample((len(fake_input_tensor),)).float().cuda()
245 |             fake_weight = target_fake_prob.view(-1, 1, 1, 1, 1)
246 | 
247 |             input_tensor = (1.0 - fake_weight) * real_input_tensor + fake_weight * fake_input_tensor
248 |             pred = model(input_tensor.flatten(0, 1)).flatten()
249 | 
250 |             loss = F.binary_cross_entropy_with_logits(pred, target_fake_prob.repeat_interleave(len(TRAIN_INDICES)))
251 | 
252 |             optimizer.zero_grad()
253 |             loss.backward()
254 |             optimizer.step()
255 |             if iteration > NUM_WARMUP_ITERATIONS:
256 |                 full_lr_scheduler.step()
257 |                 max_lr = max(param_group["lr"] for param_group in full_optimizer.param_groups)
258 |                 writer.add_scalar('train/max_lr', max_lr, iteration)
259 | 
260 |             writer.add_scalar('train/loss', loss.item(), iteration)
261 | 
262 |             if iteration == NUM_WARMUP_ITERATIONS:
263 |                 print('Stop warmup iterations')
264 |                 model.train()
265 |                 for param in model.parameters():
266 |                     param.requires_grad = True
267 |                 optimizer = full_optimizer
268 | 
269 |             if iteration % SNAPSHOT_FREQUENCY == 0:
270 |                 snapshot_name = SNAPSHOT_NAME_TEMPLATE.format(iteration)
271 |                 snapshot_path = os.path.join(snapshots_root, snapshot_name)
272 |                 print('Saving snapshot to {}'.format(snapshot_path))
273 |                 torch.save(model.state_dict(), snapshot_path)
274 | 
275 |             if iteration >= MAX_ITERS:
276 |                 print('Stop training due to maximum iteration exceeded')
277 |                 return
278 | 
279 | 
280 | if __name__ == '__main__':
281 |     main()
282 | 


--------------------------------------------------------------------------------