├── README.md ├── latex ├── .gitignore ├── README.md ├── build.sh ├── clean.sh ├── fmeasure.bib ├── iccv.sty ├── iccv2019fmeasure.tex ├── iccv_eso.sty └── ieee_fullname.bst ├── lib ├── augim.py ├── floss.py └── pylayer.py ├── models └── fdss.py ├── pytorch ├── README.md └── floss.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Optimizing the F-measure for Threshold-free Salient Object Detection

4 | 5 | Code accompanying the paper **Optimizing the F-measure for Threshold-free Salient Object Detection**. 6 | 7 |

8 | 9 |

10 | 11 | 12 |

13 | 14 |

15 | 16 | ## Howto 17 | 1. Download and build [caffe](https://github.com/bvlc/caffe) with python interface; 18 | 2. Download the MSRA-B dataset to `data/` and the initial [VGG weights](http://data.kaizhao.net/projects/fmeasure-saliency/vgg16convs.caffemodel) to `model/` 19 | 3. Generate network and solver prototxt via `python model/fdss.py`; 20 | 4. Start training the DSS+FLoss model with `python train.py --solver tmp/fdss_beta0.80_aug_solver.pt` 21 | 22 | ## Loss surface 23 | The proposed FLoss holds considerable gradients even in the saturated 24 | area, resulting in polarized predictions that are stable against the threshold. 25 | 26 |

27 | 28 |

29 |

30 | Loss surface of FLoss (left), Log-FLoss (mid) and Cross-entropy loss (right). FLoss holds larger gradients in the saturated 31 | area, leading to high-contrast predictions. 32 |

33 | 34 | ## Example detection results 35 |

36 | 37 |

38 |

39 | Several detection results. Our method results in high-contrast detections. 40 |

41 | 42 | ## Stability against threshold 43 |

44 | 45 |

46 |

47 | FLoss (solid lines) achieves high F-measure under a larger range 48 | of thresholds, presenting stability against the changing of threshold. 49 |

50 | 51 | ## Pretrained models 52 | 53 | For pretrained models and evaluation results, please visit . 54 | 55 | ___ 56 | If you have any problem using this code, please contact [Kai Zhao](http://kaizhao.net). 57 | 58 | 59 | -------------------------------------------------------------------------------- /latex/.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Build tool directories for auxiliary files 44 | # latexrun 45 | latex.out/ 46 | 47 | ## Auxiliary and intermediate files from other packages: 48 | # algorithms 49 | *.alg 50 | *.loa 51 | 52 | # achemso 53 | acs-*.bib 54 | 55 | # amsthm 56 | *.thm 57 | 58 | # beamer 59 | *.nav 60 | *.pre 61 | *.snm 62 | *.vrb 63 | 64 | # changes 65 | *.soc 66 | 67 | # comment 68 | *.cut 69 | 70 | # cprotect 71 | *.cpt 72 | 73 | # elsarticle (documentclass of Elsevier journals) 74 | *.spl 75 | 76 | # endnotes 77 | *.ent 78 | 79 | # fixme 80 | *.lox 81 | 82 | # feynmf/feynmp 83 | *.mf 84 | *.mp 85 | *.t[1-9] 86 | *.t[1-9][0-9] 87 | *.tfm 88 | 89 | #(r)(e)ledmac/(r)(e)ledpar 90 | *.end 91 | *.?end 92 | *.[1-9] 93 | *.[1-9][0-9] 94 | *.[1-9][0-9][0-9] 95 | *.[1-9]R 96 | *.[1-9][0-9]R 97 | *.[1-9][0-9][0-9]R 98 | *.eledsec[1-9] 99 | *.eledsec[1-9]R 100 | *.eledsec[1-9][0-9] 101 | *.eledsec[1-9][0-9]R 102 | *.eledsec[1-9][0-9][0-9] 103 | *.eledsec[1-9][0-9][0-9]R 104 | 105 | # glossaries 106 | *.acn 107 | *.acr 108 | *.glg 109 | *.glo 110 | *.gls 111 | *.glsdefs 112 | *.lzo 113 | *.lzs 114 | 115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 116 | # *.ist 117 | 118 | # gnuplottex 119 | *-gnuplottex-* 120 | 121 | # gregoriotex 122 | *.gaux 123 | *.gtex 124 | 125 | # htlatex 126 | *.4ct 127 | *.4tc 128 | *.idv 129 | *.lg 130 | *.trc 131 | *.xref 132 | 133 | # hyperref 134 | *.brf 135 | 136 | # knitr 137 | *-concordance.tex 138 | # TODO Comment the next line if you want to keep your tikz graphics files 139 | *.tikz 140 | *-tikzDictionary 141 | 142 | # listings 143 | *.lol 144 | 145 | # luatexja-ruby 146 | *.ltjruby 147 | 148 | # makeidx 149 | *.idx 150 | *.ilg 151 | *.ind 152 | 153 | # minitoc 154 | *.maf 155 | *.mlf 156 | *.mlt 157 | *.mtc[0-9]* 158 | *.slf[0-9]* 159 | *.slt[0-9]* 160 | *.stc[0-9]* 161 | 162 | # minted 163 | _minted* 164 | *.pyg 165 | 166 | # morewrites 167 | *.mw 168 | 169 | # nomencl 170 | *.nlg 171 | *.nlo 172 | *.nls 173 | 174 | # pax 175 | *.pax 176 | 177 | # pdfpcnotes 178 | *.pdfpc 179 | 180 | # sagetex 181 | *.sagetex.sage 182 | *.sagetex.py 183 | *.sagetex.scmd 184 | 185 | # scrwfile 186 | *.wrt 187 | 188 | # sympy 189 | *.sout 190 | *.sympy 191 | sympy-plots-for-*.tex/ 192 | 193 | # pdfcomment 194 | *.upa 195 | *.upb 196 | 197 | # pythontex 198 | *.pytxcode 199 | pythontex-files-*/ 200 | 201 | # tcolorbox 202 | *.listing 203 | 204 | # thmtools 205 | *.loe 206 | 207 | # TikZ & PGF 208 | *.dpth 209 | *.md5 210 | *.auxlock 211 | 212 | # todonotes 213 | *.tdo 214 | 215 | # vhistory 216 | *.hst 217 | *.ver 218 | 219 | # easy-todo 220 | *.lod 221 | 222 | # xcolor 223 | *.xcp 224 | 225 | # xmpincl 226 | *.xmpi 227 | 228 | # xindy 229 | *.xdy 230 | 231 | # xypic precompiled matrices and outlines 232 | *.xyc 233 | *.xyd 234 | 235 | # endfloat 236 | *.ttt 237 | *.fff 238 | 239 | # Latexian 240 | TSWLatexianTemp* 241 | 242 | ## Editors: 243 | # WinEdt 244 | *.bak 245 | *.sav 246 | 247 | # Texpad 248 | .texpadtmp 249 | 250 | # LyX 251 | *.lyx~ 252 | 253 | # Kile 254 | *.backup 255 | 256 | # KBibTeX 257 | *~[0-9]* 258 | 259 | # auto folder when using emacs and auctex 260 | ./auto/* 261 | *.el 262 | 263 | # expex forward references with \gathertags 264 | *-tags.tex 265 | 266 | # standalone packages 267 | *.sta 268 | 269 | # Makeindex log files 270 | *.lpz 271 | -------------------------------------------------------------------------------- /latex/README.md: -------------------------------------------------------------------------------- 1 | LaTeX source of the ICCV2019 paper: Optimizing the F-measure for Threshold-free Salient Object Detection 2 | 3 | Compile and clean: 4 | 5 | ``` 6 | # compile 7 | bash build.sh 8 | # clean 9 | bash clean.sh 10 | ``` 11 | The output pdf and auxiliaries are at `output/`. -------------------------------------------------------------------------------- /latex/build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | set -e 3 | if [ ! -e output ]; then 4 | mkdir output 5 | fi 6 | if [[ [$(uname) == "Linux"] || [$(uname) == "Darwin"] ]]; then 7 | LATEX="exlatex" 8 | else 9 | LATEX="pdflatex" 10 | fi 11 | echo "Building with "$LATEX 12 | "$LATEX" -output-directory output iccv2019fmeasure.tex 13 | bibtex output/iccv2019fmeasure.aux 14 | "$LATEX" -output-directory output iccv2019fmeasure.tex 15 | "$LATEX" -output-directory output iccv2019fmeasure.tex 16 | -------------------------------------------------------------------------------- /latex/clean.sh: -------------------------------------------------------------------------------- 1 | rm -rf output 2 | rm *.log 3 | rm *.pdf 4 | rm *.dvi 5 | rm *.bbl 6 | rm *.brf 7 | rm *.out 8 | rm *.aux 9 | rm *.blg 10 | rm *.gz 11 | -------------------------------------------------------------------------------- /latex/fmeasure.bib: -------------------------------------------------------------------------------- 1 | @String(PAMI = {IEEE TPAMI}) 2 | @string(ACMMM = {ACM Multimedia}) 3 | @String(TSMC = {IEEE TSMC}) 4 | @string(ICME = {IEEE ICME}) 5 | @string(TMM = {IEEE TMM}) 6 | @string(CSVT = {IEEE TCSVT}) 7 | 8 | @string(CVPR= {Proceedings of the IEEE conference on computer vision and pattern recognition.}) 9 | @string(CVPRW= {Proceedings of the IEEE conference on computer vision and pattern recognition Workshops.}) 10 | @string(ICCV= {Proceedings of the IEEE International Conference on Computer Vision.}) 11 | @string(NIPS= {Proceedings of Advances in Neural Information Processing Systems.}), 12 | @string(ECCV= {Proceedings of the European Conference on Computer Vision.}) 13 | @string(BMVC= {Brit. Mach. Vis. Conf.}) 14 | @string(ICIP = {Proceedings of the IEEE International Conference on Image Processing.}) 15 | @string(IJCAI= {Preceding of the International Joint Conference on Artificial Intelligence.}) 16 | 17 | @string(ACMMM= {ACM Int. Conf. Multimedia}) 18 | @string(ICME = {Int. Conf. Multimedia and Expo}) 19 | @string(ICPR = {Int. Conf. Pattern Recog.}) 20 | @string(ICLR = {Proceedings of the International Conference on Learning Representations}) 21 | 22 | @string(SPL = {IEEE Signal Processing Letters}) 23 | @string(IJCV = {International Journal of Computer Vision.}) 24 | @string(PAMI = {IEEE Transaction on Pattern Analysis and Machine Intelligence}) 25 | @string(PR = {Pattern Recognition}) 26 | @string(TIP = {IEEE Transaction on Image Processing.}) 27 | @string(CSVT = {IEEE Trans. Circuit Syst. Video Technol.}) 28 | @string(VR = {Vis. Res.}) 29 | @string(JOV = {J. Vis.}) 30 | @string(TMM = {IEEE Trans. Multimedia}) 31 | @string(CGF = {Comput. Graph. Forum}) 32 | @string(TVC = {The Vis. Comput.}) 33 | @string(JCST = {J. Comput. Sci. Tech.}) 34 | 35 | @inproceedings{margolin2014evaluate, 36 | title={How to evaluate foreground maps?}, 37 | author={Margolin, Ran and Zelnik-Manor, Lihi and Tal, Ayellet}, 38 | booktitle={CVPR}, 39 | pages={248--255}, 40 | year={2014} 41 | } 42 | 43 | @inproceedings{busa2015online, 44 | title={Online F-measure optimization}, 45 | author={Busa-Fekete, R{\'o}bert and Sz{\"o}r{\'e}nyi, Bal{\'a}zs and Dembczynski, Krzysztof and H{\"u}llermeier, Eyke}, 46 | booktitle={NeurIPS}, 47 | pages={595--603}, 48 | year={2015} 49 | } 50 | 51 | @InProceedings{MartinFTM01, 52 | author = {D. Martin and C. Fowlkes and D. Tal and J. Malik}, 53 | title = {A Database of Human Segmented Natural Images and its 54 | Application to Evaluating Segmentation Algorithms and 55 | Measuring Ecological Statistics}, 56 | booktitle = {ICCV}, 57 | year = {2001}, 58 | volume = {2}, 59 | pages = {416--423} 60 | } 61 | 62 | % use F-measure in saliency 63 | @inproceedings{achanta2009frequency, 64 | title={Frequency-tuned salient region detection}, 65 | author={Achanta, Radhakrishna and Hemami, Sheila and Estrada, Francisco and Susstrunk, Sabine}, 66 | booktitle={CVPR}, 67 | pages={1597--1604}, 68 | year={2009}, 69 | } 70 | 71 | @article{liu2011learning, 72 | title={Learning to detect a salient object}, 73 | author={Liu, Tie and Yuan, Zejian and Sun, Jian and Wang, Jingdong and Zheng, Nanning and Tang, Xiaoou and Shum, Heung-Yeung}, 74 | journal={IEEE PAMI}, 75 | volume={33}, 76 | number={2}, 77 | pages={353--367}, 78 | year={2011}, 79 | } 80 | 81 | # ECSSD 82 | @inproceedings{yan2013hierarchical, 83 | title={Hierarchical saliency detection}, 84 | author={Yan, Qiong and Xu, Li and Shi, Jianping and Jia, Jiaya}, 85 | booktitle={CVPR}, 86 | pages={1155--1162}, 87 | year={2013}, 88 | } 89 | 90 | @inproceedings{LiYu15, 91 | author = "Li, G. and Yu, Y.", 92 | title = "Visual Saliency Based on Multiscale Deep Features", 93 | booktitle = {CVPR}, 94 | pages = "5455-5463", 95 | year = "2015" 96 | } 97 | 98 | @inproceedings{li2014secrets, 99 | title={The secrets of salient object segmentation}, 100 | author={Li, Yin and Hou, Xiaodi and Koch, Christof and Rehg, James M and Yuille, Alan L}, 101 | booktitle = {CVPR}, 102 | pages = "280-287", 103 | year={2014}, 104 | } 105 | 106 | @inproceedings{wang2019iterative, 107 | title={An Iterative and Cooperative Top-down and Bottom-up Inference Network for Salient Object Detection}, 108 | author={Wang, Wenguan and Shen, Jianbing and Cheng, Ming-Ming and Shao, Ling}, 109 | booktitle={CVPR}, 110 | pages={5968--5977}, 111 | year={2019} 112 | } 113 | 114 | @inproceedings{wang2019salient, 115 | title={Salient Object Detection With Pyramid Attention and Salient Edges}, 116 | author={Wang, Wenguan and Zhao, Shuyang and Shen, Jianbing and Hoi, Steven CH and Borji, Ali}, 117 | booktitle={CVPR}, 118 | pages={1448--1457}, 119 | year={2019} 120 | } 121 | 122 | @inproceedings{wang2018salient, 123 | title={Salient object detection driven by fixation prediction}, 124 | author={Wang, Wenguan and Shen, Jianbing and Dong, Xingping and Borji, Ali}, 125 | booktitle={CVPR}, 126 | pages={1711--1720}, 127 | year={2018} 128 | } 129 | 130 | @inproceedings{movahedi2010design, 131 | title={Design and perceptual validation of performance measures for salient object segmentation}, 132 | author={Movahedi, Vida and Elder, James H}, 133 | booktitle={CVPR-workshop}, 134 | pages={49--56}, 135 | year={2010}, 136 | } 137 | 138 | @article{wang2019salientsurvey, 139 | title={Salient Object Detection in the Deep Learning Era: An In-Depth Survey}, 140 | author={Wang, Wenguan and Lai, Qiuxia and Fu, Huazhu and Shen, Jianbing and Ling, Haibin}, 141 | journal={arXiv preprint arXiv:1904.09146}, 142 | year={2019} 143 | } 144 | 145 | @inproceedings{long2015fully, 146 | title={Fully convolutional networks for semantic segmentation}, 147 | author={Long, Jonathan and Shelhamer, Evan and Darrell, Trevor}, 148 | booktitle={CVPR}, 149 | pages={3431--3440}, 150 | year={2015} 151 | } 152 | 153 | @inproceedings{xie2015holistically, 154 | title={Holistically-nested edge detection}, 155 | author={Xie, Saining and Tu, Zhuowen}, 156 | booktitle={ICCV}, 157 | pages={1395--1403}, 158 | year={2015} 159 | } 160 | 161 | @article{li2015visual, 162 | title={Visual saliency based on multiscale deep features}, 163 | author={Li, Guanbin and Yu, Yizhou}, 164 | journal={CVPR}, 165 | year={2015} 166 | } 167 | 168 | @inproceedings{jia2014caffe, 169 | title={Caffe: Convolutional architecture for fast feature embedding}, 170 | author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor}, 171 | booktitle={ACMM}, 172 | pages={675--678}, 173 | year={2014}, 174 | } 175 | % RFCN 176 | @inproceedings{wang2016saliency, 177 | title={Saliency detection with recurrent fully convolutional networks}, 178 | author={Wang, Linzhao and Wang, Lijun and Lu, Huchuan and Zhang, Pingping and Ruan, Xiang}, 179 | booktitle={ECCV}, 180 | pages={825--841}, 181 | year={2016}, 182 | } 183 | 184 | % DHS 185 | @inproceedings{liu2016dhsnet, 186 | title={Dhsnet: Deep hierarchical saliency network for salient object detection}, 187 | author={Liu, Nian and Han, Junwei}, 188 | booktitle={CVPR}, 189 | pages={678--686}, 190 | year={2016}, 191 | } 192 | 193 | % DCL 194 | @inproceedings{li2016deep, 195 | title={Deep contrast learning for salient object detection}, 196 | author={Li, Guanbin and Yu, Yizhou}, 197 | booktitle={CVPR}, 198 | pages={478--487}, 199 | year={2016} 200 | } 201 | 202 | % DSS 203 | @article{hou2017deeply, 204 | title={Deeply supervised salient object detection with short connections}, 205 | author={Hou, Qibin and Cheng, Ming-Ming and Hu, Xiaowei and Borji, Ali and Tu, Zhuowen and Torr, Philip}, 206 | year = {2019}, 207 | volume={41}, 208 | number={4}, 209 | pages={815-828}, 210 | journal= {IEEE TPAMI}, 211 | doi = {10.1109/TPAMI.2018.2815688}, 212 | } 213 | 214 | % VGGNet 215 | @inproceedings{simonyan2014very, 216 | title={Very deep convolutional networks for large-scale image recognition}, 217 | author={Simonyan, Karen and Zisserman, Andrew}, 218 | booktitle={ICLR}, 219 | year={2015} 220 | } 221 | 222 | @article{zhang2017amulet, 223 | title={Amulet: Aggregating Multi-level Convolutional Features for Salient Object Detection}, 224 | author={Zhang, Pingping and Wang, Dong and Lu, Huchuan and Wang, Hongyu and Ruan, Xiang}, 225 | journal={ICCV}, 226 | year={2017} 227 | } 228 | % MSRA10K 229 | @article{cheng2015global, 230 | title={Global contrast based salient region detection}, 231 | author={Cheng, Ming-Ming and Mitra, Niloy J and Huang, Xiaolei and Torr, Philip HS and Hu, Shi-Min}, 232 | journal={IEEE PAMI}, 233 | volume={37}, 234 | number={3}, 235 | pages={569--582}, 236 | year={2015}, 237 | } 238 | 239 | @article{van1974foundation, 240 | title={Foundation of evaluation}, 241 | author={Van Rijsbergen, Cornelis Joost}, 242 | journal={Journal of Documentation}, 243 | volume={30}, 244 | number={4}, 245 | pages={365--373}, 246 | year={1974}, 247 | } 248 | 249 | % old saliency detection methods 250 | @article{itti1998model, 251 | title={A model of saliency-based visual attention for rapid scene analysis}, 252 | author={Itti, Laurent and Koch, Christof and Niebur, Ernst}, 253 | journal={IEEE PAMI}, 254 | volume={20}, 255 | number={11}, 256 | pages={1254--1259}, 257 | year={1998}, 258 | } 259 | 260 | @inproceedings{klein2011center, 261 | title={Center-surround divergence of feature statistics for salient object detection}, 262 | author={Klein, Dominik A and Frintrop, Simone}, 263 | booktitle={ICCV}, 264 | pages={2214--2219}, 265 | year={2011}, 266 | } 267 | 268 | @inproceedings{borji2012exploiting, 269 | title={Exploiting local and global patch rarities for saliency detection}, 270 | author={Borji, Ali and Itti, Laurent}, 271 | booktitle={CVPR}, 272 | pages={478--485}, 273 | year={2012}, 274 | } 275 | 276 | @article{borji2015salient, 277 | title={Salient object detection: A benchmark}, 278 | author={Borji, Ali and Cheng, Ming-Ming and Jiang, Huaizu and Li, Jia}, 279 | journal={TIP}, 280 | volume={24}, 281 | number={12}, 282 | pages={5706--5722}, 283 | year={2015}, 284 | } 285 | 286 | @inproceedings{dembczynski2013optimizing, 287 | title={Optimizing the F-measure in multi-label classification: Plug-in rule approach versus structured loss minimization}, 288 | author={Dembczynski, Krzysztof and Jachnik, Arkadiusz and Kotlowski, Wojciech and Waegeman, Willem and H{\"u}llermeier, Eyke}, 289 | booktitle={ICML}, 290 | pages={1130--1138}, 291 | year={2013} 292 | } 293 | @inproceedings{petterson2010reverse, 294 | title={Reverse multi-label learning}, 295 | author={Petterson, James and Caetano, Tib{\'e}rio S}, 296 | booktitle={NeurIPS}, 297 | pages={1912--1920}, 298 | year={2010} 299 | } 300 | @inproceedings{petterson2011submodular, 301 | title={Submodular multi-label learning}, 302 | author={Petterson, James and Caetano, Tib{\'e}rio S}, 303 | booktitle={NeurIPS}, 304 | pages={1512--1520}, 305 | year={2011} 306 | } 307 | @inproceedings{lewis1995evaluating, 308 | title={Evaluating and optimizing autonomous text classification systems}, 309 | author={Lewis, David D}, 310 | booktitle={Proceedings of the annual international ACM SIGIR conference on Research and development in information retrieval}, 311 | pages={246--254}, 312 | year={1995}, 313 | } 314 | @inproceedings{chai2005expectation, 315 | title={Expectation of F-measures: Tractable exact computation and some empirical observations of its properties}, 316 | author={Chai, Kian Ming Adam}, 317 | booktitle={Proceedings of the annual international ACM SIGIR conference on Research and development in information retrieval}, 318 | pages={593--594}, 319 | year={2005}, 320 | } 321 | @inproceedings{jansche2007maximum, 322 | title={A maximum expected utility framework for binary sequence labeling}, 323 | author={Jansche, Martin}, 324 | booktitle={Proceedings of the Annual Meeting of the Association of Computational Linguistics}, 325 | pages={736--743}, 326 | year={2007} 327 | } 328 | @inproceedings{dembczynski2011exact, 329 | title={An exact algorithm for F-measure maximization}, 330 | author={Dembczynski, Krzysztof J and Waegeman, Willem and Cheng, Weiwei and H{\"u}llermeier, Eyke}, 331 | booktitle={NeurIPS}, 332 | pages={1404--1412}, 333 | year={2011} 334 | } 335 | @article{quevedo2012multilabel, 336 | title={Multilabel classifiers with a probabilistic thresholding strategy}, 337 | author={Quevedo, Jos{\'e} Ram{\'o}n and Luaces, Oscar and Bahamonde, Antonio}, 338 | journal={Pattern Recognition}, 339 | volume={45}, 340 | number={2}, 341 | pages={876--883}, 342 | year={2012}, 343 | } 344 | 345 | 346 | @inproceedings{nan2012optimizing, 347 | title={Optimizing F-measures: a tale of two approaches}, 348 | author={Ye, Nan and Chai, Kian Ming A and Lee, Wee Sun and Chieu, Hai Leong}, 349 | booktitle={ICML}, 350 | pages={1555--1562}, 351 | year={2012}, 352 | } 353 | 354 | # Dense-CRF 355 | @inproceedings{krahenbuhl2011efficient, 356 | title={Efficient inference in fully connected crfs with gaussian edge potentials}, 357 | author={Kr{\"a}henb{\"u}hl, Philipp and Koltun, Vladlen}, 358 | booktitle={NeurIPS}, 359 | pages={109--117}, 360 | year={2011} 361 | } 362 | 363 | @inproceedings{lin2017feature, 364 | title={Feature pyramid networks for object detection}, 365 | author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge}, 366 | booktitle={CVPR}, 367 | year={2017} 368 | } 369 | 370 | # deepcontour 371 | @inproceedings{shen2015deepcontour, 372 | title={Deepcontour: A deep convolutional feature learned by positive-sharing loss for contour detection}, 373 | author={Shen, Wei and Wang, Xinggang and Wang, Yan and Bai, Xiang and Zhang, Zhijiang}, 374 | booktitle={CVPR}, 375 | pages={3982--3991}, 376 | year={2015} 377 | } 378 | # SOC 379 | @article{dpfan2018soc, 380 | author = {Deng{-}Ping Fan and 381 | Jiangjiang Liu and 382 | Shanghua Gao and 383 | Qibin Hou and 384 | Ali Borji and 385 | Ming{-}Ming Cheng}, 386 | title = {Salient Objects in Clutter: Bringing Salient Object Detection to the 387 | Foreground}, 388 | journal = {ECCV}, 389 | year = {2018}, 390 | } 391 | 392 | @article{uijlings2013selective, 393 | title={Selective search for object recognition}, 394 | author={Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers, Theo and Smeulders, Arnold WM}, 395 | journal={IJCV}, 396 | volume={104}, 397 | number={2}, 398 | pages={154--171}, 399 | year={2013}, 400 | } 401 | 402 | @inproceedings{deng2018edge, 403 | title={Learning to predict crisp boundaries}, 404 | author={Ruoxi Deng and Chunhua Shen and Shengjun Liu and Huibing Wang and Xinru Liu}, 405 | booktitle={ECCV}, 406 | year={2018}, 407 | } 408 | 409 | @Article{bsds500, 410 | author = {Arbelaez, Pablo and Maire, Michael and Fowlkes, Charless and Malik, Jitendra}, 411 | title = {Contour Detection and Hierarchical Image Segmentation}, 412 | journal = {IEEE PAMI}, 413 | volume = {33}, 414 | number = {5}, 415 | year = {2011}, 416 | issn = {0162-8828}, 417 | pages = {898--916}, 418 | } 419 | 420 | @inproceedings{yang2013saliency, 421 | title={Saliency detection via graph-based manifold ranking}, 422 | author={Yang, Chuan and Zhang, Lihe and Lu, Huchuan and Ruan, Xiang and Yang, Ming-Hsuan}, 423 | booktitle={CVPR}, 424 | pages={3166--3173}, 425 | year={2013}, 426 | } 427 | 428 | @inproceedings{fan2019shifting, 429 | title={Shifting more attention to video salient object detection}, 430 | author={Fan, Deng-Ping and Wang, Wenguan and Cheng, Ming-Ming and Shen, Jianbing}, 431 | booktitle={CVPR}, 432 | pages={8554--8564}, 433 | year={2019} 434 | } 435 | 436 | @inproceedings{Liu2019PoolSal, 437 | title={A Simple Pooling-Based Design for Real-Time Salient Object Detection}, 438 | author={Jiang-Jiang Liu and Qibin Hou and Ming-Ming Cheng and Jiashi Feng and Jianmin Jiang}, 439 | booktitle={CVPR}, 440 | year={2019}, 441 | } 442 | 443 | @Article{BorjiCVM2019, 444 | author="Borji, Ali and Cheng, Ming-Ming and Hou, Qibin and Jiang, Huaizu and Li, Jia", 445 | title="Salient object detection: A survey", 446 | journal="Computational Visual Media", 447 | year="2019", 448 | volume="5", 449 | number="2", 450 | pages="117--150", 451 | issn="2096-0662", 452 | doi="10.1007/s41095-019-0149-9", 453 | } 454 | 455 | @inproceedings{zhao2018hifi, 456 | title = {Hi-{F}i: Hierarchical Feature Integration for Skeleton Detection}, 457 | author = {Kai Zhao and Wei Shen and Shanghua Gao and Dandan Li and Ming-Ming Cheng}, 458 | booktitle = {IJCAI}, 459 | pages = {1191--1197}, 460 | year = {2018}, 461 | month = {7}, 462 | doi = {10.24963/ijcai.2018/166}, 463 | url = {http://kaizhao.net/hifi}, 464 | } 465 | 466 | @InProceedings{zhao2019EGNet, 467 | author = {Jiaxing Zhao and Jiangjiang liu and Dengping Fan and Yang Cao and Jufeng Yang and Ming-Ming Cheng}, 468 | title = {EGNet:Edge Guidance Network for Salient Object Detection}, 469 | booktitle = {ICCV}, 470 | month = {Oct}, 471 | year = {2019} 472 | } 473 | 474 | @article{gao2019res2net, 475 | title={Res2Net: A New Multi-scale Backbone Architecture}, 476 | author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip}, 477 | journal={IEEE TPAMI}, 478 | year={2019} 479 | } 480 | 481 | @article{RcfEdgePami2019, 482 | author = {Yun Liu and Ming-Ming Cheng and Xiaowei Hu and Jia-Wang Bian and Le Zhang and Xiang Bai and Jinhui Tang}, 483 | title = {Richer Convolutional Features for Edge Detection}, 484 | year = {2019}, 485 | journal= {IEEE TPAMI}, 486 | volume={41}, 487 | number={8}, 488 | pages={1939 - 1946}, 489 | doi = {10.1109/TPAMI.2018.2878849}, 490 | } 491 | 492 | @InProceedings{Wang_2019_CVPR, 493 | author = {Wang, Yukang and Xu, Yongchao and Tsogkas, Stavros and Bai, Xiang and Dickinson, Sven and Siddiqi, Kaleem}, 494 | title = {DeepFlux for Skeletons in the Wild}, 495 | booktitle = {CVPR}, 496 | month = {June}, 497 | year = {2019} 498 | } 499 | @InProceedings{Hu_2018_CVPR, 500 | author = {Hu, Xiaowei and Zhu, Lei and Fu, Chi-Wing and Qin, Jing and Heng, Pheng-Ann}, 501 | title = {Direction-Aware Spatial Context Features for Shadow Detection}, 502 | booktitle = {CVPR}, 503 | month = {June}, 504 | year = {2018} 505 | } 506 | @InProceedings{MobileNetV2, 507 | title = {MobileNetV2: Inverted Residuals and Linear Bottlenecks}, 508 | author = {Mark Sandler and 509 | Andrew G. Howard and 510 | Menglong Zhu and 511 | Andrey Zhmoginov and 512 | Liang{-}Chieh Chen}, 513 | booktitle = {CVPR}, 514 | month = {June}, 515 | year = {2018} 516 | } -------------------------------------------------------------------------------- /latex/iccv.sty: -------------------------------------------------------------------------------- 1 | % --------------------------------------------------------------- 2 | % 3 | % $Id: iccv.sty,v 1.3 2005/10/24 19:56:15 awf Exp $ 4 | % 5 | % by Paolo.Ienne@di.epfl.ch 6 | % some mods by awf@acm.org 7 | % 8 | % --------------------------------------------------------------- 9 | % 10 | % no guarantee is given that the format corresponds perfectly to 11 | % IEEE 8.5" x 11" Proceedings, but most features should be ok. 12 | % 13 | % --------------------------------------------------------------- 14 | % with LaTeX2e: 15 | % ============= 16 | % 17 | % use as 18 | % \documentclass[times,10pt,twocolumn]{article} 19 | % \usepackage{latex8} 20 | % \usepackage{times} 21 | % 22 | % --------------------------------------------------------------- 23 | 24 | % with LaTeX 2.09: 25 | % ================ 26 | % 27 | % use as 28 | % \documentstyle[times,art10,twocolumn,latex8]{article} 29 | % 30 | % --------------------------------------------------------------- 31 | % with both versions: 32 | % =================== 33 | % 34 | % specify \iccvfinalcopy to emit the final camera-ready copy 35 | % 36 | % specify references as 37 | % \bibliographystyle{ieee} 38 | % \bibliography{...your files...} 39 | % 40 | % --------------------------------------------------------------- 41 | 42 | \usepackage{eso-pic} 43 | \usepackage{xspace} 44 | 45 | \typeout{ICCV 8.5 x 11-Inch Proceedings Style `iccv.sty'.} 46 | 47 | % ten point helvetica bold required for captions 48 | % eleven point times bold required for second-order headings 49 | % in some sites the name of the fonts may differ, 50 | % change the name here: 51 | \font\iccvtenhv = phvb at 8pt % *** IF THIS FAILS, SEE iccv.sty *** 52 | \font\elvbf = ptmb scaled 1100 53 | 54 | % If the above lines give an error message, try to comment them and 55 | % uncomment these: 56 | %\font\iccvtenhv = phvb7t at 8pt 57 | %\font\elvbf = ptmb7t scaled 1100 58 | 59 | % set dimensions of columns, gap between columns, and paragraph indent 60 | \setlength{\textheight}{8.875in} 61 | \setlength{\textwidth}{6.875in} 62 | \setlength{\columnsep}{0.3125in} 63 | \setlength{\topmargin}{0in} 64 | \setlength{\headheight}{0in} 65 | \setlength{\headsep}{0in} 66 | \setlength{\parindent}{1pc} 67 | \setlength{\oddsidemargin}{-.304in} 68 | \setlength{\evensidemargin}{-.304in} 69 | 70 | \newif\ificcvfinal 71 | \iccvfinalfalse 72 | \def\iccvfinalcopy{\global\iccvfinaltrue} 73 | 74 | % memento from size10.clo 75 | % \normalsize{\@setfontsize\normalsize\@xpt\@xiipt} 76 | % \small{\@setfontsize\small\@ixpt{11}} 77 | % \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}} 78 | % \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt} 79 | % \tiny{\@setfontsize\tiny\@vpt\@vipt} 80 | % \large{\@setfontsize\large\@xiipt{14}} 81 | % \Large{\@setfontsize\Large\@xivpt{18}} 82 | % \LARGE{\@setfontsize\LARGE\@xviipt{22}} 83 | % \huge{\@setfontsize\huge\@xxpt{25}} 84 | % \Huge{\@setfontsize\Huge\@xxvpt{30}} 85 | 86 | \def\@maketitle 87 | { 88 | \newpage 89 | \null 90 | \vskip .375in 91 | \begin{center} 92 | {\Large \bf \@title \par} 93 | % additional two empty lines at the end of the title 94 | \vspace*{24pt} 95 | { 96 | \large 97 | \lineskip .5em 98 | \begin{tabular}[t]{c} 99 | \ificcvfinal\@author\else Anonymous ICCV submission\\ 100 | \vspace*{1pt}\\%This space will need to be here in the final copy, so don't squeeze it out for the review copy. 101 | Paper ID \iccvPaperID \fi 102 | \end{tabular} 103 | \par 104 | } 105 | % additional small space at the end of the author name 106 | \vskip .5em 107 | % additional empty line at the end of the title block 108 | \vspace*{12pt} 109 | \end{center} 110 | } 111 | 112 | \def\abstract 113 | {% 114 | \centerline{\large\bf Abstract}% 115 | \vspace*{12pt}% 116 | \it% 117 | } 118 | 119 | \def\endabstract 120 | { 121 | % additional empty line at the end of the abstract 122 | \vspace*{12pt} 123 | } 124 | 125 | \def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} 126 | 127 | \newlength{\@ctmp} 128 | \newlength{\@figindent} 129 | \setlength{\@figindent}{1pc} 130 | 131 | \long\def\@makecaption#1#2{ 132 | \setbox\@tempboxa\hbox{\small \noindent #1.~#2} 133 | \setlength{\@ctmp}{\hsize} 134 | \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent} 135 | % IF longer than one indented paragraph line 136 | \ifdim \wd\@tempboxa >\@ctmp 137 | % THEN DON'T set as an indented paragraph 138 | {\small #1.~#2\par} 139 | \else 140 | % ELSE center 141 | \hbox to\hsize{\hfil\box\@tempboxa\hfil} 142 | \fi} 143 | 144 | % correct heading spacing and type 145 | \def\iccvsection{\@startsection {section}{1}{\z@} 146 | {10pt plus 2pt minus 2pt}{7pt} {\large\bf}} 147 | \def\iccvssect#1{\iccvsection*{#1}} 148 | \def\iccvsect#1{\iccvsection{\hskip -1em.~#1}} 149 | \def\section{\@ifstar\iccvssect\iccvsect} 150 | 151 | \def\iccvsubsection{\@startsection {subsection}{2}{\z@} 152 | {8pt plus 2pt minus 2pt}{6pt} {\elvbf}} 153 | \def\iccvssubsect#1{\iccvsubsection*{#1}} 154 | \def\iccvsubsect#1{\iccvsubsection{\hskip -1em.~#1}} 155 | \def\subsection{\@ifstar\iccvssubsect\iccvsubsect} 156 | 157 | %% --------- Page background marks: Ruler and confidentiality 158 | 159 | % ----- define vruler 160 | \makeatletter 161 | \newbox\iccvrulerbox 162 | \newcount\iccvrulercount 163 | \newdimen\iccvruleroffset 164 | \newdimen\cv@lineheight 165 | \newdimen\cv@boxheight 166 | \newbox\cv@tmpbox 167 | \newcount\cv@refno 168 | \newcount\cv@tot 169 | % NUMBER with left flushed zeros \fillzeros[] 170 | \newcount\cv@tmpc@ \newcount\cv@tmpc 171 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi 172 | \cv@tmpc=1 % 173 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi 174 | \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat 175 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi 176 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat 177 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% 178 | % \makevruler[][][][][] 179 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip 180 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt% 181 | \global\setbox\iccvrulerbox=\vbox to \textheight{% 182 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight 183 | \cv@lineheight=#1\global\iccvrulercount=#2% 184 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2% 185 | \cv@refno1\vskip-\cv@lineheight\vskip1ex% 186 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\iccvtenhv\hfil\fillzeros[#4]\iccvrulercount}}% 187 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break 188 | \advance\cv@refno1\global\advance\iccvrulercount#3\relax 189 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}% 190 | \makeatother 191 | % ----- end of vruler 192 | 193 | % \makevruler[][][][][] 194 | \def\iccvruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\iccvrulerbox}} 195 | \newdimen\iccvtmppos 196 | \AddToShipoutPicture{% 197 | \ificcvfinal\else 198 | %\AtTextLowerLeft{% 199 | % \color[gray]{.15}\framebox(\LenToUnit{\textwidth},\LenToUnit{\textheight}){} 200 | %} 201 | \iccvruleroffset=\textheight 202 | \advance\iccvruleroffset by -3.7pt 203 | \color[rgb]{.5,.5,1} 204 | \AtTextUpperLeft{% 205 | \put(\LenToUnit{-35pt},\LenToUnit{-\iccvruleroffset}){%left ruler 206 | \iccvruler{\iccvrulercount}} 207 | \iccvtmppos=\textwidth\advance\iccvtmppos by 30pt 208 | \put(\LenToUnit{\iccvtmppos},\LenToUnit{-\iccvruleroffset}){%right ruler 209 | \iccvruler{\iccvrulercount}} 210 | } 211 | \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small ICCV}\\\#\iccvPaperID\end{center}}} 212 | \AtTextUpperLeft{%paperID in corners 213 | \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid} 214 | \iccvtmppos=\textwidth\advance\iccvtmppos by -8pt 215 | \put(\LenToUnit{\iccvtmppos},\LenToUnit{45pt}){\pid} 216 | } 217 | \AtTextUpperLeft{%confidential 218 | \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\iccvtenhv 219 | ICCV 2019 Submission \#\iccvPaperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}} 220 | } 221 | \fi 222 | } 223 | 224 | %%% Make figure placement a little more predictable. 225 | % We trust the user to move figures if this results 226 | % in ugliness. 227 | % Minimize bad page breaks at figures 228 | \renewcommand{\textfraction}{0.01} 229 | \renewcommand{\floatpagefraction}{0.99} 230 | \renewcommand{\topfraction}{0.99} 231 | \renewcommand{\bottomfraction}{0.99} 232 | \renewcommand{\dblfloatpagefraction}{0.99} 233 | \renewcommand{\dbltopfraction}{0.99} 234 | \setcounter{totalnumber}{99} 235 | \setcounter{topnumber}{99} 236 | \setcounter{bottomnumber}{99} 237 | 238 | % Add a period to the end of an abbreviation unless there's one 239 | % already, then \xspace. 240 | \makeatletter 241 | \DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot} 242 | \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace} 243 | 244 | \def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot} 245 | \def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot} 246 | \def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot} 247 | \def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot} 248 | \def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot} 249 | \def\etal{\emph{et al}\onedot} 250 | \makeatother 251 | 252 | % --------------------------------------------------------------- 253 | -------------------------------------------------------------------------------- /latex/iccv2019fmeasure.tex: -------------------------------------------------------------------------------- 1 | %\documentclass[10pt,onecolumn,letterpaper]{article} 2 | \documentclass[10pt,twocolumn,letterpaper]{article} 3 | 4 | \usepackage{iccv} 5 | \usepackage{times} 6 | \usepackage{epsfig} 7 | \usepackage{graphicx, overpic} 8 | \usepackage{wrapfig,lipsum,booktabs} 9 | 10 | \usepackage{enumitem} 11 | \usepackage[breaklinks=true,colorlinks,citecolor=blue,urlcolor=blue,bookmarks=false]{hyperref} 12 | \usepackage{amsmath} 13 | \usepackage{amssymb} 14 | \newcommand{\argmin}{\mathop{\mathrm{argmin}}\limits} 15 | \newcommand{\argmax}{\mathop{\mathrm{argmax}}\limits} 16 | 17 | \newcommand{\ConfInf}{\vspace{-.7in} {\normalsize \normalfont \color{blue}{ 18 | IEEE International Conference on Computer Vision (ICCV) 2019}} \vspace{.45in} \\} 19 | 20 | % Include other packages here, before hyperref. 21 | 22 | % If you comment hyperref and then uncomment it, you should delete 23 | % egpaper.aux before re-running latex. (Or just hit 'q' on the first latex 24 | % run, let it finish, and you should be clear). 25 | 26 | \iccvfinalcopy % *** Uncomment this line for the final submission 27 | 28 | 29 | %\newcommand{\GramaCheck}{} 30 | \ifdefined \GramaCheck 31 | \newcommand{\CheckRmv}[1]{} 32 | \newcommand{\figref}[1]{Figure 1}% 33 | \newcommand{\tabref}[1]{Table 1}% 34 | \newcommand{\secref}[1]{Section 1} 35 | \newcommand{\equref}[1]{Equation 1} 36 | \else 37 | \newcommand{\CheckRmv}[1]{#1} 38 | \newcommand{\figref}[1]{Fig.~\ref{#1}}% 39 | \newcommand{\tabref}[1]{Tab.~\ref{#1}}% 40 | \newcommand{\secref}[1]{Sec.~\ref{#1}} 41 | \newcommand{\equref}[1]{Eq.~(\ref{#1})} 42 | \fi 43 | 44 | 45 | \def\iccvPaperID{1406} % *** Enter the ICCV Paper ID here 46 | \def\httilde{\mbox{\tt\raisebox{-.5ex}{\symbol{126}}}} 47 | 48 | % Pages are numbered in submission mode, and unnumbered in camera-ready 49 | \ificcvfinal\pagestyle{empty}\fi 50 | \begin{document} 51 | 52 | %%%%%%%%% TITLE 53 | % \title{Optimizing the F-measure for Threshold-free Salient Object Detection} 54 | \title{\ConfInf Optimizing the F-measure for Threshold-free Salient Object Detection} 55 | 56 | %%%%%%%%% AUTHORS 57 | % Kai Zhao, Shanghua Gao, Wenguan Wang, Ming-Ming Cheng 58 | \author{ 59 | Kai Zhao\textsuperscript{1}, Shanghua Gao\textsuperscript{1}, 60 | Wenguan Wang\textsuperscript{2}, 61 | Ming-Ming Cheng\textsuperscript{1}\thanks{M.M. Cheng is the corresponding author.}\\ 62 | \textsuperscript{1}TKLNDST, CS, Nankai University~~~~ 63 | \textsuperscript{2}Inception Institute of Artificial Intelligence\\ 64 | {\tt\small \{kaiz.xyz,shanghuagao,wenguanwang.ai\}@gmail.com,cmm@nankai.edu.cn} 65 | } 66 | 67 | \maketitle 68 | \thispagestyle{empty} 69 | 70 | 71 | %%%%%%%%% ABSTRACT 72 | \begin{abstract} 73 | Current CNN-based solutions to salient object detection (SOD) 74 | mainly rely on the optimization of cross-entropy loss (CELoss). 75 | % 76 | Then the quality of detected saliency maps is often evaluated in 77 | terms of F-measure. 78 | % 79 | In this paper, we investigate an interesting issue: 80 | can we consistently use the F-measure formulation in both training and evaluation 81 | for SOD? 82 | % 83 | By reformulating the standard F-measure, 84 | we propose the \emph{relaxed F-measure} which is differentiable w.r.t 85 | the posterior and can be easily appended to the back of CNNs as the loss function. 86 | % 87 | Compared to the conventional cross-entropy loss of which the gradients decrease 88 | dramatically in the saturated area, 89 | our loss function, named FLoss, holds considerable gradients even when the activation 90 | approaches the target. 91 | % 92 | Consequently, the FLoss can continuously force the network 93 | to produce polarized activations. 94 | % 95 | Comprehensive benchmarks on several popular datasets show that FLoss 96 | outperforms the state-of-the-art with a considerable margin. 97 | % 98 | More specifically, due to the polarized predictions, 99 | our method is able to obtain high-quality saliency maps without carefully tuning 100 | the optimal threshold, showing significant advantages in real-world applications. 101 | Code and pretrained models are available at \url{http://kaizhao.net/fmeasure}. 102 | \end{abstract} 103 | 104 | %%%%%%%%% BODY TEXT 105 | \section{Introduction} 106 | %------------------------------------------------------------------------------------------ 107 | We consider the task of salient object detection (SOD), where each pixel of a 108 | given image has to be classified as salient (outstanding) or not. 109 | % 110 | The human visual system is able to perceive and process visual signals 111 | distinctively: interested regions are conceived and analyzed with high priority 112 | while other regions draw less attention. 113 | % 114 | This capacity has been long studied in the computer vision community 115 | in the name of `salient object detection', 116 | since it can ease the procedure of scene understanding~\cite{borji2015salient}. 117 | % 118 | The performance of modern salient object detection methods is often evaluated 119 | in terms of F-measure. 120 | % 121 | Rooted from information retrieval~\cite{van1974foundation}, 122 | the F-measure is widely used as an evaluation metric 123 | in tasks where elements of a specified class have to be retrieved, 124 | especially when the relevant class is rare. 125 | % 126 | Given the per-pixel prediction $\hat{Y} (\hat{y}_i \!\in\! [0, 1], i\!=\!1,...,|Y|)$ 127 | and the ground-truth saliency map $Y (y_i \!\in\! \{0, 1\}, i\!=\!1,...,|Y|)$, 128 | a threshold $t$ is applied to obtain the binarized prediction 129 | $\dot{Y}^t (\dot{y}^t_i \!\in\! \{0, 1\}, i\!=\!1,...,|Y|)$. 130 | % 131 | The F-measure is then defined as the harmonic mean of precision and recall: 132 | \begin{equation}\small 133 | \!\!F(Y, \dot{Y}^t) \!=\! 134 | (1\!+\!\beta^2)\frac{\text{precision}(Y, \dot{Y}^t) \cdot \text{recall}(Y, \dot{Y}^t)} 135 | {\beta^2 \text{precision}(Y, \dot{Y}^t) \!+\! \text{recall}(Y, \dot{Y}^t)}, 136 | \label{eq:def-f} 137 | \end{equation} 138 | where $\beta^2\!>\!0$ is a balance factor between precision and recall. 139 | % 140 | When $\beta^2\!>\!1$, the F-measure is biased in favour of recall 141 | and otherwise in favour of precision. 142 | %the F-measure considers precision more than recall. 143 | 144 | Most CNN-based solutions for SOD 145 | \cite{hou2017deeply,li2016deep,wang2016saliency,fan2019shifting,wang2019iterative, 146 | zhao2019EGNet,wang2019salient} 147 | mainly rely on the optimization of 148 | \emph{cross-entropy loss} (CELoss) in an FCN~\cite{long2015fully} architecture, 149 | and the quality of saliency maps is often assessed by the F-measure. 150 | % 151 | Optimizing the pixel-independent CELoss can be regarded as minimizing the mean absolute 152 | error (MAE=$\frac{1}{N}\sum_i^N |\hat{y}_i - y_i|$), because in both circumstances 153 | each prediction/ground-truth pair works independently and contributes 154 | to the final score equally. 155 | % 156 | If the data labels have biased distribution, models trained with CELoss 157 | would make biased predictions towards the majority class. 158 | % 159 | Therefore, SOD models trained with CELoss hold biased prior and tend to 160 | predict unknown pixels as the background, 161 | consequently leading to low-recall detections. 162 | % 163 | %The same problem also exists in edge detection and some studies alleviate this problem 164 | %by class-specific loss weights~\cite{xie2015holistically}. 165 | % 166 | The F-measure~\cite{van1974foundation} is a more sophisticated 167 | and comprehensive evaluation metric which combines 168 | precision and recall into a single score 169 | and automatically offsets the unbalance between positive/negative samples. 170 | 171 | 172 | In this paper, we provide a uniform formulation % investigate to 173 | in both training and evaluation for SOD. 174 | % 175 | By directly taking the evaluation metric, 176 | \emph{i.e.} the F-measure, as the optimization target, 177 | we perform F-measure maximizing in an end-to-end manner. 178 | % 179 | To perform end-to-end learning, 180 | we propose the \emph{relaxed F-measure} to overcome the in-differentiability 181 | in the standard F-measure formulation. 182 | % 183 | The proposed loss function, named FLoss, is decomposable 184 | w.r.t the posterior $\hat{Y}$ and 185 | thus can be appended to the back of a CNN as supervision without effort. 186 | % 187 | We test the FLoss on several state-of-the-art SOD 188 | architectures and witness a visible performance gain. 189 | % 190 | Furthermore, the proposed FLoss holds considerable gradients even in the saturated area, 191 | resulting in polarized predictions that are stable against the threshold. 192 | % 193 | Our proposed FLoss enjoys three favorable properties:\vspace{-3pt} 194 | \begin{itemize}[noitemsep] 195 | \item Threshold-free salient object detection. 196 | % 197 | Models trained with FLoss produce contrastive saliency maps in which 198 | the foreground and background are clearly separated. 199 | % 200 | Therefore, 201 | FLoss can achieve high performance under a wide range of threshold. 202 | 203 | \item Being able to deal with unbalanced data. 204 | % 205 | Defined as the harmonic mean of precision and recall, the F-measure is able to establish a 206 | balance between samples of different classes. 207 | % 208 | We experimentally evidence that our method can find a better compromise between 209 | precision and recall. 210 | 211 | \item Fast convergence. Our method quickly learns to focus on 212 | salient object areas after only hundreds of iterations, 213 | showing fast convergence speed. 214 | \end{itemize} 215 | %------------------------------------------------------------------------------------------ 216 | \section{Related Work} 217 | %------------------------------------------------------------------------------------------ 218 | We review several CNN-based architectures for SOD and 219 | the literature related to F-measure optimization. 220 | 221 | \paragraph{Salient Object Detection (SOD).} 222 | The convolutional neural network (CNN) is proven to be dominant in many sub-areas 223 | of computer vision. 224 | % 225 | Significant progress has been achieved since the presence of CNN in SOD. 226 | % 227 | The DHS net~\cite{liu2016dhsnet} is one of the pioneers of using CNN for SOD. 228 | % 229 | DHS firstly produces a coarse saliency map with global cues, including contrast, objectness \etal. 230 | Then the coarse map is progressively refined with a hierarchical recurrent CNN. 231 | % 232 | The emergence of the fully convolutional network (FCN)~\cite{long2015fully} provides an 233 | elegant way to perform the end-to-end pixel-wise inference. 234 | % 235 | DCL~\cite{li2016deep} uses a two-stream architecture to process contrast information 236 | in both pixel and patch levels. 237 | % 238 | % 239 | The FCN-based sub-stream produces a saliency map with pixel-wise accuracy, 240 | and the other network stream performs inference on each object segment. 241 | % 242 | Finally, a fully connected CRF~\cite{krahenbuhl2011efficient} is used to 243 | combine the pixel-level and segment-level semantics. 244 | 245 | 246 | % 247 | Rooted from the HED~\cite{xie2015holistically} for edge detection, 248 | aggregating multi-scale side-outputs is proven to be effective in refining dense predictions 249 | especially when the detailed local structures are required to be preserved. 250 | % 251 | In the HED-like architectures, deeper side-outputs capture rich semantics and 252 | shallower side-outputs contain high-resolution details. 253 | % 254 | Combining these representations of different levels will lead to significant performance 255 | improvements. 256 | % 257 | DSS~\cite{hou2017deeply} introduces deep-to-shallow short connections 258 | across different side-outputs to refine the shallow side-outputs with deep semantic features. 259 | % 260 | The deep-to-shallow short connections enable the shallow side-outputs to 261 | distinguish real salient objects from the background 262 | and meanwhile retain the high resolution. 263 | % 264 | Liu \etal \cite{Liu2019PoolSal} design a pooling-based module to 265 | efficiently fuse convolutional features from a top-down pathway. 266 | % 267 | The idea of imposing top-down refinement has also 268 | been adopted in Amulet~\cite{zhang2017amulet}, 269 | and enhanced by Zhao \etal~\cite{zhao2018hifi} with bi-directional refinement. 270 | Later, Wang \etal \cite{wang2018salient} propose a visual attention-driven model 271 | that bridges the gap between SOD and eye fixation prediction. 272 | % 273 | These methods mentioned above tried to refine SOD by 274 | introducing a more powerful network architecture, 275 | from recurrent refining network to multi-scale side-output fusing. 276 | % 277 | We refer the readers to a recent survey ~\cite{BorjiCVM2019} for more details. 278 | 279 | \paragraph{F-measure Optimization.} 280 | Despite having been utilized as a common performance metric 281 | in many application domains, 282 | optimizing the F-measure doesn't draw much attention until very recently. 283 | % 284 | The works aiming at optimizing the F-measure can be divided into two subcategories 285 | ~\cite{dembczynski2013optimizing}: 286 | (a) structured loss minimization methods such as~\cite{petterson2010reverse, petterson2011submodular} 287 | which optimize the F-measure as the target during training; 288 | and (b) plug-in rule approaches which optimize the F-measure during inference phase 289 | ~\cite{jansche2007maximum,dembczynski2011exact,quevedo2012multilabel,nan2012optimizing}. 290 | 291 | Much of the attention has been drawn to the study of the latter subcategory: 292 | finding an optimal threshold value which leads to a maximal F-measure given predicted 293 | posterior $\hat{Y}$. 294 | % 295 | There are few articles about optimizing the F-measure during the training phase. 296 | % 297 | Petterson \etal ~\cite{petterson2010reverse} optimize the F-measure indirectly 298 | by maximizing a loss function associated to the F-measure. 299 | % 300 | Then in their successive work~\cite{petterson2011submodular} they construct an 301 | upper bound of the discrete F-measure 302 | and then maximize the F-measure by optimizing its upper bound. 303 | % 304 | These previous studies either work as post-processing, 305 | or are in-differentiable w.r.t posteriors, 306 | making them hard to be applied to the deep learning framework. 307 | 308 | %------------------------------------------------------------------------------------------ 309 | \section{Optimizing the F-measure for SOD} 310 | %------------------------------------------------------------------------------------------ 311 | 312 | %------------------------------------------------------------------------------------------ 313 | \subsection{The Relaxed F-measure} 314 | %------------------------------------------------------------------------------------------ 315 | In the standard F-measure, the true positive, 316 | false positive and false negative are defined as the number of corresponding samples: 317 | \begin{equation} 318 | \begin{split} 319 | TP(\dot{Y}^t, Y) &= \sum\nolimits_i 1(y_i==1 \ \text{and} \ \dot{y}^t_i==1), \\ 320 | FP(\dot{Y}^t, Y) &= \sum\nolimits_i 1(y_i==0 \ \text{and} \ \dot{y}^t_i==1), \\ 321 | FN(\dot{Y}^t, Y) &= \sum\nolimits_i 1(y_i==1 \ \text{and} \ \dot{y}^t_i==0), \\ 322 | \end{split} 323 | \label{eq:tpfp0} 324 | \end{equation} 325 | where $Y$ is the ground-truth, $\dot{Y}^t$ is the binary prediction binarized by threshold $t$ 326 | and $Y$ is the ground-truth saliency map. 327 | % 328 | $1(\cdot)$ is an indicator function that evaluates to $1$ if its argument is true and 0 otherwise. 329 | 330 | To incorporate the F-measure into CNN and optimize it in an end-to-end manner, 331 | we define a decomposable F-measure that is differentiable over posterior $\hat{Y}$. 332 | % 333 | Based on this motivation, we reformulate the true positive, false positive and false negative 334 | based on the continuous posterior $\hat{Y}$: 335 | \begin{equation} 336 | \begin{split} 337 | TP(\hat{Y}, Y) &= \sum\nolimits_i \hat{y}_i \cdot y_i, \\ 338 | FP(\hat{Y}, Y) &= \sum\nolimits_i \hat{y}_i \cdot (1 - y_i), \\ 339 | FN(\hat{Y}, Y) &= \sum\nolimits_i (1-\hat{y}_i) \cdot y_i \ . \\ 340 | \end{split} 341 | \label{eq:tpfp} 342 | \end{equation} 343 | %Similar formulation has been used in~\cite{margolin2014evaluate} to evaluate the quality 344 | %of saliency maps, which is proven to be consistent with human perception. 345 | % 346 | Given the definitions in Eq.~\ref{eq:tpfp}, precision $p$ and recall $r$ are: 347 | \begin{equation} 348 | p(\hat{Y}, Y) = \frac{TP}{TP + FP},\quad r(\hat{Y}, Y) = \frac{TP}{TP + FN}. 349 | \label{pr} 350 | \end{equation} 351 | % 352 | Finally, our \emph{relaxed F-measure} can be written as: 353 | \begin{equation} 354 | \begin{split} 355 | F(\hat{Y}, Y) &= \frac{(1+\beta^2) p \cdot r}{\beta^2 p + r} ,\\ 356 | &= \frac{(1 + \beta^2)TP}{\beta^2(TP + FN) + (TP + FP)} ,\\ 357 | &= \frac{(1 + \beta^2)TP}{H}, 358 | \end{split} 359 | \label{f} 360 | \end{equation} 361 | where $H\! =\! \beta^2(TP + FN) + (TP + FP)$. 362 | Due to the relaxation in Eq.~\ref{eq:tpfp}, Eq.~\ref{f} is decomposable w.r.t the 363 | posterior $\hat{Y}$, therefore can be integrated in CNN architecture trained with 364 | back-prop. 365 | 366 | %------------------------------------------------------------------------------------------ 367 | \subsection{Maximizing F-measure in CNNs} 368 | %------------------------------------------------------------------------------------------ 369 | In order to maximize the \emph{relaxed F-measure} in CNNs in an end-to-end manner, 370 | we define our proposed F-measure based loss (FLoss) function $\mathcal{L}_{F}$ as: 371 | \begin{equation} 372 | \mathcal{L}_{F}(\hat{Y}, Y) = 1 - F = 1 - \frac{(1 + \beta^2)TP}{H}\label{eq:floss}. 373 | \end{equation} 374 | Minimizing $\mathcal{L}_{F}(\hat{Y}, Y)$ is equivalent to maximizing the \emph{relaxed F-measure}. 375 | % 376 | Note again that $\mathcal{L}_{F}$ is calculated directly from the raw prediction $\hat{Y}$ without 377 | thresholding. 378 | % 379 | Therefore, $\mathcal{L}_{F}$ is differentiable 380 | over the prediction $\hat{Y}$ and can be plugged into CNNs. 381 | % 382 | The partial derivative of loss $\mathcal{L}_{F}$ over network activation $\hat{Y}$ at location $i$ is: 383 | \begin{equation} 384 | \begin{split} 385 | \frac{\partial \mathcal{L}_{F}}{\partial \hat{y}_i} 386 | &= -\frac{\partial F}{\partial \hat{y}_i} \\ 387 | &= -\Big(\frac{\partial F}{\partial TP}\cdot \frac{\partial TP}{\partial \hat{y}_i} + 388 | \frac{\partial F}{\partial H }\cdot \frac{\partial H }{\partial \hat{y}_i}\Big) \\ 389 | &= -\Big(\frac{(1+\beta^2)y_i}{H} - \frac{(1+\beta^2)TP}{H^2}\Big) \\ 390 | &= \frac{(1+\beta^2)TP}{H^2} - \frac{(1+\beta^2)y_i}{H} .\\ 391 | \end{split}\label{eq:grad-floss} 392 | \end{equation} 393 | 394 | There is another alternative to Eq.~\ref{eq:floss} which maximize the log-likelihood of F-measure: 395 | \begin{equation} 396 | \mathcal{L}_{\log F}(\hat{Y}, Y) = -\log(F)\label{eq:logfloss}, 397 | \end{equation} 398 | and the corresponding gradient is 399 | \begin{equation} 400 | \frac{\partial \mathcal{L}_{\log F}}{\partial \hat{y}_i} = 401 | \frac{1}{F}\left[\frac{(1+\beta^2)TP}{H^2} - \frac{(1+\beta^2)y_i}{H}\right]. \\ 402 | \label{eq:grad-logfloss} 403 | \end{equation} 404 | We will theoretically and experimentally analyze the advantage of 405 | FLoss against Log-FLoss and CELoss in terms of 406 | producing polarized and high-contrast saliency maps. 407 | 408 | %------------------------------------------------------------------------------------------ 409 | \subsection{FLoss vs Cross-entropy Loss}\label{sec:cel-vs-floss} 410 | %------------------------------------------------------------------------------------------ 411 | To demonstrate the superiority of our FLoss over the alternative Log-FLoss and 412 | the \emph{cross-entropy loss} (CELoss), 413 | we compare the definition, gradient and surface plots of these three loss functions. 414 | % 415 | The definition of CELoss is: 416 | \begin{equation} 417 | \mathcal{L}_{CE}(\hat{Y}, Y) \!=\! -\sum\nolimits_i^{|Y|} 418 | \left(y_i \log{\hat{y}_i} + (1\!-\!y_i) \log{(1\!-\!\hat{y}_i)}\right), 419 | \label{eq:celoss} 420 | \end{equation} 421 | where $i$ is the spatial location of the input image and $|Y|$ is the number of pixels of the input image. 422 | % 423 | The gradient of $\mathcal{L}_{CE}$ w.r.t prediction $\hat{y}_i$ is: 424 | \begin{equation} 425 | \frac{\partial \mathcal{L}_{CE}}{\partial \hat{y}_i} = \frac{y_i}{\hat{y}_i} - \frac{1 - y_i}{1 - \hat{y}_i}. 426 | \label{eq:grad-celoss} 427 | \end{equation} 428 | 429 | \CheckRmv{ 430 | \begin{figure*}[!htb] 431 | \centering 432 | \begin{overpic}[width=0.9\linewidth]{figures/loss-surface-crop} 433 | \put(7,53){FLoss (Eq.~\ref{eq:floss})} 434 | \put(42,53){Log-FLoss (Eq.~\ref{eq:logfloss})} 435 | \put(77,53){CELoss (Eq.~\ref{eq:celoss})} 436 | \put(7,25){FLoss (Eq.~\ref{eq:floss})} 437 | \put(42,25){Log-FLoss (Eq.~\ref{eq:logfloss})} 438 | \put(77,25){CELoss (Eq.~\ref{eq:celoss})} 439 | \put(-3,38){\rotatebox{90}{\small{GT=[0, 1]}}} 440 | \put(-3,10){\rotatebox{90}{\small{GT=[1, 1]}}} 441 | \end{overpic}\vspace{2pt} 442 | \caption{Surface plot of different loss functions in a 2-point 2-class classification circumstance. 443 | % 444 | Columns from left to right: F-measure loss defined in Eq.~\ref{eq:floss}, 445 | log F-measure loss defined in Eq.~\ref{eq:logfloss} and 446 | cross-entropy loss in Eq.~\ref{eq:celoss}. 447 | % 448 | In top row the ground-truth is [0, 1] and in bottom row the ground-truth 449 | is [1, 1]. 450 | % 451 | Compared with cross-entropy loss and Log-FLoss, 452 | FLoss holds considerable gradient even in the saturated area, which will force 453 | to produce polarized predictions. 454 | }\label{fig:loss-surface}\vspace{-6pt} 455 | \end{figure*} 456 | } 457 | 458 | As revealed in Eq.~\ref{eq:grad-floss} and Eq.~\ref{eq:grad-celoss}, the gradient of CELoss 459 | $\frac{\partial \mathcal{L}_{CE}}{\partial \hat{y}_i}$ relies only on the 460 | prediction/ground-truth of a single pixel $i$; 461 | whereas in FLoss $\frac{\partial \mathcal{L}_{F}}{\partial \hat{y}_i}$ 462 | is globally determined by 463 | the prediction and ground-truth of ALL pixels in the image. 464 | % 465 | We further compare the surface plots 466 | of FLoss, Log-FLoss and CELoss in a two points binary classification problem. 467 | % 468 | The results are in Fig.~\ref{fig:loss-surface}. 469 | % 470 | The two spatial axes represent the prediction $\hat{y}_0$ and $\hat{y}_1$, 471 | and the $z$ axis indicates the loss value. 472 | 473 | As shown in Fig.~\ref{fig:loss-surface}, 474 | the gradient of FLoss is different from that of CELoss and Log-FLoss in two aspects: 475 | (1) Limited gradient: the FLoss holds limited gradient values even 476 | the predictions are far away from the ground-truth. 477 | % 478 | This is crucial for CNN training because it prevents the notorious gradient explosion problem. 479 | Consequently, FLoss allows larger learning rates in the training phase, as evidenced by 480 | our experiments. 481 | % 482 | (2) Considerable gradients in the saturated area: in CELoss, the gradient decays 483 | when the prediction gets closer to the ground-truth, 484 | while FLoss holds considerable gradients even in the saturated area. 485 | % 486 | This will force the network to have polarized predictions. 487 | % 488 | Salient detection examples in Fig.~\ref{fig:examples} illustrate the `high-contrast' 489 | and polarized predictions. 490 | 491 | %------------------------------------------------------------------------------------------ 492 | \section{Experiments and Analysis} 493 | %------------------------------------------------------------------------------------------ 494 | 495 | %------------------------------------------------------------------------------------------ 496 | \subsection{Experimental Configurations} 497 | %------------------------------------------------------------------------------------------ 498 | \textbf{Dataset and data augmentation.} 499 | We uniformly train our model and competitors on the MSRA-B~\cite{liu2011learning} 500 | training set for a fair comparison. 501 | % 502 | The MSRA-B dataset with 5000 images in total is equally split into training/testing 503 | subsets. 504 | % 505 | We test the trained models on 5 other SOD datasets: 506 | ECSSD~\cite{yan2013hierarchical}, 507 | HKU-IS~\cite{li2015visual}, 508 | PASCALS~\cite{li2014secrets}, 509 | SOD~\cite{movahedi2010design}, 510 | and DUT-OMRON~\cite{movahedi2010design}. 511 | % 512 | More statistics of these datasets are shown in Table~\ref{tab:dset-stats}. 513 | % 514 | It's worth mentioning that the challenging degree of a dataset is determined by many factors 515 | such as the number of images, number of objects in one image, the contrast of salient object w.r.t the background, 516 | the complexity of salient object structures, the center bias of salient objects and 517 | the size variance of images \emph{etc}. 518 | % 519 | Analyzing these details is out of the scope of this paper, 520 | we refer the readers to~\cite{dpfan2018soc} for more analysis of datasets. 521 | 522 | \CheckRmv{ 523 | \begin{table}[!htb] 524 | \centering 525 | \renewcommand{\arraystretch}{1.00} 526 | \setlength\tabcolsep{4pt} 527 | \resizebox{0.45\textwidth}{!}{ 528 | \begin{tabular}{r|c|c|c|c} 529 | \toprule[1pt] 530 | \textbf{Dataset~~~~} & \textbf{\#Images} &\textbf{Year} &\textbf{Pub.} & \textbf{Contrast} \\ 531 | \midrule[1pt] 532 | MSRA-B~\cite{liu2011learning} & 5000 & 2011 & TPAMI & High\\ 533 | ECSSD~\cite{yan2013hierarchical} & 1000 & 2013 & CVPR & High\\ 534 | HKU-IS~\cite{li2015visual} & 1447 & 2015 & CVPR& Low \\ 535 | PASCALS~\cite{li2014secrets} & 850 & 2014 & CVPR & Medium\\ 536 | SOD~\cite{movahedi2010design} & 300 & 2010 & CVPRW & Low\\ 537 | DUT-OMRON~\cite{yang2013saliency} & 5168 & 2013 & CVPR & Low\\ 538 | \bottomrule[1pt] 539 | \end{tabular} 540 | } 541 | \vspace{4pt} 542 | \caption{Statistics of SOD datasets. 543 | `\#Images' indicates the number of images in a dataset 544 | and `contrast' represents the general contrast between foreground/background. 545 | % 546 | The lower the contrast, the more challenging the dataset is. 547 | }\label{tab:dset-stats}\vspace{-5pt} 548 | \end{table} 549 | } 550 | 551 | 552 | % 553 | Data augmentation is critical to generating sufficient data for training deep CNNs. 554 | % 555 | We fairly perform data augmentation for the original implementations and their FLoss variants. 556 | % 557 | For the DSS~\cite{hou2017deeply} and DHS~\cite{liu2016dhsnet} architectures we perform only 558 | horizontal flip on both training images and saliency maps just as DSS did. 559 | % 560 | Amulet~\cite{zhang2017amulet} only allows $256\!\times\!256$ inputs. 561 | % 562 | We randomly crop/pad the original data to get square images, then resize them to meet the shape requirement. 563 | 564 | \CheckRmv{ 565 | \begin{table*}[!htb] 566 | \centering 567 | \scriptsize 568 | %\footnotesize 569 | \renewcommand{\arraystretch}{1.2} 570 | \renewcommand{\tabcolsep}{2pt} 571 | \resizebox{0.99\textwidth}{!}{ 572 | \begin{tabular}{lcc|ccc|ccc|ccc|ccc|ccc} 573 | % --------------------------------------------------------- 574 | \toprule[1pt] & 575 | \multicolumn{2}{c}{Training data} & 576 | \multicolumn{3}{c}{ECSSD~\cite{yan2013hierarchical}} & 577 | \multicolumn{3}{c}{HKU-IS~\cite{li2015visual}} & 578 | \multicolumn{3}{c}{PASCALS~\cite{li2014secrets}} & 579 | \multicolumn{3}{c}{SOD~\cite{movahedi2010design}} & 580 | \multicolumn{3}{c}{DUT-OMRON~\cite{movahedi2010design}}\\ 581 | % --------------------------------------------------------- 582 | \cmidrule(l){2-3} \cmidrule(l){4-6} \cmidrule(l){7-9} \cmidrule(l){10-13} 583 | \cmidrule(l){13-15} \cmidrule(l){16-18} 584 | % --------------------------------------------------------- 585 | Model & Train & \#Images & MaxF & MeanF & MAE & MaxF & MeanF & MAE & 586 | MaxF & MeanF & MAE & MaxF & MeanF & MAE & MaxF & MeanF & MAE \\ 587 | % --------------------------------------------------------- 588 | \midrule[1pt] 589 | % --------------------------------------------------------- 590 | % Cross enropy& 591 | % MB~\cite{liu2011learning} & 2.5K & 592 | % 0.908 & 0.889 & 0.060 & 0.899 & 0.877 & 0.048 & 593 | % 0.824 & 0.806 & 0.099 & 0.835 & 0.815 & 0.125 & 0.761 & 0.738 & 0.071 \\ 594 | % --------------------------------------------------------- 595 | Log-FLoss& 596 | MB~\cite{liu2011learning} & 2.5K & 597 | .909 & .891 & .057 & .903 & .881 & .043 & .823 & .808 & 598 | .101 & .838 & .817 & .122 & .770 & .741 & .062 \\ 599 | % --------------------------------------------------------- 600 | \textbf{FLoss}& 601 | MB~\cite{liu2011learning} & 2.5K & 602 | .914 & .903 & .050 & .908 & .896 & .038 & .829 & .818 & 603 | .091 & .843 & .838 & .111 & .777 & .755 & .067 \\ 604 | \bottomrule[1pt] 605 | \vspace{0pt} 606 | \end{tabular} 607 | } 608 | \caption{Performance comparison of Log-FLoss (Eq.~\ref{eq:logfloss}) and FLoss (Eq.~\ref{eq:floss}). 609 | % 610 | FLoss performs better than Log-FLoss on most datasets 611 | in terms of MaxF, MeanF and MAE. 612 | % 613 | Specifically FLoss enjoys a large improvement in terms of MeanF because 614 | of its high-contrast predictions. 615 | } 616 | \label{tab:floss-vs-logfloss} 617 | \end{table*} 618 | } 619 | 620 | 621 | \textbf{Network architecture and hyper-parameters.} 622 | We test our proposed FLoss on 3 baseline methods: 623 | Amulet~\cite{zhang2017amulet}, DHS~\cite{liu2011learning} and DSS~\cite{hou2017deeply}. 624 | % 625 | To verify the effectiveness of FLoss (Eq.~\ref{eq:floss}), 626 | we replace the loss functions 627 | of the original implementations with FLoss 628 | and keep all other configurations unchanged. 629 | % 630 | As explained in Sec.~\ref{sec:cel-vs-floss}, the FLoss allows a larger base learning rate due to 631 | limited gradients. 632 | % 633 | We use the base learning rate $10^4$ times the original settings. 634 | % 635 | For example, in DSS the base learning rate is $10^{-8}$, while in our F-DSS, the base learning 636 | rate is $10^{-4}$. 637 | % 638 | All other hyper-parameters are consistent with the original implementations for a fair comparison. 639 | 640 | \textbf{Evaluation metrics.} We evaluate the performance of saliency maps 641 | in terms of maximal F-measure (MaxF), mean F-measure (MeanF) and mean absolute error 642 | (MAE = $\frac{1}{N}\sum_i^N |\hat{y}_i - y_i|$). 643 | % 644 | The factor $\beta^2$ in Eq.~\ref{eq:def-f} is set to 0.3 as suggested by 645 | ~\cite{achanta2009frequency, hou2017deeply, li2016deep, liu2016dhsnet, wang2016saliency}. 646 | % 647 | By applying series thresholds $t\in \mathcal{T}$ to the saliency map $\hat{Y}$, we obtain 648 | binarized saliency maps $\dot{Y}^t$ with different precisions, recalls and F-measures. 649 | 650 | Then the optimal threshold $t_o$ is obtained by exhaustively searching the testing set: 651 | \begin{equation} 652 | t_o = \argmax_{t\in \mathcal{T}} F(Y, \dot{Y}^t). 653 | \label{eq:optimal-t} 654 | \end{equation} 655 | 656 | Finally, we binarize the predictions with $t_o$ and evaluate the best F-measure: 657 | \begin{equation} 658 | \text{MaxF} = F(Y, \dot{Y}^{t_o}), 659 | \label{eq:maxf} 660 | \end{equation} 661 | where $\dot{Y}^{t_o}$ is a binary saliency map binarized with $t_o$. 662 | % 663 | The MeanF is the average F-measure under different thresholds: 664 | \begin{equation} 665 | \text{MeanF} = \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}} F(Y, \dot{Y}^t), 666 | \label{eq:meanf} 667 | \end{equation} 668 | where $\mathcal{T}$ is the collection of possible thresholds. 669 | 670 | 671 | %------------------------------------------------------------------------------------------ 672 | \subsection{Log-FLoss vs FLoss} 673 | %------------------------------------------------------------------------------------------ 674 | Firstly we compare FLoss with its alternative, namely Log-FLoss defined in Eq.~\ref{eq:logfloss}, 675 | to justify our choice. 676 | % 677 | As analyzed in Sec.~\ref{sec:cel-vs-floss}, FLoss enjoys the advantage of having large gradients 678 | in the saturated area that cross-entropy loss and Log-FLoss don't have. 679 | 680 | \CheckRmv{ 681 | \begin{figure}[b] 682 | \centering 683 | \begin{overpic}[width=1\linewidth]{figures/floss-vs-logfloss3} 684 | \put(-1,41){\rotatebox{90}{Image}} 685 | \put(-1,30){\rotatebox{90}{GT}} 686 | \put(-1,14){\rotatebox{90}{\footnotesize{Log-FLoss}}} 687 | \put(-1,2){\rotatebox{90}{\small{FLoss}}} 688 | \end{overpic} 689 | \caption{Example saliency maps by FLoss (bottom) and Log-FLoss (middle). 690 | % 691 | Our proposed FLoss method produces high-contrast saliency maps. 692 | }\label{fig:examples-floss-vs-logfloss}\vspace{0pt} 693 | \end{figure} 694 | } 695 | 696 | 697 | \CheckRmv{ 698 | \begin{figure*}[!thb] 699 | \centering 700 | \begin{overpic}[width=0.95\linewidth]{figures/detection-examples} 701 | \put(3,65){Image} 702 | \put(17,65){GT} 703 | \put(26,65){DHS~\cite{liu2016dhsnet}} 704 | \put(39.5,65){\parbox{.5in}{F-DHS}} 705 | \put(50,65){Amulet~\cite{zhang2017amulet}} 706 | \put(64,65){F-Amulet} 707 | \put(78,65){DSS~\cite{hou2017deeply}} 708 | \put(90,65){\parbox{.5in}{F-DSS}} 709 | \end{overpic} 710 | \caption{Salient object detection examples on several popular datasets. 711 | F-DHS, F-Amulet and F-DSS indicate the original architectures trained with 712 | our proposed FLoss. FLoss leads to sharp salient confidence, especially 713 | on the object boundaries.}\label{fig:examples} 714 | \end{figure*} 715 | } 716 | 717 | 718 | To experimentally verify our assumption that FLoss will produce high-contrast predictions, 719 | we train the DSS~\cite{hou2017deeply} model with FLoss and Log-FLoss, respectively. 720 | % 721 | The training data is MSRA-B~\cite{liu2011learning} and 722 | hyper-parameters are kept unchanged with the original implementation, except for the base learning rate. 723 | % 724 | We adjust the base learning rate to $10^{-4}$ since our method accept larger learning rate, as explained 725 | in Sec.~\ref{sec:cel-vs-floss}. 726 | % 727 | Quantitative results are in Table~\ref{tab:floss-vs-logfloss} and some example detected saliency maps 728 | are shown in Fig.~\ref{fig:examples-floss-vs-logfloss}. 729 | 730 | Although both of Log-FLoss 731 | and FLoss use F-measure as maximization target, 732 | FLoss derives polarized predictions with high foreground-background contrast, 733 | as shown in Fig.~\ref{fig:examples-floss-vs-logfloss}. 734 | % 735 | The same conclusion can be drawn from Table~\ref{tab:floss-vs-logfloss} where 736 | FLoss achieves higher Mean F-measure. 737 | % 738 | Which reveals that FLoss achieves higher 739 | F-measure score under a wide range of thresholds. 740 | 741 | %------------------------------------------------------------------------------------------ 742 | \subsection{Evaluation results on open Benchmarks} 743 | %------------------------------------------------------------------------------------------ 744 | 745 | \CheckRmv{ 746 | \begin{table*}[!htp] 747 | \centering 748 | \scriptsize 749 | %\footnotesize 750 | \renewcommand{\arraystretch}{1.5} 751 | %\renewcommand{\tabcolsep}{1.2mm} 752 | \renewcommand{\tabcolsep}{2pt} 753 | \resizebox{0.99\textwidth}{!}{ 754 | \begin{tabular}{lcc|ccc|ccc|ccc|ccc|ccc} 755 | \toprule[1pt] & 756 | \multicolumn{2}{c}{Training data} & 757 | \multicolumn{3}{c}{ECSSD~\cite{yan2013hierarchical}} & 758 | \multicolumn{3}{c}{HKU-IS~\cite{li2015visual}} & 759 | \multicolumn{3}{c}{PASCALS~\cite{li2014secrets}} & 760 | \multicolumn{3}{c}{SOD~\cite{movahedi2010design}} & 761 | \multicolumn{3}{c}{DUT-OMRON~\cite{movahedi2010design}}\\ 762 | % end of the raw 763 | \cmidrule(l){2-3} \cmidrule(l){4-6} \cmidrule(l){7-9} \cmidrule(l){10-13} \cmidrule(l){13-15} \cmidrule(l){16-18} 764 | Model & Train & \#Images & MaxF & MeanF & MAE & MaxF & MeanF & MAE & 765 | MaxF & MeanF & MAE & MaxF & MeanF & MAE & MaxF & MeanF & MAE \\ 766 | % end of the raw 767 | \midrule[1pt] 768 | \textbf{RFCN}~\cite{wang2016saliency} & 769 | MK~\cite{cheng2015global} & 770 | 10K & .898 & .842 & .095 & .895 & .830 & .078 & 771 | .829 & .784 & .118 & .807 & .748 & .161 & - & - & -\\ 772 | % --------------------------------------------------------- 773 | \textbf{DCL}~\cite{li2016deep} & 774 | MB~\cite{liu2011learning} & 775 | 2.5K & .897 & .847 & .077 & .893 & .837 & .063 & 776 | .807 & .761 & .115 & .833 & .780 & .131 & .733 & .690 & .095 \\ 777 | % --------------------------------------------------------- 778 | \textbf{DHS}~\cite{liu2016dhsnet} & 779 | MK~\cite{cheng2015global}+D~\cite{movahedi2010design} & 780 | 9.5K & .905 & .876 & .066 & .891 & .860 & .059 & 781 | .820 & .794 & .101 & .819 & .793 & .136 & - & - & - \\ 782 | % --------------------------------------------------------- 783 | \textbf{Amulet}~\cite{zhang2017amulet} & MK~\cite{cheng2015global} & 784 | 10K & .912 & .898 & .059 & .889 & .873 & .052 & 785 | .828 & .813 & .092 & .801 & .780 & .146 & .737 & .719 & .083 \\ 786 | % --------------------------------------------------------- 787 | \hline 788 | \textbf{DHS}~\cite{liu2016dhsnet} & 789 | MB & 790 | 2.5K & .874 & .867 & .074 & .835 & .829 & .071 & 791 | .782 & .777 & .114 & .800 & .789 & .140 & .704 & .696 & \textbf{.078} \\ 792 | % --------------------------------------------------------- 793 | \textbf{DHS+FLoss}~\cite{liu2016dhsnet} & 794 | MB & 2.5K & 795 | \textbf{.884} & \textbf{.879} & \textbf{.067} & \textbf{.859} & 796 | \textbf{.854} & \textbf{.061} & \textbf{.792} & \textbf{.786} & 797 | \textbf{.107} & \textbf{.801} & \textbf{.795} & \textbf{.138} & 798 | \textbf{.707} & \textbf{.701} & .079 \\ 799 | % --------------------------------------------------------- 800 | \hline 801 | \textbf{Amulet}~\cite{zhang2017amulet} & 802 | MB & 2.5K & 803 | .881 & .857 & .076 & .868 & .837 & .061 & 804 | .775 & .753 & .125 & .791 & .776 & .149 & .704 & .663 & .098 \\ 805 | % --------------------------------------------------------- 806 | \textbf{Amulet-FLoss} & MB & 2.5K & 807 | \textbf{.894} & \textbf{.883} & \textbf{.063} & \textbf{.880} & 808 | \textbf{.866} & \textbf{.051} & \textbf{.791} & \textbf{.776} & 809 | \textbf{.115} & \textbf{.805} & \textbf{.800} & \textbf{.138} & 810 | \textbf{.729} & \textbf{.696} & \textbf{.097} \\ 811 | % --------------------------------------------------------- 812 | \hline 813 | \textbf{DSS}~\cite{hou2017deeply} & MB & 814 | 2.5K & .908 & .889 & .060 & .899 & .877 & .048 & 815 | .824 & .806 & .099 & .835 & .815 & .125 & .761 & .738 & .071 \\ 816 | % --------------------------------------------------------- 817 | % end of the raw 818 | \textbf{DSS+FLoss} & 819 | MB & 2.5K & 820 | \textbf{.914} & \textbf{.903} & \textbf{.050} & \textbf{.908} & 821 | \textbf{.896} & \textbf{.038} & \textbf{.829} & \textbf{.818} & 822 | \textbf{.091} & \textbf{.843} & \textbf{.838} & \textbf{.111} & 823 | \textbf{.777} & \textbf{.755} & \textbf{.067} \\ 824 | % end of the raw 825 | \bottomrule[1pt] 826 | \vspace{0.5pt} 827 | \end{tabular} 828 | }\vspace{-8pt} 829 | \caption{Quantitative comparison of different methods on 6 popular datasets. 830 | % 831 | Our proposed FLoss consistently improves performance in terms of both MAE (the smaller the better) 832 | and F-measure (the larger the better). 833 | % 834 | Especially in terms of Mean F-measure, we outperform the state-of-the-art with very 835 | clear margins, because our method is able to produce high-contrast predictions that can 836 | achieve high F-measure under a wide range of thresholds. 837 | } 838 | \label{tab:quantitative}\vspace{-12pt} 839 | \end{table*} 840 | } 841 | 842 | 843 | We compare the proposed method with several baselines on 5 popular datasets. 844 | % 845 | Some example detection results are shown in Fig.~\ref{fig:examples} and 846 | comprehensive quantitative comparisons are in Table~\ref{tab:quantitative}. 847 | % 848 | In general, FLoss-based methods can obtain considerable improvements compared with 849 | their cross-entropy loss (CELoss) based counterparts 850 | especially in terms of mean F-measure and MAE. 851 | % 852 | This is mainly because our method is stable against the threshold, leading to 853 | high-performance saliency maps under a wide threshold range. 854 | % 855 | In our detected saliency maps, the foreground (salient objects) and background are well separated, 856 | as shown in Fig.~\ref{fig:examples} and explained in Sec.~\ref{sec:cel-vs-floss}. 857 | 858 | 859 | %------------------------------------------------------------------------------------------ 860 | \subsection{Threshold Free Salient Object Detection}\label{sec:thres-free} 861 | %------------------------------------------------------------------------------------------ 862 | 863 | State-of-the-art SOD methods \cite{hou2017deeply,li2016deep,liu2016dhsnet,zhang2017amulet} 864 | often evaluate maximal F-measure as follows: 865 | (a) Obtain the saliency maps $\hat{Y}_i$ with pretrained model; 866 | (b) Tune the best threshold $t_o$ by exhaustive search on the testing set (Eq.~\ref{eq:optimal-t}) 867 | and binarize the predictions with $t_o$; 868 | (c) Evaluate the maximal F-measure according to Eq.~\ref{eq:maxf}. 869 | % 870 | %We point out that one significant flaw of existing popular evaluation 871 | %procedure is that they have to manually tune the optimal threshold $t_o$. 872 | % 873 | %And we experimentally confirm that conventional saliency detectors are sensitive 874 | % to the `optimal threshold'. 875 | % 876 | %The standard evaluation protocol for SOD works as below: 877 | % 878 | 879 | 880 | There is an obvious flaw in the above procedure: 881 | the optimal threshold is obtained via an exhaustive search on the testing set. 882 | % 883 | %However, in real-world applications it is impossible for us to 884 | %tune such `optimal threshold' on the annotated testing data. 885 | Such procedure is impractical for real-world applications as we would 886 | not have annotated testing data. 887 | % 888 | And even if we tuned the optimal threshold on one dataset, it can not be widely applied 889 | to other datasets. 890 | 891 | \CheckRmv{ 892 | \begin{figure*}[t] 893 | \centering 894 | \begin{tabular}{@{}cc@{}} 895 | \begin{overpic}[width=.40\textwidth]{figures/f-thres} 896 | \put(60, 35.5){\scriptsize{\cite{zhang2017amulet}}} % amulet 897 | \put(56, 26.5){\scriptsize{\cite{liu2016dhsnet}}} % dhs 898 | \put(55, 17.5){\scriptsize{\cite{hou2017deeply}}} % dss 899 | %\put(57, 18){\scriptsize{\cite{li2016deep}}} % RFCN 900 | %\put(55, 13.3){\scriptsize{\cite{wang2016saliency}}} % dcl 901 | \put(50, -5){(a)} 902 | \end{overpic} & 903 | \begin{overpic}[width=.40\textwidth]{figures/thres-variation} 904 | \put(50, -5){(b)} 905 | \end{overpic} \\ 906 | \end{tabular}\vspace{10pt} 907 | \caption{(a) F-measures under different thresholds on the ECSSD dataset. 908 | % 909 | (b) The mean and variance of optimal threshold $t_o$. 910 | % 911 | FLoss-based methods hold stable $t_o$ across different datasets (lower $t_o$ variances) and different backbone 912 | architectures (F-DHS, F-Amulet and F-DSS hold very close mean $t_o$). 913 | } 914 | \label{fig:thres-free} 915 | \end{figure*} 916 | } 917 | 918 | We further analyze the sensitivity of methods against thresholds in two aspects: 919 | (1) model performance under different thresholds, which reflects the stability of 920 | a method against threshold change, 921 | (2) the mean and variance of optimal threshold $t_o$ on different datasets, 922 | which represent the generalization ability of $t_o$ tuned on one dataset to others. 923 | 924 | Fig.~\ref{fig:thres-free} (a) illustrates the F-measure w.r.t different thresholds. 925 | % 926 | For most methods without FLoss, the F-measure changes sharply with the threshold, 927 | and the maximal F-measure (MaxF) presents only in a narrow threshold span. 928 | % 929 | While FLoss based methods are almost immune from the change of threshold. 930 | % 931 | 932 | Fig.~\ref{fig:thres-free} (b) reflects the mean and variance of $t_o$ 933 | across different datasets. 934 | % 935 | Conventional methods (DHS, DSS, Amulet) present unstable $t_o$ on different datasets, 936 | as evidenced by their large variances. 937 | % 938 | While the $t_o$ of FLoss-based methods (F-DHS, F-Amulet, F-DSS) 939 | stay unchanged across different datasets and different backbone network architectures. 940 | % 941 | 942 | In conclusion, the proposed FLoss is stable against threshold $t$ 943 | in three aspects: 944 | (1) it achieves high performance under a wide range of threshold; 945 | % 946 | (2) optimal threshold $t_o$ tuned on one dataset can be transferred to others, 947 | because $t_o$ varies slightly across different datasets; 948 | % 949 | and (3) $t_o$ obtained from one backbone architecture can be applied to other architectures. 950 | 951 | %------------------------------------------------------------------------------------------ 952 | \subsection{The Label-unbalancing Problem in SOD} 953 | %------------------------------------------------------------------------------------------ 954 | 955 | \CheckRmv{ 956 | \begin{figure} 957 | \centering 958 | \includegraphics[width=0.75\linewidth]{figures/prf-thres2} 959 | \caption{{\color{red}{\textbf{Precision}}}, {\color{green}{\textbf{Recall}}}, 960 | {\color{blue}{\textbf{F-measure}}} and Maximal F-measure ({\color{blue}{$\bullet$}}) of 961 | DSS (\textbf{- - -}) 962 | and F-DSS (\textbf{---}) under different thresholds. 963 | % 964 | DSS tends to predict unknown pixels as the 965 | majority class--the background, resulting in high precision but low recall. 966 | % 967 | FLoss is able to find a better compromise between precision and recall. 968 | }\label{fig:prf-thres} 969 | \end{figure} 970 | } 971 | 972 | 973 | The foreground and background are biased in SOD where 974 | most pixels belong to the non-salient regions. 975 | % 976 | The unbalanced training data will lead the model to local minimal 977 | that tends to predict unknown pixels as the background. 978 | % 979 | Consequently, the recall will become a bottleneck to the performance during evaluations, 980 | as illustrated in Fig.~\ref{fig:prf-thres}. 981 | 982 | Although assigning loss weight to the positive/negative samples is a simple way to 983 | offset the unbalancing problem, an additional experiment in Table~\ref{tab:balance} 984 | reveals that our method performs better than simply assigning loss weight. 985 | % 986 | We define the \emph{balanced cross-entropy loss} with weight factor between 987 | positive/negative samples: 988 | \begin{equation} 989 | \begin{split} 990 | \mathcal{L}_{balance} = \sum\nolimits_i^{|Y|} &w_1 \cdot y_i\log{\hat{y_i}} + \\ 991 | &w_0 \cdot (1-y_i)\log{(1-\hat{y_i})}. 992 | \end{split} 993 | \label{eq:balance-cross-entropy} 994 | \end{equation} 995 | % 996 | The loss weights for positive/negative samples are determined by the 997 | positive/negative proportion in a mini-batch: 998 | $w_1 = \frac{1}{|Y|}\sum_i^{|Y|} 1(y_i\!==\!0)$ and $w_0 = \frac{1}{|Y|}\sum_i^{|Y|} 1(y_i\!==\!1)$, 999 | as suggested in~\cite{xie2015holistically} and \cite{shen2015deepcontour}. 1000 | 1001 | %------------------------------------------------------------------------------------------ 1002 | \subsection{The Compromise Between Precision and Recall} 1003 | %------------------------------------------------------------------------------------------ 1004 | 1005 | \CheckRmv{ 1006 | \begin{table*}[t] 1007 | \centering 1008 | \scriptsize 1009 | %\footnotesize 1010 | \renewcommand{\arraystretch}{1.2} 1011 | %\renewcommand{\tabcolsep}{1.5mm} 1012 | \renewcommand{\tabcolsep}{2pt} 1013 | \resizebox{0.99\textwidth}{!}{ 1014 | \begin{tabular}{lcc|ccc|ccc|ccc|ccc|ccc} 1015 | % --------------------------------------------------------- 1016 | \toprule[1pt] & 1017 | \multicolumn{2}{c}{Training data} & 1018 | \multicolumn{3}{c}{ECSSD~\cite{yan2013hierarchical}} & 1019 | \multicolumn{3}{c}{HKU-IS~\cite{li2015visual}} & 1020 | \multicolumn{3}{c}{PASCALS~\cite{li2014secrets}} & 1021 | \multicolumn{3}{c}{SOD~\cite{movahedi2010design}} & 1022 | \multicolumn{3}{c}{DUT-OMRON~\cite{movahedi2010design}}\\ 1023 | % --------------------------------------------------------- 1024 | \cmidrule(l){2-3} \cmidrule(l){4-6} \cmidrule(l){7-9} \cmidrule(l){10-13} 1025 | \cmidrule(l){13-15} \cmidrule(l){16-18} 1026 | % --------------------------------------------------------- 1027 | Model & Train & \#Images & MaxF & MeanF & MAE & MaxF & MeanF & MAE & 1028 | MaxF & MeanF & MAE & MaxF & MeanF & MAE & MaxF & MeanF & MAE \\ 1029 | % --------------------------------------------------------- 1030 | \midrule[1pt] 1031 | % --------------------------------------------------------- 1032 | \textbf{DSS}~\cite{hou2017deeply} & 1033 | MB~\cite{liu2011learning} & 2.5K & 1034 | .908 & .889 & .060 & .899 & .877 & .048 & 1035 | .824 & .806 & .099 & .835 & .815 & .125 & .761 & .738 & .071 \\ 1036 | % --------------------------------------------------------- 1037 | \textbf{DSS+Balance} & 1038 | MB~\cite{liu2011learning} & 2.5K & 1039 | .910 & .890 & .059 & .900 & .877 & .048 & .827 & 1040 | .807 & .097 & .837 & .816 & .124 & .765 & .741 & .069 \\ 1041 | % end of the raw 1042 | \textbf{DSS+FLoss} & MB~\cite{liu2011learning} & 2.5K & 1043 | \textbf{.914} & \textbf{.903} & \textbf{.050} & \textbf{.908} & \textbf{.896} & \textbf{.038} 1044 | & \textbf{.829} & 1045 | \textbf{.818} & \textbf{.091} & \textbf{.843} & \textbf{.838} & 1046 | \textbf{.111} & \textbf{.777} & \textbf{.755} & \textbf{.067} \\ 1047 | % end of the raw 1048 | \bottomrule[1pt] 1049 | \vspace{0pt} 1050 | \end{tabular} 1051 | } 1052 | \vspace{-2pt} 1053 | %\vspace{1pt} 1054 | \caption{Performance comparisons across the original cross-entropy loss (Eq.~\ref{eq:celoss}), 1055 | balanced cross-entropy loss (Eq.~\ref{eq:balance-cross-entropy}) and 1056 | our proposed FLoss (Eq.~\ref{eq:floss}). 1057 | % 1058 | Original cross-entropy learns a biased prior towards the major class (the background). 1059 | % 1060 | This is evidenced by the low recall: many positive points 1061 | are mis-predicted as negative because of biased prior. 1062 | % 1063 | By assigning loss weights on foreground/background samples, 1064 | the \emph{balanced cross-entropy loss} can alleviate the unbalancing problem. 1065 | % 1066 | Our proposed method performs better than the \emph{balanced cross-entropy loss}, 1067 | because the F-measure criterion can automatically adjust data unbalance. 1068 | } 1069 | \vspace{-5pt} 1070 | \label{tab:balance} 1071 | \end{table*} 1072 | } 1073 | 1074 | Recall and precision are two conflict metrics. 1075 | % 1076 | In some applications, we care recall more than precision, 1077 | while in other tasks precision may be more important than recall. 1078 | % 1079 | The $\beta^2$ in Eq.~\ref{eq:def-f} balances the bias between precision and precision 1080 | when evaluating the performance of specific tasks. 1081 | % 1082 | For example, recent studies on edge detection 1083 | use~\cite{bsds500, xie2015holistically, shen2015deepcontour} $\beta^2=1$, 1084 | indicating its equal consideration on precision and recall. 1085 | % 1086 | While saliency detection~\cite{achanta2009frequency, hou2017deeply, li2016deep, liu2016dhsnet, wang2016saliency} usually uses $\beta^2=0.3$ 1087 | to emphasize the precision over the recall. 1088 | 1089 | As an optimization target, the FLoss should also be able to 1090 | balance the favor between precision and recall. 1091 | % 1092 | We train models with different $\beta^2$ and comprehensively evaluate 1093 | their performances in terms of precision, recall and F-measure. 1094 | % 1095 | Results in Fig.~\ref{fig:pr-beta} reveal that $\beta^2$ is a bias adjuster 1096 | between precision and recall: % during model training 1097 | larger $\beta^2$ leads to higher recall while lower $\beta^2$ results in higher precision. 1098 | 1099 | \CheckRmv{ 1100 | \begin{figure}[!htb] 1101 | \centering 1102 | \includegraphics[width=0.75\linewidth]{figures/pr-beta} 1103 | \caption{{\color{red}{\textbf{Precision}}}, {\color{green}{\textbf{Recall}}}, 1104 | {\color{blue}{\textbf{F-measure}}} of model trained under different $\beta^2$ 1105 | (Eq.~\ref{eq:def-f}). 1106 | % 1107 | The precision decreases with the growing of $\beta^2$ whereas recall increases. 1108 | % 1109 | This characteristic gives us much flexibility to adjust the balance 1110 | between recall and precision: 1111 | use larger $\beta^2$ in a recall-first application and lower $\beta^2$ 1112 | otherwise. 1113 | }\label{fig:pr-beta}\vspace{-12pt} 1114 | \end{figure} 1115 | } 1116 | 1117 | 1118 | %------------------------------------------------------------------------------------------ 1119 | \subsection{Faster Convergence and Better Performance} 1120 | %------------------------------------------------------------------------------------------ 1121 | In this experiment, we train three state-of-the-art saliency detectors (Amulet~\cite{zhang2017amulet}, 1122 | DHS~\cite{liu2011learning} and DSS~\cite{hou2017deeply}) and their FLoss counterparts. 1123 | % 1124 | Then we plot the performance of all the methods at each checkpoint 1125 | to determine the converge speed and converged performance of respective models. 1126 | % 1127 | All the models are trained on the MB~\cite{liu2011learning} dataset 1128 | and tested on the ECSSD~\cite{yan2013hierarchical} dataset. 1129 | % 1130 | The results are shown in Fig.\ref{fig:f-iter}. 1131 | 1132 | \CheckRmv{ 1133 | \begin{figure}[!h] 1134 | \centering 1135 | \begin{overpic}[width=0.85\linewidth]{figures/f-iter-multiple} 1136 | %\put(90, 13.5){\cite{hou2017deeply}} 1137 | \end{overpic} 1138 | \caption{Performance versus training iterations. 1139 | Our method presents faster convergence and higher converged performance.} 1140 | \label{fig:f-iter} 1141 | \vspace{-10pt} 1142 | \end{figure} 1143 | } 1144 | 1145 | We observe that our FLoss offers a per-iteration performance promotion for all the three saliency models. 1146 | We also find 1147 | that the FLoss-based methods quickly learn to 1148 | focus on the salient object area and achieve high F-measure score after hundreds of iterations. 1149 | % 1150 | While cross-entropy based methods produce blurry outputs and cannot localize 1151 | salient areas very preciously. 1152 | % 1153 | As shown in Fig.~\ref{fig:f-iter}, FLoss based methods converge faster than its cross entropy 1154 | competitors and get higher converged performance. 1155 | 1156 | \vspace{-4pt} 1157 | %------------------------------------------------------------------------------------------ 1158 | \section{Conclusion} 1159 | %------------------------------------------------------------------------------------------ 1160 | \vspace{-4pt} 1161 | In this paper, we propose to directly maximize the F-measure for salient object detection. 1162 | % 1163 | We introduce the FLoss that is differentiable w.r.t the predicted posteriors 1164 | as the optimization objective of CNNs. 1165 | % 1166 | The proposed method achieves better performance in terms of better handling biased data distributions. 1167 | % 1168 | Moreover, our method is stable against the threshold and able to produce high-quality saliency maps 1169 | under a wide threshold range, showing great potential in real-world applications. 1170 | % 1171 | By adjusting the $\beta^2$ factor, one can easily adjust the 1172 | compromise between precision and recall, 1173 | enabling flexibility to deal with various applications. 1174 | % 1175 | Comprehensive benchmarks on several popular datasets illustrate the advantage of the proposed 1176 | method. 1177 | 1178 | \paragraph{Future work.} 1179 | We plan to improve the performance and efficiency of the proposed method 1180 | by using recent backbone models, \eg, \cite{gao2019res2net,MobileNetV2}. 1181 | % 1182 | Besides, the FLoss is potentially helpful to other binary dense prediction tasks 1183 | such as edge detection~\cite{RcfEdgePami2019}, shadow detection~\cite{Hu_2018_CVPR} 1184 | and skeleton detection~\cite{zhao2018hifi}. 1185 | 1186 | 1187 | \paragraph{Acknowledgment.} 1188 | This research was supported by NSFC (61572264, 61620106008), 1189 | the national youth talent support program, 1190 | and Tianjin Natural Science Foundation (17JCJQJC43700, 18ZXZNGX00110). 1191 | 1192 | {\small 1193 | \bibliographystyle{ieee_fullname} 1194 | \bibliography{fmeasure} 1195 | } 1196 | 1197 | \end{document} 1198 | -------------------------------------------------------------------------------- /latex/iccv_eso.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% This is file `everyshi.sty', 3 | %% generated with the docstrip utility. 4 | %% 5 | %% The original source files were: 6 | %% 7 | %% everyshi.dtx (with options: `package') 8 | %% 9 | %% Copyright (C) [1994..1999] by Martin Schroeder. All rights reserved. 10 | %% 11 | %% This file is part of the EveryShi package 12 | %% 13 | %% This program may be redistributed and/or modified under the terms 14 | %% of the LaTeX Project Public License, either version 1.0 of this 15 | %% license, or (at your option) any later version. 16 | %% The latest version of this license is in 17 | %% CTAN:macros/latex/base/lppl.txt. 18 | %% 19 | %% Happy users are requested to send me a postcard. :-) 20 | %% 21 | %% The EveryShi package contains these files: 22 | %% 23 | %% everyshi.asc 24 | %% everyshi.dtx 25 | %% everyshi.dvi 26 | %% everyshi.ins 27 | %% everyshi.bug 28 | %% 29 | %% Error Reports in case of UNCHANGED versions to 30 | %% 31 | %% Martin Schr"oder 32 | %% Cr"usemannallee 3 33 | %% D-28213 Bremen 34 | %% Martin.Schroeder@ACM.org 35 | %% 36 | %% File: everyshi.dtx Copyright (C) 2001 Martin Schr\"oder 37 | \NeedsTeXFormat{LaTeX2e} 38 | \ProvidesPackage{everyshi} 39 | [2001/05/15 v3.00 EveryShipout Package (MS)] 40 | %% \CharacterTable 41 | %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z 42 | %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z 43 | %% Digits \0\1\2\3\4\5\6\7\8\9 44 | %% Exclamation \! Double quote \" Hash (number) \# 45 | %% Dollar \$ Percent \% Ampersand \& 46 | %% Acute accent \' Left paren $ Right paren $ 47 | %% Asterisk \* Plus \+ Comma \, 48 | %% Minus \- Point \. Solidus \/ 49 | %% Colon \: Semicolon \; Less than \< 50 | %% Equals \= Greater than \> Question mark \? 51 | %% Commercial at \@ Left bracket \[ Backslash \\ 52 | %% Right bracket \] Circumflex \^ Underscore \_ 53 | %% Grave accent \` Left brace \{ Vertical bar \| 54 | %% Right brace \} Tilde \~} 55 | %% 56 | %% \iffalse meta-comment 57 | %% =================================================================== 58 | %% @LaTeX-package-file{ 59 | %% author = {Martin Schr\"oder}, 60 | %% version = "3.00", 61 | %% date = "15 May 2001", 62 | %% filename = "everyshi.sty", 63 | %% address = {Martin Schr\"oder 64 | %% Cr\"usemannallee 3 65 | %% 28213 Bremen 66 | %% Germany}, 67 | %% telephone = "+49-421-2239425", 68 | %% email = "martin@oneiros.de", 69 | %% pgp-Key = "2048 bit / KeyID 292814E5", 70 | %% pgp-fingerprint = "7E86 6EC8 97FA 2995 82C3 FEA5 2719 090E", 71 | %% docstring = "LaTeX package which provides hooks into 72 | %% \cs{shipout}. 73 | %% } 74 | %% =================================================================== 75 | %% \fi 76 | 77 | \newcommand{\@EveryShipout@Hook}{} 78 | \newcommand{\@EveryShipout@AtNextHook}{} 79 | \newcommand*{\EveryShipout}[1] 80 | {\g@addto@macro\@EveryShipout@Hook{#1}} 81 | \newcommand*{\AtNextShipout}[1] 82 | {\g@addto@macro\@EveryShipout@AtNextHook{#1}} 83 | \newcommand{\@EveryShipout@Shipout}{% 84 | \afterassignment\@EveryShipout@Test 85 | \global\setbox\@cclv= % 86 | } 87 | \newcommand{\@EveryShipout@Test}{% 88 | \ifvoid\@cclv\relax 89 | \aftergroup\@EveryShipout@Output 90 | \else 91 | \@EveryShipout@Output 92 | \fi% 93 | } 94 | \newcommand{\@EveryShipout@Output}{% 95 | \@EveryShipout@Hook% 96 | \@EveryShipout@AtNextHook% 97 | \gdef\@EveryShipout@AtNextHook{}% 98 | \@EveryShipout@Org@Shipout\box\@cclv% 99 | } 100 | \newcommand{\@EveryShipout@Org@Shipout}{} 101 | \newcommand*{\@EveryShipout@Init}{% 102 | \message{ABD: EveryShipout initializing macros}% 103 | \let\@EveryShipout@Org@Shipout\shipout 104 | \let\shipout\@EveryShipout@Shipout 105 | } 106 | \AtBeginDocument{\@EveryShipout@Init} 107 | \endinput 108 | %% 109 | %% End of file `everyshi.sty'. 110 | -------------------------------------------------------------------------------- /latex/ieee_fullname.bst: -------------------------------------------------------------------------------- 1 | % This is a modification to the normal ieee.bst used by CVPR to render 2 | % first names in the bibliography as "Firstname Lastname" rather than 3 | % "F. Lastname". 4 | % 5 | % Jonathan T. Barron, 12/5/2018, jonbarron@gmail.com 6 | 7 | % --------------------------------------------------------------- 8 | % 9 | % ieee.bst,v 1.0 2002/04/16 10 | % 11 | % by Glenn Paulley (paulley@acm.org) 12 | % 13 | % Modified from latex8.bst 1995/09/15 15:13:49 ienne Exp $ 14 | % 15 | % by Paolo.Ienne@di.epfl.ch 16 | % 17 | % 18 | % --------------------------------------------------------------- 19 | % 20 | % no guarantee is given that the format corresponds perfectly to 21 | % IEEE 8.5" x 11" Proceedings, but most features should be ok. 22 | % 23 | % --------------------------------------------------------------- 24 | % 25 | % `ieee' from BibTeX standard bibliography style `abbrv' 26 | % version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. 27 | % Copyright (C) 1985, all rights reserved. 28 | % Copying of this file is authorized only if either 29 | % (1) you make absolutely no changes to your copy, including name, or 30 | % (2) if you do make changes, you name it something other than 31 | % btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. 32 | % This restriction helps ensure that all standard styles are identical. 33 | % The file btxbst.doc has the documentation for this style. 34 | 35 | ENTRY 36 | { address 37 | author 38 | booktitle 39 | chapter 40 | edition 41 | editor 42 | howpublished 43 | institution 44 | journal 45 | key 46 | month 47 | note 48 | number 49 | organization 50 | pages 51 | publisher 52 | school 53 | series 54 | title 55 | type 56 | volume 57 | year 58 | } 59 | {} 60 | { label } 61 | 62 | INTEGERS { output.state before.all mid.sentence after.sentence after.block } 63 | 64 | FUNCTION {init.state.consts} 65 | { #0 'before.all := 66 | #1 'mid.sentence := 67 | #2 'after.sentence := 68 | #3 'after.block := 69 | } 70 | 71 | STRINGS { s t } 72 | 73 | FUNCTION {output.nonnull} 74 | { 's := 75 | output.state mid.sentence = 76 | { ", " * write$ } 77 | { output.state after.block = 78 | { add.period$ write$ 79 | newline$ 80 | "\newblock " write$ 81 | } 82 | { output.state before.all = 83 | 'write$ 84 | { add.period$ " " * write$ } 85 | if$ 86 | } 87 | if$ 88 | mid.sentence 'output.state := 89 | } 90 | if$ 91 | s 92 | } 93 | 94 | FUNCTION {output} 95 | { duplicate$ empty$ 96 | 'pop$ 97 | 'output.nonnull 98 | if$ 99 | } 100 | 101 | FUNCTION {output.check} 102 | { 't := 103 | duplicate$ empty$ 104 | { pop$ "empty " t * " in " * cite$ * warning$ } 105 | 'output.nonnull 106 | if$ 107 | } 108 | 109 | FUNCTION {output.bibitem} 110 | { newline$ 111 | "\bibitem{" write$ 112 | cite$ write$ 113 | "}" write$ 114 | newline$ 115 | "" 116 | before.all 'output.state := 117 | } 118 | 119 | FUNCTION {fin.entry} 120 | { add.period$ 121 | write$ 122 | newline$ 123 | } 124 | 125 | FUNCTION {new.block} 126 | { output.state before.all = 127 | 'skip$ 128 | { after.block 'output.state := } 129 | if$ 130 | } 131 | 132 | FUNCTION {new.sentence} 133 | { output.state after.block = 134 | 'skip$ 135 | { output.state before.all = 136 | 'skip$ 137 | { after.sentence 'output.state := } 138 | if$ 139 | } 140 | if$ 141 | } 142 | 143 | FUNCTION {not} 144 | { { #0 } 145 | { #1 } 146 | if$ 147 | } 148 | 149 | FUNCTION {and} 150 | { 'skip$ 151 | { pop$ #0 } 152 | if$ 153 | } 154 | 155 | FUNCTION {or} 156 | { { pop$ #1 } 157 | 'skip$ 158 | if$ 159 | } 160 | 161 | FUNCTION {new.block.checka} 162 | { empty$ 163 | 'skip$ 164 | 'new.block 165 | if$ 166 | } 167 | 168 | FUNCTION {new.block.checkb} 169 | { empty$ 170 | swap$ empty$ 171 | and 172 | 'skip$ 173 | 'new.block 174 | if$ 175 | } 176 | 177 | FUNCTION {new.sentence.checka} 178 | { empty$ 179 | 'skip$ 180 | 'new.sentence 181 | if$ 182 | } 183 | 184 | FUNCTION {new.sentence.checkb} 185 | { empty$ 186 | swap$ empty$ 187 | and 188 | 'skip$ 189 | 'new.sentence 190 | if$ 191 | } 192 | 193 | FUNCTION {field.or.null} 194 | { duplicate$ empty$ 195 | { pop$ "" } 196 | 'skip$ 197 | if$ 198 | } 199 | 200 | FUNCTION {emphasize} 201 | { duplicate$ empty$ 202 | { pop$ "" } 203 | { "{\em " swap$ * "}" * } 204 | if$ 205 | } 206 | 207 | INTEGERS { nameptr namesleft numnames } 208 | 209 | FUNCTION {format.names} 210 | { 's := 211 | #1 'nameptr := 212 | s num.names$ 'numnames := 213 | numnames 'namesleft := 214 | { namesleft #0 > } 215 | % Formerly { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := 216 | { s nameptr "{ff }{vv }{ll}{, jj}" format.name$ 't := 217 | nameptr #1 > 218 | { namesleft #1 > 219 | { ", " * t * } 220 | { numnames #2 > 221 | { "," * } 222 | 'skip$ 223 | if$ 224 | t "others" = 225 | { " et~al." * } 226 | { " and " * t * } 227 | if$ 228 | } 229 | if$ 230 | } 231 | 't 232 | if$ 233 | nameptr #1 + 'nameptr := 234 | 235 | namesleft #1 - 'namesleft := 236 | } 237 | while$ 238 | } 239 | 240 | FUNCTION {format.authors} 241 | { author empty$ 242 | { "" } 243 | { author format.names } 244 | if$ 245 | } 246 | 247 | FUNCTION {format.editors} 248 | { editor empty$ 249 | { "" } 250 | { editor format.names 251 | editor num.names$ #1 > 252 | { ", editors" * } 253 | { ", editor" * } 254 | if$ 255 | } 256 | if$ 257 | } 258 | 259 | FUNCTION {format.title} 260 | { title empty$ 261 | { "" } 262 | { title "t" change.case$ } 263 | if$ 264 | } 265 | 266 | FUNCTION {n.dashify} 267 | { 't := 268 | "" 269 | { t empty$ not } 270 | { t #1 #1 substring$ "-" = 271 | { t #1 #2 substring$ "--" = not 272 | { "--" * 273 | t #2 global.max$ substring$ 't := 274 | } 275 | { { t #1 #1 substring$ "-" = } 276 | { "-" * 277 | t #2 global.max$ substring$ 't := 278 | } 279 | while$ 280 | } 281 | if$ 282 | } 283 | { t #1 #1 substring$ * 284 | t #2 global.max$ substring$ 't := 285 | } 286 | if$ 287 | } 288 | while$ 289 | } 290 | 291 | FUNCTION {format.date} 292 | { year empty$ 293 | { month empty$ 294 | { "" } 295 | { "there's a month but no year in " cite$ * warning$ 296 | month 297 | } 298 | if$ 299 | } 300 | { month empty$ 301 | 'year 302 | { month " " * year * } 303 | if$ 304 | } 305 | if$ 306 | } 307 | 308 | FUNCTION {format.btitle} 309 | { title emphasize 310 | } 311 | 312 | FUNCTION {tie.or.space.connect} 313 | { duplicate$ text.length$ #3 < 314 | { "~" } 315 | { " " } 316 | if$ 317 | swap$ * * 318 | } 319 | 320 | FUNCTION {either.or.check} 321 | { empty$ 322 | 'pop$ 323 | { "can't use both " swap$ * " fields in " * cite$ * warning$ } 324 | if$ 325 | } 326 | 327 | FUNCTION {format.bvolume} 328 | { volume empty$ 329 | { "" } 330 | { "volume" volume tie.or.space.connect 331 | series empty$ 332 | 'skip$ 333 | { " of " * series emphasize * } 334 | if$ 335 | "volume and number" number either.or.check 336 | } 337 | if$ 338 | } 339 | 340 | FUNCTION {format.number.series} 341 | { volume empty$ 342 | { number empty$ 343 | { series field.or.null } 344 | { output.state mid.sentence = 345 | { "number" } 346 | { "Number" } 347 | if$ 348 | number tie.or.space.connect 349 | series empty$ 350 | { "there's a number but no series in " cite$ * warning$ } 351 | { " in " * series * } 352 | if$ 353 | } 354 | if$ 355 | } 356 | { "" } 357 | if$ 358 | } 359 | 360 | FUNCTION {format.edition} 361 | { edition empty$ 362 | { "" } 363 | { output.state mid.sentence = 364 | { edition "l" change.case$ " edition" * } 365 | { edition "t" change.case$ " edition" * } 366 | if$ 367 | } 368 | if$ 369 | } 370 | 371 | INTEGERS { multiresult } 372 | 373 | FUNCTION {multi.page.check} 374 | { 't := 375 | #0 'multiresult := 376 | { multiresult not 377 | t empty$ not 378 | and 379 | } 380 | { t #1 #1 substring$ 381 | duplicate$ "-" = 382 | swap$ duplicate$ "," = 383 | swap$ "+" = 384 | or or 385 | { #1 'multiresult := } 386 | { t #2 global.max$ substring$ 't := } 387 | if$ 388 | } 389 | while$ 390 | multiresult 391 | } 392 | 393 | FUNCTION {format.pages} 394 | { pages empty$ 395 | { "" } 396 | { pages multi.page.check 397 | { "pages" pages n.dashify tie.or.space.connect } 398 | { "page" pages tie.or.space.connect } 399 | if$ 400 | } 401 | if$ 402 | } 403 | 404 | FUNCTION {format.vol.num.pages} 405 | { volume field.or.null 406 | number empty$ 407 | 'skip$ 408 | { "(" number * ")" * * 409 | volume empty$ 410 | { "there's a number but no volume in " cite$ * warning$ } 411 | 'skip$ 412 | if$ 413 | } 414 | if$ 415 | pages empty$ 416 | 'skip$ 417 | { duplicate$ empty$ 418 | { pop$ format.pages } 419 | { ":" * pages n.dashify * } 420 | if$ 421 | } 422 | if$ 423 | } 424 | 425 | FUNCTION {format.chapter.pages} 426 | { chapter empty$ 427 | 'format.pages 428 | { type empty$ 429 | { "chapter" } 430 | { type "l" change.case$ } 431 | if$ 432 | chapter tie.or.space.connect 433 | pages empty$ 434 | 'skip$ 435 | { ", " * format.pages * } 436 | if$ 437 | } 438 | if$ 439 | } 440 | 441 | FUNCTION {format.in.ed.booktitle} 442 | { booktitle empty$ 443 | { "" } 444 | { editor empty$ 445 | { "In " booktitle emphasize * } 446 | { "In " format.editors * ", " * booktitle emphasize * } 447 | if$ 448 | } 449 | if$ 450 | } 451 | 452 | FUNCTION {empty.misc.check} 453 | 454 | { author empty$ title empty$ howpublished empty$ 455 | month empty$ year empty$ note empty$ 456 | and and and and and 457 | key empty$ not and 458 | { "all relevant fields are empty in " cite$ * warning$ } 459 | 'skip$ 460 | if$ 461 | } 462 | 463 | FUNCTION {format.thesis.type} 464 | { type empty$ 465 | 'skip$ 466 | { pop$ 467 | type "t" change.case$ 468 | } 469 | if$ 470 | } 471 | 472 | FUNCTION {format.tr.number} 473 | { type empty$ 474 | { "Technical Report" } 475 | 'type 476 | if$ 477 | number empty$ 478 | { "t" change.case$ } 479 | { number tie.or.space.connect } 480 | if$ 481 | } 482 | 483 | FUNCTION {format.article.crossref} 484 | { key empty$ 485 | { journal empty$ 486 | { "need key or journal for " cite$ * " to crossref " * crossref * 487 | warning$ 488 | "" 489 | } 490 | { "In {\em " journal * "\/}" * } 491 | if$ 492 | } 493 | { "In " key * } 494 | if$ 495 | " \cite{" * crossref * "}" * 496 | } 497 | 498 | FUNCTION {format.crossref.editor} 499 | { editor #1 "{vv~}{ll}" format.name$ 500 | editor num.names$ duplicate$ 501 | #2 > 502 | { pop$ " et~al." * } 503 | { #2 < 504 | 'skip$ 505 | { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = 506 | { " et~al." * } 507 | { " and " * editor #2 "{vv~}{ll}" format.name$ * } 508 | if$ 509 | } 510 | if$ 511 | } 512 | if$ 513 | } 514 | 515 | FUNCTION {format.book.crossref} 516 | { volume empty$ 517 | { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ 518 | "In " 519 | } 520 | { "Volume" volume tie.or.space.connect 521 | " of " * 522 | } 523 | if$ 524 | editor empty$ 525 | editor field.or.null author field.or.null = 526 | or 527 | { key empty$ 528 | { series empty$ 529 | { "need editor, key, or series for " cite$ * " to crossref " * 530 | crossref * warning$ 531 | "" * 532 | } 533 | { "{\em " * series * "\/}" * } 534 | if$ 535 | } 536 | { key * } 537 | if$ 538 | } 539 | { format.crossref.editor * } 540 | if$ 541 | " \cite{" * crossref * "}" * 542 | } 543 | 544 | FUNCTION {format.incoll.inproc.crossref} 545 | { editor empty$ 546 | editor field.or.null author field.or.null = 547 | or 548 | { key empty$ 549 | { booktitle empty$ 550 | { "need editor, key, or booktitle for " cite$ * " to crossref " * 551 | crossref * warning$ 552 | "" 553 | } 554 | { "In {\em " booktitle * "\/}" * } 555 | if$ 556 | } 557 | { "In " key * } 558 | if$ 559 | } 560 | { "In " format.crossref.editor * } 561 | if$ 562 | " \cite{" * crossref * "}" * 563 | } 564 | 565 | FUNCTION {article} 566 | { output.bibitem 567 | format.authors "author" output.check 568 | new.block 569 | format.title "title" output.check 570 | new.block 571 | crossref missing$ 572 | { journal emphasize "journal" output.check 573 | format.vol.num.pages output 574 | format.date "year" output.check 575 | } 576 | { format.article.crossref output.nonnull 577 | format.pages output 578 | } 579 | if$ 580 | new.block 581 | note output 582 | fin.entry 583 | } 584 | 585 | FUNCTION {book} 586 | { output.bibitem 587 | author empty$ 588 | { format.editors "author and editor" output.check } 589 | { format.authors output.nonnull 590 | crossref missing$ 591 | { "author and editor" editor either.or.check } 592 | 'skip$ 593 | if$ 594 | } 595 | if$ 596 | new.block 597 | format.btitle "title" output.check 598 | crossref missing$ 599 | { format.bvolume output 600 | new.block 601 | format.number.series output 602 | new.sentence 603 | publisher "publisher" output.check 604 | address output 605 | } 606 | { new.block 607 | format.book.crossref output.nonnull 608 | } 609 | if$ 610 | format.edition output 611 | format.date "year" output.check 612 | new.block 613 | note output 614 | fin.entry 615 | } 616 | 617 | FUNCTION {booklet} 618 | { output.bibitem 619 | format.authors output 620 | new.block 621 | format.title "title" output.check 622 | howpublished address new.block.checkb 623 | howpublished output 624 | address output 625 | format.date output 626 | new.block 627 | note output 628 | fin.entry 629 | } 630 | 631 | FUNCTION {inbook} 632 | { output.bibitem 633 | author empty$ 634 | { format.editors "author and editor" output.check } 635 | { format.authors output.nonnull 636 | 637 | crossref missing$ 638 | { "author and editor" editor either.or.check } 639 | 'skip$ 640 | if$ 641 | } 642 | if$ 643 | new.block 644 | format.btitle "title" output.check 645 | crossref missing$ 646 | { format.bvolume output 647 | format.chapter.pages "chapter and pages" output.check 648 | new.block 649 | format.number.series output 650 | new.sentence 651 | publisher "publisher" output.check 652 | address output 653 | } 654 | { format.chapter.pages "chapter and pages" output.check 655 | new.block 656 | format.book.crossref output.nonnull 657 | } 658 | if$ 659 | format.edition output 660 | format.date "year" output.check 661 | new.block 662 | note output 663 | fin.entry 664 | } 665 | 666 | FUNCTION {incollection} 667 | { output.bibitem 668 | format.authors "author" output.check 669 | new.block 670 | format.title "title" output.check 671 | new.block 672 | crossref missing$ 673 | { format.in.ed.booktitle "booktitle" output.check 674 | format.bvolume output 675 | format.number.series output 676 | format.chapter.pages output 677 | new.sentence 678 | publisher "publisher" output.check 679 | address output 680 | format.edition output 681 | format.date "year" output.check 682 | } 683 | { format.incoll.inproc.crossref output.nonnull 684 | format.chapter.pages output 685 | } 686 | if$ 687 | new.block 688 | note output 689 | fin.entry 690 | } 691 | 692 | FUNCTION {inproceedings} 693 | { output.bibitem 694 | format.authors "author" output.check 695 | new.block 696 | format.title "title" output.check 697 | new.block 698 | crossref missing$ 699 | { format.in.ed.booktitle "booktitle" output.check 700 | format.bvolume output 701 | format.number.series output 702 | format.pages output 703 | address empty$ 704 | { organization publisher new.sentence.checkb 705 | organization output 706 | publisher output 707 | format.date "year" output.check 708 | } 709 | { address output.nonnull 710 | format.date "year" output.check 711 | new.sentence 712 | organization output 713 | publisher output 714 | } 715 | if$ 716 | } 717 | { format.incoll.inproc.crossref output.nonnull 718 | format.pages output 719 | } 720 | if$ 721 | new.block 722 | note output 723 | fin.entry 724 | } 725 | 726 | FUNCTION {conference} { inproceedings } 727 | 728 | FUNCTION {manual} 729 | { output.bibitem 730 | author empty$ 731 | { organization empty$ 732 | 'skip$ 733 | { organization output.nonnull 734 | address output 735 | } 736 | if$ 737 | } 738 | { format.authors output.nonnull } 739 | if$ 740 | new.block 741 | format.btitle "title" output.check 742 | author empty$ 743 | { organization empty$ 744 | { address new.block.checka 745 | address output 746 | } 747 | 'skip$ 748 | if$ 749 | } 750 | { organization address new.block.checkb 751 | organization output 752 | address output 753 | } 754 | if$ 755 | format.edition output 756 | format.date output 757 | new.block 758 | note output 759 | fin.entry 760 | } 761 | 762 | FUNCTION {mastersthesis} 763 | { output.bibitem 764 | format.authors "author" output.check 765 | new.block 766 | format.title "title" output.check 767 | new.block 768 | "Master's thesis" format.thesis.type output.nonnull 769 | school "school" output.check 770 | address output 771 | format.date "year" output.check 772 | new.block 773 | note output 774 | fin.entry 775 | } 776 | 777 | FUNCTION {misc} 778 | { output.bibitem 779 | format.authors output 780 | title howpublished new.block.checkb 781 | format.title output 782 | howpublished new.block.checka 783 | howpublished output 784 | format.date output 785 | new.block 786 | note output 787 | fin.entry 788 | empty.misc.check 789 | } 790 | 791 | FUNCTION {phdthesis} 792 | { output.bibitem 793 | format.authors "author" output.check 794 | new.block 795 | format.btitle "title" output.check 796 | new.block 797 | "PhD thesis" format.thesis.type output.nonnull 798 | school "school" output.check 799 | address output 800 | format.date "year" output.check 801 | new.block 802 | note output 803 | fin.entry 804 | } 805 | 806 | FUNCTION {proceedings} 807 | { output.bibitem 808 | editor empty$ 809 | { organization output } 810 | { format.editors output.nonnull } 811 | 812 | if$ 813 | new.block 814 | format.btitle "title" output.check 815 | format.bvolume output 816 | format.number.series output 817 | address empty$ 818 | { editor empty$ 819 | { publisher new.sentence.checka } 820 | { organization publisher new.sentence.checkb 821 | organization output 822 | } 823 | if$ 824 | publisher output 825 | format.date "year" output.check 826 | } 827 | { address output.nonnull 828 | format.date "year" output.check 829 | new.sentence 830 | editor empty$ 831 | 'skip$ 832 | { organization output } 833 | if$ 834 | publisher output 835 | } 836 | if$ 837 | new.block 838 | note output 839 | fin.entry 840 | } 841 | 842 | FUNCTION {techreport} 843 | { output.bibitem 844 | format.authors "author" output.check 845 | new.block 846 | format.title "title" output.check 847 | new.block 848 | format.tr.number output.nonnull 849 | institution "institution" output.check 850 | address output 851 | format.date "year" output.check 852 | new.block 853 | note output 854 | fin.entry 855 | } 856 | 857 | FUNCTION {unpublished} 858 | { output.bibitem 859 | format.authors "author" output.check 860 | new.block 861 | format.title "title" output.check 862 | new.block 863 | note "note" output.check 864 | format.date output 865 | fin.entry 866 | } 867 | 868 | FUNCTION {default.type} { misc } 869 | 870 | MACRO {jan} {"Jan."} 871 | 872 | MACRO {feb} {"Feb."} 873 | 874 | MACRO {mar} {"Mar."} 875 | 876 | MACRO {apr} {"Apr."} 877 | 878 | MACRO {may} {"May"} 879 | 880 | MACRO {jun} {"June"} 881 | 882 | MACRO {jul} {"July"} 883 | 884 | MACRO {aug} {"Aug."} 885 | 886 | MACRO {sep} {"Sept."} 887 | 888 | MACRO {oct} {"Oct."} 889 | 890 | MACRO {nov} {"Nov."} 891 | 892 | MACRO {dec} {"Dec."} 893 | 894 | MACRO {acmcs} {"ACM Comput. Surv."} 895 | 896 | MACRO {acta} {"Acta Inf."} 897 | 898 | MACRO {cacm} {"Commun. ACM"} 899 | 900 | MACRO {ibmjrd} {"IBM J. Res. Dev."} 901 | 902 | MACRO {ibmsj} {"IBM Syst.~J."} 903 | 904 | MACRO {ieeese} {"IEEE Trans. Softw. Eng."} 905 | 906 | MACRO {ieeetc} {"IEEE Trans. Comput."} 907 | 908 | MACRO {ieeetcad} 909 | {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} 910 | 911 | MACRO {ipl} {"Inf. Process. Lett."} 912 | 913 | MACRO {jacm} {"J.~ACM"} 914 | 915 | MACRO {jcss} {"J.~Comput. Syst. Sci."} 916 | 917 | MACRO {scp} {"Sci. Comput. Programming"} 918 | 919 | MACRO {sicomp} {"SIAM J. Comput."} 920 | 921 | MACRO {tocs} {"ACM Trans. Comput. Syst."} 922 | 923 | MACRO {tods} {"ACM Trans. Database Syst."} 924 | 925 | MACRO {tog} {"ACM Trans. Gr."} 926 | 927 | MACRO {toms} {"ACM Trans. Math. Softw."} 928 | 929 | MACRO {toois} {"ACM Trans. Office Inf. Syst."} 930 | 931 | MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} 932 | 933 | MACRO {tcs} {"Theoretical Comput. Sci."} 934 | 935 | READ 936 | 937 | FUNCTION {sortify} 938 | { purify$ 939 | "l" change.case$ 940 | } 941 | 942 | INTEGERS { len } 943 | 944 | FUNCTION {chop.word} 945 | { 's := 946 | 'len := 947 | s #1 len substring$ = 948 | { s len #1 + global.max$ substring$ } 949 | 's 950 | if$ 951 | } 952 | 953 | FUNCTION {sort.format.names} 954 | { 's := 955 | #1 'nameptr := 956 | "" 957 | s num.names$ 'numnames := 958 | numnames 'namesleft := 959 | { namesleft #0 > } 960 | { nameptr #1 > 961 | { " " * } 962 | 'skip$ 963 | if$ 964 | s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := 965 | nameptr numnames = t "others" = and 966 | { "et al" * } 967 | { t sortify * } 968 | if$ 969 | nameptr #1 + 'nameptr := 970 | namesleft #1 - 'namesleft := 971 | } 972 | while$ 973 | } 974 | 975 | FUNCTION {sort.format.title} 976 | { 't := 977 | "A " #2 978 | "An " #3 979 | "The " #4 t chop.word 980 | chop.word 981 | chop.word 982 | sortify 983 | #1 global.max$ substring$ 984 | } 985 | 986 | FUNCTION {author.sort} 987 | { author empty$ 988 | { key empty$ 989 | { "to sort, need author or key in " cite$ * warning$ 990 | "" 991 | } 992 | { key sortify } 993 | if$ 994 | } 995 | { author sort.format.names } 996 | if$ 997 | } 998 | 999 | FUNCTION {author.editor.sort} 1000 | { author empty$ 1001 | { editor empty$ 1002 | { key empty$ 1003 | { "to sort, need author, editor, or key in " cite$ * warning$ 1004 | "" 1005 | } 1006 | { key sortify } 1007 | if$ 1008 | } 1009 | { editor sort.format.names } 1010 | if$ 1011 | } 1012 | { author sort.format.names } 1013 | if$ 1014 | } 1015 | 1016 | FUNCTION {author.organization.sort} 1017 | { author empty$ 1018 | 1019 | { organization empty$ 1020 | { key empty$ 1021 | { "to sort, need author, organization, or key in " cite$ * warning$ 1022 | "" 1023 | } 1024 | { key sortify } 1025 | if$ 1026 | } 1027 | { "The " #4 organization chop.word sortify } 1028 | if$ 1029 | } 1030 | { author sort.format.names } 1031 | if$ 1032 | } 1033 | 1034 | FUNCTION {editor.organization.sort} 1035 | { editor empty$ 1036 | { organization empty$ 1037 | { key empty$ 1038 | { "to sort, need editor, organization, or key in " cite$ * warning$ 1039 | "" 1040 | } 1041 | { key sortify } 1042 | if$ 1043 | } 1044 | { "The " #4 organization chop.word sortify } 1045 | if$ 1046 | } 1047 | { editor sort.format.names } 1048 | if$ 1049 | } 1050 | 1051 | FUNCTION {presort} 1052 | { type$ "book" = 1053 | type$ "inbook" = 1054 | or 1055 | 'author.editor.sort 1056 | { type$ "proceedings" = 1057 | 'editor.organization.sort 1058 | { type$ "manual" = 1059 | 'author.organization.sort 1060 | 'author.sort 1061 | if$ 1062 | } 1063 | if$ 1064 | } 1065 | if$ 1066 | " " 1067 | * 1068 | year field.or.null sortify 1069 | * 1070 | " " 1071 | * 1072 | title field.or.null 1073 | sort.format.title 1074 | * 1075 | #1 entry.max$ substring$ 1076 | 'sort.key$ := 1077 | } 1078 | 1079 | ITERATE {presort} 1080 | 1081 | SORT 1082 | 1083 | STRINGS { longest.label } 1084 | 1085 | INTEGERS { number.label longest.label.width } 1086 | 1087 | FUNCTION {initialize.longest.label} 1088 | { "" 'longest.label := 1089 | #1 'number.label := 1090 | #0 'longest.label.width := 1091 | } 1092 | 1093 | FUNCTION {longest.label.pass} 1094 | { number.label int.to.str$ 'label := 1095 | number.label #1 + 'number.label := 1096 | label width$ longest.label.width > 1097 | { label 'longest.label := 1098 | label width$ 'longest.label.width := 1099 | } 1100 | 'skip$ 1101 | if$ 1102 | } 1103 | 1104 | EXECUTE {initialize.longest.label} 1105 | 1106 | ITERATE {longest.label.pass} 1107 | 1108 | FUNCTION {begin.bib} 1109 | { preamble$ empty$ 1110 | 'skip$ 1111 | { preamble$ write$ newline$ } 1112 | if$ 1113 | "\begin{thebibliography}{" longest.label * "}" * 1114 | "\itemsep=-1pt" * % Compact the entries a little. 1115 | write$ newline$ 1116 | } 1117 | 1118 | EXECUTE {begin.bib} 1119 | 1120 | EXECUTE {init.state.consts} 1121 | 1122 | ITERATE {call.type$} 1123 | 1124 | FUNCTION {end.bib} 1125 | { newline$ 1126 | "\end{thebibliography}" write$ newline$ 1127 | } 1128 | 1129 | EXECUTE {end.bib} 1130 | 1131 | % end of file ieee.bst 1132 | % --------------------------------------------------------------- 1133 | 1134 | 1135 | 1136 | -------------------------------------------------------------------------------- /lib/augim.py: -------------------------------------------------------------------------------- 1 | # ================================================= 2 | # * Licensed under The MIT License 3 | # * Written by KAI-ZHAO 4 | # ================================================= 5 | # AugIm: Image Augmentation 6 | # Usage: 7 | # im = np.zeros((100, 100, 3), dtype=np.uint8) 8 | # label_map = np.zeros((100, 100), dtype=np.uint8) 9 | # im, label_map = augim.rescale([im, label_map], scales=[0.6. 0.8. 1.0. 1.2]) 10 | import numpy as np 11 | from PIL import Image 12 | import skimage.transform as transform 13 | import cv2 14 | USE_CV2_RESIZE = True 15 | 16 | def resizeim(x, h, w): 17 | if USE_CV2_RESIZE: 18 | return cv2.resize(x, (w, h)) 19 | else: 20 | pil_im = Image.fromarray(x.astype(np.uint8)) 21 | pil_im = pil_im.resize((w, h)) 22 | return np.array(pil_im) 23 | 24 | def resizegt(x, h, w): 25 | """ 26 | resize a BINARY map 27 | """ 28 | assert np.unique(x).size == 2 29 | if USE_CV2_RESIZE: 30 | im1 = cv2.resize(x, (w, h), cv2.INTER_NEAREST) 31 | thres = float(im1.max() - im1.min()) / 2 32 | im1[im1 < thres] = 0 33 | im1[im1 != 0] = 1 34 | return im1 35 | else: 36 | pil_im = Image.fromarray(x) 37 | pil_im = pil_im.resize((w, h), resample=Image.NEAREST) 38 | return np.array(pil_im) 39 | 40 | def shape_match(im, gt): 41 | return im.shape[0] == gt.shape[0] and im.shape[1] == gt.shape[1] 42 | 43 | #=======================================================# 44 | # Operations that can be performed on BOTH images and 45 | # label-maps 46 | #=======================================================# 47 | def rescale(x, scales, keep=True): 48 | """ 49 | rescale (resize) image and ground-truth map 50 | x: input image (or image/label pair) 51 | keep: keep width-heigh ratio or not 52 | """ 53 | assert isinstance(x, list) or isinstance(x, np.ndarray) 54 | assert isinstance(scales, list) or isinstance(scales, np.ndarray) 55 | if isinstance(scales, list): 56 | scales = np.array(scales) 57 | s = np.random.choice(scales) 58 | s1 = np.random.choice(scales) 59 | if isinstance(x, list): 60 | assert len(x) == 2 61 | im, gt = x 62 | h, w, c = im.shape 63 | assert c == 3 64 | assert h == gt.shape[0] and w == gt.shape[1] 65 | if keep: 66 | h1, w1 = int(h * s), int(w * s) 67 | else: 68 | h1, w1 = int(h * s), int(w * s1) 69 | return [resizeim(im, h1, w1), resizegt(gt, h1, w1)] 70 | elif isinstance(x, np.ndarray): 71 | h, w, c = x.shape 72 | assert c == 3 73 | if keep: 74 | h1, w1 = int(h * s), int(w * s) 75 | else: 76 | h1, w1 = int(h * s), int(w * s1) 77 | return resizeim(x, h, w) 78 | else: 79 | raise TypeError("Error!") 80 | 81 | def fliplr(x, p=0.5): 82 | """ 83 | Flip left-right 84 | """ 85 | assert isinstance(x, list) or isinstance(x, np.ndarray) 86 | flag = np.random.binomial(1, p) 87 | if flag: 88 | if isinstance(x, list): 89 | im, gt = x 90 | assert im.ndim == 3 and gt.ndim == 2 91 | assert im.shape[0] == gt.shape[0] and im.shape[1] == gt.shape[1] \ 92 | and im.shape[2] == 3 93 | return [im[:, ::-1, :], gt[:, ::-1]] 94 | else: 95 | assert x.ndim == 3 96 | return x[:, ::-1, :] 97 | else: 98 | return x 99 | 100 | def crop(x, offset=20): 101 | assert isinstance(x, list) or isinstance(x, np.ndarray) 102 | assert offset > 0 103 | if isinstance(x, list): 104 | im, gt = x 105 | has_gt = True 106 | else: 107 | im = x 108 | has_gt = False 109 | h, w, c = im.shape 110 | if has_gt: 111 | assert gt.shape[0] == h and gt.shape[1] == w 112 | assert offset < h // 2 and offset < w // 2 113 | xstart = np.random.choice(np.arange(1, offset)) 114 | xend = w - np.random.choice(np.arange(1, offset)) 115 | ystart = np.random.choice(np.arange(1, offset)) 116 | yend = h - np.random.choice(np.arange(1, offset)) 117 | if has_gt: 118 | return [im[ystart:yend, xstart:xend, :], gt[ystart:yend, xstart:xend]] 119 | else: 120 | return im[ystart:yend, xstart:xend] 121 | 122 | def rescale_crop(x, size=[256 ,256]): 123 | h0, w0 = map(lambda x: int(x), list(size)) 124 | assert isinstance(x, list) or isinstance(x, np.ndarray) 125 | r = -1 126 | if isinstance(x, list): 127 | im, lb = x 128 | assert shape_match(im, lb) 129 | pil_im = Image.fromarray(im.astype(np.uint8)) 130 | pil_lb = Image.fromarray(lb.astype(np.uint8)) 131 | else: 132 | im = x 133 | assert im.ndim == 3 134 | pil_im = Image.fromarray(im.astype(np.uint8)) 135 | h, w = im.shape[:2] 136 | # print("Input shape (%d, %d)" % (h, w)) 137 | r_h, r_w = np.float32(h) / h0, np.float32(w) / w0 138 | if r_h <= r_w: 139 | r = r_h 140 | else: 141 | r = r_w 142 | assert r > 0, "r = %f" % r 143 | new_w = int(np.round(w / r)) 144 | new_h = int(np.round(h / r)) 145 | assert new_w >= w0 and new_h >= h0, "(%d, %d) vs (%d, %d)" % (new_h, new_w, h0, w0) 146 | pil_im = pil_im.resize((new_w, new_h)) 147 | if isinstance(x, list): 148 | pil_lb = pil_lb.resize((new_w, new_h)) 149 | xstart, ystart = -1, -1 150 | if new_w == w0: 151 | xstart = 0 152 | else: 153 | xstart = int(np.random.choice(new_w - w0, 1)) 154 | if new_h == h0: 155 | ystart = 0 156 | else: 157 | ystart = int(np.random.choice(new_h - h0, 1)) 158 | im = np.array(pil_im) 159 | # print("Rescaled shape: ", im.shape) 160 | im = im[ystart:ystart+h0, xstart: xstart+w0, :] 161 | if isinstance(x, list): 162 | lb = np.array(pil_lb) 163 | lb = lb[ystart:ystart+h0, xstart: xstart+w0] 164 | return [im, lb] 165 | 166 | def rotate(x, angle=[-45, 45], expand=True): 167 | """ 168 | Rotate images (and lable-maps) at any angle 169 | """ 170 | angle = np.array(angle) 171 | angle = np.random.randint(low=angle.min(), high=angle.max()) 172 | if isinstance(x, list): 173 | assert len(x) == 2 174 | im, gt = x 175 | im = im.astype(np.uint8) 176 | gt = gt.astype(np.uint8) 177 | assert shape_match(im, gt) 178 | islogical = np.unique(gt).size <= 2 179 | # fill the rim fo the rotated image with symmetric values rather than 0 180 | im_rotated = transform.rotate(im, angle, mode="symmetric") 181 | gt_rotated = transform.rotate(gt, angle, mode="constant", preserve_range=True) 182 | im_rotated = np.uint8(im_rotated * 255) 183 | if islogical: 184 | gt_rotated = binary(gt_rotated) 185 | return [im_rotated, gt_rotated] 186 | else: 187 | assert isinstance(x, np.ndarray) 188 | im = im.astype(np.uint8) 189 | im_rotated = transform.rotate(im, angle, mode="edge") 190 | return im_rotated 191 | 192 | def rotate90(x, p=[0.3, 0.4, 0.3]): 193 | """ 194 | Randomly rotate image&label in 90deg 195 | Clockwise 90deg or 0deg or Counter-clockwise 90deg 196 | probabilities specified by p 197 | For example: p=[0.25, 0.5, 0.25] 198 | """ 199 | assert isinstance(x, list) or isinstance(x, np.ndarray) 200 | if p != None: 201 | assert isinstance(p, list) or isinstance(p, np.ndarray) 202 | p = np.array(p) 203 | k = np.random.choice([-1, 0, 1], 1, p=p) 204 | if isinstance(x, list): 205 | assert len(x) == 2 206 | im, lb = x 207 | return [np.rot90(im, k=k), np.rot90(lb, k=k)] 208 | elif isinstance(x, np.array): 209 | return np.rot90(im, k=k) 210 | else: 211 | raise TypeError("Invalid type") 212 | 213 | def crop_object_rim(x, margin=5): 214 | """ 215 | NOTE!!! 216 | This is for saliency detection or other segmentation tasks 217 | Crop an image to make the object on the border of image 218 | """ 219 | assert len(x) == 2 220 | im, gt = x 221 | assert np.unique(gt).size == 2, "len(np.unique(gt)) = %d" % len(np.unique(gt)) 222 | h, w, c = im.shape 223 | assert im.size / gt.size == c 224 | [y, x] = np.where(gt != 0) 225 | xmin = max(x.min() - margin, 0) 226 | ymin = max(y.min() - margin, 0) 227 | xmax = min(x.max() + margin, w) 228 | ymax = min(y.max() + margin, h) 229 | if xmin == 0: 230 | xstart = 0 231 | else: 232 | xstart = np.random.choice(np.arange(0, xmin)) 233 | if ymin == 0: 234 | ystart = 0 235 | else: 236 | ystart = np.random.choice(np.arange(0, ymin)) 237 | if xmax == w: 238 | xend = w 239 | else: 240 | xend = np.random.choice(np.arange(xmax, w)) 241 | if ymax == h: 242 | yend = h 243 | else: 244 | yend = np.random.choice(np.arange(ymax, h)) 245 | return [im[ystart:yend, xstart:xend, :], gt[ystart:yend, xstart:xend]] 246 | 247 | 248 | def pad_object_rim(x, margin=5): 249 | """ 250 | NOTE!!! 251 | This is for saliency detection or other segmentation tasks 252 | Pad an image to make the object on the border of image 253 | """ 254 | assert len(x) == 2 255 | im, gt = x 256 | assert np.unique(gt).size == 2, "len(np.unique(gt)) = %d" % len(np.unique(gt)) 257 | h, w, c = im.shape 258 | assert im.size / gt.size == c 259 | [y, x] = np.where(gt != 0) 260 | xmin = max(x.min() - margin, 0) 261 | ymin = max(y.min() - margin, 0) 262 | xmax = min(x.max() + margin, w) 263 | ymax = min(y.max() + margin, h) 264 | if xmin == 0: 265 | wbefore = 0 266 | else: 267 | wbefore = np.random.choice(np.arange(0, xmin)) 268 | if ymin == 0: 269 | hbefore = 0 270 | else: 271 | hbefore = np.random.choice(np.arange(0, ymin)) 272 | if xmax == w: 273 | wafter = 0 274 | else: 275 | wafter = np.random.choice(np.arange(0, w-xmax)) 276 | if ymax == h: 277 | hafter = 0 278 | else: 279 | hafter = np.random.choice(np.arange(0, h-ymax)) 280 | impad = np.pad(im, ((hbefore, hafter), (wbefore, wafter), (0, 0)), mode="symmetric") 281 | if gt.ndim == 2: 282 | gtpad = np.pad(gt, ((hbefore, hafter), (wbefore, wafter)), mode="symmetric") 283 | elif gt.ndim == 3: 284 | gtpad = np.pad(gt, ((hbefore, hafter), (wbefore, wafter), (0, 0)), mode="symmetric") 285 | else: 286 | raise ValueError("Invalid gt shape!") 287 | return [impad, gtpad] 288 | 289 | #=======================================================# 290 | # Operations performed on images ONLY! 291 | #=======================================================# 292 | def shuffle_channel(x, p=0.5): 293 | """ 294 | Random shuffle image channels 295 | x: ndarray with shape [h, w, channel] 296 | """ 297 | assert isinstance(x, np.ndarray) 298 | flag = np.random.binomial(1, p) 299 | if x.ndim == 2 or flag == 0: 300 | return x 301 | else: 302 | assert x.ndim == 3 303 | h, w, c = x.shape 304 | order = np.arange(c) 305 | np.random.shuffle(order) 306 | x = x[:, :, order] 307 | return x 308 | 309 | def addgaus(x, sigma=1): 310 | assert isinstance(x, np.ndarray) 311 | x = x.astype(np.float32) + np.random.rand(*x.shape) * sigma 312 | return x 313 | 314 | def add_per_channel(x, sigma=5): 315 | assert isinstance(x, np.ndarray) 316 | assert x.ndim == 3 317 | nchannels = x.shape[2] 318 | noise = np.random.randint(low=-np.abs(sigma), high=np.abs(sigma), size=nchannels) 319 | return x + noise 320 | 321 | def add(x, sigma=5): 322 | assert isinstance(x, np.ndarray) 323 | noise = np.random.randint(low=-np.abs(sigma), high=np.abs(sigma)) 324 | return x + noise 325 | 326 | def binary(x): 327 | assert isinstance(x, np.ndarray) 328 | assert x.ndim == 2 or x.ndim == 3 329 | x[x < (x.max() - x.min()) / 2] = 0 330 | x[x != 0] = 1 331 | return x 332 | -------------------------------------------------------------------------------- /lib/floss.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | from numpy import logical_and as land, logical_or as lor, logical_not as lnot 4 | import caffe 5 | FLT_MIN=1e-16 6 | 7 | class FmeasureLossLayer(caffe.Layer): 8 | def setup(self, bottom, top): 9 | if len(bottom) != 2: 10 | raise Exception("Need two inputs to compute distance.") 11 | params = eval(self.param_str) 12 | self.log = False 13 | if 'log' in params: 14 | self.log = bool(params['log']) 15 | self.counter=0 16 | self.beta = np.float(params['beta']) 17 | self.DEBUG = True 18 | 19 | def reshape(self, bottom, top): 20 | if bottom[0].count != bottom[1].count: 21 | raise Exception("Inputs must have the same dimension.") 22 | self.diff = np.zeros_like(bottom[0].data, dtype=np.float32) 23 | top[0].reshape(1) 24 | 25 | def forward(self, bottom, top): 26 | """ 27 | F = \frac{(1+\beta)pr}{\beta p + r} 28 | loss = 1 - F 29 | p = \frac{TP}{TP + FP} 30 | r = \frac{TP}{TP + FN} 31 | See http://kaizhao.net/fmeasure 32 | """ 33 | pred = np.squeeze(bottom[0].data[...]) 34 | target = np.squeeze(bottom[1].data[...]) 35 | target = target > 0 36 | h, w = target.shape 37 | assert pred.max() <= 1 and pred.min() >= 0, "pred.max = %f, pred.min = %f" % (pred.max(), pred.min()) 38 | self.TP=np.sum(target * pred) 39 | self.H = self.beta * target.sum() + pred.sum() 40 | self.fmeasure = (1 + self.beta) * self.TP / (self.H + FLT_MIN) 41 | if self.log: 42 | # loss = -\log{F-measure} 43 | loss = -np.log(self.fmeasure + FLT_MIN) 44 | else: 45 | # loss = 1 - F-measure 46 | loss = 1 - self.fmeasure 47 | top[0].data[0] = loss 48 | 49 | def backward(self, top, propagate_down, bottom): 50 | """ 51 | grad[i] = \frac{(1+\beta)TP}{H^2} - \frac{(1+\beta)y_i}{H} 52 | See http://kaizhao.net/fmeasure 53 | """ 54 | pred = bottom[0].data[...] 55 | target = bottom[1].data[...] 56 | grad = (1 + self.beta) * self.TP / (self.H**2 + FLT_MIN) - \ 57 | (1+self.beta) * target / (self.H + FLT_MIN) 58 | if self.log: 59 | grad /= (self.fmeasure + FLT_MIN) 60 | bottom[0].diff[...] = grad 61 | -------------------------------------------------------------------------------- /lib/pylayer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Code written by KAI ZHAO (http://kaiz.xyz) 4 | import caffe 5 | import numpy as np 6 | from os.path import join, isfile, splitext 7 | import random, cv2 8 | import augim 9 | 10 | class ImageLabelmapDataLayer(caffe.Layer): 11 | """ 12 | Python data layer 13 | """ 14 | def setup(self, bottom, top): 15 | params = eval(self.param_str) 16 | self.root = params['root'] 17 | self.source = params['source'] 18 | self.shuffle = bool(params['shuffle']) 19 | self.mean = np.array((104.00699, 116.66877, 122.67892)) 20 | self.aug = False 21 | if 'aug' in params: 22 | self.aug = bool(params['aug']) 23 | with open(join(self.root, self.source), 'r') as f: 24 | self.filelist = f.readlines() 25 | if self.shuffle: 26 | random.shuffle(self.filelist) 27 | self.idx = 0 28 | top[0].reshape(1, 3, 100, 100) # im 29 | top[1].reshape(1, 1, 100, 100) # lb 30 | 31 | def reshape(self, bottom, top): 32 | """ 33 | Will reshape in forward() 34 | """ 35 | 36 | def forward(self, bottom, top): 37 | """ 38 | Load data 39 | """ 40 | filename = splitext(self.filelist[self.idx])[0] 41 | imfn = join(self.root, 'images', filename+".jpg") 42 | lbfn = join(self.root, 'annotations', filename+".png") 43 | assert isfile(imfn), "file %s doesn't exist!" % imfn 44 | assert isfile(lbfn), "file %s doesn't exist!" % lbfn 45 | im = cv2.imread(imfn).astype(np.float32) 46 | lb = cv2.imread(lbfn, 0).astype(np.float32) 47 | if self.aug: 48 | im, lb = augim.rescale([im, lb], np.linspace(0.5, 1.5, 11)) 49 | if np.random.binomial(1, 0.2): 50 | im, lb = augim.rotate([im, lb], angle=[-10, 10], expand=False) 51 | im, lb = augim.fliplr([im, lb]) 52 | assert np.unique(lb).size == 2, "unique(lb).size = %d" % np.unique(lb).size 53 | lb[lb != 0] = 1 54 | im, lb = map(lambda x:np.float32(x), [im, lb]) 55 | if im.ndim == 2: 56 | im = im[:,:,np.newaxis] 57 | im = np.repeat(im, 3, 2) 58 | im -= self.mean 59 | im = np.transpose(im, (2, 0, 1)) 60 | im = im[np.newaxis, :, :, :] 61 | assert lb.ndim == 2, "lb.ndim = %d" % lb.ndim 62 | h, w = lb.shape 63 | assert im.shape[2] == h and im.shape[3] == w, "Image and GT shape mismatch." 64 | lb = lb[np.newaxis, np.newaxis, :, :] 65 | if np.count_nonzero(lb) == 0: 66 | print "Warning: all zero label map!" 67 | top[0].reshape(1, 3, h, w) 68 | top[1].reshape(1, 1, h, w) 69 | top[0].data[...] = im 70 | top[1].data[...] = lb 71 | if self.idx == len(self.filelist)-1: 72 | # we've reached the end, restart. 73 | print "Restarting data prefetching from start." 74 | if self.shuffle: 75 | random.shuffle(self.filelist) 76 | self.idx = 0 77 | else: 78 | self.idx = self.idx + 1 79 | 80 | def backward(self, top, propagate_down, bottom): 81 | """ 82 | Data layer doesn't need back propagate 83 | """ 84 | pass 85 | -------------------------------------------------------------------------------- /models/fdss.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | sys.path.insert(0, 'lib') 3 | from os.path import join, abspath, isdir 4 | import caffe 5 | from caffe import layers as L, params as P 6 | from caffe.coord_map import crop 7 | import numpy as np 8 | from math import ceil 9 | parser = argparse.ArgumentParser(description='DSS') 10 | parser.add_argument('--lossnorm', type=str, help='Normalize Loss', default="False") 11 | parser.add_argument('--beta', type=float, help='Value of beta', default=0.8) 12 | parser.add_argument('--aug', type=str, help='Data augmentation', default="True") 13 | TMP_DIR = abspath('tmp') 14 | SNAPSHOTS_DIR = abspath('snapshots') 15 | if not isdir(TMP_DIR): 16 | os.makedirs(TMP_DIR) 17 | def str2bool(str1): 18 | if str1.lower() == 'true' or str1.lower() == '1': 19 | return True 20 | elif str1.lower() == 'false' or str1.lower() == '0': 21 | return False 22 | else: 23 | raise ValueError('Error!') 24 | 25 | args = parser.parse_args() 26 | args.lossnorm = str2bool(args.lossnorm) 27 | args.aug = str2bool(args.aug) 28 | 29 | def conv_relu(bottom, nout, ks=3, stride=1, pad=1, mult=[1,1,2,0]): 30 | conv = L.Convolution(bottom, kernel_size=ks, stride=stride, 31 | num_output=nout, pad=pad, weight_filler=dict(type='gaussian',std=0.01), 32 | param=[dict(lr_mult=mult[0], decay_mult=mult[1]), dict(lr_mult=mult[2], decay_mult=mult[3])]) 33 | return conv, L.ReLU(conv, in_place=True) 34 | 35 | def max_pool(bottom, ks=2, stride=2): 36 | return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride) 37 | 38 | def conv1x1(bottom, name, lr=1, wf=dict(type='gaussian',std=0.01)): 39 | return L.Convolution(bottom, name=name, kernel_size=1,num_output=1, weight_filler=wf, 40 | param=[dict(lr_mult=0.1*lr, decay_mult=1), dict(lr_mult=0.2*lr, decay_mult=0)]) 41 | 42 | def upsample(bottom, name,stride): 43 | s, k, pad = stride, 2 * stride, int(ceil(stride-1)/2) 44 | #name = "upsample%d"%s 45 | return L.Deconvolution(bottom, name=name, convolution_param=dict(num_output=1, 46 | kernel_size=k, stride=s, pad=pad, weight_filler = dict(type="bilinear")), 47 | param=[dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]) 48 | 49 | def net(split): 50 | n = caffe.NetSpec() 51 | if split=='train': 52 | data_params = dict(mean=(104.00699, 116.66877, 122.67892)) 53 | data_params['root'] = './data/MSRA-B/' 54 | data_params['source'] = "train_list.txt" 55 | data_params['shuffle'] = True 56 | data_params['aug'] = args.aug 57 | data_params['ignore_label'] = -1 # ignore label 58 | n.data, n.label = L.Python(module='pylayer', layer='ImageLabelmapDataLayer', ntop=2, \ 59 | param_str=str(data_params)) 60 | loss_param = dict(normalize=args.lossnorm) 61 | if data_params.has_key('ignore_label'): 62 | loss_param['ignore_label'] = data_params['ignore_label'] 63 | elif split == 'test': 64 | n.data = L.Input(name = 'data', input_param=dict(shape=dict(dim=[1,3,500,500]))) 65 | else: 66 | raise Exception("Invalid phase") 67 | 68 | n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=5) 69 | n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) 70 | n.pool1 = max_pool(n.relu1_2) 71 | 72 | n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) 73 | n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) 74 | n.pool2 = max_pool(n.relu2_2) 75 | 76 | n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) 77 | n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) 78 | n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) 79 | n.pool3 = max_pool(n.relu3_3) 80 | 81 | n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) 82 | n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) 83 | n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) 84 | n.pool4 = max_pool(n.relu4_3) 85 | 86 | n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) 87 | n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) 88 | n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) 89 | n.pool5 = max_pool(n.relu5_3) 90 | n.pool5a = L.Pooling(n.pool5, pool=P.Pooling.AVE, kernel_size=3, stride=1,pad=1) 91 | ###DSN conv 6### 92 | n.conv1_dsn6,n.relu1_dsn6=conv_relu(n.pool5a,512,ks=7, pad=3) 93 | n.conv2_dsn6,n.relu2_dsn6=conv_relu(n.relu1_dsn6,512,ks=7, pad=3) 94 | n.conv3_dsn6=conv1x1(n.relu2_dsn6, 'conv3_dsn6') 95 | n.score_dsn6_up = upsample(n.conv3_dsn6, stride=32,name='upsample32_in_dsn6') 96 | n.upscore_dsn6 = crop(n.score_dsn6_up, n.data) 97 | if split=='train': 98 | n.sigmoid_dsn6 = L.Sigmoid(n.upscore_dsn6) 99 | floss_param = dict() 100 | floss_param['name']='dsn6' 101 | floss_param['beta']=args.beta 102 | n.loss_dsn6 = L.Python(n.sigmoid_dsn6,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 103 | else: 104 | n.sigmoid_dsn6 = L.Sigmoid(n.upscore_dsn6) 105 | ###DSN conv 5### 106 | n.conv1_dsn5,n.relu1_dsn5=conv_relu(n.conv5_3,512,ks=5, pad=2) 107 | n.conv2_dsn5,n.relu2_dsn5=conv_relu(n.relu1_dsn5,512,ks=5, pad=2) 108 | n.conv3_dsn5=conv1x1(n.relu2_dsn5, 'conv3_dsn5') 109 | n.score_dsn5_up = upsample(n.conv3_dsn5, stride=16,name='upsample16_in_dsn5') 110 | n.upscore_dsn5 = crop(n.score_dsn5_up, n.data) 111 | if split=='train': 112 | n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) 113 | floss_param['name']='dsn5' 114 | floss_param['beta']=args.beta 115 | n.loss_dsn5 = L.Python(n.sigmoid_dsn5,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 116 | else: 117 | n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) 118 | ###DSN conv 4### 119 | n.conv1_dsn4,n.relu1_dsn4=conv_relu(n.conv4_3,256,ks=5, pad=2) 120 | n.conv2_dsn4,n.relu2_dsn4=conv_relu(n.relu1_dsn4,256,ks=5, pad=2) 121 | n.conv3_dsn4=conv1x1(n.relu2_dsn4, 'conv3_dsn4') 122 | 123 | n.score_dsn6_up_4 = upsample(n.conv3_dsn6, stride=4,name='upsample4_dsn6') 124 | n.upscore_dsn6_4 = crop(n.score_dsn6_up_4, n.conv3_dsn4) 125 | n.score_dsn5_up_4 = upsample(n.conv3_dsn5, stride=2,name='upsample2_dsn5') 126 | n.upscore_dsn5_4 = crop(n.score_dsn5_up_4, n.conv3_dsn4) 127 | n.concat_dsn4 = L.Eltwise(n.conv3_dsn4, 128 | n.upscore_dsn6_4, 129 | n.upscore_dsn5_4, 130 | name="concat_dsn4") 131 | n.conv4_dsn4=conv1x1(n.concat_dsn4, 'conv4_dsn4') 132 | n.score_dsn4_up = upsample(n.conv4_dsn4, stride=8,name='upsample8_in_dsn4') 133 | n.upscore_dsn4 = crop(n.score_dsn4_up, n.data) 134 | if split=='train': 135 | n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) 136 | floss_param['name']='dsn4' 137 | floss_param['beta']=args.beta 138 | n.loss_dsn4 = L.Python(n.sigmoid_dsn4,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 139 | else: 140 | n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) 141 | ### DSN conv 3 ### 142 | n.conv1_dsn3,n.relu1_dsn3=conv_relu(n.conv3_3,256,ks=5, pad=2) 143 | n.conv2_dsn3,n.relu2_dsn3=conv_relu(n.relu1_dsn3,256,ks=5, pad=2) 144 | n.conv3_dsn3=conv1x1(n.relu2_dsn3, 'conv3_dsn3') 145 | 146 | n.score_dsn6_up_3 = upsample(n.conv3_dsn6, stride=8,name='upsample8_dsn6') 147 | n.upscore_dsn6_3 = crop(n.score_dsn6_up_3, n.conv3_dsn3) 148 | n.score_dsn5_up_3 = upsample(n.conv3_dsn5, stride=4,name='upsample4_dsn5') 149 | n.upscore_dsn5_3 = crop(n.score_dsn5_up_3, n.conv3_dsn3) 150 | n.concat_dsn3 = L.Eltwise(n.conv3_dsn3, 151 | n.upscore_dsn6_3, 152 | n.upscore_dsn5_3, 153 | name='concat') 154 | n.conv4_dsn3=conv1x1(n.concat_dsn3, 'conv4_dsn3') 155 | n.score_dsn3_up = upsample(n.conv4_dsn3, stride=4,name='upsample4_in_dsn3') 156 | n.upscore_dsn3 = crop(n.score_dsn3_up, n.data) 157 | if split=='train': 158 | n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) 159 | floss_param['name']='dsn3' 160 | floss_param['beta']=args.beta 161 | n.loss_dsn3 = L.Python(n.sigmoid_dsn3,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 162 | else: 163 | n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) 164 | ### DSN conv 2 ### 165 | n.conv1_dsn2,n.relu1_dsn2=conv_relu(n.conv2_2,128,ks=3, pad=1) 166 | n.conv2_dsn2,n.relu2_dsn2=conv_relu(n.relu1_dsn2,128,ks=3, pad=1) 167 | n.conv3_dsn2=conv1x1(n.relu2_dsn2, 'conv3_dsn2') 168 | 169 | n.score_dsn6_up_2 = upsample(n.conv3_dsn6, stride=16,name='upsample16_dsn6') 170 | n.upscore_dsn6_2 = crop(n.score_dsn6_up_2, n.conv3_dsn2) 171 | n.score_dsn5_up_2 = upsample(n.conv3_dsn5, stride=8,name='upsample8_dsn5') 172 | n.upscore_dsn5_2 = crop(n.score_dsn5_up_2, n.conv3_dsn2) 173 | n.score_dsn4_up_2 = upsample(n.conv4_dsn4, stride=4,name='upsample4_dsn4') 174 | n.upscore_dsn4_2 = crop(n.score_dsn4_up_2, n.conv3_dsn2) 175 | n.score_dsn3_up_2 = upsample(n.conv4_dsn3, stride=2,name='upsample2_dsn3') 176 | n.upscore_dsn3_2 = crop(n.score_dsn3_up_2, n.conv3_dsn2) 177 | n.concat_dsn2 = L.Eltwise(n.conv3_dsn2, 178 | n.upscore_dsn5_2, 179 | n.upscore_dsn4_2, 180 | n.upscore_dsn6_2, 181 | n.upscore_dsn3_2, 182 | name='concat') 183 | n.conv4_dsn2=conv1x1(n.concat_dsn2, 'conv4_dsn2') 184 | n.score_dsn2_up = upsample(n.conv4_dsn2, stride=2,name='upsample2_in_dsn2') 185 | n.upscore_dsn2 = crop(n.score_dsn2_up, n.data) 186 | if split=='train': 187 | n.sigmoid_dsn2 = L.Sigmoid(n.upscore_dsn2) 188 | floss_param['name']='dsn2' 189 | floss_param['beta']=args.beta 190 | n.loss_dsn2 = L.Python(n.sigmoid_dsn2,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 191 | else: 192 | n.sigmoid_dsn2 = L.Sigmoid(n.upscore_dsn2) 193 | ## DSN conv 1 ### 194 | n.conv1_dsn1,n.relu1_dsn1=conv_relu(n.conv1_2,128,ks=3, pad=1) 195 | n.conv2_dsn1,n.relu2_dsn1=conv_relu(n.relu1_dsn1,128,ks=3, pad=1) 196 | n.conv3_dsn1=conv1x1(n.relu2_dsn1, 'conv3_dsn1') 197 | 198 | n.score_dsn6_up_1 = upsample(n.conv3_dsn6, stride=32,name='upsample32_dsn6') 199 | n.upscore_dsn6_1 = crop(n.score_dsn6_up_1, n.conv3_dsn1) 200 | n.score_dsn5_up_1 = upsample(n.conv3_dsn5, stride=16,name='upsample16_dsn5') 201 | n.upscore_dsn5_1 = crop(n.score_dsn5_up_1, n.conv3_dsn1) 202 | n.score_dsn4_up_1 = upsample(n.conv4_dsn4, stride=8,name='upsample8_dsn4') 203 | n.upscore_dsn4_1 = crop(n.score_dsn4_up_1, n.conv3_dsn1) 204 | n.score_dsn3_up_1 = upsample(n.conv4_dsn3, stride=4,name='upsample4_dsn3') 205 | n.upscore_dsn3_1 = crop(n.score_dsn3_up_1, n.conv3_dsn1) 206 | 207 | n.concat_dsn1 = L.Eltwise(n.conv3_dsn1, 208 | n.upscore_dsn5_1, 209 | n.upscore_dsn4_1, 210 | n.upscore_dsn6_1, 211 | n.upscore_dsn3_1, 212 | name='concat') 213 | n.score_dsn1_up=conv1x1(n.concat_dsn1, 'conv4_dsn1') 214 | n.upscore_dsn1 = crop(n.score_dsn1_up, n.data) 215 | if split=='train': 216 | n.sigmoid_dsn1 = L.Sigmoid(n.upscore_dsn1) 217 | floss_param['name']='dsn1' 218 | floss_param['beta']=args.beta 219 | n.loss_dsn1 = L.Python(n.sigmoid_dsn1,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 220 | else: 221 | n.sigmoid_dsn1 = L.Sigmoid(n.upscore_dsn1) 222 | ### Eltwise and multiscale weight layer ### 223 | n.concat_upscore = L.Eltwise(n.upscore_dsn1, 224 | n.upscore_dsn2, 225 | n.upscore_dsn3, 226 | n.upscore_dsn4, 227 | n.upscore_dsn5, 228 | n.upscore_dsn6, 229 | name='concat') 230 | n.upscore_fuse=conv1x1(n.concat_upscore, 'new_score_weighting', wf=dict({'type': 'constant', 'value':np.float(1)/6 })) 231 | if split=='train': 232 | n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) 233 | floss_param['name']='fuse' 234 | floss_param['beta']=args.beta 235 | n.loss_fuse = L.Python(n.sigmoid_fuse,n.label,module='floss', layer='FmeasureLossLayer',param_str=str(floss_param),ntop=1,loss_weight=1) 236 | else: 237 | n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) 238 | return n.to_proto() 239 | 240 | pt_filename = join(TMP_DIR, 'fdss') 241 | snapshot_filename = join(SNAPSHOTS_DIR, 'fdss') 242 | pt_filename = "%s_beta%.2f" % (pt_filename, args.beta) 243 | snapshot_filename = "%s_beta%.2f" % (snapshot_filename, args.beta) 244 | 245 | if args.lossnorm: 246 | pt_filename += "_lossnorm" 247 | snapshot_filename += "_lossnorm" 248 | if args.aug: 249 | pt_filename += "_aug" 250 | snapshot_filename += "_aug" 251 | 252 | print("%s\n%s" % (pt_filename, snapshot_filename)) 253 | def make_net(): 254 | with open('%s_train.pt' %(pt_filename), 'w') as f: 255 | f.write(str(net('train'))) 256 | with open('%s_test.pt' %(pt_filename), 'w') as f: 257 | f.write(str(net('test'))) 258 | def make_solver(): 259 | sp = {} 260 | sp['net'] = '"%s_train.pt"' %(pt_filename) 261 | if args.lossnorm: 262 | sp['base_lr'] = '1e-3' 263 | else: 264 | sp['base_lr'] = '1e-3' 265 | sp['lr_policy'] = '"step"' 266 | sp['momentum'] = '0.9' 267 | sp['weight_decay'] = '0.0001' 268 | sp['iter_size'] = '10' 269 | sp['stepsize'] = '5000' 270 | sp['display'] = '10' 271 | sp['snapshot'] = '2000' 272 | sp['snapshot_prefix'] = '"%s"' % snapshot_filename 273 | sp['gamma'] = '0.1' 274 | sp['max_iter'] = '40000' 275 | sp['solver_mode'] = 'GPU' 276 | f = open('%s_solver.pt' % pt_filename, 'w') 277 | for k, v in sorted(sp.items()): 278 | if not(type(v) is str): 279 | raise TypeError('All solver parameters must be strings') 280 | f.write('%s: %s\n'%(k, v)) 281 | f.close() 282 | 283 | def make_all(): 284 | make_net() 285 | make_solver() 286 | 287 | if __name__ == '__main__': 288 | make_all() 289 | -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | pytorch implementation of FLoss. 2 | -------------------------------------------------------------------------------- /pytorch/floss.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | class FLoss(nn.Module): 4 | def __init__(self, beta=0.3, log_like=False): 5 | super(FLoss, self).__init__() 6 | self.beta = beta 7 | self.log_like = log_like 8 | 9 | def forward(self, prediction, target): 10 | EPS = 1e-10 11 | N = prediction.size(0) 12 | TP = (prediction * target).view(N, -1).sum(dim=1) 13 | H = self.beta * target.view(N, -1).sum(dim=1) + prediction.view(N, -1).sum(dim=1) 14 | fmeasure = (1 + self.beta) * TP / (H + EPS) 15 | if self.log_like: 16 | floss = -torch.log(fmeasure) 17 | else: 18 | floss = (1 - fmeasure) 19 | return floss 20 | 21 | def floss(prediction, target, beta=0.3, log_like=False): 22 | EPS = 1e-10 23 | N = N = prediction.size(0) 24 | TP = (prediction * target).view(N, -1).sum(dim=1) 25 | H = beta * target.view(N, -1).sum(dim=1) + prediction.view(N, -1).sum(dim=1) 26 | fmeasure = (1 + beta) * TP / (H + EPS) 27 | if log_like: 28 | floss = -torch.log(fmeasure) 29 | else: 30 | floss = (1 - fmeasure) 31 | return floss 32 | 33 | 34 | if __name__=="__main__": 35 | import torch 36 | fl = FLoss() 37 | prediction = torch.rand(1, 1, 100, 100) 38 | target = (torch.rand(1, 1, 100, 100) >= 0.5).float() 39 | print("FLoss (Module)=%.7f" % fl(prediction, target).item()) 40 | print("FLoss (Functional)=%.7f" % floss(prediction, target).item()) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import numpy as np 5 | import sys, os, argparse 6 | from scipy.io import savemat 7 | import datetime 8 | sys.path.insert(0, 'lib') 9 | from os.path import isfile, join, isdir, abspath 10 | import cv2 11 | import caffe 12 | from caffe.proto import caffe_pb2 13 | from google.protobuf import text_format 14 | parser = argparse.ArgumentParser(description='Training DSS.') 15 | parser.add_argument('--gpu', type=int, help='gpu ID', default=0) 16 | parser.add_argument('--solver', type=str, help='solver', default='models/floss_solver.prototxt') 17 | parser.add_argument('--weights', type=str, help='base model', default='models/vgg16convs.caffemodel') 18 | parser.add_argument('--debug', type=str, help='debug mode', default='False') 19 | def str2bool(str1): 20 | if "true" in str1.lower() or "1" in str1.lower(): 21 | return True 22 | elif "false" in str1.lower() or "0" in str1.lower(): 23 | return False 24 | args = parser.parse_args() 25 | assert isfile(args.solver) 26 | assert isfile(args.weights) 27 | DEBUG = str2bool(args.debug) 28 | CACHE_FREQ = 1 29 | CACHE_DIR = abspath('data/cache') 30 | if not isdir(CACHE_DIR): 31 | os.makedirs(CACHE_DIR) 32 | if DEBUG: 33 | from pytools.image import overlay 34 | from pytools.misc import blob2im 35 | import matplotlib.pyplot as plt 36 | import matplotlib.cm as cm 37 | def upsample_filt(size): 38 | factor = (size + 1) // 2 39 | if size % 2 == 1: 40 | center = factor - 1 41 | else: 42 | center = factor - 0.5 43 | og = np.ogrid[:size, :size] 44 | return (1 - abs(og[0] - center) / factor) * \ 45 | (1 - abs(og[1] - center) / factor) 46 | def interp_surgery(net, layers): 47 | for l in layers: 48 | m, k, h, w = net.params[l][0].data.shape 49 | if m != k: 50 | print('input + output channels need to be the same') 51 | raise 52 | if h != w: 53 | print('filters need to be square') 54 | raise 55 | filt = upsample_filt(h) 56 | net.params[l][0].data[range(m), range(k), :, :] = filt 57 | caffe.set_mode_gpu() 58 | caffe.set_device(args.gpu) 59 | if not isdir('snapshots'): 60 | os.makedirs('snapshots') 61 | solver = caffe.SGDSolver(args.solver) 62 | # get snapshot_prefix 63 | solver_param = caffe_pb2.SolverParameter() 64 | with open(args.solver, 'rb') as f: 65 | text_format.Merge(f.read(), solver_param) 66 | max_iter = solver_param.max_iter 67 | # net surgery 68 | interp_layers = [k for k in solver.net.params.keys() if 'up' in k] 69 | interp_surgery(solver.net, interp_layers) 70 | solver.net.copy_from(args.weights) 71 | for p in solver.net.params: 72 | param = solver.net.params[p] 73 | for i in range(len(param)): 74 | print(p, "param[%d]: mean=%.5f, std=%.5f"%(i, solver.net.params[p][i].data.mean(), \ 75 | solver.net.params[p][i].data.std())) 76 | if DEBUG: 77 | now = datetime.datetime.now() 78 | cache_dir = join(CACHE_DIR, "%s-%s-%dH-%dM-%dS" % (args.solver.split(os.sep)[-1], str(now.date()), now.hour, now.minute, 79 | now.second)) 80 | if not isdir(cache_dir): 81 | os.makedirs(cache_dir) 82 | for i in range(1, max_iter + 1, CACHE_FREQ): 83 | cache_fn = join(cache_dir, "iter%d" % i) 84 | solver.step(CACHE_FREQ) 85 | keys = [None] * 7 86 | for i in range(len(keys)): 87 | if i <= 5: 88 | keys[i] = "sigmoid_dsn%d" % (i + 1) 89 | else: 90 | keys[i] = "sigmoid_fuse" 91 | mat_dict = dict() 92 | for k in keys: 93 | mat_dict[k + "_data"] = np.squeeze(solver.net.blobs[k].data) 94 | mat_dict[k + "_grad"] = np.squeeze(solver.net.blobs[k].diff) 95 | im = blob2im(solver.net.blobs['data'].data) 96 | mat_dict["image"] = im 97 | lb = np.squeeze(solver.net.blobs['label'].data) 98 | mat_dict["label"] = lb 99 | savemat(cache_fn, mat_dict) 100 | im = overlay(im, lb) 101 | dsn1 = np.squeeze(solver.net.blobs['sigmoid_dsn1'].data) 102 | dsn2 = np.squeeze(solver.net.blobs['sigmoid_dsn2'].data) 103 | dsn3 = np.squeeze(solver.net.blobs['sigmoid_dsn3'].data) 104 | dsn4 = np.squeeze(solver.net.blobs['sigmoid_dsn4'].data) 105 | dsn5 = np.squeeze(solver.net.blobs['sigmoid_dsn5'].data) 106 | dsn6 = np.squeeze(solver.net.blobs['sigmoid_dsn6'].data) 107 | fuse = np.squeeze(solver.net.blobs['sigmoid_fuse'].data) 108 | dss_fuse = (dsn3 + dsn4 + dsn5 + fuse) / 4 109 | fig, axes = plt.subplots(3, 3, figsize=(16, 16)) 110 | axes[0, 0].imshow(im) 111 | axes[0, 0].set_title("image and label") 112 | axes[0, 1].imshow(dsn1, cmap=cm.Greys_r) 113 | axes[0, 1].set_title("DSN1") 114 | axes[0, 2].imshow(dsn2, cmap=cm.Greys_r) 115 | axes[0, 2].set_title("DSN2") 116 | axes[1, 0].imshow(dsn3, cmap=cm.Greys_r) 117 | axes[1, 0].set_title("DSN3") 118 | axes[1, 1].imshow(dsn4, cmap=cm.Greys_r) 119 | axes[1, 1].set_title("DSN4") 120 | axes[1, 2].imshow(dsn5, cmap=cm.Greys_r) 121 | axes[1, 2].set_title("DSN5") 122 | axes[2, 0].imshow(dsn6, cmap=cm.Greys_r) 123 | axes[2, 0].set_title("DSN6") 124 | axes[2, 1].imshow(fuse, cmap=cm.Greys_r) 125 | axes[2, 1].set_title("fuse (dsn1~6)") 126 | axes[2, 2].imshow(dss_fuse, cmap=cm.Greys_r) 127 | axes[2, 2].set_title("DSS style fuse (dsn3~5 + fuse)") 128 | plt.savefig(cache_fn+'.jpg') 129 | plt.close(fig) 130 | print("Saving cache file to %s" % cache_fn) 131 | else: 132 | solver.solve() 133 | --------------------------------------------------------------------------------