├── BibTex.md ├── README.md └── img └── definition.png /BibTex.md: -------------------------------------------------------------------------------- 1 | @article{suo2023text, 2 | title={Text Augmented Spatial-aware Zero-shot Referring Image Segmentation}, 3 | author={Suo, Yucheng and Zhu, Linchao and Yang, Yi}, 4 | journal={arXiv preprint arXiv:2310.18049}, 5 | year={2023} 6 | } 7 | 8 | @inproceedings{shi2023unsupervised, 9 | title={Unsupervised Domain Adaptation for Referring Semantic Segmentation}, 10 | author={Shi, Haonan and Pan, Wenwen and Zhao, Zhou and Zhang, Mingmin and Wu, Fei}, 11 | booktitle={Proceedings of the 31st ACM International Conference on Multimedia}, 12 | pages={5807--5818}, 13 | year={2023} 14 | } 15 | 16 | @inproceedings{liu2023caris, 17 | title={CARIS: Context-Aware Referring Image Segmentation}, 18 | author={Liu, Sun-Ao and Zhang, Yiheng and Qiu, Zhaofan and Xie, Hongtao and Zhang, Yongdong and Yao, Ting}, 19 | booktitle={Proceedings of the 31st ACM International Conference on Multimedia}, 20 | pages={779--788}, 21 | year={2023} 22 | } 23 | 24 | @inproceedings{kim2023shatter, 25 | title={Shatter and Gather: Learning Referring Image Segmentation with Text Supervision}, 26 | author={Kim, Dongwon and Kim, Namyup and Lan, Cuiling and Kwak, Suha}, 27 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 28 | pages={15547--15557}, 29 | year={2023} 30 | } 31 | 32 | @inproceedings{wu2023advancing, 33 | title={Advancing referring expression segmentation beyond single image}, 34 | author={Wu, Yixuan and Zhang, Zhao and Xie, Chi and Zhu, Feng and Zhao, Rui}, 35 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 36 | pages={2628--2638}, 37 | year={2023} 38 | } 39 | 40 | @inproceedings{xu2023bridging, 41 | title={Bridging vision and language encoders: Parameter-efficient tuning for referring image segmentation}, 42 | author={Xu, Zunnan and Chen, Zhihong and Zhang, Yong and Song, Yibing and Wan, Xiang and Li, Guanbin}, 43 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 44 | pages={17503--17512}, 45 | year={2023} 46 | } 47 | 48 | @inproceedings{liu2023referring, 49 | title={Referring image segmentation using text supervision}, 50 | author={Liu, Fang and Liu, Yuhao and Kong, Yuqiu and Xu, Ke and Zhang, Lihe and Yin, Baocai and Hancke, Gerhard and Lau, Rynson}, 51 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 52 | pages={22124--22134}, 53 | year={2023} 54 | } 55 | 56 | @inproceedings{hu2023beyond, 57 | title={Beyond One-to-One: Rethinking the Referring Image Segmentation}, 58 | author={Hu, Yutao and Wang, Qixiong and Shao, Wenqi and Xie, Enze and Li, Zhenguo and Han, Jungong and Luo, Ping}, 59 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 60 | pages={4067--4077}, 61 | year={2023} 62 | } 63 | 64 | @article{ding2023bilateral, 65 | title={Bilateral Knowledge Interaction Network for Referring Image Segmentation}, 66 | author={Ding, Haixin and Zhang, Shengchuan and Wu, Qiong and Yu, Songlin and Hu, Jie and Cao, Liujuan and Ji, Rongrong}, 67 | journal={IEEE Transactions on Multimedia}, 68 | year={2023}, 69 | publisher={IEEE} 70 | } 71 | 72 | @inproceedings{ouyang23slvit, 73 | title={Slvit: Scale-wise language-guided vision transformer for referring image segmentation}, 74 | author={Ouyang, Shuyi and Wang, Hongyi and Xie, Shiao and Niu, Ziwei and Tong, Ruofeng and Chen, Yen-Wei and Lin, Lanfen}, 75 | booktitle={Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23}, 76 | pages={1294--1302} 77 | } 78 | 79 | @article{cheng2023wico, 80 | title={WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation}, 81 | author={Cheng, Zesen and Jin, Peng and Li, Hao and Li, Kehan and Li, Siheng and Ji, Xiangyang and Liu, Chang and Chen, Jie}, 82 | journal={arXiv preprint arXiv:2306.10750}, 83 | year={2023} 84 | } 85 | 86 | @article{wang2023cm, 87 | title={CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation}, 88 | author={Wang, Wenxuan and Liu, Jing and He, Xingjian and Zhang, Yisi and Chen, Chen and Shen, Jiachen and Zhang, Yan and Li, Jiangyun}, 89 | journal={arXiv preprint arXiv:2305.11481}, 90 | year={2023} 91 | } 92 | 93 | @inproceedings{tang2023contrastive, 94 | title={Contrastive Grouping with Transformer for Referring Image Segmentation}, 95 | author={Tang, Jiajin and Zheng, Ge and Shi, Cheng and Yang, Sibei}, 96 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 97 | pages={23570--23580}, 98 | year={2023} 99 | } 100 | 101 | @inproceedings{qu2023learning, 102 | title={Learning to Segment Every Referring Object Point by Point}, 103 | author={Qu, Mengxue and Wu, Yu and Wei, Yunchao and Liu, Wu and Liang, Xiaodan and Zhao, Yao}, 104 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 105 | pages={3021--3030}, 106 | year={2023} 107 | } 108 | 109 | @inproceedings{yu2023zero, 110 | title={Zero-shot Referring Image Segmentation with Global-Local Context Features}, 111 | author={Yu, Seonghoon and Seo, Paul Hongsuck and Son, Jeany}, 112 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 113 | pages={19456--19465}, 114 | year={2023} 115 | } 116 | 117 | @inproceedings{xu2023meta, 118 | title={Meta compositional referring expression segmentation}, 119 | author={Xu, Li and Huang, Mark He and Shang, Xindi and Yuan, Zehuan and Sun, Ying and Liu, Jun}, 120 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 121 | pages={19478--19487}, 122 | year={2023} 123 | } 124 | 125 | @inproceedings{liu2023polyformer, 126 | title={PolyFormer: Referring image segmentation as sequential polygon generation}, 127 | author={Liu, Jiang and Ding, Hui and Cai, Zhaowei and Zhang, Yuting and Satzoda, Ravi Kumar and Mahadevan, Vijay and Manmatha, R}, 128 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 129 | pages={18653--18663}, 130 | year={2023} 131 | } 132 | 133 | @inproceedings{liu2023gres, 134 | title={GRES: Generalized referring expression segmentation}, 135 | author={Liu, Chang and Ding, Henghui and Jiang, Xudong}, 136 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 137 | pages={23592--23601}, 138 | year={2023} 139 | } 140 | 141 | @article{yang2023semantics, 142 | title={Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation}, 143 | author={Yang, Zhao and Wang, Jiaqi and Tang, Yansong and Chen, Kai and Zhao, Hengshuang and Torr, Philip HS}, 144 | journal={arXiv preprint arXiv:2303.06345}, 145 | year={2023} 146 | } 147 | 148 | @article{chen2022position, 149 | title={Position-aware contrastive alignment for referring image segmentation}, 150 | author={Chen, Bo and Hu, Zhiwei and Ji, Zhilong and Bai, Jinfeng and Zuo, Wangmeng}, 151 | journal={arXiv preprint arXiv:2212.13419}, 152 | year={2022} 153 | } 154 | 155 | @article{zhang2022coupalign, 156 | title={Coupalign: Coupling word-pixel with sentence-mask alignments for referring image segmentation}, 157 | author={Zhang, Zicheng and Zhu, Yi and Liu, Jianzhuang and Liang, Xiaodan and Ke, Wei}, 158 | journal={Advances in Neural Information Processing Systems}, 159 | volume={35}, 160 | pages={14729--14742}, 161 | year={2022} 162 | } 163 | 164 | @article{shang2022cross, 165 | title={Cross-Modal Recurrent Semantic Comprehension for Referring Image Segmentation}, 166 | author={Shang, Chao and Li, Hongliang and Qiu, Heqian and Wu, Qingbo and Meng, Fanman and Zhao, Taijin and Ngan, King Ngi}, 167 | journal={IEEE Transactions on Circuits and Systems for Video Technology}, 168 | year={2022}, 169 | publisher={IEEE} 170 | } 171 | 172 | @article{liu2023local, 173 | title={Local-global coordination with transformers for referring image segmentation}, 174 | author={Liu, Fang and Kong, Yuqiu and Zhang, Lihe and Feng, Guang and Yin, Baocai}, 175 | journal={Neurocomputing}, 176 | volume={522}, 177 | pages={39--52}, 178 | year={2023}, 179 | publisher={Elsevier} 180 | } 181 | 182 | @article{huang2022unified, 183 | title={A unified mutual supervision framework for referring expression segmentation and generation}, 184 | author={Huang, Shijia and Li, Feng and Zhang, Hao and Liu, Shilong and Zhang, Lei and Wang, Liwei}, 185 | journal={arXiv preprint arXiv:2211.07919}, 186 | year={2022} 187 | } 188 | 189 | @article{ding2022vlt, 190 | title={Vlt: Vision-language transformer and query generation for referring segmentation}, 191 | author={Ding, Henghui and Liu, Chang and Wang, Suchen and Jiang, Xudong}, 192 | journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 193 | year={2022}, 194 | publisher={IEEE} 195 | } 196 | 197 | @article{feng2022learning, 198 | title={Learning from box annotations for referring image segmentation}, 199 | author={Feng, Guang and Zhang, Lihe and Hu, Zhiwei and Lu, Huchuan}, 200 | journal={IEEE Transactions on Neural Networks and Learning Systems}, 201 | year={2022}, 202 | publisher={IEEE} 203 | } 204 | 205 | @article{liu2022instance, 206 | title={Instance-specific feature propagation for referring segmentation}, 207 | author={Liu, Chang and Jiang, Xudong and Ding, Henghui}, 208 | journal={IEEE Transactions on Multimedia}, 209 | year={2022}, 210 | publisher={IEEE} 211 | } 212 | 213 | @inproceedings{zhu2022seqtr, 214 | title={Seqtr: A simple yet universal network for visual grounding}, 215 | author={Zhu, Chaoyang and Zhou, Yiyi and Shen, Yunhang and Luo, Gen and Pan, Xingjia and Lin, Mingbao and Chen, Chao and Cao, Liujuan and Sun, Xiaoshuai and Ji, Rongrong}, 216 | booktitle={European Conference on Computer Vision}, 217 | pages={598--615}, 218 | year={2022}, 219 | organization={Springer} 220 | } 221 | 222 | @inproceedings{yang2022lavt, 223 | title={Lavt: Language-aware vision transformer for referring image segmentation}, 224 | author={Yang, Zhao and Wang, Jiaqi and Tang, Yansong and Chen, Kai and Zhao, Hengshuang and Torr, Philip HS}, 225 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 226 | pages={18155--18165}, 227 | year={2022} 228 | } 229 | 230 | @inproceedings{wang2022cris, 231 | title={Cris: Clip-driven referring image segmentation}, 232 | author={Wang, Zhaoqing and Lu, Yu and Li, Qiang and Tao, Xunqiang and Guo, Yandong and Gong, Mingming and Liu, Tongliang}, 233 | booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, 234 | pages={11686--11695}, 235 | year={2022} 236 | } 237 | 238 | @inproceedings{kim2022restr, 239 | title={Restr: Convolution-free referring image segmentation using transformers}, 240 | author={Kim, Namyup and Kim, Dongwon and Lan, Cuiling and Zeng, Wenjun and Kwak, Suha}, 241 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 242 | pages={18145--18154}, 243 | year={2022} 244 | } 245 | 246 | @article{feng2021bidirectional, 247 | title={Bidirectional relationship inferring network for referring image localization and segmentation}, 248 | author={Feng, Guang and Hu, Zhiwei and Zhang, Lihe and Sun, Jiayu and Lu, Huchuan}, 249 | journal={IEEE Transactions on Neural Networks and Learning Systems}, 250 | year={2021}, 251 | publisher={IEEE} 252 | } 253 | 254 | @article{li2021referring, 255 | title={Referring transformer: A one-step approach to multi-task visual grounding}, 256 | author={Li, Muchen and Sigal, Leonid}, 257 | journal={Advances in neural information processing systems}, 258 | volume={34}, 259 | pages={19652--19664}, 260 | year={2021} 261 | } 262 | 263 | @inproceedings{jiao2021two, 264 | title={Two-stage visual cues enhancement network for referring image segmentation}, 265 | author={Jiao, Yang and Jie, Zequn and Luo, Weixin and Chen, Jingjing and Jiang, Yu-Gang and Wei, Xiaolin and Ma, Lin}, 266 | booktitle={Proceedings of the 29th ACM International Conference on Multimedia}, 267 | pages={1331--1340}, 268 | year={2021} 269 | } 270 | 271 | @inproceedings{ding2021vision, 272 | title={Vision-language transformer and query generation for referring segmentation}, 273 | author={Ding, Henghui and Liu, Chang and Wang, Suchen and Jiang, Xudong}, 274 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 275 | pages={16321--16330}, 276 | year={2021} 277 | } 278 | 279 | @inproceedings{kamath2021mdetr, 280 | title={Mdetr-modulated detection for end-to-end multi-modal understanding}, 281 | author={Kamath, Aishwarya and Singh, Mannat and LeCun, Yann and Synnaeve, Gabriel and Misra, Ishan and Carion, Nicolas}, 282 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 283 | pages={1780--1790}, 284 | year={2021} 285 | } 286 | 287 | @inproceedings{feng2021encoder, 288 | title={Encoder fusion network with co-attention embedding for referring image segmentation}, 289 | author={Feng, Guang and Hu, Zhiwei and Zhang, Lihe and Lu, Huchuan}, 290 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 291 | pages={15506--15515}, 292 | year={2021} 293 | } 294 | 295 | @inproceedings{yang2021bottom, 296 | title={Bottom-up shift and reasoning for referring image segmentation}, 297 | author={Yang, Sibei and Xia, Meng and Li, Guanbin and Zhou, Hong-Yu and Yu, Yizhou}, 298 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 299 | pages={11266--11275}, 300 | year={2021} 301 | } 302 | 303 | @inproceedings{jing2021locate, 304 | title={Locate then segment: A strong pipeline for referring image segmentation}, 305 | author={Jing, Ya and Kong, Tao and Wang, Wei and Wang, Liang and Li, Lei and Tan, Tieniu}, 306 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 307 | pages={9858--9867}, 308 | year={2021} 309 | } 310 | 311 | @inproceedings{luo2020cascade, 312 | title={Cascade grouped attention network for referring expression segmentation}, 313 | author={Luo, Gen and Zhou, Yiyi and Ji, Rongrong and Sun, Xiaoshuai and Su, Jinsong and Lin, Chia-Wen and Tian, Qi}, 314 | booktitle={Proceedings of the 28th ACM International Conference on Multimedia}, 315 | pages={1274--1282}, 316 | year={2020} 317 | } 318 | 319 | @inproceedings{hui2020linguistic, 320 | title={Linguistic structure guided context modeling for referring image segmentation}, 321 | author={Hui, Tianrui and Liu, Si and Huang, Shaofei and Li, Guanbin and Yu, Sansi and Zhang, Faxi and Han, Jizhong}, 322 | booktitle={Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part X 16}, 323 | pages={59--75}, 324 | year={2020}, 325 | organization={Springer} 326 | } 327 | 328 | @inproceedings{huang2020referring, 329 | title={Referring image segmentation via cross-modal progressive comprehension}, 330 | author={Huang, Shaofei and Hui, Tianrui and Liu, Si and Li, Guanbin and Wei, Yunchao and Han, Jizhong and Liu, Luoqi and Li, Bo}, 331 | booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, 332 | pages={10488--10497}, 333 | year={2020} 334 | } 335 | 336 | @inproceedings{hu2020bi, 337 | title={Bi-directional relationship inferring network for referring image segmentation}, 338 | author={Hu, Zhiwei and Feng, Guang and Sun, Jiayu and Zhang, Lihe and Lu, Huchuan}, 339 | booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, 340 | pages={4424--4433}, 341 | year={2020} 342 | } 343 | 344 | @inproceedings{wu2020phrasecut, 345 | title={Phrasecut: Language-based image segmentation in the wild}, 346 | author={Wu, Chenyun and Lin, Zhe and Cohen, Scott and Bui, Trung and Maji, Subhransu}, 347 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 348 | pages={10216--10225}, 349 | year={2020} 350 | } 351 | 352 | @inproceedings{luo2020multi, 353 | title={Multi-task collaborative network for joint referring expression comprehension and segmentation}, 354 | author={Luo, Gen and Zhou, Yiyi and Sun, Xiaoshuai and Cao, Liujuan and Wu, Chenglin and Deng, Cheng and Ji, Rongrong}, 355 | booktitle={Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition}, 356 | pages={10034--10043}, 357 | year={2020} 358 | } 359 | 360 | @article{ye2020dual, 361 | title={Dual convolutional lstm network for referring image segmentation}, 362 | author={Ye, Linwei and Liu, Zhi and Wang, Yang}, 363 | journal={IEEE Transactions on Multimedia}, 364 | volume={22}, 365 | number={12}, 366 | pages={3224--3235}, 367 | year={2020}, 368 | publisher={IEEE} 369 | } 370 | 371 | @article{chen2019referring, 372 | title={Referring expression object segmentation with caption-aware consistency}, 373 | author={Chen, Yi-Wen and Tsai, Yi-Hsuan and Wang, Tiantian and Lin, Yen-Yu and Yang, Ming-Hsuan}, 374 | journal={arXiv preprint arXiv:1910.04748}, 375 | year={2019} 376 | } 377 | 378 | @inproceedings{chen2019see, 379 | title={See-through-text grouping for referring image segmentation}, 380 | author={Chen, Ding-Jie and Jia, Songhao and Lo, Yi-Chen and Chen, Hwann-Tzong and Liu, Tyng-Luh}, 381 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 382 | pages={7454--7463}, 383 | year={2019} 384 | } 385 | 386 | @inproceedings{ye2019cross, 387 | title={Cross-modal self-attention network for referring image segmentation}, 388 | author={Ye, Linwei and Rochan, Mrigank and Liu, Zhi and Wang, Yang}, 389 | booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, 390 | pages={10502--10511}, 391 | year={2019} 392 | } 393 | 394 | @inproceedings{shi2018key, 395 | title={Key-word-aware network for referring expression image segmentation}, 396 | author={Shi, Hengcan and Li, Hongliang and Meng, Fanman and Wu, Qingbo}, 397 | booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, 398 | pages={38--54}, 399 | year={2018} 400 | } 401 | 402 | @inproceedings{margffoy2018dynamic, 403 | title={Dynamic multimodal instance segmentation guided by natural language queries}, 404 | author={Margffoy-Tuay, Edgar and P{\'e}rez, Juan C and Botero, Emilio and Arbel{\'a}ez, Pablo}, 405 | booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, 406 | pages={630--645}, 407 | year={2018} 408 | } 409 | 410 | @inproceedings{li2018referring, 411 | title={Referring image segmentation via recurrent refinement networks}, 412 | author={Li, Ruiyu and Li, Kaican and Kuo, Yi-Chun and Shu, Michelle and Qi, Xiaojuan and Shen, Xiaoyong and Jia, Jiaya}, 413 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, 414 | pages={5745--5753}, 415 | year={2018} 416 | } 417 | 418 | @inproceedings{yu2018mattnet, 419 | title={Mattnet: Modular attention network for referring expression comprehension}, 420 | author={Yu, Licheng and Lin, Zhe and Shen, Xiaohui and Yang, Jimei and Lu, Xin and Bansal, Mohit and Berg, Tamara L}, 421 | booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, 422 | pages={1307--1315}, 423 | year={2018} 424 | } 425 | 426 | @inproceedings{liu2017recurrent, 427 | title={Recurrent multimodal interaction for referring image segmentation}, 428 | author={Liu, Chenxi and Lin, Zhe and Shen, Xiaohui and Yang, Jimei and Lu, Xin and Yuille, Alan}, 429 | booktitle={Proceedings of the IEEE international conference on computer vision}, 430 | pages={1271--1280}, 431 | year={2017} 432 | } 433 | 434 | @inproceedings{hu2016segmentation, 435 | title={Segmentation from natural language expressions}, 436 | author={Hu, Ronghang and Rohrbach, Marcus and Darrell, Trevor}, 437 | booktitle={Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14}, 438 | pages={108--124}, 439 | year={2016}, 440 | organization={Springer} 441 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **RIS-Learning-List** 2 | 3 | ## **Introduction** 4 | This repository introduces **Referring Image Segmentation** task, and collects some related works. 5 | ## **Content** 6 | 7 | - [Definition](#Definition) 8 | - [Dataset](#Datsets) 9 | - [Evaluation Metric](#Evaluation-Metric) 10 | - [Related Works](#Related-Works) 11 | - [Performance](#Performance) 12 | - [Reference](#Reference) 13 | 14 | ## **Definition** 15 | 16 | **Referring Image Segmentation (RIS)** is a challenging problem at the intersection of computer vision and natural language processing. Given an image and a natural language expression, the goal is to produce a segmentation mask in the image corresponding to the objects referred by the the natural language expression. 17 | ![](https://github.com/Huntersxsx/RIS-Learning-List/blob/main/img/definition.png) 18 | 19 | ## **Datsets** 20 | - [**RefCOCO**](https://arxiv.org/pdf/1608.00272): It contains **19,994 images** with **142,210 referring expressions** for **50,000 objects**, which are collected from the MSCOCO via a two-player game. The dataset is split into 120,624 train, 10,834 validation, 5,657 test A, and 5,095 test B samples, respectively. 21 | - [**RefCOCO+**](https://arxiv.org/pdf/1608.00272): It contains **141,564 language expressions** with **49,856 objects** in **19,992 images**. The datasetis split into train, validation, test A, and test B with 120,624, 10,758, 5,726, and 4,889 samples, respectively. Compared with RefCOCO dataset, some kinds of **absolute-location words are excluded** from the RefCOCO+ dataset. 22 | - [**G-Ref**](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Mao_Generation_and_Comprehension_CVPR_2016_paper.pdf): It includes **104,560 referring expressions** for **54,822 objects** in **26,711 images**. 23 | - Expressions in RefCOCO and RefCOCO+ are very succinct (containing 3.5 words on average). In contrast, **expressionsin G-Ref are more complex** (containing 8.4 words on average). Conversely, **RefCOCO and RefCOCO+ tend to have more objects of the same category per image** (3.9 on average) compared to G-Ref (1.6 on average). 24 | 25 | ## **Evaluation Metric** 26 | - **overall IoU:** It is the total intersection area divided by the total union area, where both intersection area and union area are accumulated over all test samples (each test sample is an image and a referential expression). 27 | - **mean IoU:** It is the IoU between the prediction and ground truth averaged across all test samples. 28 | - **Precision@X:** It measures the percentage of test images with an IoU score higher than the threshold X ∈ {0.5, 0.6, 0.7, 0.8, 0.9}. 29 | 30 | ## **Related Works** 31 | 32 | - **MagNet:** [Mask Grounding for Referring Image Segmentation](https://arxiv.org/pdf/2312.12198.pdf). *in Arxiv 2023*. 33 | - **MRES:** [Unveiling Parts Beyond Objects: 34 | Towards Finer-Granularity Referring Expression Segmentation](https://arxiv.org/pdf/2312.08007.pdf). *in Arxiv 2023*. [code](https://github.com/Rubics-Xuan/MRES) 35 | - [Towards Generalizable Referring Image Segmentation via 36 | Target Prompt and Visual Coherence](https://arxiv.org/pdf/2312.00452.pdf). *in Arxiv 2023*. 37 | - **BTMAE:** [Synchronizing Vision and Language: Bidirectional Token-Masking AutoEncoder 38 | for Referring Image Segmentation](https://arxiv.org/pdf/2311.17952.pdf). *in Arxiv 2023*. 39 | - **MARIS:** [MARIS: Referring Image Segmentation via Mutual-Aware Attention Features](https://arxiv.org/pdf/2311.15727.pdf). *in Arxiv 2023*. 40 | - **Omni-RES:** [Towards Omni-supervised Referring Expression Segmentation](https://arxiv.org/pdf/2311.00397.pdf). *in Arxiv 2023*. [code](https://github.com/nineblu/omni-res) 41 | - **JMCELN:** [Referring Image Segmentation via Joint Mask Contextual Embedding 42 | Learning and Progressive Alignment Network](https://aclanthology.org/2023.emnlp-main.481/). *in EMNLP 2023*. [code](https://github.com/toyottttttt/referring-segmentation) 43 | - **TAS:** [Text Augmented Spatial-aware Zero-shot Referring Image Segmentation](https://arxiv.org/pdf/2310.18049.pdf). *in EMNLP 2023 Findings*. 44 | - **CVMN:** [Unsupervised Domain Adaptation for Referring Semantic Segmentation](https://dl.acm.org/doi/abs/10.1145/3581783.3611879). *in ACM MM 2023*. [code](https://github.com/asudahkzj/CVMN) 45 | - **CARIS:** [CARIS: Context-Aware Referring Image Segmentation](https://dl.acm.org/doi/abs/10.1145/3581783.3612117). *in ACM MM 2023*. [code](https://github.com/lsa1997/CARIS) 46 | - **Shatter and Gather:** [Shatter and Gather: Learning Referring Image Segmentation with Text Supervision](https://arxiv.org/pdf/2308.15512v1.pdf). *in ICCV 2023*. 47 | - **Group-RES:** [Advancing Referring Expression Segmentation Beyond Single Image](https://arxiv.org/pdf/2305.12452.pdf). *in ICCV 2023*. [code](https://github.com/yixuan730/group-res) 48 | - **ETRIS:** [Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation](https://arxiv.org/pdf/2307.11545.pdf). *in ICCV 2023*. [code](https://github.com/kkakkkka/ETRIS) 49 | - **TRIS:** [Referring Image Segmentation Using Text Supervision](https://arxiv.org/pdf/2308.14575.pdf). *in ICCV 2023*. [code](https://github.com/fawnliu/TRIS) 50 | - **RIS-DMMI:** [Beyond One-to-One: Rethinking the Referring Image Segmentation](https://arxiv.org/pdf/2308.13853.pdf). *in ICCV 2023*. [code](https://github.com/toggle1995/RIS-DMMI) 51 | - **BKINet:** [Bilateral Knowledge Interaction Network for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/10227590). *in TMM 2023*. [code](https://github.com/dhding/BKINet) 52 | - **SLViT:** [SLViT: Scale-Wise Language-Guided Vision Transformer for Referring Image Segmentation](https://www.ijcai.org/proceedings/2023/0144.pdf). *in IJCAI 2023*. [code](https://github.com/NaturalKnight/SLViT) 53 | - **WiCo:** [WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation](https://www.ijcai.org/proceedings/2023/0071.pdf). *in IJCAI 2023*. 54 | - **CM-MaskSD:** [CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation](https://arxiv.org/pdf/2305.11481.pdf). *in Arxiv 2023*. 55 | - **CGFormer:** [Contrastive Grouping with Transformer for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Tang_Contrastive_Grouping_With_Transformer_for_Referring_Image_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/Toneyaya/CGFormer) 56 | - **Partial-RES:** [Learning to Segment Every Referring Object Point by Point](https://openaccess.thecvf.com/content/CVPR2023/papers/Qu_Learning_To_Segment_Every_Referring_Object_Point_by_Point_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/qumengxue/Partial-RES.git) 57 | - **Zero-shot RIS:** [Zero-shot Referring Image Segmentation with Global-Local Context Features](https://openaccess.thecvf.com/content/CVPR2023/papers/Yu_Zero-Shot_Referring_Image_Segmentation_With_Global-Local_Context_Features_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/Seonghoon-Yu/Zero-shot-RIS) 58 | - **MCRES:** [Meta Compositional Referring Expression Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Xu_Meta_Compositional_Referring_Expression_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. 59 | - **PolyFormer:** [PolyFormer: Referring Image Segmentation as Sequential Polygon Generation](https://arxiv.org/pdf/2302.07387.pdf). *in CVPR 2023*. [project](https://polyformer.github.io/) 60 | - **GRES:** [Generalized Referring Expression Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Liu_GRES_Generalized_Referring_Expression_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. [project](https://henghuiding.github.io/GRES) 61 | - **SADLR:** [Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation](https://arxiv.org/pdf/2303.06345.pdf). *in AAAI 2023*. 62 | - **PCAN:** [Position-Aware Contrastive Alignment for Referring Image Segmentation](https://arxiv.org/pdf/2212.13419.pdf). *in Arxiv 2022*. 63 | - **CoupAlign:** [CoupAlign: Coupling Word-Pixel with Sentence-Mask Alignments for Referring Image Segmentation](https://arxiv.org/pdf/2212.01769.pdf). *in NeurIPS 2022*. [code](https://gitee.com/mindspore/models/tree/master/research/cv/CoupAlign) 64 | - **CRSCNet:** [Cross-Modal Recurrent Semantic Comprehension for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/9998537). *in TCSVT 2022*. 65 | - **LGCT:** [Local-global coordination with transformers for referring image segmentation](https://www.sciencedirect.com/science/article/pii/S0925231222015119). *in Neurocomputing 2022*. 66 | - **RES®:** [A Unified Mutual Supervision Framework for Referring Expression Segmentation and Generation](https://arxiv.org/pdf/2211.07919.pdf). *in Arxiv 2022*. 67 | - **VLT:** [VLT: Vision-Language Transformer and Query Generation for Referring Segmentation](https://arxiv.org/pdf/2210.15871.pdf). *in TPAMI 2022*. [code](https://github.com/henghuiding/Vision-Language-Transformer) 68 | - [Learning From Box Annotations for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/9875225). *in TNNLS 2022*. [code](https://github.com/fengguang94/Weakly-Supervised-RIS) 69 | - [Instance-Specific Feature Propagation for Referring Segmentation](https://ieeexplore.ieee.org/abstract/document/9745353). *in TMM 2022*. 70 | - **SeqTR:** [SeqTR: A Simple Yet Universal Network for Visual Grounding](https://arxiv.org/pdf/2203.16265.pdf). *in ECCV 2022*. [code](https://github.com/sean-zhuh/SeqTR) 71 | - **LAVT:** [LAVT: Language-Aware Vision Transformer for Referring Image Segmentation](https://arxiv.org/abs/2112.02244). *in CVPR 2022*. [code](https://github.com/yz93/LAVT-RIS) 72 | - **CRIS:** [CRIS: CLIP-Driven Referring Image Segmentation](https://arxiv.org/abs/2111.15174). *in CVPR 2022*. [code](https://github.com/DerrickWang005/CRIS.pytorch) 73 | - **CRIS:** [CRIS: CLIP-Driven Referring Image Segmentation](https://arxiv.org/abs/2111.15174). *in CVPR 2022*. [code](https://github.com/DerrickWang005/CRIS.pytorch) 74 | - **ReSTR:** [ReSTR: Convolution-free Referring Image Segmentation Using Transformers](https://www.microsoft.com/en-us/research/uploads/prod/2022/03/01404.pdf). *in CVPR 2022*. [project](http://cvlab.postech.ac.kr/research/restr/) 75 | - [Bidirectional relationship inferring network for referring image localization and segmentation](https://ieeexplore.ieee.org/document/9526878). *in TNNLS 2021*. 76 | - **RefTR:** [Referring Transformer: A One-step Approach to Multi-task Visual Grounding](https://openreview.net/pdf?id=J64lDCrYGi). *in NeurIPS 2021*. 77 | - **TV-Net:** [Two-stage Visual Cues Enhancement Network for Referring Image Segmentation](https://arxiv.org/abs/2110.04435). *in ACM MM 2021*. [code](https://github.com/sxjyjay/tv-net) 78 | - **VLT:** [Vision-Language Transformer and Query Generation for Referring Segmentation](https://arxiv.org/abs/2108.05565). *in ICCV 2021*. [code](https://github.com/henghuiding/Vision-Language-Transformer) 79 | - **MDETR:** [MDETR - Modulated Detection for End-to-End Multi-Modal Understanding](https://arxiv.org/abs/2104.12763). *in ICCV 2021*. [code](https://github.com/ashkamath/mdetr) 80 | - **EFNet:** [Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Feng_Encoder_Fusion_Network_With_Co-Attention_Embedding_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. [code](https://github.com/fengguang94/CEFNet) 81 | - **BUSNet:** [Bottom-Up Shift and Reasoning for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. [code](https://github.com/incredibleXM/BUSNet) 82 | - **LTS:** [Locate then Segment: A Strong Pipeline for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Jing_Locate_Then_Segment_A_Strong_Pipeline_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. 83 | - **CGAN:** [Cascade Grouped Attention Network for Referring Expression Segmentation](https://dl.acm.org/doi/abs/10.1145/3394171.3414006). *in ACM MM 2020*. 84 | - **LSCM:** [Linguistic Structure Guided Context Modeling for Referring Image Segmentation](http://colalab.org/media/paper/Linguistic_Structure_Guided_Context_Modeling_for_Referring_Image_Segmentation.pdf). *in ECCV 2020*. 85 | - **CMPC-Refseg:** [Referring Image Segmentation via Cross-Modal Progressive Comprehension](http://openaccess.thecvf.com/content_CVPR_2020/papers/Huang_Referring_Image_Segmentation_via_Cross-Modal_Progressive_Comprehension_CVPR_2020_paper.pdf). *in CVPR 2020*. [code](https://github.com/spyflying/CMPC-Refseg) 86 | - **BRINet:** [Bi-directional Relationship Inferring Network for Referring Image Segmentation](http://openaccess.thecvf.com/content_CVPR_2020/papers/Hu_Bi-Directional_Relationship_Inferring_Network_for_Referring_Image_Segmentation_CVPR_2020_paper.pdf). *in CVPR 2020*. [code](https://github.com/fengguang94/CVPR2020-BRINet) 87 | - **PhraseCut:** [PhraseCut: Language-based Image Segmentation in the Wild](https://people.cs.umass.edu/~smaji/papers/phrasecut+supp-cvpr20.pdf). *in CVPR 2020*. [code](https://github.com/ChenyunWu/PhraseCutDataset) 88 | - **MCN:** [Multi-task Collaborative Network for Joint Referring Expression Comprehension and Segmentation](https://arxiv.org/abs/2003.08813). *in CVPR 2020*. [code](https://github.com/luogen1996/MCN) 89 | - [Dual Convolutional LSTM Network for Referring Image Segmentation](https://arxiv.org/abs/2001.11561). *in TMM 2020*. 90 | - **lang2seg:** [Referring Expression Object Segmentation with Caption-Aware Consistency](https://arxiv.org/pdf/1910.04748.pdf). *in BMVC 2019*. [code](https://github.com/wenz116/lang2seg) 91 | - **STEP:** [See-Through-Text Grouping for Referring Image Segmentation](http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_See-Through-Text_Grouping_for_Referring_Image_Segmentation_ICCV_2019_paper.pdf). *in ICCV 2019*. 92 | - **CMSA-Net:** [Cross-Modal Self-Attention Network for Referring Image Segmentation](https://arxiv.org/pdf/1904.04745.pdf). *in CVPR 2019*. [code](https://github.com/lwye/CMSA-Net) 93 | - **KWA:** [Key-Word-Aware Network for Referring Expression Image Segmentation](http://openaccess.thecvf.com/content_ECCV_2018/papers/Hengcan_Shi_Key-Word-Aware_Network_for_ECCV_2018_paper.pdf). *in ECCV 2018*. [code](https://github.com/shihengcan/key-word-aware-network-pycaffe) 94 | - **DMN:** [Dynamic Multimodal Instance Segmentation Guided by Natural Language Queries](http://openaccess.thecvf.com/content_ECCV_2018/papers/Edgar_Margffoy-Tuay_Dynamic_Multimodal_Instance_ECCV_2018_paper.pdf). *in ECCV 2018*. [code](https://github.com/BCV-Uniandes/DMS) 95 | - **RRN:** [Referring Image Segmentation via Recurrent Refinement Networks](http://openaccess.thecvf.com/content_cvpr_2018/papers/Li_Referring_Image_Segmentation_CVPR_2018_paper.pdf). *in CVPR 2018*. [code](https://github.com/liruiyu/referseg_rrn) 96 | - **MAttNet:** [MAttNet: Modular Attention Network for Referring Expression Comprehension](http://openaccess.thecvf.com/content_cvpr_2018/papers/Yu_MAttNet_Modular_Attention_CVPR_2018_paper.pdf). *in CVPR 2018*. [code](https://github.com/lichengunc/MAttNet) 97 | - **RMI:** [Recurrent Multimodal Interaction for Referring Image Segmentation](http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Recurrent_Multimodal_Interaction_ICCV_2017_paper.pdf). *in ICCV 2017*. [code](https://github.com/chenxi116/TF-phrasecut-public) 98 | - **LSTM-CNN:** [Segmentation from natural language expressions](https://arxiv.org/pdf/1603.06180.pdf). *in ECCV 2016*. [code](https://github.com/ronghanghu/text_objseg) 99 | 100 | 101 | ## Performance 102 | 103 | 104 | ## Reference 105 | 106 | [MarkMoHR / Awesome-Referring-Image-Segmentation](https://github.com/MarkMoHR/Awesome-Referring-Image-Segmentation) 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /img/definition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersxsx/RIS-Learning-List/dfc63f149f4516087ec316228d01803d525b7347/img/definition.png --------------------------------------------------------------------------------