├── BibTex.md
├── README.md
└── img
    └── definition.png


/BibTex.md:
--------------------------------------------------------------------------------
  1 | @article{suo2023text,
  2 |   title={Text Augmented Spatial-aware Zero-shot Referring Image Segmentation},
  3 |   author={Suo, Yucheng and Zhu, Linchao and Yang, Yi},
  4 |   journal={arXiv preprint arXiv:2310.18049},
  5 |   year={2023}
  6 | }
  7 | 
  8 | @inproceedings{shi2023unsupervised,
  9 |   title={Unsupervised Domain Adaptation for Referring Semantic Segmentation},
 10 |   author={Shi, Haonan and Pan, Wenwen and Zhao, Zhou and Zhang, Mingmin and Wu, Fei},
 11 |   booktitle={Proceedings of the 31st ACM International Conference on Multimedia},
 12 |   pages={5807--5818},
 13 |   year={2023}
 14 | }
 15 | 
 16 | @inproceedings{liu2023caris,
 17 |   title={CARIS: Context-Aware Referring Image Segmentation},
 18 |   author={Liu, Sun-Ao and Zhang, Yiheng and Qiu, Zhaofan and Xie, Hongtao and Zhang, Yongdong and Yao, Ting},
 19 |   booktitle={Proceedings of the 31st ACM International Conference on Multimedia},
 20 |   pages={779--788},
 21 |   year={2023}
 22 | }
 23 | 
 24 | @inproceedings{kim2023shatter,
 25 |   title={Shatter and Gather: Learning Referring Image Segmentation with Text Supervision},
 26 |   author={Kim, Dongwon and Kim, Namyup and Lan, Cuiling and Kwak, Suha},
 27 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
 28 |   pages={15547--15557},
 29 |   year={2023}
 30 | }
 31 | 
 32 | @inproceedings{wu2023advancing,
 33 |   title={Advancing referring expression segmentation beyond single image},
 34 |   author={Wu, Yixuan and Zhang, Zhao and Xie, Chi and Zhu, Feng and Zhao, Rui},
 35 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
 36 |   pages={2628--2638},
 37 |   year={2023}
 38 | }
 39 | 
 40 | @inproceedings{xu2023bridging,
 41 |   title={Bridging vision and language encoders: Parameter-efficient tuning for referring image segmentation},
 42 |   author={Xu, Zunnan and Chen, Zhihong and Zhang, Yong and Song, Yibing and Wan, Xiang and Li, Guanbin},
 43 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
 44 |   pages={17503--17512},
 45 |   year={2023}
 46 | }
 47 | 
 48 | @inproceedings{liu2023referring,
 49 |   title={Referring image segmentation using text supervision},
 50 |   author={Liu, Fang and Liu, Yuhao and Kong, Yuqiu and Xu, Ke and Zhang, Lihe and Yin, Baocai and Hancke, Gerhard and Lau, Rynson},
 51 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
 52 |   pages={22124--22134},
 53 |   year={2023}
 54 | }
 55 | 
 56 | @inproceedings{hu2023beyond,
 57 |   title={Beyond One-to-One: Rethinking the Referring Image Segmentation},
 58 |   author={Hu, Yutao and Wang, Qixiong and Shao, Wenqi and Xie, Enze and Li, Zhenguo and Han, Jungong and Luo, Ping},
 59 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
 60 |   pages={4067--4077},
 61 |   year={2023}
 62 | }
 63 | 
 64 | @article{ding2023bilateral,
 65 |   title={Bilateral Knowledge Interaction Network for Referring Image Segmentation},
 66 |   author={Ding, Haixin and Zhang, Shengchuan and Wu, Qiong and Yu, Songlin and Hu, Jie and Cao, Liujuan and Ji, Rongrong},
 67 |   journal={IEEE Transactions on Multimedia},
 68 |   year={2023},
 69 |   publisher={IEEE}
 70 | }
 71 | 
 72 | @inproceedings{ouyang23slvit,
 73 |   title={Slvit: Scale-wise language-guided vision transformer for referring image segmentation},
 74 |   author={Ouyang, Shuyi and Wang, Hongyi and Xie, Shiao and Niu, Ziwei and Tong, Ruofeng and Chen, Yen-Wei and Lin, Lanfen},
 75 |   booktitle={Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23},
 76 |   pages={1294--1302}
 77 | }
 78 | 
 79 | @article{cheng2023wico,
 80 |   title={WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation},
 81 |   author={Cheng, Zesen and Jin, Peng and Li, Hao and Li, Kehan and Li, Siheng and Ji, Xiangyang and Liu, Chang and Chen, Jie},
 82 |   journal={arXiv preprint arXiv:2306.10750},
 83 |   year={2023}
 84 | }
 85 | 
 86 | @article{wang2023cm,
 87 |   title={CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation},
 88 |   author={Wang, Wenxuan and Liu, Jing and He, Xingjian and Zhang, Yisi and Chen, Chen and Shen, Jiachen and Zhang, Yan and Li, Jiangyun},
 89 |   journal={arXiv preprint arXiv:2305.11481},
 90 |   year={2023}
 91 | }
 92 | 
 93 | @inproceedings{tang2023contrastive,
 94 |   title={Contrastive Grouping with Transformer for Referring Image Segmentation},
 95 |   author={Tang, Jiajin and Zheng, Ge and Shi, Cheng and Yang, Sibei},
 96 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
 97 |   pages={23570--23580},
 98 |   year={2023}
 99 | }
100 | 
101 | @inproceedings{qu2023learning,
102 |   title={Learning to Segment Every Referring Object Point by Point},
103 |   author={Qu, Mengxue and Wu, Yu and Wei, Yunchao and Liu, Wu and Liang, Xiaodan and Zhao, Yao},
104 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
105 |   pages={3021--3030},
106 |   year={2023}
107 | }
108 | 
109 | @inproceedings{yu2023zero,
110 |   title={Zero-shot Referring Image Segmentation with Global-Local Context Features},
111 |   author={Yu, Seonghoon and Seo, Paul Hongsuck and Son, Jeany},
112 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
113 |   pages={19456--19465},
114 |   year={2023}
115 | }
116 | 
117 | @inproceedings{xu2023meta,
118 |   title={Meta compositional referring expression segmentation},
119 |   author={Xu, Li and Huang, Mark He and Shang, Xindi and Yuan, Zehuan and Sun, Ying and Liu, Jun},
120 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
121 |   pages={19478--19487},
122 |   year={2023}
123 | }
124 | 
125 | @inproceedings{liu2023polyformer,
126 |   title={PolyFormer: Referring image segmentation as sequential polygon generation},
127 |   author={Liu, Jiang and Ding, Hui and Cai, Zhaowei and Zhang, Yuting and Satzoda, Ravi Kumar and Mahadevan, Vijay and Manmatha, R},
128 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
129 |   pages={18653--18663},
130 |   year={2023}
131 | }
132 | 
133 | @inproceedings{liu2023gres,
134 |   title={GRES: Generalized referring expression segmentation},
135 |   author={Liu, Chang and Ding, Henghui and Jiang, Xudong},
136 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
137 |   pages={23592--23601},
138 |   year={2023}
139 | }
140 | 
141 | @article{yang2023semantics,
142 |   title={Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation},
143 |   author={Yang, Zhao and Wang, Jiaqi and Tang, Yansong and Chen, Kai and Zhao, Hengshuang and Torr, Philip HS},
144 |   journal={arXiv preprint arXiv:2303.06345},
145 |   year={2023}
146 | }
147 | 
148 | @article{chen2022position,
149 |   title={Position-aware contrastive alignment for referring image segmentation},
150 |   author={Chen, Bo and Hu, Zhiwei and Ji, Zhilong and Bai, Jinfeng and Zuo, Wangmeng},
151 |   journal={arXiv preprint arXiv:2212.13419},
152 |   year={2022}
153 | }
154 | 
155 | @article{zhang2022coupalign,
156 |   title={Coupalign: Coupling word-pixel with sentence-mask alignments for referring image segmentation},
157 |   author={Zhang, Zicheng and Zhu, Yi and Liu, Jianzhuang and Liang, Xiaodan and Ke, Wei},
158 |   journal={Advances in Neural Information Processing Systems},
159 |   volume={35},
160 |   pages={14729--14742},
161 |   year={2022}
162 | }
163 | 
164 | @article{shang2022cross,
165 |   title={Cross-Modal Recurrent Semantic Comprehension for Referring Image Segmentation},
166 |   author={Shang, Chao and Li, Hongliang and Qiu, Heqian and Wu, Qingbo and Meng, Fanman and Zhao, Taijin and Ngan, King Ngi},
167 |   journal={IEEE Transactions on Circuits and Systems for Video Technology},
168 |   year={2022},
169 |   publisher={IEEE}
170 | }
171 | 
172 | @article{liu2023local,
173 |   title={Local-global coordination with transformers for referring image segmentation},
174 |   author={Liu, Fang and Kong, Yuqiu and Zhang, Lihe and Feng, Guang and Yin, Baocai},
175 |   journal={Neurocomputing},
176 |   volume={522},
177 |   pages={39--52},
178 |   year={2023},
179 |   publisher={Elsevier}
180 | }
181 | 
182 | @article{huang2022unified,
183 |   title={A unified mutual supervision framework for referring expression segmentation and generation},
184 |   author={Huang, Shijia and Li, Feng and Zhang, Hao and Liu, Shilong and Zhang, Lei and Wang, Liwei},
185 |   journal={arXiv preprint arXiv:2211.07919},
186 |   year={2022}
187 | }
188 | 
189 | @article{ding2022vlt,
190 |   title={Vlt: Vision-language transformer and query generation for referring segmentation},
191 |   author={Ding, Henghui and Liu, Chang and Wang, Suchen and Jiang, Xudong},
192 |   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
193 |   year={2022},
194 |   publisher={IEEE}
195 | }
196 | 
197 | @article{feng2022learning,
198 |   title={Learning from box annotations for referring image segmentation},
199 |   author={Feng, Guang and Zhang, Lihe and Hu, Zhiwei and Lu, Huchuan},
200 |   journal={IEEE Transactions on Neural Networks and Learning Systems},
201 |   year={2022},
202 |   publisher={IEEE}
203 | }
204 | 
205 | @article{liu2022instance,
206 |   title={Instance-specific feature propagation for referring segmentation},
207 |   author={Liu, Chang and Jiang, Xudong and Ding, Henghui},
208 |   journal={IEEE Transactions on Multimedia},
209 |   year={2022},
210 |   publisher={IEEE}
211 | }
212 | 
213 | @inproceedings{zhu2022seqtr,
214 |   title={Seqtr: A simple yet universal network for visual grounding},
215 |   author={Zhu, Chaoyang and Zhou, Yiyi and Shen, Yunhang and Luo, Gen and Pan, Xingjia and Lin, Mingbao and Chen, Chao and Cao, Liujuan and Sun, Xiaoshuai and Ji, Rongrong},
216 |   booktitle={European Conference on Computer Vision},
217 |   pages={598--615},
218 |   year={2022},
219 |   organization={Springer}
220 | }
221 | 
222 | @inproceedings{yang2022lavt,
223 |   title={Lavt: Language-aware vision transformer for referring image segmentation},
224 |   author={Yang, Zhao and Wang, Jiaqi and Tang, Yansong and Chen, Kai and Zhao, Hengshuang and Torr, Philip HS},
225 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
226 |   pages={18155--18165},
227 |   year={2022}
228 | }
229 | 
230 | @inproceedings{wang2022cris,
231 |   title={Cris: Clip-driven referring image segmentation},
232 |   author={Wang, Zhaoqing and Lu, Yu and Li, Qiang and Tao, Xunqiang and Guo, Yandong and Gong, Mingming and Liu, Tongliang},
233 |   booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
234 |   pages={11686--11695},
235 |   year={2022}
236 | }
237 | 
238 | @inproceedings{kim2022restr,
239 |   title={Restr: Convolution-free referring image segmentation using transformers},
240 |   author={Kim, Namyup and Kim, Dongwon and Lan, Cuiling and Zeng, Wenjun and Kwak, Suha},
241 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
242 |   pages={18145--18154},
243 |   year={2022}
244 | }
245 | 
246 | @article{feng2021bidirectional,
247 |   title={Bidirectional relationship inferring network for referring image localization and segmentation},
248 |   author={Feng, Guang and Hu, Zhiwei and Zhang, Lihe and Sun, Jiayu and Lu, Huchuan},
249 |   journal={IEEE Transactions on Neural Networks and Learning Systems},
250 |   year={2021},
251 |   publisher={IEEE}
252 | }
253 | 
254 | @article{li2021referring,
255 |   title={Referring transformer: A one-step approach to multi-task visual grounding},
256 |   author={Li, Muchen and Sigal, Leonid},
257 |   journal={Advances in neural information processing systems},
258 |   volume={34},
259 |   pages={19652--19664},
260 |   year={2021}
261 | }
262 | 
263 | @inproceedings{jiao2021two,
264 |   title={Two-stage visual cues enhancement network for referring image segmentation},
265 |   author={Jiao, Yang and Jie, Zequn and Luo, Weixin and Chen, Jingjing and Jiang, Yu-Gang and Wei, Xiaolin and Ma, Lin},
266 |   booktitle={Proceedings of the 29th ACM International Conference on Multimedia},
267 |   pages={1331--1340},
268 |   year={2021}
269 | }
270 | 
271 | @inproceedings{ding2021vision,
272 |   title={Vision-language transformer and query generation for referring segmentation},
273 |   author={Ding, Henghui and Liu, Chang and Wang, Suchen and Jiang, Xudong},
274 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
275 |   pages={16321--16330},
276 |   year={2021}
277 | }
278 | 
279 | @inproceedings{kamath2021mdetr,
280 |   title={Mdetr-modulated detection for end-to-end multi-modal understanding},
281 |   author={Kamath, Aishwarya and Singh, Mannat and LeCun, Yann and Synnaeve, Gabriel and Misra, Ishan and Carion, Nicolas},
282 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
283 |   pages={1780--1790},
284 |   year={2021}
285 | }
286 | 
287 | @inproceedings{feng2021encoder,
288 |   title={Encoder fusion network with co-attention embedding for referring image segmentation},
289 |   author={Feng, Guang and Hu, Zhiwei and Zhang, Lihe and Lu, Huchuan},
290 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
291 |   pages={15506--15515},
292 |   year={2021}
293 | }
294 | 
295 | @inproceedings{yang2021bottom,
296 |   title={Bottom-up shift and reasoning for referring image segmentation},
297 |   author={Yang, Sibei and Xia, Meng and Li, Guanbin and Zhou, Hong-Yu and Yu, Yizhou},
298 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
299 |   pages={11266--11275},
300 |   year={2021}
301 | }
302 | 
303 | @inproceedings{jing2021locate,
304 |   title={Locate then segment: A strong pipeline for referring image segmentation},
305 |   author={Jing, Ya and Kong, Tao and Wang, Wei and Wang, Liang and Li, Lei and Tan, Tieniu},
306 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
307 |   pages={9858--9867},
308 |   year={2021}
309 | }
310 | 
311 | @inproceedings{luo2020cascade,
312 |   title={Cascade grouped attention network for referring expression segmentation},
313 |   author={Luo, Gen and Zhou, Yiyi and Ji, Rongrong and Sun, Xiaoshuai and Su, Jinsong and Lin, Chia-Wen and Tian, Qi},
314 |   booktitle={Proceedings of the 28th ACM International Conference on Multimedia},
315 |   pages={1274--1282},
316 |   year={2020}
317 | }
318 | 
319 | @inproceedings{hui2020linguistic,
320 |   title={Linguistic structure guided context modeling for referring image segmentation},
321 |   author={Hui, Tianrui and Liu, Si and Huang, Shaofei and Li, Guanbin and Yu, Sansi and Zhang, Faxi and Han, Jizhong},
322 |   booktitle={Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part X 16},
323 |   pages={59--75},
324 |   year={2020},
325 |   organization={Springer}
326 | }
327 | 
328 | @inproceedings{huang2020referring,
329 |   title={Referring image segmentation via cross-modal progressive comprehension},
330 |   author={Huang, Shaofei and Hui, Tianrui and Liu, Si and Li, Guanbin and Wei, Yunchao and Han, Jizhong and Liu, Luoqi and Li, Bo},
331 |   booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
332 |   pages={10488--10497},
333 |   year={2020}
334 | }
335 | 
336 | @inproceedings{hu2020bi,
337 |   title={Bi-directional relationship inferring network for referring image segmentation},
338 |   author={Hu, Zhiwei and Feng, Guang and Sun, Jiayu and Zhang, Lihe and Lu, Huchuan},
339 |   booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
340 |   pages={4424--4433},
341 |   year={2020}
342 | }
343 | 
344 | @inproceedings{wu2020phrasecut,
345 |   title={Phrasecut: Language-based image segmentation in the wild},
346 |   author={Wu, Chenyun and Lin, Zhe and Cohen, Scott and Bui, Trung and Maji, Subhransu},
347 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
348 |   pages={10216--10225},
349 |   year={2020}
350 | }
351 | 
352 | @inproceedings{luo2020multi,
353 |   title={Multi-task collaborative network for joint referring expression comprehension and segmentation},
354 |   author={Luo, Gen and Zhou, Yiyi and Sun, Xiaoshuai and Cao, Liujuan and Wu, Chenglin and Deng, Cheng and Ji, Rongrong},
355 |   booktitle={Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition},
356 |   pages={10034--10043},
357 |   year={2020}
358 | }
359 | 
360 | @article{ye2020dual,
361 |   title={Dual convolutional lstm network for referring image segmentation},
362 |   author={Ye, Linwei and Liu, Zhi and Wang, Yang},
363 |   journal={IEEE Transactions on Multimedia},
364 |   volume={22},
365 |   number={12},
366 |   pages={3224--3235},
367 |   year={2020},
368 |   publisher={IEEE}
369 | }
370 | 
371 | @article{chen2019referring,
372 |   title={Referring expression object segmentation with caption-aware consistency},
373 |   author={Chen, Yi-Wen and Tsai, Yi-Hsuan and Wang, Tiantian and Lin, Yen-Yu and Yang, Ming-Hsuan},
374 |   journal={arXiv preprint arXiv:1910.04748},
375 |   year={2019}
376 | }
377 | 
378 | @inproceedings{chen2019see,
379 |   title={See-through-text grouping for referring image segmentation},
380 |   author={Chen, Ding-Jie and Jia, Songhao and Lo, Yi-Chen and Chen, Hwann-Tzong and Liu, Tyng-Luh},
381 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
382 |   pages={7454--7463},
383 |   year={2019}
384 | }
385 | 
386 | @inproceedings{ye2019cross,
387 |   title={Cross-modal self-attention network for referring image segmentation},
388 |   author={Ye, Linwei and Rochan, Mrigank and Liu, Zhi and Wang, Yang},
389 |   booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
390 |   pages={10502--10511},
391 |   year={2019}
392 | }
393 | 
394 | @inproceedings{shi2018key,
395 |   title={Key-word-aware network for referring expression image segmentation},
396 |   author={Shi, Hengcan and Li, Hongliang and Meng, Fanman and Wu, Qingbo},
397 |   booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
398 |   pages={38--54},
399 |   year={2018}
400 | }
401 | 
402 | @inproceedings{margffoy2018dynamic,
403 |   title={Dynamic multimodal instance segmentation guided by natural language queries},
404 |   author={Margffoy-Tuay, Edgar and P{\'e}rez, Juan C and Botero, Emilio and Arbel{\'a}ez, Pablo},
405 |   booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
406 |   pages={630--645},
407 |   year={2018}
408 | }
409 | 
410 | @inproceedings{li2018referring,
411 |   title={Referring image segmentation via recurrent refinement networks},
412 |   author={Li, Ruiyu and Li, Kaican and Kuo, Yi-Chun and Shu, Michelle and Qi, Xiaojuan and Shen, Xiaoyong and Jia, Jiaya},
413 |   booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
414 |   pages={5745--5753},
415 |   year={2018}
416 | }
417 | 
418 | @inproceedings{yu2018mattnet,
419 |   title={Mattnet: Modular attention network for referring expression comprehension},
420 |   author={Yu, Licheng and Lin, Zhe and Shen, Xiaohui and Yang, Jimei and Lu, Xin and Bansal, Mohit and Berg, Tamara L},
421 |   booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
422 |   pages={1307--1315},
423 |   year={2018}
424 | }
425 | 
426 | @inproceedings{liu2017recurrent,
427 |   title={Recurrent multimodal interaction for referring image segmentation},
428 |   author={Liu, Chenxi and Lin, Zhe and Shen, Xiaohui and Yang, Jimei and Lu, Xin and Yuille, Alan},
429 |   booktitle={Proceedings of the IEEE international conference on computer vision},
430 |   pages={1271--1280},
431 |   year={2017}
432 | }
433 | 
434 | @inproceedings{hu2016segmentation,
435 |   title={Segmentation from natural language expressions},
436 |   author={Hu, Ronghang and Rohrbach, Marcus and Darrell, Trevor},
437 |   booktitle={Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14},
438 |   pages={108--124},
439 |   year={2016},
440 |   organization={Springer}
441 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # **RIS-Learning-List**
  2 | 
  3 | ## **Introduction**
  4 | This repository introduces **Referring Image Segmentation** task, and collects some related works.
  5 | ## **Content**
  6 | 
  7 | - [Definition](#Definition)
  8 | - [Dataset](#Datsets)
  9 | - [Evaluation Metric](#Evaluation-Metric)
 10 | - [Related Works](#Related-Works)
 11 | - [Performance](#Performance)
 12 | - [Reference](#Reference)
 13 | 
 14 | ## **Definition**
 15 | 
 16 | **Referring Image Segmentation (RIS)** is a challenging problem at the intersection of computer vision and natural language processing. Given an image and a natural language expression, the goal is to produce a segmentation mask in the image corresponding to the objects referred by the the natural language expression.
 17 | ![](https://github.com/Huntersxsx/RIS-Learning-List/blob/main/img/definition.png)
 18 | 
 19 | ## **Datsets**
 20 | - [**RefCOCO**](https://arxiv.org/pdf/1608.00272): It contains **19,994 images** with **142,210 referring expressions** for **50,000 objects**, which are collected from the MSCOCO via a two-player game. The dataset is split into 120,624 train, 10,834 validation, 5,657 test A, and 5,095 test B samples, respectively. 
 21 | - [**RefCOCO+**](https://arxiv.org/pdf/1608.00272): It contains **141,564 language expressions** with **49,856 objects** in **19,992 images**. The datasetis split into train, validation, test A, and test B with 120,624, 10,758, 5,726, and 4,889 samples, respectively. Compared with RefCOCO dataset, some kinds of **absolute-location words are excluded** from the RefCOCO+ dataset.
 22 | - [**G-Ref**](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Mao_Generation_and_Comprehension_CVPR_2016_paper.pdf): It includes **104,560 referring expressions** for **54,822 objects** in **26,711 images**. 
 23 | - Expressions in RefCOCO and RefCOCO+ are very succinct (containing 3.5 words on average). In contrast, **expressionsin G-Ref are more complex** (containing 8.4 words on average). Conversely, **RefCOCO and RefCOCO+ tend to have more objects of the same category per image** (3.9 on average) compared to G-Ref (1.6 on average). 
 24 | 
 25 | ## **Evaluation Metric**
 26 | - **overall IoU:** It is the total intersection area divided by the total union area, where both intersection area and union area are accumulated over all test samples (each test sample is an image and a referential expression).
 27 | - **mean IoU:** It is the IoU between the prediction and ground truth averaged across all test samples.
 28 | - **Precision@X:** It measures the percentage of test images with an IoU score higher than the threshold X ∈ {0.5, 0.6, 0.7, 0.8, 0.9}.
 29 | 
 30 | ## **Related Works**
 31 | 
 32 | - **MagNet:** [Mask Grounding for Referring Image Segmentation](https://arxiv.org/pdf/2312.12198.pdf). *in Arxiv 2023*. 
 33 | - **MRES:** [Unveiling Parts Beyond Objects:
 34 | Towards Finer-Granularity Referring Expression Segmentation](https://arxiv.org/pdf/2312.08007.pdf). *in Arxiv 2023*. [code](https://github.com/Rubics-Xuan/MRES)
 35 | - [Towards Generalizable Referring Image Segmentation via
 36 | Target Prompt and Visual Coherence](https://arxiv.org/pdf/2312.00452.pdf). *in Arxiv 2023*. 
 37 | - **BTMAE:** [Synchronizing Vision and Language: Bidirectional Token-Masking AutoEncoder
 38 | for Referring Image Segmentation](https://arxiv.org/pdf/2311.17952.pdf). *in Arxiv 2023*. 
 39 | - **MARIS:** [MARIS: Referring Image Segmentation via Mutual-Aware Attention Features](https://arxiv.org/pdf/2311.15727.pdf). *in Arxiv 2023*. 
 40 | - **Omni-RES:** [Towards Omni-supervised Referring Expression Segmentation](https://arxiv.org/pdf/2311.00397.pdf). *in Arxiv 2023*. [code](https://github.com/nineblu/omni-res)
 41 | - **JMCELN:** [Referring Image Segmentation via Joint Mask Contextual Embedding
 42 | Learning and Progressive Alignment Network](https://aclanthology.org/2023.emnlp-main.481/). *in EMNLP 2023*. [code](https://github.com/toyottttttt/referring-segmentation)
 43 | - **TAS:** [Text Augmented Spatial-aware Zero-shot Referring Image Segmentation](https://arxiv.org/pdf/2310.18049.pdf). *in EMNLP 2023 Findings*. 
 44 | - **CVMN:** [Unsupervised Domain Adaptation for Referring Semantic Segmentation](https://dl.acm.org/doi/abs/10.1145/3581783.3611879). *in ACM MM 2023*. [code](https://github.com/asudahkzj/CVMN)
 45 | - **CARIS:** [CARIS: Context-Aware Referring Image Segmentation](https://dl.acm.org/doi/abs/10.1145/3581783.3612117). *in ACM MM 2023*. [code](https://github.com/lsa1997/CARIS)
 46 | - **Shatter and Gather:** [Shatter and Gather: Learning Referring Image Segmentation with Text Supervision](https://arxiv.org/pdf/2308.15512v1.pdf). *in ICCV 2023*.
 47 | - **Group-RES:** [Advancing Referring Expression Segmentation Beyond Single Image](https://arxiv.org/pdf/2305.12452.pdf). *in ICCV 2023*. [code](https://github.com/yixuan730/group-res)
 48 | - **ETRIS:** [Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation](https://arxiv.org/pdf/2307.11545.pdf). *in ICCV 2023*. [code](https://github.com/kkakkkka/ETRIS)
 49 | - **TRIS:** [Referring Image Segmentation Using Text Supervision](https://arxiv.org/pdf/2308.14575.pdf). *in ICCV 2023*. [code](https://github.com/fawnliu/TRIS)
 50 | - **RIS-DMMI:** [Beyond One-to-One: Rethinking the Referring Image Segmentation](https://arxiv.org/pdf/2308.13853.pdf). *in ICCV 2023*. [code](https://github.com/toggle1995/RIS-DMMI)
 51 | - **BKINet:** [Bilateral Knowledge Interaction Network for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/10227590). *in TMM 2023*. [code](https://github.com/dhding/BKINet)
 52 | - **SLViT:** [SLViT: Scale-Wise Language-Guided Vision Transformer for Referring Image Segmentation](https://www.ijcai.org/proceedings/2023/0144.pdf). *in IJCAI 2023*. [code](https://github.com/NaturalKnight/SLViT)
 53 | - **WiCo:** [WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation](https://www.ijcai.org/proceedings/2023/0071.pdf). *in IJCAI 2023*.
 54 | - **CM-MaskSD:** [CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation](https://arxiv.org/pdf/2305.11481.pdf). *in Arxiv 2023*.
 55 | - **CGFormer:** [Contrastive Grouping with Transformer for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Tang_Contrastive_Grouping_With_Transformer_for_Referring_Image_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/Toneyaya/CGFormer)
 56 | - **Partial-RES:** [Learning to Segment Every Referring Object Point by Point](https://openaccess.thecvf.com/content/CVPR2023/papers/Qu_Learning_To_Segment_Every_Referring_Object_Point_by_Point_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/qumengxue/Partial-RES.git)
 57 | - **Zero-shot RIS:** [Zero-shot Referring Image Segmentation with Global-Local Context Features](https://openaccess.thecvf.com/content/CVPR2023/papers/Yu_Zero-Shot_Referring_Image_Segmentation_With_Global-Local_Context_Features_CVPR_2023_paper.pdf). *in CVPR 2023*. [code](https://github.com/Seonghoon-Yu/Zero-shot-RIS)
 58 | - **MCRES:** [Meta Compositional Referring Expression Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Xu_Meta_Compositional_Referring_Expression_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. 
 59 | - **PolyFormer:** [PolyFormer: Referring Image Segmentation as Sequential Polygon Generation](https://arxiv.org/pdf/2302.07387.pdf). *in CVPR 2023*. [project](https://polyformer.github.io/)
 60 | - **GRES:** [Generalized Referring Expression Segmentation](https://openaccess.thecvf.com/content/CVPR2023/papers/Liu_GRES_Generalized_Referring_Expression_Segmentation_CVPR_2023_paper.pdf). *in CVPR 2023*. [project](https://henghuiding.github.io/GRES)
 61 | - **SADLR:** [Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation](https://arxiv.org/pdf/2303.06345.pdf). *in AAAI 2023*.
 62 | - **PCAN:** [Position-Aware Contrastive Alignment for Referring Image Segmentation](https://arxiv.org/pdf/2212.13419.pdf). *in Arxiv 2022*.
 63 | - **CoupAlign:** [CoupAlign: Coupling Word-Pixel with Sentence-Mask Alignments for Referring Image Segmentation](https://arxiv.org/pdf/2212.01769.pdf). *in NeurIPS 2022*. [code](https://gitee.com/mindspore/models/tree/master/research/cv/CoupAlign)
 64 | - **CRSCNet:** [Cross-Modal Recurrent Semantic Comprehension for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/9998537). *in TCSVT 2022*.
 65 | - **LGCT:** [Local-global coordination with transformers for referring image segmentation](https://www.sciencedirect.com/science/article/pii/S0925231222015119). *in Neurocomputing 2022*. 
 66 | - **RES&REG:** [A Unified Mutual Supervision Framework for Referring Expression Segmentation and Generation](https://arxiv.org/pdf/2211.07919.pdf). *in Arxiv 2022*.
 67 | - **VLT:** [VLT: Vision-Language Transformer and Query Generation for Referring Segmentation](https://arxiv.org/pdf/2210.15871.pdf). *in TPAMI 2022*. [code](https://github.com/henghuiding/Vision-Language-Transformer)
 68 | - [Learning From Box Annotations for Referring Image Segmentation](https://ieeexplore.ieee.org/abstract/document/9875225). *in TNNLS 2022*. [code](https://github.com/fengguang94/Weakly-Supervised-RIS)
 69 | - [Instance-Specific Feature Propagation for Referring Segmentation](https://ieeexplore.ieee.org/abstract/document/9745353). *in TMM 2022*. 
 70 | - **SeqTR:** [SeqTR: A Simple Yet Universal Network for Visual Grounding](https://arxiv.org/pdf/2203.16265.pdf). *in ECCV 2022*. [code](https://github.com/sean-zhuh/SeqTR)
 71 | - **LAVT:** [LAVT: Language-Aware Vision Transformer for Referring Image Segmentation](https://arxiv.org/abs/2112.02244). *in CVPR 2022*. [code](https://github.com/yz93/LAVT-RIS)
 72 | - **CRIS:** [CRIS: CLIP-Driven Referring Image Segmentation](https://arxiv.org/abs/2111.15174). *in CVPR 2022*. [code](https://github.com/DerrickWang005/CRIS.pytorch)
 73 | - **CRIS:** [CRIS: CLIP-Driven Referring Image Segmentation](https://arxiv.org/abs/2111.15174). *in CVPR 2022*. [code](https://github.com/DerrickWang005/CRIS.pytorch)
 74 | - **ReSTR:** [ReSTR: Convolution-free Referring Image Segmentation Using Transformers](https://www.microsoft.com/en-us/research/uploads/prod/2022/03/01404.pdf). *in CVPR 2022*. [project](http://cvlab.postech.ac.kr/research/restr/)
 75 | - [Bidirectional relationship inferring network for referring image localization and segmentation](https://ieeexplore.ieee.org/document/9526878). *in TNNLS 2021*. 
 76 | - **RefTR:** [Referring Transformer: A One-step Approach to Multi-task Visual Grounding](https://openreview.net/pdf?id=J64lDCrYGi). *in NeurIPS 2021*. 
 77 | - **TV-Net:** [Two-stage Visual Cues Enhancement Network for Referring Image Segmentation](https://arxiv.org/abs/2110.04435). *in ACM MM 2021*. [code](https://github.com/sxjyjay/tv-net)
 78 | - **VLT:** [Vision-Language Transformer and Query Generation for Referring Segmentation](https://arxiv.org/abs/2108.05565). *in ICCV 2021*. [code](https://github.com/henghuiding/Vision-Language-Transformer)
 79 | - **MDETR:** [MDETR - Modulated Detection for End-to-End Multi-Modal Understanding](https://arxiv.org/abs/2104.12763). *in ICCV 2021*. [code](https://github.com/ashkamath/mdetr)
 80 | - **EFNet:** [Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Feng_Encoder_Fusion_Network_With_Co-Attention_Embedding_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. [code](https://github.com/fengguang94/CEFNet)
 81 | - **BUSNet:** [Bottom-Up Shift and Reasoning for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. [code](https://github.com/incredibleXM/BUSNet)
 82 | - **LTS:** [Locate then Segment: A Strong Pipeline for Referring Image Segmentation](https://openaccess.thecvf.com/content/CVPR2021/papers/Jing_Locate_Then_Segment_A_Strong_Pipeline_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf). *in CVPR 2021*. 
 83 | - **CGAN:** [Cascade Grouped Attention Network for Referring Expression Segmentation](https://dl.acm.org/doi/abs/10.1145/3394171.3414006). *in ACM MM 2020*.
 84 | - **LSCM:** [Linguistic Structure Guided Context Modeling for Referring Image Segmentation](http://colalab.org/media/paper/Linguistic_Structure_Guided_Context_Modeling_for_Referring_Image_Segmentation.pdf). *in ECCV 2020*. 
 85 | - **CMPC-Refseg:** [Referring Image Segmentation via Cross-Modal Progressive Comprehension](http://openaccess.thecvf.com/content_CVPR_2020/papers/Huang_Referring_Image_Segmentation_via_Cross-Modal_Progressive_Comprehension_CVPR_2020_paper.pdf). *in CVPR 2020*. [code](https://github.com/spyflying/CMPC-Refseg)
 86 | - **BRINet:** [Bi-directional Relationship Inferring Network for Referring Image Segmentation](http://openaccess.thecvf.com/content_CVPR_2020/papers/Hu_Bi-Directional_Relationship_Inferring_Network_for_Referring_Image_Segmentation_CVPR_2020_paper.pdf). *in CVPR 2020*. [code](https://github.com/fengguang94/CVPR2020-BRINet)
 87 | - **PhraseCut:** [PhraseCut: Language-based Image Segmentation in the Wild](https://people.cs.umass.edu/~smaji/papers/phrasecut+supp-cvpr20.pdf). *in CVPR 2020*. [code](https://github.com/ChenyunWu/PhraseCutDataset)
 88 | - **MCN:** [Multi-task Collaborative Network for Joint Referring Expression Comprehension and Segmentation](https://arxiv.org/abs/2003.08813). *in CVPR 2020*. [code](https://github.com/luogen1996/MCN)
 89 | - [Dual Convolutional LSTM Network for Referring Image Segmentation](https://arxiv.org/abs/2001.11561). *in TMM 2020*. 
 90 | - **lang2seg:** [Referring Expression Object Segmentation with Caption-Aware Consistency](https://arxiv.org/pdf/1910.04748.pdf). *in BMVC 2019*. [code](https://github.com/wenz116/lang2seg)
 91 | - **STEP:** [See-Through-Text Grouping for Referring Image Segmentation](http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_See-Through-Text_Grouping_for_Referring_Image_Segmentation_ICCV_2019_paper.pdf). *in ICCV 2019*. 
 92 | - **CMSA-Net:** [Cross-Modal Self-Attention Network for Referring Image Segmentation](https://arxiv.org/pdf/1904.04745.pdf). *in CVPR 2019*. [code](https://github.com/lwye/CMSA-Net)
 93 | - **KWA:** [Key-Word-Aware Network for Referring Expression Image Segmentation](http://openaccess.thecvf.com/content_ECCV_2018/papers/Hengcan_Shi_Key-Word-Aware_Network_for_ECCV_2018_paper.pdf). *in ECCV 2018*. [code](https://github.com/shihengcan/key-word-aware-network-pycaffe)
 94 | - **DMN:** [Dynamic Multimodal Instance Segmentation Guided by Natural Language Queries](http://openaccess.thecvf.com/content_ECCV_2018/papers/Edgar_Margffoy-Tuay_Dynamic_Multimodal_Instance_ECCV_2018_paper.pdf). *in ECCV 2018*. [code](https://github.com/BCV-Uniandes/DMS)
 95 | - **RRN:** [Referring Image Segmentation via Recurrent Refinement Networks](http://openaccess.thecvf.com/content_cvpr_2018/papers/Li_Referring_Image_Segmentation_CVPR_2018_paper.pdf). *in CVPR 2018*. [code](https://github.com/liruiyu/referseg_rrn)
 96 | - **MAttNet:** [MAttNet: Modular Attention Network for Referring Expression Comprehension](http://openaccess.thecvf.com/content_cvpr_2018/papers/Yu_MAttNet_Modular_Attention_CVPR_2018_paper.pdf). *in CVPR 2018*. [code](https://github.com/lichengunc/MAttNet)
 97 | - **RMI:** [Recurrent Multimodal Interaction for Referring Image Segmentation](http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Recurrent_Multimodal_Interaction_ICCV_2017_paper.pdf). *in ICCV 2017*. [code](https://github.com/chenxi116/TF-phrasecut-public)
 98 | - **LSTM-CNN:** [Segmentation from natural language expressions](https://arxiv.org/pdf/1603.06180.pdf). *in ECCV 2016*. [code](https://github.com/ronghanghu/text_objseg)
 99 | 
100 | 
101 | ## Performance
102 | 
103 | 
104 | ## Reference
105 | 
106 | [MarkMoHR / Awesome-Referring-Image-Segmentation](https://github.com/MarkMoHR/Awesome-Referring-Image-Segmentation)
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/img/definition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersxsx/RIS-Learning-List/dfc63f149f4516087ec316228d01803d525b7347/img/definition.png


--------------------------------------------------------------------------------