@inproceedings{43c2d1afc3d34f6fa43a7da8616cb8b9,
title = "VMSIS: A Pre-trained Vision Transformer with Mamba Decoder for Surgical Instrument Segmentation",
abstract = "Accurate surgical instrument segmentation plays a vital role in robot assisted surgery. We present VMSIS, a hybrid architecture that combines the visual representation capabilities of self-supervised DINOv2 with the efficient sequence modeling of Mamba for surgical instrument segmentation. Our approach trained DINOv2 backbone with over 900,000 frames of RGB surgical videos and introduces a Mamba-based decoder that effectively captures temporal dependencies in surgical video sequences with backbone frozen. By processing 10 consecutive frames, our model achieves accurate instrument segmentation while maintaining temporal consistency. Experiments on 4 reorganized public datasets demonstrate the effectiveness of our approach, achieving competitive results with fewer trainable parameters compared to traditional methods.",
author = "Yuechen Tao and Xiaobo Zhu and Shiwei Wu and He Sun and Jiangang Liu and Yu An and Jie Tian and Zhenyu Liu",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 47th Annual International Conference of the IEEE Engineering in Medicine and Biology Society, EMBC 2025 ; Conference date: 14-07-2025 Through 18-07-2025",
year = "2025",
doi = "10.1109/EMBC58623.2025.11252986",
language = "英语",
series = "Proceedings of the Annual International Conference of the IEEE Engineering in Medicine and Biology Society, EMBS",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2025 47th Annual International Conference of the IEEE Engineering in Medicine and Biology Society, EMBC 2025 - Proceedings",
address = "美国",
}