@inproceedings{098ea0c9ad1746c38bef7680d9532c23,
title = "SDPT: Synchronous Dual Prompt Tuning for Fusion-Based Visual-Language Pre-trained Models",
abstract = "Prompt tuning methods have achieved remarkable success in parameter-efficient fine-tuning on large pre-trained models. However, their application to dual-modal fusion-based visual-language pre-trained models (VLPMs), such as GLIP, has encountered issues. Existing prompt tuning methods have not effectively addressed the modal mapping and aligning problem for tokens in different modalities, leading to poor transfer generalization. To address this issue, we propose Synchronous Dual Prompt Tuning (SDPT). SDPT initializes a single set of learnable unified prototype tokens in the established modal aligning space to represent the aligned semantics of text and image modalities for downstream tasks. Furthermore, SDPT establishes inverse linear projections that require no training to embed the information of unified prototype tokens into the input space of different modalities. The inverse linear projections allow the unified prototype token to synchronously represent the two modalities and enable SDPT to share the unified semantics of text and image for downstream tasks across different modal prompts. Experimental results demonstrate that SDPT assists fusion-based VLPMs to achieve superior outcomes with only 0.04\% of model parameters for training across various scenarios, outperforming other single- or dual-modal methods. The code will be released at https://github.com/wuyongjianCODE/SDPT.",
keywords = "Parameter-efficient fine-tuning, Prompt tuning, Visual-language pre-trained models",
author = "Yang Zhou and Yongjian Wu and Jiya Saiyin and Bingzheng Wei and Maode Lai and Eric Chang and Yan Xu",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.; 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-72967-6\_19",
language = "英语",
isbn = "9783031729669",
series = "Lecture Notes in Computer Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "340--356",
editor = "Ale{\v s} Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and G{\"u}l Varol",
booktitle = "Computer Vision – ECCV 2024 - 18th European Conference, Proceedings",
address = "德国",
}