@inproceedings{de6811d927994ea5abfacc78cfdeeffc,
title = "Cross-Modal Omni Interaction Modeling for Phrase Grounding",
abstract = "Phrase grounding aims to localize the objects described by phrases in a natural language specification. Previous works model the interaction of inputs from text modality and visual modality only in the intra-modal global level and consequently lacks the ability to capture the precise and complete context information. In this paper, we propose a novel Cross-Modal Omni Interaction network (COI Net) composed of a neighboring interaction module, a global interaction module, a cross-modal interaction module and a multilevel alignment module. Our approach formulates the complex spatial and semantic relationship among image regions and phrases through multi-level multi-modal interaction. We capture the local relationship using the interaction among neighboring regions and then collect the global context through the interaction among all regions using a transformer encoder. We further use a co-attention module to apply the interaction between two modalities to gather the cross-modal context for all image regions and phrases. In addition to the omni interaction modeling, we also leverage a straightforward yet effective multilevel alignment regularization to formulate the dependencies among all grounding decisions. We extensively validate the effectiveness of our model. Experiments show that our approach outperforms existing state-of-the-art methods by large margins on two popular datasets in terms of accuracy: 6.15\% on Flickr30K Entities (71.36\% increased to 77.51\%) and 21.25\% on ReferItGame (44.91\% increased to 66.16\%). The code of our implementation is available at https://github.com/yiranyyu/Phrase-Grounding.",
keywords = "attention mechanism, phrase grounding, supervised learning, visual and language",
author = "Tianyu Yu and Tianrui Hui and Zhihao Yu and Yue Liao and Sansi Yu and Faxi Zhang and Si Liu",
note = "Publisher Copyright: {\textcopyright} 2020 ACM.; 28th ACM International Conference on Multimedia, MM 2020 ; Conference date: 12-10-2020 Through 16-10-2020",
year = "2020",
month = oct,
day = "12",
doi = "10.1145/3394171.3413846",
language = "英语",
series = "MM 2020 - Proceedings of the 28th ACM International Conference on Multimedia",
publisher = "Association for Computing Machinery, Inc",
pages = "1725--1734",
booktitle = "MM 2020 - Proceedings of the 28th ACM International Conference on Multimedia",
}