@inproceedings{5e7e6704f6e142de8f7250ca473e40de,
title = "Where to Focus: Investigating Hierarchical Attention Relationship for Fine-Grained Visual Classification",
abstract = "Object categories are often grouped into a multi-granularity taxonomic hierarchy. Classifying objects at coarser-grained hierarchy requires global and common characteristics, while finer-grained hierarchy classification relies on local and discriminative features. Therefore, humans should also subconsciously focus on different object regions when classifying different hierarchies. This granularity-wise attention is confirmed by our collected human real-time gaze data on different hierarchy classifications. To leverage this mechanism, we propose a Cross-Hierarchical Region Feature (CHRF) learning framework. Specifically, we first design a region feature mining module that imitates humans to learn different granularity-wise attention regions with multi-grained classification tasks. To explore how human attention shifts from one hierarchy to another, we further present a cross-hierarchical orthogonal fusion module to enhance the region feature representation by blending the original feature and an orthogonal component extracted from adjacent hierarchies. Experiments on five hierarchical fine-grained datasets demonstrate the effectiveness of CHRF compared with the state-of-the-art methods. Ablation study and visualization results also consistently verify the advantages of our human attention-oriented modules. The code and dataset are available at https://github.com/visiondom/CHRF.",
keywords = "Fine-grained visual classification, Human attention, Multi-granularity, Orthogonal fusion",
author = "Yang Liu and Lei Zhou and Pengcheng Zhang and Xiao Bai and Lin Gu and Xiaohan Yu and Jun Zhou and Hancock, \{Edwin R.\}",
note = "Publisher Copyright: {\textcopyright} 2022, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 17th European Conference on Computer Vision, ECCV 2022 ; Conference date: 23-10-2022 Through 27-10-2022",
year = "2022",
doi = "10.1007/978-3-031-20053-3\_4",
language = "英语",
isbn = "9783031200526",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "57--73",
editor = "Shai Avidan and Gabriel Brostow and Moustapha Ciss{\'e} and Farinella, \{Giovanni Maria\} and Tal Hassner",
booktitle = "Computer Vision – ECCV 2022 - 17th European Conference, Proceedings",
address = "德国",
}