@inproceedings{c410688343584d62b08abf41f97c881f,
title = "Progressive Self-supervised Spatio-temporal Feature Learning Based on Video Sequence Saliency",
abstract = "We observe that videos have different levels of frame/clip sequence saliency, and the effective utilization of frame/clip sequence saliency is beneficial to spatio-temporal feature learning. Therefore, we propose a new concept called video sequence saliency (VSS) to measure the degree of difficulty of models in identifying the correct frames/clip orders of videos. Accordingly, we developed a novel method named progressive self-supervised spatio-temporal feature learning based on VSS (PSSFL-VSS). For the pretext task of clip order prediction, the videos are input into networks in descending order by VSS values rather than randomly, as in traditional methods. In addition, we update the VSS value of each video based on clip order prediction results. The effectiveness of our pre-trained models is verified by carrying out the downstream tasks of clip/video retrieval and action recognition, and experimental results show that our method achieves apparent improvements over the state-of-the-art methods.",
keywords = "Action Recognition, Clip/Video Retrieval, Self-supervised Learning, Spatio-temporal Feature Learning, Video Sequence Saliency",
author = "Jinlong Kang and Tao Xu and Boting Qu and Xiang Wang and Xiaoli Lian and Jing Guo and Yuan Gao",
note = "Publisher Copyright: {\textcopyright} 2025 SPIE.; 8th International Conference on Video and Image Processing, ICVIP 2024 ; Conference date: 13-12-2024 Through 15-12-2024",
year = "2025",
doi = "10.1117/12.3059113",
language = "英语",
series = "Proceedings of SPIE - The International Society for Optical Engineering",
publisher = "SPIE",
editor = "Xuefeng Liang",
booktitle = "Eighth International Conference on Video and Image Processing, ICVIP 2024",
address = "美国",
}