@inproceedings{ddaab5e8e55e4b808be45e433b5d18eb,
title = "STD-TR: End-to-End Spatio-Temporal Action Detection with Transformers",
abstract = "Spatio-temporal action detection methods locate human actions in both spatial and temporal dimension, which usually follow a two-stage structure. In this paper, We propose STD-TR, a novel spatio-temporal action detection framework with an end-to-end transformer structure. STD-TR employs two branches to extract feature from video clip and key frame concurrently, then sends the aggregated feature to the transformer encoder-decoder. View spatio-temporal action detection as a set matching and prediction problem, STD-TR employs learned object queries to model the relation of feature context, and directly outputs all predictions at one inference time. Our method remove all hand-designed and can be optimized by a joint loss. Besides, a Hungarian algorithm and a upgraded linking strategy are used for bipartite set matching and action tube generation respectively. Convincing experiment result on challenging dataset demonstrates the superiority of our method.",
keywords = "Action Detection, End-to-End, Spatio-Temporal Action Detection, Transformers",
author = "Zexian Li and Tian Wang and Aichun Zhu and Kexin Liu and Peng Shi and Hichem Snoussi",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE; 2021 China Automation Congress, CAC 2021 ; Conference date: 22-10-2021 Through 24-10-2021",
year = "2021",
doi = "10.1109/CAC53003.2021.9727692",
language = "英语",
series = "Proceeding - 2021 China Automation Congress, CAC 2021",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "7615--7620",
booktitle = "Proceeding - 2021 China Automation Congress, CAC 2021",
address = "美国",
}