-
[pdf]
[bibtex]@InProceedings{Yu_2025_CVPR, author = {Yu, Jun and Zhang, Yunxiang and Sun, Fengzhao and Wang, Leilei and Lu, Renjie and Zhu, Lingsi and Lu, Xilong and Zheng, Yang and Wang, Yongqi}, title = {Towards Robust Multimodal AU Detection: STN-Enhanced Visual Encoding and Audio-Visual Spatial-Temporal Alignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2025}, pages = {5734-5741} }
Towards Robust Multimodal AU Detection: STN-Enhanced Visual Encoding and Audio-Visual Spatial-Temporal Alignment
Abstract
Facial Action Unit (AU) detection is crucial for understanding human emotional expressions and enhancing the perceptual capabilities of intelligent systems, particularly in in-the-wild environments. The key challenge lies in accurately extracting subtle motion features and effectively integrating complex facial dynamics. However, existing methods are limited by the complexity of multimodal data and the varying environmental conditions, leading to suboptimal performance. To address these shortcomings, we propose a novel multimodal domain alignment strategy. Specifically, we design a multimodal audio-visual feature extraction and fusion framework that analyzes pre-processed audio and video frames at both the global and local levels. For video frames, we employ ConvNeXt integrated with a Spatial Transformer Network (STN) as the image encoder, while for audio, we extract Mel spectrogram features using Whisper. Additionally, we introduce a Temporal Convolutional Network (TCN) for temporal modeling of multimodal features. Experimental results demonstrate that our approach achieves superior performance on the Aff-Wild2 dataset, significantly improving AU detection accuracy compared to existing methods and showcasing state-of-the-art detection capabilities. Notably, our method secured first place in the AU Detection track of the 8th Competition on Affective & Behavior Analysis in-the-Wild (ABAW8), further highlighting its effectiveness and robustness in real-world scenarios.
Related Material