Papers
- Back
Seeing the World through Your Eyes-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Alzayer_2024_CVPR, author = {Alzayer, Hadi and Zhang, Kevin and Feng, Brandon and Metzler, Christopher A. and Huang, Jia-Bin}, title = {Seeing the World through Your Eyes}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {4864-4873} }
Ungeneralizable Examples-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ye_2024_CVPR, author = {Ye, Jingwen and Wang, Xinchao}, title = {Ungeneralizable Examples}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11944-11953} }
LaneCPP: Continuous 3D Lane Detection using Physical Priors-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pittner_2024_CVPR, author = {Pittner, Maximilian and Janai, Joel and Condurache, Alexandru P.}, title = {LaneCPP: Continuous 3D Lane Detection using Physical Priors}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10639-10648} }
CityDreamer: Compositional Generative Model of Unbounded 3D Cities-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xie_2024_CVPR, author = {Xie, Haozhe and Chen, Zhaoxi and Hong, Fangzhou and Liu, Ziwei}, title = {CityDreamer: Compositional Generative Model of Unbounded 3D Cities}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9666-9675} }
Action Detection via an Image Diffusion Process-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Foo_2024_CVPR, author = {Foo, Lin Geng and Li, Tianjiao and Rahmani, Hossein and Liu, Jun}, title = {Action Detection via an Image Diffusion Process}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18351-18361} }
ConTex-Human: Free-View Rendering of Human from a Single Image with Texture-Consistent Synthesis-
[pdf]
[supp]
[bibtex]@InProceedings{Gao_2024_CVPR, author = {Gao, Xiangjun and Li, Xiaoyu and Zhang, Chaopeng and Zhang, Qi and Cao, Yanpei and Shan, Ying and Quan, Long}, title = {ConTex-Human: Free-View Rendering of Human from a Single Image with Texture-Consistent Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10084-10094} }
Streaming Dense Video Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2024_CVPR, author = {Zhou, Xingyi and Arnab, Anurag and Buch, Shyamal and Yan, Shen and Myers, Austin and Xiong, Xuehan and Nagrani, Arsha and Schmid, Cordelia}, title = {Streaming Dense Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18243-18252} }
Rethinking Inductive Biases for Surface Normal Estimation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Bae_2024_CVPR, author = {Bae, Gwangbin and Davison, Andrew J.}, title = {Rethinking Inductive Biases for Surface Normal Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9535-9545} }
Fair Federated Learning under Domain Skew with Local Consistency and Domain Diversity-
[pdf]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Yuhang and Huang, Wenke and Ye, Mang}, title = {Fair Federated Learning under Domain Skew with Local Consistency and Domain Diversity}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12077-12086} }
HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation in Video Understanding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Nguyen_2024_CVPR, author = {Nguyen, Trong-Thuan and Nguyen, Pha and Luu, Khoa}, title = {HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation in Video Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18384-18394} }
OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning Denoising-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Haichao and Xu, Yi and Lu, Hongsheng and Shimizu, Takayuki and Fu, Yun}, title = {OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning Denoising}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14802-14811} }
FADES: Fair Disentanglement with Sensitive Relevance-
[pdf]
[supp]
[bibtex]@InProceedings{Jang_2024_CVPR, author = {Jang, Taeuk and Wang, Xiaoqian}, title = {FADES: Fair Disentanglement with Sensitive Relevance}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12067-12076} }
Self-Supervised Class-Agnostic Motion Prediction with Spatial and Temporal Consistency Regularizations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Kewei and Wu, Yizheng and Cen, Jun and Pan, Zhiyu and Li, Xingyi and Wang, Zhe and Cao, Zhiguo and Lin, Guosheng}, title = {Self-Supervised Class-Agnostic Motion Prediction with Spatial and Temporal Consistency Regularizations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14638-14647} }
CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kennerley_2024_CVPR, author = {Kennerley, Mikhail and Wang, Jian-Gang and Veeravalli, Bharadwaj and Tan, Robby T.}, title = {CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16541-16550} }
An Empirical Study of Scaling Law for Scene Text Recognition-
[pdf]
[supp]
[bibtex]@InProceedings{Rang_2024_CVPR, author = {Rang, Miao and Bi, Zhenni and Liu, Chuanjian and Wang, Yunhe and Han, Kai}, title = {An Empirical Study of Scaling Law for Scene Text Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15619-15629} }
Text2Loc: 3D Point Cloud Localization from Natural Language-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xia_2024_CVPR, author = {Xia, Yan and Shi, Letian and Ding, Zifeng and Henriques, Joao F. and Cremers, Daniel}, title = {Text2Loc: 3D Point Cloud Localization from Natural Language}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14958-14967} }
Decomposing Disease Descriptions for Enhanced Pathology Detection: A Multi-Aspect Vision-Language Pre-training Framework-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Phan_2024_CVPR, author = {Phan, Vu Minh Hieu and Xie, Yutong and Qi, Yuankai and Liu, Lingqiao and Liu, Liyang and Zhang, Bowen and Liao, Zhibin and Wu, Qi and To, Minh-Son and Verjans, Johan W.}, title = {Decomposing Disease Descriptions for Enhanced Pathology Detection: A Multi-Aspect Vision-Language Pre-training Framework}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11492-11501} }
Fusing Personal and Environmental Cues for Identification and Segmentation of First-Person Camera Wearers in Third-Person Views-
[pdf]
[supp]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Ziwei and Wang, Yuchen and Wang, Chuhua}, title = {Fusing Personal and Environmental Cues for Identification and Segmentation of First-Person Camera Wearers in Third-Person Views}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16477-16487} }
Desigen: A Pipeline for Controllable Design Template Generation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Weng_2024_CVPR, author = {Weng, Haohan and Huang, Danqing and Qiao, Yu and Hu, Zheng and Lin, Chin-Yew and Zhang, Tong and Chen, C. L. Philip}, title = {Desigen: A Pipeline for Controllable Design Template Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12721-12732} }
Multi-criteria Token Fusion with One-step-ahead Attention for Efficient Vision Transformers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Sanghyeok and Choi, Joonmyung and Kim, Hyunwoo J.}, title = {Multi-criteria Token Fusion with One-step-ahead Attention for Efficient Vision Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15741-15750} }
ViewFusion: Towards Multi-View Consistency via Interpolated Denoising-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Xianghui and Zuo, Yan and Ramasinghe, Sameera and Bazzani, Loris and Avraham, Gil and van den Hengel, Anton}, title = {ViewFusion: Towards Multi-View Consistency via Interpolated Denoising}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9870-9880} }
SketchINR: A First Look into Sketches as Implicit Neural Representations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Bandyopadhyay_2024_CVPR, author = {Bandyopadhyay, Hmrishav and Bhunia, Ayan Kumar and Chowdhury, Pinaki Nath and Sain, Aneeshan and Xiang, Tao and Hospedales, Timothy and Song, Yi-Zhe}, title = {SketchINR: A First Look into Sketches as Implicit Neural Representations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12565-12574} }
MatchU: Matching Unseen Objects for 6D Pose Estimation from RGB-D Images-
[pdf]
[supp]
[bibtex]@InProceedings{Huang_2024_CVPR, author = {Huang, Junwen and Yu, Hao and Yu, Kuan-Ting and Navab, Nassir and Ilic, Slobodan and Busam, Benjamin}, title = {MatchU: Matching Unseen Objects for 6D Pose Estimation from RGB-D Images}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10095-10105} }
Towards High-fidelity Artistic Image Vectorization via Texture-Encapsulated Shape Parameterization-
[pdf]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Ye and Ni, Bingbing and Liu, Jinfan and Huang, Xiaoyang and Chen, Xuanhong}, title = {Towards High-fidelity Artistic Image Vectorization via Texture-Encapsulated Shape Parameterization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15877-15886} }
EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xiong_2024_CVPR, author = {Xiong, Yunyang and Varadarajan, Bala and Wu, Lemeng and Xiang, Xiaoyu and Xiao, Fanyi and Zhu, Chenchen and Dai, Xiaoliang and Wang, Dilin and Sun, Fei and Iandola, Forrest and Krishnamoorthi, Raghuraman and Chandra, Vikas}, title = {EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16111-16121} }
ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for Autonomous Vehicles-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Jiawei and Xu, Chejian and Li, Bo}, title = {ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for Autonomous Vehicles}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15459-15469} }
Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic Treatment based on Multi-Scale Aggregation and Anthropic Prior Knowledge-
[pdf]
[supp]
[bibtex]@InProceedings{Zou_2024_CVPR, author = {Zou, Bo and Wang, Shaofeng and Liu, Hao and Sun, Gaoyue and Wang, Yajie and Zuo, FeiFei and Quan, Chengbin and Zhao, Youjian}, title = {Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic Treatment based on Multi-Scale Aggregation and Anthropic Prior Knowledge}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11601-11610} }
Bayesian Diffusion Models for 3D Shape Reconstruction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Haiyang and Lei, Yu and Chen, Zeyuan and Zhang, Xiang and Zhao, Yue and Wang, Yilin and Tu, Zhuowen}, title = {Bayesian Diffusion Models for 3D Shape Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10628-10638} }
CrossKD: Cross-Head Knowledge Distillation for Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Jiabao and Chen, Yuming and Zheng, Zhaohui and Li, Xiang and Cheng, Ming-Ming and Hou, Qibin}, title = {CrossKD: Cross-Head Knowledge Distillation for Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16520-16530} }
Bi-level Learning of Task-Specific Decoders for Joint Registration and One-Shot Medical Image Segmentation-
[pdf]
[supp]
[bibtex]@InProceedings{Fan_2024_CVPR, author = {Fan, Xin and Wang, Xiaolin and Gao, Jiaxin and Wang, Jia and Luo, Zhongxuan and Liu, Risheng}, title = {Bi-level Learning of Task-Specific Decoders for Joint Registration and One-Shot Medical Image Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11726-11735} }
EscherNet: A Generative Model for Scalable View Synthesis-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kong_2024_CVPR, author = {Kong, Xin and Liu, Shikun and Lyu, Xiaoyang and Taher, Marwan and Qi, Xiaojuan and Davison, Andrew J.}, title = {EscherNet: A Generative Model for Scalable View Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9503-9513} }
MeaCap: Memory-Augmented Zero-shot Image Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zeng_2024_CVPR, author = {Zeng, Zequn and Xie, Yan and Zhang, Hao and Chen, Chiyu and Chen, Bo and Wang, Zhengjue}, title = {MeaCap: Memory-Augmented Zero-shot Image Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14100-14110} }
Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and Distance-Aware Bi-Projection Fusion-
[pdf]
[supp]
[bibtex]@InProceedings{Ai_2024_CVPR, author = {Ai, Hao and Wang, Lin}, title = {Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and Distance-Aware Bi-Projection Fusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9926-9935} }
Curriculum Point Prompting for Weakly-Supervised Referring Image Segmentation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Dai_2024_CVPR, author = {Dai, Qiyuan and Yang, Sibei}, title = {Curriculum Point Prompting for Weakly-Supervised Referring Image Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13711-13722} }
EventDance: Unsupervised Source-free Cross-modal Adaptation for Event-based Object Recognition-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zheng_2024_CVPR, author = {Zheng, Xu and Wang, Lin}, title = {EventDance: Unsupervised Source-free Cross-modal Adaptation for Event-based Object Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17448-17458} }
CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale Volumetric Super-Resolution of Medical Data-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Fang_2024_CVPR, author = {Fang, Wei and Tang, Yuxing and Guo, Heng and Yuan, Mingze and Mok, Tony C. W. and Yan, Ke and Yao, Jiawen and Chen, Xin and Liu, Zaiyi and Lu, Le and Zhang, Ling and Xu, Minfeng}, title = {CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale Volumetric Super-Resolution of Medical Data}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11631-11641} }
Holistic Autonomous Driving Understanding by Bird's-Eye-View Injected Multi-Modal Large Models-
[pdf]
[supp]
[bibtex]@InProceedings{Ding_2024_CVPR, author = {Ding, Xinpeng and Han, Jianhua and Xu, Hang and Liang, Xiaodan and Zhang, Wei and Li, Xiaomeng}, title = {Holistic Autonomous Driving Understanding by Bird's-Eye-View Injected Multi-Modal Large Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13668-13677} }
Extreme Point Supervised Instance Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Hyeonjun and Hwang, Sehyun and Kwak, Suha}, title = {Extreme Point Supervised Instance Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17212-17222} }
MedM2G: Unifying Medical Multi-Modal Generation via Cross-Guided Diffusion with Visual Invariant-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhan_2024_CVPR, author = {Zhan, Chenlu and Lin, Yu and Wang, Gaoang and Wang, Hongwei and Wu, Jian}, title = {MedM2G: Unifying Medical Multi-Modal Generation via Cross-Guided Diffusion with Visual Invariant}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11502-11512} }
Neural Parametric Gaussians for Monocular Non-Rigid Object Reconstruction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Das_2024_CVPR, author = {Das, Devikalyan and Wewer, Christopher and Yunus, Raza and Ilg, Eddy and Lenssen, Jan Eric}, title = {Neural Parametric Gaussians for Monocular Non-Rigid Object Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10715-10725} }
PH-Net: Semi-Supervised Breast Lesion Segmentation via Patch-wise Hardness-
[pdf]
[supp]
[bibtex]@InProceedings{Jiang_2024_CVPR, author = {Jiang, Siyao and Wu, Huisi and Chen, Junyang and Zhang, Qin and Qin, Jing}, title = {PH-Net: Semi-Supervised Breast Lesion Segmentation via Patch-wise Hardness}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11418-11427} }
ExACT: Language-guided Conceptual Reasoning and Uncertainty Estimation for Event-based Action Recognition and More-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2024_CVPR, author = {Zhou, Jiazhou and Zheng, Xu and Lyu, Yuanhuiyi and Wang, Lin}, title = {ExACT: Language-guided Conceptual Reasoning and Uncertainty Estimation for Event-based Action Recognition and More}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18633-18643} }
Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kwon_2024_CVPR, author = {Kwon, Hyeongjun and Jang, Jinhyun and Kim, Jin and Kim, Kwonyoung and Sohn, Kwanghoon}, title = {Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17364-17374} }
ParameterNet: Parameters Are All You Need for Large-scale Visual Pretraining of Mobile Networks-
[pdf]
[bibtex]@InProceedings{Han_2024_CVPR, author = {Han, Kai and Wang, Yunhe and Guo, Jianyuan and Wu, Enhua}, title = {ParameterNet: Parameters Are All You Need for Large-scale Visual Pretraining of Mobile Networks}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15751-15761} }
Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ke_2024_CVPR, author = {Ke, Bingxin and Obukhov, Anton and Huang, Shengyu and Metzger, Nando and Daudt, Rodrigo Caye and Schindler, Konrad}, title = {Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9492-9502} }
LLMs are Good Sign Language Translators-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gong_2024_CVPR, author = {Gong, Jia and Foo, Lin Geng and He, Yixuan and Rahmani, Hossein and Liu, Jun}, title = {LLMs are Good Sign Language Translators}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18362-18372} }
Revisiting the Domain Shift and Sample Uncertainty in Multi-source Active Domain Transfer-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Wenqiao and Lv, Zheqi and Zhou, Hao and Liu, Jia-Wei and Li, Juncheng and Li, Mengze and Li, Yunfei and Zhang, Dongping and Zhuang, Yueting and Tang, Siliang}, title = {Revisiting the Domain Shift and Sample Uncertainty in Multi-source Active Domain Transfer}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16751-16761} }
Learning Continual Compatible Representation for Re-indexing Free Lifelong Person Re-identification-
[pdf]
[supp]
[bibtex]@InProceedings{Cui_2024_CVPR, author = {Cui, Zhenyu and Zhou, Jiahuan and Wang, Xun and Zhu, Manyu and Peng, Yuxin}, title = {Learning Continual Compatible Representation for Re-indexing Free Lifelong Person Re-identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16614-16623} }
CORES: Convolutional Response-based Score for Out-of-distribution Detection-
[pdf]
[bibtex]@InProceedings{Tang_2024_CVPR, author = {Tang, Keke and Hou, Chao and Peng, Weilong and Chen, Runnan and Zhu, Peican and Wang, Wenping and Tian, Zhihong}, title = {CORES: Convolutional Response-based Score for Out-of-distribution Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10916-10925} }
Accurate Spatial Gene Expression Prediction by Integrating Multi-Resolution Features-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chung_2024_CVPR, author = {Chung, Youngmin and Ha, Ji Hun and Im, Kyeong Chan and Lee, Joo Sang}, title = {Accurate Spatial Gene Expression Prediction by Integrating Multi-Resolution Features}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11591-11600} }
Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded Surfaces Completion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Sun_2024_CVPR, author = {Sun, Su and Zhao, Cheng and Guo, Yuliang and Wang, Ruoyu and Huang, Xinyu and Chen, Yingjie Victor and Ren, Liu}, title = {Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded Surfaces Completion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12744-12753} }
VideoGrounding-DINO: Towards Open-Vocabulary Spatio-Temporal Video Grounding-
[pdf]
[bibtex]@InProceedings{Wasim_2024_CVPR, author = {Wasim, Syed Talal and Naseer, Muzammal and Khan, Salman and Yang, Ming-Hsuan and Khan, Fahad Shahbaz}, title = {VideoGrounding-DINO: Towards Open-Vocabulary Spatio-Temporal Video Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18909-18918} }
Think Twice Before Selection: Federated Evidential Active Learning for Medical Image Analysis with Domain Shifts-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Jiayi and Ma, Benteng and Cui, Hengfei and Xia, Yong}, title = {Think Twice Before Selection: Federated Evidential Active Learning for Medical Image Analysis with Domain Shifts}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11439-11449} }
ViTamin: Designing Scalable Vision Models in the Vision-Language Era-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Jieneng and Yu, Qihang and Shen, Xiaohui and Yuille, Alan and Chen, Liang-Chieh}, title = {ViTamin: Designing Scalable Vision Models in the Vision-Language Era}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12954-12966} }
Seeing the Unseen: Visual Common Sense for Semantic Placement-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ramrakhya_2024_CVPR, author = {Ramrakhya, Ram and Kembhavi, Aniruddha and Batra, Dhruv and Kira, Zsolt and Zeng, Kuo-Hao and Weihs, Luca}, title = {Seeing the Unseen: Visual Common Sense for Semantic Placement}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16273-16283} }
LLaMA-Excitor: General Instruction Tuning via Indirect Feature Interaction-
[pdf]
[supp]
[bibtex]@InProceedings{Zou_2024_CVPR, author = {Zou, Bo and Yang, Chao and Qiao, Yu and Quan, Chengbin and Zhao, Youjian}, title = {LLaMA-Excitor: General Instruction Tuning via Indirect Feature Interaction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14089-14099} }
Steerers: A Framework for Rotation Equivariant Keypoint Descriptors-
[pdf]
[supp]
[bibtex]@InProceedings{Bokman_2024_CVPR, author = {B\"okman, Georg and Edstedt, Johan and Felsberg, Michael and Kahl, Fredrik}, title = {Steerers: A Framework for Rotation Equivariant Keypoint Descriptors}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {4885-4895} }
Efficient Dataset Distillation via Minimax Diffusion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gu_2024_CVPR, author = {Gu, Jianyang and Vahidian, Saeed and Kungurtsev, Vyacheslav and Wang, Haonan and Jiang, Wei and You, Yang and Chen, Yiran}, title = {Efficient Dataset Distillation via Minimax Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15793-15803} }
Posterior Distillation Sampling-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Koo_2024_CVPR, author = {Koo, Juil and Park, Chanho and Sung, Minhyuk}, title = {Posterior Distillation Sampling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13352-13361} }
HOISDF: Constraining 3D Hand-Object Pose Estimation with Global Signed Distance Fields-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Qi_2024_CVPR, author = {Qi, Haozhe and Zhao, Chen and Salzmann, Mathieu and Mathis, Alexander}, title = {HOISDF: Constraining 3D Hand-Object Pose Estimation with Global Signed Distance Fields}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10392-10402} }
DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View Synthesis-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gu_2024_CVPR, author = {Gu, Yuming and Xu, Hongyi and Xie, You and Song, Guoxian and Shi, Yichun and Chang, Di and Yang, Jing and Luo, Linjie}, title = {DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10456-10465} }
H-ViT: A Hierarchical Vision Transformer for Deformable Image Registration-
[pdf]
[supp]
[bibtex]@InProceedings{Ghahremani_2024_CVPR, author = {Ghahremani, Morteza and Khateri, Mohammad and Jian, Bailiang and Wiestler, Benedikt and Adeli, Ehsan and Wachinger, Christian}, title = {H-ViT: A Hierarchical Vision Transformer for Deformable Image Registration}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11513-11523} }
VideoLLM-online: Online Video Large Language Model for Streaming Video-
[pdf]
[supp]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Joya and Lv, Zhaoyang and Wu, Shiwei and Lin, Kevin Qinghong and Song, Chenan and Gao, Difei and Liu, Jia-Wei and Gao, Ziteng and Mao, Dongxing and Shou, Mike Zheng}, title = {VideoLLM-online: Online Video Large Language Model for Streaming Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18407-18418} }
Towards Better Vision-Inspired Vision-Language Models-
[pdf]
[bibtex]@InProceedings{Cao_2024_CVPR, author = {Cao, Yun-Hao and Ji, Kaixiang and Huang, Ziyuan and Zheng, Chuanyang and Liu, Jiajia and Wang, Jian and Chen, Jingdong and Yang, Ming}, title = {Towards Better Vision-Inspired Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13537-13547} }
VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly Supervised 3D Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Zihua and Sakuma, Hiroki and Okutomi, Masatoshi}, title = {VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly Supervised 3D Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17354-17363} }
RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation-
[pdf]
[supp]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Zeyuan and Liu, Jiageng and Chen, Peihao and Cherian, Anoop and Marks, Tim K. and Le Roux, Jonathan and Gan, Chuang}, title = {RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16251-16261} }
Endow SAM with Keen Eyes: Temporal-spatial Prompt Learning for Video Camouflaged Object Detection-
[pdf]
[bibtex]@InProceedings{Hui_2024_CVPR, author = {Hui, Wenjun and Zhu, Zhenfeng and Zheng, Shuai and Zhao, Yao}, title = {Endow SAM with Keen Eyes: Temporal-spatial Prompt Learning for Video Camouflaged Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19058-19067} }
Forgery-aware Adaptive Transformer for Generalizable Synthetic Image Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Huan and Tan, Zichang and Tan, Chuangchuang and Wei, Yunchao and Wang, Jingdong and Zhao, Yao}, title = {Forgery-aware Adaptive Transformer for Generalizable Synthetic Image Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10770-10780} }
PostureHMR: Posture Transformation for 3D Human Mesh Recovery-
[pdf]
[supp]
[bibtex]@InProceedings{Song_2024_CVPR, author = {Song, Yu-Pei and Wu, Xiao and Yuan, Zhaoquan and Qiao, Jian-Jun and Peng, Qiang}, title = {PostureHMR: Posture Transformation for 3D Human Mesh Recovery}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9732-9741} }
Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer Learning for Point Cloud Analysis-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2024_CVPR, author = {Zhou, Xin and Liang, Dingkang and Xu, Wei and Zhu, Xingkui and Xu, Yihan and Zou, Zhikang and Bai, Xiang}, title = {Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer Learning for Point Cloud Analysis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14707-14717} }
Wonder3D: Single Image to 3D using Cross-Domain Diffusion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Long_2024_CVPR, author = {Long, Xiaoxiao and Guo, Yuan-Chen and Lin, Cheng and Liu, Yuan and Dou, Zhiyang and Liu, Lingjie and Ma, Yuexin and Zhang, Song-Hai and Habermann, Marc and Theobalt, Christian and Wang, Wenping}, title = {Wonder3D: Single Image to 3D using Cross-Domain Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9970-9980} }
RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail Richness in Text-to-3D-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Qiu_2024_CVPR, author = {Qiu, Lingteng and Chen, Guanying and Gu, Xiaodong and Zuo, Qi and Xu, Mutian and Wu, Yushuang and Yuan, Weihao and Dong, Zilong and Bo, Liefeng and Han, Xiaoguang}, title = {RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail Richness in Text-to-3D}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9914-9925} }
Zero-shot Referring Expression Comprehension via Structural Similarity Between Images and Captions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Han_2024_CVPR, author = {Han, Zeyu and Zhu, Fangrui and Lao, Qianru and Jiang, Huaizu}, title = {Zero-shot Referring Expression Comprehension via Structural Similarity Between Images and Captions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14364-14374} }
Triplane Meets Gaussian Splatting: Fast and Generalizable Single-View 3D Reconstruction with Transformers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zou_2024_CVPR, author = {Zou, Zi-Xin and Yu, Zhipeng and Guo, Yuan-Chen and Li, Yangguang and Liang, Ding and Cao, Yan-Pei and Zhang, Song-Hai}, title = {Triplane Meets Gaussian Splatting: Fast and Generalizable Single-View 3D Reconstruction with Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10324-10335} }
WateRF: Robust Watermarks in Radiance Fields for Protection of Copyrights-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Jang_2024_CVPR, author = {Jang, Youngdong and Lee, Dong In and Jang, MinHyuk and Kim, Jong Wook and Yang, Feng and Kim, Sangpil}, title = {WateRF: Robust Watermarks in Radiance Fields for Protection of Copyrights}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12087-12097} }
Instance-aware Contrastive Learning for Occluded Human Mesh Reconstruction-
[pdf]
[supp]
[bibtex]@InProceedings{Gwon_2024_CVPR, author = {Gwon, Mi-Gyeong and Um, Gi-Mun and Cheong, Won-Sik and Kim, Wonjun}, title = {Instance-aware Contrastive Learning for Occluded Human Mesh Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10553-10562} }
Robust Noisy Correspondence Learning with Equivariant Similarity Consistency-
[pdf]
[supp]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Yuchen and Wang, Likai and Yang, Erkun and Deng, Cheng}, title = {Robust Noisy Correspondence Learning with Equivariant Similarity Consistency}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17700-17709} }
Compositional Video Understanding with Spatiotemporal Structure-based Transformers-
[pdf]
[supp]
[bibtex]@InProceedings{Yun_2024_CVPR, author = {Yun, Hoyeoung and Ahn, Jinwoo and Kim, Minseo and Kim, Eun-Sol}, title = {Compositional Video Understanding with Spatiotemporal Structure-based Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18751-18760} }
3D LiDAR Mapping in Dynamic Environments using a 4D Implicit Neural Representation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhong_2024_CVPR, author = {Zhong, Xingguang and Pan, Yue and Stachniss, Cyrill and Behley, Jens}, title = {3D LiDAR Mapping in Dynamic Environments using a 4D Implicit Neural Representation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15417-15427} }
What When and Where? Self-Supervised Spatio-Temporal Grounding in Untrimmed Multi-Action Videos from Narrated Instructions-
[pdf]
[supp]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Brian and Shvetsova, Nina and Rouditchenko, Andrew and Kondermann, Daniel and Thomas, Samuel and Chang, Shih-Fu and Feris, Rogerio and Glass, James and Kuehne, Hilde}, title = {What When and Where? Self-Supervised Spatio-Temporal Grounding in Untrimmed Multi-Action Videos from Narrated Instructions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18419-18429} }
FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wen_2024_CVPR, author = {Wen, Bowen and Yang, Wei and Kautz, Jan and Birchfield, Stan}, title = {FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17868-17879} }
Hyperbolic Anomaly Detection-
[pdf]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Huimin and Chen, Zhentao and Xu, Yunhao and Hu, Junlin}, title = {Hyperbolic Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17511-17520} }
VLP: Vision Language Planning for Autonomous Driving-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pan_2024_CVPR, author = {Pan, Chenbin and Yaman, Burhaneddin and Nesti, Tommaso and Mallik, Abhirup and Allievi, Alessandro G and Velipasalar, Senem and Ren, Liu}, title = {VLP: Vision Language Planning for Autonomous Driving}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14760-14769} }
ProMark: Proactive Diffusion Watermarking for Causal Attribution-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Asnani_2024_CVPR, author = {Asnani, Vishal and Collomosse, John and Bui, Tu and Liu, Xiaoming and Agarwal, Shruti}, title = {ProMark: Proactive Diffusion Watermarking for Causal Attribution}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10802-10811} }
Consistency and Uncertainty: Identifying Unreliable Responses From Black-Box Vision-Language Models for Selective Visual Question Answering-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Khan_2024_CVPR, author = {Khan, Zaid and Fu, Yun}, title = {Consistency and Uncertainty: Identifying Unreliable Responses From Black-Box Vision-Language Models for Selective Visual Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10854-10863} }
Implicit Motion Function-
[pdf]
[supp]
[bibtex]@InProceedings{Gao_2024_CVPR, author = {Gao, Yue and Li, Jiahao and Chu, Lei and Lu, Yan}, title = {Implicit Motion Function}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19278-19289} }
MultiDiff: Consistent Novel View Synthesis from a Single Image-
[pdf]
[supp]
[bibtex]@InProceedings{Muller_2024_CVPR, author = {M\"uller, Norman and Schwarz, Katja and R\"ossle, Barbara and Porzi, Lorenzo and Bul\`o, Samuel Rota and Nie{\ss}ner, Matthias and Kontschieder, Peter}, title = {MultiDiff: Consistent Novel View Synthesis from a Single Image}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10258-10268} }
Atom-Level Optical Chemical Structure Recognition with Limited Supervision-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Oldenhof_2024_CVPR, author = {Oldenhof, Martijn and De Brouwer, Edward and Arany, Adam and Moreau, Yves}, title = {Atom-Level Optical Chemical Structure Recognition with Limited Supervision}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17669-17678} }
LiDAR-based Person Re-identification-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Guo_2024_CVPR, author = {Guo, Wenxuan and Pan, Zhiyu and Liang, Yingping and Xi, Ziheng and Zhong, Zhicheng and Feng, Jianjiang and Zhou, Jie}, title = {LiDAR-based Person Re-identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17437-17447} }
Model Adaptation for Time Constrained Embodied Control-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Song_2024_CVPR, author = {Song, Jaehyun and Yoo, Minjong and Woo, Honguk}, title = {Model Adaptation for Time Constrained Embodied Control}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16499-16508} }
ActiveDC: Distribution Calibration for Active Finetuning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Wenshuai and Hu, Zhenghui and Lu, Yu and Meng, Jinzhou and Liu, Qingjie and Wang, Yunhong}, title = {ActiveDC: Distribution Calibration for Active Finetuning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16996-17005} }
Seeing Unseen: Discover Novel Biomedical Concepts via Geometry-Constrained Probabilistic Modeling-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Fan_2024_CVPR, author = {Fan, Jianan and Liu, Dongnan and Chang, Hang and Huang, Heng and Chen, Mei and Cai, Weidong}, title = {Seeing Unseen: Discover Novel Biomedical Concepts via Geometry-Constrained Probabilistic Modeling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11524-11534} }
Communication-Efficient Federated Learning with Accelerated Client Gradient-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Geeho and Kim, Jinkyu and Han, Bohyung}, title = {Communication-Efficient Federated Learning with Accelerated Client Gradient}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12385-12394} }
LLMs are Good Action Recognizers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Qu_2024_CVPR, author = {Qu, Haoxuan and Cai, Yujun and Liu, Jun}, title = {LLMs are Good Action Recognizers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18395-18406} }
Interactive Continual Learning: Fast and Slow Thinking-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Qi_2024_CVPR, author = {Qi, Biqing and Chen, Xinquan and Gao, Junqi and Li, Dong and Liu, Jianxing and Wu, Ligang and Zhou, Bowen}, title = {Interactive Continual Learning: Fast and Slow Thinking}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12882-12892} }
Towards Learning a Generalist Model for Embodied Navigation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zheng_2024_CVPR, author = {Zheng, Duo and Huang, Shijia and Zhao, Lin and Zhong, Yiwu and Wang, Liwei}, title = {Towards Learning a Generalist Model for Embodied Navigation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13624-13634} }
Splatter Image: Ultra-Fast Single-View 3D Reconstruction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Szymanowicz_2024_CVPR, author = {Szymanowicz, Stanislaw and Rupprecht, Chrisitian and Vedaldi, Andrea}, title = {Splatter Image: Ultra-Fast Single-View 3D Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10208-10217} }
Modeling Collaborator: Enabling Subjective Vision Classification With Minimal Human Effort via LLM Tool-Use-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Toubal_2024_CVPR, author = {Toubal, Imad Eddine and Avinash, Aditya and Alldrin, Neil Gordon and Dlabal, Jan and Zhou, Wenlei and Luo, Enming and Stretcu, Otilia and Xiong, Hao and Lu, Chun-Ta and Zhou, Howard and Krishna, Ranjay and Fuxman, Ariel and Duerig, Tom}, title = {Modeling Collaborator: Enabling Subjective Vision Classification With Minimal Human Effort via LLM Tool-Use}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17553-17563} }
GeoReF: Geometric Alignment Across Shape Variation for Category-level Object Pose Refinement-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zheng_2024_CVPR, author = {Zheng, Linfang and Tse, Tze Ho Elden and Wang, Chen and Sun, Yinghan and Chen, Hua and Leonardis, Ales and Zhang, Wei and Chang, Hyung Jin}, title = {GeoReF: Geometric Alignment Across Shape Variation for Category-level Object Pose Refinement}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10693-10703} }
Learning Group Activity Features Through Person Attribute Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Nakatani_2024_CVPR, author = {Nakatani, Chihiro and Kawashima, Hiroaki and Ukita, Norimichi}, title = {Learning Group Activity Features Through Person Attribute Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18233-18242} }
Plug-and-Play Diffusion Distillation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Hsiao_2024_CVPR, author = {Hsiao, Yi-Ting and Khodadadeh, Siavash and Duarte, Kevin and Lin, Wei-An and Qu, Hui and Kwon, Mingi and Kalarot, Ratheesh}, title = {Plug-and-Play Diffusion Distillation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13743-13752} }
MindBridge: A Cross-Subject Brain Decoding Framework-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Shizun and Liu, Songhua and Tan, Zhenxiong and Wang, Xinchao}, title = {MindBridge: A Cross-Subject Brain Decoding Framework}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11333-11342} }
MM-Narrator: Narrating Long-form Videos with Multimodal In-Context Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Chaoyi and Lin, Kevin and Yang, Zhengyuan and Wang, Jianfeng and Li, Linjie and Lin, Chung-Ching and Liu, Zicheng and Wang, Lijuan}, title = {MM-Narrator: Narrating Long-form Videos with Multimodal In-Context Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13647-13657} }
Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar Creation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Xiyi and Mihajlovic, Marko and Wang, Shaofei and Prokudin, Sergey and Tang, Siyu}, title = {Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar Creation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10359-10370} }
Fully Convolutional Slice-to-Volume Reconstruction for Single-Stack MRI-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Young_2024_CVPR, author = {Young, Sean I. and Balbastre, Yael and Fischl, Bruce and Golland, Polina and Iglesias, Juan Eugenio}, title = {Fully Convolutional Slice-to-Volume Reconstruction for Single-Stack MRI}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11535-11545} }
Enhance Image Classification via Inter-Class Image Mixup with Diffusion Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Zhicai and Wei, Longhui and Wang, Tan and Chen, Heyu and Hao, Yanbin and Wang, Xiang and He, Xiangnan and Tian, Qi}, title = {Enhance Image Classification via Inter-Class Image Mixup with Diffusion Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17223-17233} }
Alpha-CLIP: A CLIP Model Focusing on Wherever You Want-
[pdf]
[supp]
[bibtex]@InProceedings{Sun_2024_CVPR, author = {Sun, Zeyi and Fang, Ye and Wu, Tong and Zhang, Pan and Zang, Yuhang and Kong, Shu and Xiong, Yuanjun and Lin, Dahua and Wang, Jiaqi}, title = {Alpha-CLIP: A CLIP Model Focusing on Wherever You Want}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13019-13029} }
ADA-Track: End-to-End Multi-Camera 3D Multi-Object Tracking with Alternating Detection and Association-
[pdf]
[supp]
[bibtex]@InProceedings{Ding_2024_CVPR, author = {Ding, Shuxiao and Schneider, Lukas and Cordts, Marius and Gall, Juergen}, title = {ADA-Track: End-to-End Multi-Camera 3D Multi-Object Tracking with Alternating Detection and Association}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15184-15194} }
Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular Depth Estimation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Talker_2024_CVPR, author = {Talker, Lior and Cohen, Aviad and Yosef, Erez and Dana, Alexandra and Dinerstein, Michael}, title = {Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular Depth Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10606-10616} }
Attention-Driven Training-Free Efficiency Enhancement of Diffusion Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Hongjie and Liu, Difan and Kang, Yan and Li, Yijun and Lin, Zhe and Jha, Niraj K. and Liu, Yuchen}, title = {Attention-Driven Training-Free Efficiency Enhancement of Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16080-16089} }
CPR: Retrieval Augmented Generation for Copyright Protection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Golatkar_2024_CVPR, author = {Golatkar, Aditya and Achille, Alessandro and Zancato, Luca and Wang, Yu-Xiang and Swaminathan, Ashwin and Soatto, Stefano}, title = {CPR: Retrieval Augmented Generation for Copyright Protection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12374-12384} }
Vision-and-Language Navigation via Causal Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Liuyi and He, Zongtao and Dang, Ronghao and Shen, Mengjiao and Liu, Chengju and Chen, Qijun}, title = {Vision-and-Language Navigation via Causal Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13139-13150} }
Unveiling Parts Beyond Objects: Towards Finer-Granularity Referring Expression Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Wenxuan and Yue, Tongtian and Zhang, Yisi and Guo, Longteng and He, Xingjian and Wang, Xinlong and Liu, Jing}, title = {Unveiling Parts Beyond Objects: Towards Finer-Granularity Referring Expression Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12998-13008} }
Differentiable Display Photometric Stereo-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Choi_2024_CVPR, author = {Choi, Seokjun and Yoon, Seungwoo and Nam, Giljoo and Lee, Seungyong and Baek, Seung-Hwan}, title = {Differentiable Display Photometric Stereo}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11831-11840} }
In-distribution Public Data Synthesis with Diffusion Models for Differentially Private Image Classification-
[pdf]
[supp]
[bibtex]@InProceedings{Park_2024_CVPR, author = {Park, Jinseong and Choi, Yujin and Lee, Jaewook}, title = {In-distribution Public Data Synthesis with Diffusion Models for Differentially Private Image Classification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12236-12246} }
LSK3DNet: Towards Effective and Efficient 3D Perception with Large Sparse Kernels-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Feng_2024_CVPR, author = {Feng, Tuo and Wang, Wenguan and Ma, Fan and Yang, Yi}, title = {LSK3DNet: Towards Effective and Efficient 3D Perception with Large Sparse Kernels}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14916-14927} }
Diversified and Personalized Multi-rater Medical Image Segmentation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Yicheng and Luo, Xiangde and Xu, Zhe and Guo, Xiaoqing and Ju, Lie and Ge, Zongyuan and Liao, Wenjun and Cai, Jianfei}, title = {Diversified and Personalized Multi-rater Medical Image Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11470-11479} }
Discover and Mitigate Multiple Biased Subgroups in Image Classifiers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Zeliang and Feng, Mingqian and Li, Zhiheng and Xu, Chenliang}, title = {Discover and Mitigate Multiple Biased Subgroups in Image Classifiers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10906-10915} }
ExMap: Leveraging Explainability Heatmaps for Unsupervised Group Robustness to Spurious Correlations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chakraborty_2024_CVPR, author = {Chakraborty, Rwiddhi and Sletten, Adrian and Kampffmeyer, Michael C.}, title = {ExMap: Leveraging Explainability Heatmaps for Unsupervised Group Robustness to Spurious Correlations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12017-12026} }
Learning to Segment Referred Objects from Narrated Egocentric Videos-
[pdf]
[supp]
[bibtex]@InProceedings{Shen_2024_CVPR, author = {Shen, Yuhan and Wang, Huiyu and Yang, Xitong and Feiszli, Matt and Elhamifar, Ehsan and Torresani, Lorenzo and Mavroudi, Effrosyni}, title = {Learning to Segment Referred Objects from Narrated Egocentric Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14510-14520} }
Adapting Visual-Language Models for Generalizable Anomaly Detection in Medical Images-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Huang_2024_CVPR, author = {Huang, Chaoqin and Jiang, Aofan and Feng, Jinghao and Zhang, Ya and Wang, Xinchao and Wang, Yanfeng}, title = {Adapting Visual-Language Models for Generalizable Anomaly Detection in Medical Images}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11375-11385} }
Depth-aware Test-Time Training for Zero-shot Video Object Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Weihuang and Shen, Xi and Li, Haolun and Bi, Xiuli and Liu, Bo and Pun, Chi-Man and Cun, Xiaodong}, title = {Depth-aware Test-Time Training for Zero-shot Video Object Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19218-19227} }
RMem: Restricted Memory Banks Improve Video Object Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2024_CVPR, author = {Zhou, Junbao and Pang, Ziqi and Wang, Yu-Xiong}, title = {RMem: Restricted Memory Banks Improve Video Object Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18602-18611} }
Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention Graph in Pre-Trained Transformers-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Hongjie and Dedhia, Bhishma and Jha, Niraj K.}, title = {Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention Graph in Pre-Trained Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16070-16079} }
DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo Boundary Enrichment and Online Refinement-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Hao and Liu, Huabin and Qiao, Yu and Sun, Xiao}, title = {DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo Boundary Enrichment and Online Refinement}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18699-18708} }
SOK-Bench: A Situated Video Reasoning Benchmark with Aligned Open-World Knowledge-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Andong and Wu, Bo and Chen, Sunli and Chen, Zhenfang and Guan, Haotian and Lee, Wei-Ning and Li, Li Erran and Gan, Chuang}, title = {SOK-Bench: A Situated Video Reasoning Benchmark with Aligned Open-World Knowledge}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13384-13394} }
LORS: Low-rank Residual Structure for Parameter-Efficient Network Stacking-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Jialin and Nie, Qiang and Fu, Weifu and Lin, Yuhuan and Tao, Guangpin and Liu, Yong and Wang, Chengjie}, title = {LORS: Low-rank Residual Structure for Parameter-Efficient Network Stacking}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15866-15876} }
Multi-modal In-Context Learning Makes an Ego-evolving Scene Text Recognizer-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Zhen and Tang, Jingqun and Lin, Chunhui and Wu, Binghong and Huang, Can and Liu, Hao and Tan, Xin and Zhang, Zhizhong and Xie, Yuan}, title = {Multi-modal In-Context Learning Makes an Ego-evolving Scene Text Recognizer}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15567-15576} }
Training Diffusion Models Towards Diverse Image Generation with Reinforcement Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Miao_2024_CVPR, author = {Miao, Zichen and Wang, Jiang and Wang, Ze and Yang, Zhengyuan and Wang, Lijuan and Qiu, Qiang and Liu, Zicheng}, title = {Training Diffusion Models Towards Diverse Image Generation with Reinforcement Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10844-10853} }
LASIL: Learner-Aware Supervised Imitation Learning For Long-term Microscopic Traffic Simulation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Guo_2024_CVPR, author = {Guo, Ke and Miao, Zhenwei and Jing, Wei and Liu, Weiwei and Li, Weizi and Hao, Dayang and Pan, Jia}, title = {LASIL: Learner-Aware Supervised Imitation Learning For Long-term Microscopic Traffic Simulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15386-15395} }
SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular 3D Detection of Large Objects-
[pdf]
[supp]
[bibtex]@InProceedings{Kumar_2024_CVPR, author = {Kumar, Abhinav and Guo, Yuliang and Huang, Xinyu and Ren, Liu and Liu, Xiaoming}, title = {SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular 3D Detection of Large Objects}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10269-10280} }
NOPE: Novel Object Pose Estimation from a Single Image-
[pdf]
[supp]
[bibtex]@InProceedings{Nguyen_2024_CVPR, author = {Nguyen, Van Nguyen and Groueix, Thibault and Ponimatkin, Georgy and Hu, Yinlin and Marlet, Renaud and Salzmann, Mathieu and Lepetit, Vincent}, title = {NOPE: Novel Object Pose Estimation from a Single Image}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17923-17932} }
Dual-View Visual Contextualization for Web Navigation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kil_2024_CVPR, author = {Kil, Jihyung and Song, Chan Hee and Zheng, Boyuan and Deng, Xiang and Su, Yu and Chao, Wei-Lun}, title = {Dual-View Visual Contextualization for Web Navigation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14445-14454} }
Language-driven Grasp Detection-
[pdf]
[supp]
[bibtex]@InProceedings{Vuong_2024_CVPR, author = {Vuong, An Dinh and Vu, Minh Nhat and Huang, Baoru and Nguyen, Nghia and Le, Hieu and Vo, Thieu and Nguyen, Anh}, title = {Language-driven Grasp Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17902-17912} }
Towards Modern Image Manipulation Localization: A Large-Scale Dataset and Novel Methods-
[pdf]
[supp]
[bibtex]@InProceedings{Qu_2024_CVPR, author = {Qu, Chenfan and Zhong, Yiwu and Liu, Chongyu and Xu, Guitao and Peng, Dezhi and Guo, Fengjun and Jin, Lianwen}, title = {Towards Modern Image Manipulation Localization: A Large-Scale Dataset and Novel Methods}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10781-10790} }
Object Recognition as Next Token Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yue_2024_CVPR, author = {Yue, Kaiyu and Chen, Bor-Chun and Geiping, Jonas and Li, Hengduo and Goldstein, Tom and Lim, Ser-Nam}, title = {Object Recognition as Next Token Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16645-16656} }
Transcriptomics-guided Slide Representation Learning in Computational Pathology-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Jaume_2024_CVPR, author = {Jaume, Guillaume and Oldenburg, Lukas and Vaidya, Anurag and Chen, Richard J. and Williamson, Drew F.K. and Peeters, Thomas and Song, Andrew H. and Mahmood, Faisal}, title = {Transcriptomics-guided Slide Representation Learning in Computational Pathology}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9632-9644} }
CLIP-BEVFormer: Enhancing Multi-View Image-Based BEV Detector with Ground Truth Flow-
[pdf]
[supp]
[bibtex]@InProceedings{Pan_2024_CVPR, author = {Pan, Chenbin and Yaman, Burhaneddin and Velipasalar, Senem and Ren, Liu}, title = {CLIP-BEVFormer: Enhancing Multi-View Image-Based BEV Detector with Ground Truth Flow}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15216-15225} }
CLOVA: A Closed-LOop Visual Assistant with Tool Usage and Update-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gao_2024_CVPR, author = {Gao, Zhi and Du, Yuntao and Zhang, Xintong and Ma, Xiaojian and Han, Wenjuan and Zhu, Song-Chun and Li, Qing}, title = {CLOVA: A Closed-LOop Visual Assistant with Tool Usage and Update}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13258-13268} }
Depth Prompting for Sensor-Agnostic Depth Estimation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Park_2024_CVPR, author = {Park, Jin-Hwi and Jeong, Chanhwi and Lee, Junoh and Jeon, Hae-Gon}, title = {Depth Prompting for Sensor-Agnostic Depth Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9859-9869} }
G3DR: Generative 3D Reconstruction in ImageNet-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Reddy_2024_CVPR, author = {Reddy, Pradyumna and Elezi, Ismail and Deng, Jiankang}, title = {G3DR: Generative 3D Reconstruction in ImageNet}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9655-9665} }
Hyperspherical Classification with Dynamic Label-to-Prototype Assignment-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Saadabadi_2024_CVPR, author = {Saadabadi, Mohammad Saeed Ebrahimi and Dabouei, Ali and Malakshan, Sahar Rahimi and Nasrabadi, Nasser M.}, title = {Hyperspherical Classification with Dynamic Label-to-Prototype Assignment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17333-17342} }
VTimeLLM: Empower LLM to Grasp Video Moments-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Huang_2024_CVPR, author = {Huang, Bin and Wang, Xin and Chen, Hong and Song, Zihan and Zhu, Wenwu}, title = {VTimeLLM: Empower LLM to Grasp Video Moments}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14271-14280} }
FLHetBench: Benchmarking Device and State Heterogeneity in Federated Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Junyuan and Zeng, Shuang and Zhang, Miao and Wang, Runxi and Wang, Feifei and Zhou, Yuyin and Liang, Paul Pu and Qu, Liangqiong}, title = {FLHetBench: Benchmarking Device and State Heterogeneity in Federated Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12098-12108} }
Privacy-Preserving Optics for Enhancing Protection in Face De-Identification-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lopez_2024_CVPR, author = {Lopez, Jhon and Hinojosa, Carlos and Arguello, Henry and Ghanem, Bernard}, title = {Privacy-Preserving Optics for Enhancing Protection in Face De-Identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12120-12129} }
SmartRefine: A Scenario-Adaptive Refinement Framework for Efficient Motion Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhou_2024_CVPR, author = {Zhou, Yang and Shao, Hao and Wang, Letian and Waslander, Steven L. and Li, Hongsheng and Liu, Yu}, title = {SmartRefine: A Scenario-Adaptive Refinement Framework for Efficient Motion Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15281-15290} }
Multi-Scale Video Anomaly Detection by Multi-Grained Spatio-Temporal Representation Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Menghao and Wang, Jingyu and Qi, Qi and Sun, Haifeng and Zhuang, Zirui and Ren, Pengfei and Ma, Ruilong and Liao, Jianxin}, title = {Multi-Scale Video Anomaly Detection by Multi-Grained Spatio-Temporal Representation Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17385-17394} }
Generative Multimodal Models are In-Context Learners-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Sun_2024_CVPR, author = {Sun, Quan and Cui, Yufeng and Zhang, Xiaosong and Zhang, Fan and Yu, Qiying and Wang, Yueze and Rao, Yongming and Liu, Jingjing and Huang, Tiejun and Wang, Xinlong}, title = {Generative Multimodal Models are In-Context Learners}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14398-14409} }
Feature Re-Embedding: Towards Foundation Model-Level Performance in Computational Pathology-
[pdf]
[supp]
[bibtex]@InProceedings{Tang_2024_CVPR, author = {Tang, Wenhao and Zhou, Fengtao and Huang, Sheng and Zhu, Xiang and Zhang, Yi and Liu, Bo}, title = {Feature Re-Embedding: Towards Foundation Model-Level Performance in Computational Pathology}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11343-11352} }
Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Zhiwei and Liu, Jing and Wu, Peng}, title = {Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18899-18908} }
SparseOcc: Rethinking Sparse Latent Representation for Vision-Based Semantic Occupancy Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tang_2024_CVPR, author = {Tang, Pin and Wang, Zhongdao and Wang, Guoqing and Zheng, Jilai and Ren, Xiangxuan and Feng, Bailan and Ma, Chao}, title = {SparseOcc: Rethinking Sparse Latent Representation for Vision-Based Semantic Occupancy Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15035-15044} }
Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic Architecture-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Fei and Guo, Dan and Li, Kun and Zhong, Zhun and Wang, Meng}, title = {Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic Architecture}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18984-18994} }
Hyperbolic Learning with Synthetic Captions for Open-World Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kong_2024_CVPR, author = {Kong, Fanjie and Chen, Yanbei and Cai, Jiarui and Modolo, Davide}, title = {Hyperbolic Learning with Synthetic Captions for Open-World Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16762-16771} }
Interpretable Measures of Conceptual Similarity by Complexity-Constrained Descriptive Auto-Encoding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Achille_2024_CVPR, author = {Achille, Alessandro and Steeg, Greg Ver and Liu, Tian Yu and Trager, Matthew and Klingenberg, Carson and Soatto, Stefano}, title = {Interpretable Measures of Conceptual Similarity by Complexity-Constrained Descriptive Auto-Encoding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11062-11071} }
3D Feature Tracking via Event Camera-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Siqi and Zhou, Zhikuan and Xue, Zhou and Li, Yipeng and Du, Shaoyi and Gao, Yue}, title = {3D Feature Tracking via Event Camera}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18974-18983} }
MaxQ: Multi-Axis Query for N:M Sparsity Network-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xiang_2024_CVPR, author = {Xiang, Jingyang and Li, Siqi and Chen, Junhao and Chen, Zhuangzhi and Huang, Tianxin and Peng, Linpeng and Liu, Yong}, title = {MaxQ: Multi-Axis Query for N:M Sparsity Network}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15845-15854} }
Part-aware Unified Representation of Language and Skeleton for Zero-shot Action Recognition-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhu_2024_CVPR, author = {Zhu, Anqi and Ke, Qiuhong and Gong, Mingming and Bailey, James}, title = {Part-aware Unified Representation of Language and Skeleton for Zero-shot Action Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18761-18770} }
Composing Object Relations and Attributes for Image-Text Matching-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pham_2024_CVPR, author = {Pham, Khoi and Huynh, Chuong and Lim, Ser-Nam and Shrivastava, Abhinav}, title = {Composing Object Relations and Attributes for Image-Text Matching}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14354-14363} }
Previously on ... From Recaps to Story Summarization-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Singh_2024_CVPR, author = {Singh, Aditya Kumar and Srivastava, Dhruv and Tapaswi, Makarand}, title = {Previously on ... From Recaps to Story Summarization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13635-13646} }
mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration-
[pdf]
[supp]
[bibtex]@InProceedings{Ye_2024_CVPR, author = {Ye, Qinghao and Xu, Haiyang and Ye, Jiabo and Yan, Ming and Hu, Anwen and Liu, Haowei and Qian, Qi and Zhang, Ji and Huang, Fei}, title = {mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13040-13051} }
Learning by Correction: Efficient Tuning Task for Zero-Shot Generative Vision-Language Reasoning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Rongjie and Wu, Yu and He, Xuming}, title = {Learning by Correction: Efficient Tuning Task for Zero-Shot Generative Vision-Language Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13428-13437} }
Supervised Anomaly Detection for Complex Industrial Images-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Baitieva_2024_CVPR, author = {Baitieva, Aimira and Hurych, David and Besnier, Victor and Bernard, Olivier}, title = {Supervised Anomaly Detection for Complex Industrial Images}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17754-17762} }
Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with Queryable Objects and Open-Set Relationships-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Koch_2024_CVPR, author = {Koch, Sebastian and Vaskevicius, Narunas and Colosi, Mirco and Hermosilla, Pedro and Ropinski, Timo}, title = {Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with Queryable Objects and Open-Set Relationships}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14183-14193} }
SURE: SUrvey REcipes for building reliable and robust deep networks-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Yuting and Chen, Yingyi and Yu, Xuanlong and Chen, Dexiong and Shen, Xi}, title = {SURE: SUrvey REcipes for building reliable and robust deep networks}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17500-17510} }
PolarRec: Improving Radio Interferometric Data Reconstruction Using Polar Coordinates-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Ruoqi and Chen, Zhuoyang and Zhu, Jiayi and Luo, Qiong and Wang, Feng}, title = {PolarRec: Improving Radio Interferometric Data Reconstruction Using Polar Coordinates}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12841-12850} }
Summarize the Past to Predict the Future: Natural Language Descriptions of Context Boost Multimodal Object Interaction Anticipation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pasca_2024_CVPR, author = {Pasca, Razvan-George and Gavryushin, Alexey and Hamza, Muhammad and Kuo, Yen-Ling and Mo, Kaichun and Van Gool, Luc and Hilliges, Otmar and Wang, Xi}, title = {Summarize the Past to Predict the Future: Natural Language Descriptions of Context Boost Multimodal Object Interaction Anticipation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18286-18296} }
Towards CLIP-driven Language-free 3D Visual Grounding via 2D-3D Relational Enhancement and Consistency-
[pdf]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Yuqi and Luo, Han and Lei, Yinjie}, title = {Towards CLIP-driven Language-free 3D Visual Grounding via 2D-3D Relational Enhancement and Consistency}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13063-13072} }
Optimal Transport Aggregation for Visual Place Recognition-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Izquierdo_2024_CVPR, author = {Izquierdo, Sergio and Civera, Javier}, title = {Optimal Transport Aggregation for Visual Place Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17658-17668} }
Aligning and Prompting Everything All at Once for Universal Visual Perception-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shen_2024_CVPR, author = {Shen, Yunhang and Fu, Chaoyou and Chen, Peixian and Zhang, Mengdan and Li, Ke and Sun, Xing and Wu, Yunsheng and Lin, Shaohui and Ji, Rongrong}, title = {Aligning and Prompting Everything All at Once for Universal Visual Perception}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13193-13203} }
Correlation-Decoupled Knowledge Distillation for Multimodal Sentiment Analysis with Incomplete Modalities-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Mingcheng and Yang, Dingkang and Zhao, Xiao and Wang, Shuaibing and Wang, Yan and Yang, Kun and Sun, Mingyang and Kou, Dongliang and Qian, Ziyun and Zhang, Lihua}, title = {Correlation-Decoupled Knowledge Distillation for Multimodal Sentiment Analysis with Incomplete Modalities}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12458-12468} }
LoSh: Long-Short Text Joint Prediction Network for Referring Video Object Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yuan_2024_CVPR, author = {Yuan, Linfeng and Shi, Miaojing and Yue, Zijie and Chen, Qijun}, title = {LoSh: Long-Short Text Joint Prediction Network for Referring Video Object Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14001-14010} }
Dual Prototype Attention for Unsupervised Video Object Segmentation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Cho_2024_CVPR, author = {Cho, Suhwan and Lee, Minhyeok and Lee, Seunghoon and Lee, Dogyoon and Choi, Heeseung and Kim, Ig-Jae and Lee, Sangyoun}, title = {Dual Prototype Attention for Unsupervised Video Object Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19238-19247} }
Navigate Beyond Shortcuts: Debiased Learning Through the Lens of Neural Collapse-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Yining and Sun, Junjie and Wang, Chenyue and Zhang, Mi and Yang, Min}, title = {Navigate Beyond Shortcuts: Debiased Learning Through the Lens of Neural Collapse}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12322-12331} }
A Subspace-Constrained Tyler's Estimator and its Applications to Structure from Motion-
[pdf]
[supp]
[bibtex]@InProceedings{Yu_2024_CVPR, author = {Yu, Feng and Zhang, Teng and Lerman, Gilad}, title = {A Subspace-Constrained Tyler's Estimator and its Applications to Structure from Motion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14575-14584} }
CAD: Photorealistic 3D Generation via Adversarial Distillation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wan_2024_CVPR, author = {Wan, Ziyu and Paschalidou, Despoina and Huang, Ian and Liu, Hongyu and Shen, Bokui and Xiang, Xiaoyu and Liao, Jing and Guibas, Leonidas}, title = {CAD: Photorealistic 3D Generation via Adversarial Distillation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10194-10207} }
Enhancing Vision-Language Pre-training with Rich Supervisions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gao_2024_CVPR, author = {Gao, Yuan and Shi, Kunyu and Zhu, Pengkai and Belval, Edouard and Nuriel, Oren and Appalaraju, Srikar and Ghadar, Shabnam and Tu, Zhuowen and Mahadevan, Vijay and Soatto, Stefano}, title = {Enhancing Vision-Language Pre-training with Rich Supervisions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13480-13491} }
Adaptive VIO: Deep Visual-Inertial Odometry with Online Continual Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pan_2024_CVPR, author = {Pan, Youqi and Zhou, Wugen and Cao, Yingdian and Zha, Hongbin}, title = {Adaptive VIO: Deep Visual-Inertial Odometry with Online Continual Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18019-18028} }
Generalized Large-Scale Data Condensation via Various Backbone and Statistical Matching-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shao_2024_CVPR, author = {Shao, Shitong and Yin, Zeyuan and Zhou, Muxin and Zhang, Xindong and Shen, Zhiqiang}, title = {Generalized Large-Scale Data Condensation via Various Backbone and Statistical Matching}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16709-16718} }
On Train-Test Class Overlap and Detection for Image Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Song_2024_CVPR, author = {Song, Chull Hwan and Yoon, Jooyoung and Hwang, Taebaek and Choi, Shunghyun and Gu, Yeong Hyeon and Avrithis, Yannis}, title = {On Train-Test Class Overlap and Detection for Image Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17375-17384} }
AttriHuman-3D: Editable 3D Human Avatar Generation with Attribute Decomposition and Indexing-
[pdf]
[supp]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Fan and Chen, Tianyi and He, Xiaosheng and Cai, Zhongang and Yang, Lei and Wu, Si and Lin, Guosheng}, title = {AttriHuman-3D: Editable 3D Human Avatar Generation with Attribute Decomposition and Indexing}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10596-10605} }
Learning Object State Changes in Videos: An Open-World Perspective-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xue_2024_CVPR, author = {Xue, Zihui and Ashutosh, Kumar and Grauman, Kristen}, title = {Learning Object State Changes in Videos: An Open-World Perspective}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18493-18503} }
SCoFT: Self-Contrastive Fine-Tuning for Equitable Image Generation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Zhixuan and Schaldenbrand, Peter and Okogwu, Beverley-Claire and Peng, Wenxuan and Yun, Youngsik and Hundt, Andrew and Kim, Jihie and Oh, Jean}, title = {SCoFT: Self-Contrastive Fine-Tuning for Equitable Image Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10822-10832} }
Iterated Learning Improves Compositionality in Large Vision-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zheng_2024_CVPR, author = {Zheng, Chenhao and Zhang, Jieyu and Kembhavi, Aniruddha and Krishna, Ranjay}, title = {Iterated Learning Improves Compositionality in Large Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13785-13795} }
Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Xiao and Wang, Shiao and Tang, Chuanming and Zhu, Lin and Jiang, Bo and Tian, Yonghong and Tang, Jin}, title = {Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19248-19257} }
Dual DETRs for Multi-Label Temporal Action Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhu_2024_CVPR, author = {Zhu, Yuhan and Zhang, Guozhen and Tan, Jing and Wu, Gangshan and Wang, Limin}, title = {Dual DETRs for Multi-Label Temporal Action Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18559-18569} }
Virtual Immunohistochemistry Staining for Histological Images Assisted by Weakly-supervised Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Jiahan and Dong, Jiuyang and Huang, Shenjin and Li, Xi and Jiang, Junjun and Fan, Xiaopeng and Zhang, Yongbing}, title = {Virtual Immunohistochemistry Staining for Histological Images Assisted by Weakly-supervised Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11259-11268} }
DeCoTR: Enhancing Depth Completion with 2D and 3D Attentions-
[pdf]
[arXiv]
[bibtex]@InProceedings{Shi_2024_CVPR, author = {Shi, Yunxiao and Singh, Manish Kumar and Cai, Hong and Porikli, Fatih}, title = {DeCoTR: Enhancing Depth Completion with 2D and 3D Attentions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10736-10746} }
Utility-Fairness Trade-Offs and How to Find Them-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Dehdashtian_2024_CVPR, author = {Dehdashtian, Sepehr and Sadeghi, Bashir and Boddeti, Vishnu Naresh}, title = {Utility-Fairness Trade-Offs and How to Find Them}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12037-12046} }
SAOR: Single-View Articulated Object Reconstruction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Aygun_2024_CVPR, author = {Aygun, Mehmet and Mac Aodha, Oisin}, title = {SAOR: Single-View Articulated Object Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10382-10391} }
A Theory of Joint Light and Heat Transport for Lambertian Scenes-
[pdf]
[supp]
[bibtex]@InProceedings{Ramanagopal_2024_CVPR, author = {Ramanagopal, Mani and Narayanan, Sriram and Sankaranarayanan, Aswin C. and Narasimhan, Srinivasa G.}, title = {A Theory of Joint Light and Heat Transport for Lambertian Scenes}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11924-11933} }
iKUN: Speak to Trackers without Retraining-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Du_2024_CVPR, author = {Du, Yunhao and Lei, Cheng and Zhao, Zhicheng and Su, Fei}, title = {iKUN: Speak to Trackers without Retraining}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19135-19144} }
Facial Identity Anonymization via Intrinsic and Extrinsic Attention Distraction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kuang_2024_CVPR, author = {Kuang, Zhenzhong and Yang, Xiaochen and Shen, Yingjie and Hu, Chao and Yu, Jun}, title = {Facial Identity Anonymization via Intrinsic and Extrinsic Attention Distraction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12406-12415} }
3D-SceneDreamer: Text-Driven 3D-Consistent Scene Generation-
[pdf]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Songchun and Zhang, Yibo and Zheng, Quan and Ma, Rui and Hua, Wei and Bao, Hujun and Xu, Weiwei and Zou, Changqing}, title = {3D-SceneDreamer: Text-Driven 3D-Consistent Scene Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10170-10180} }
VMINer: Versatile Multi-view Inverse Rendering with Near- and Far-field Light Sources-
[pdf]
[supp]
[bibtex]@InProceedings{Fei_2024_CVPR, author = {Fei, Fan and Tang, Jiajun and Tan, Ping and Shi, Boxin}, title = {VMINer: Versatile Multi-view Inverse Rendering with Near- and Far-field Light Sources}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11800-11809} }
RoHM: Robust Human Motion Reconstruction via Diffusion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Siwei and Bhatnagar, Bharat Lal and Xu, Yuanlu and Winkler, Alexander and Kadlecek, Petr and Tang, Siyu and Bogo, Federica}, title = {RoHM: Robust Human Motion Reconstruction via Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14606-14617} }
Do You Remember? Dense Video Captioning with Cross-Modal Memory Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Minkuk and Kim, Hyeon Bae and Moon, Jinyoung and Choi, Jinwoo and Kim, Seong Tae}, title = {Do You Remember? Dense Video Captioning with Cross-Modal Memory Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13894-13904} }
SPAD: Spatially Aware Multi-View Diffusers-
[pdf]
[supp]
[bibtex]@InProceedings{Kant_2024_CVPR, author = {Kant, Yash and Siarohin, Aliaksandr and Wu, Ziyi and Vasilkovsky, Michael and Qian, Guocheng and Ren, Jian and Guler, Riza Alp and Ghanem, Bernard and Tulyakov, Sergey and Gilitschenski, Igor}, title = {SPAD: Spatially Aware Multi-View Diffusers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10026-10038} }
Gradient Reweighting: Towards Imbalanced Class-Incremental Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{He_2024_CVPR, author = {He, Jiangpeng}, title = {Gradient Reweighting: Towards Imbalanced Class-Incremental Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16668-16677} }
Gaussian Splatting SLAM-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Matsuki_2024_CVPR, author = {Matsuki, Hidenobu and Murai, Riku and Kelly, Paul H.J. and Davison, Andrew J.}, title = {Gaussian Splatting SLAM}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18039-18048} }
Not All Classes Stand on Same Embeddings: Calibrating a Semantic Distance with Metric Tensor-
[pdf]
[supp]
[bibtex]@InProceedings{Park_2024_CVPR, author = {Park, Jae Hyeon and Lee, Gyoomin and Park, Seunggi and Cho, Sung In}, title = {Not All Classes Stand on Same Embeddings: Calibrating a Semantic Distance with Metric Tensor}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17722-17731} }
A Simple Recipe for Contrastively Pre-training Video-First Encoders Beyond 16 Frames-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Papalampidi_2024_CVPR, author = {Papalampidi, Pinelopi and Koppula, Skanda and Pathak, Shreya and Chiu, Justin and Heyward, Joe and Patraucean, Viorica and Shen, Jiajun and Miech, Antoine and Zisserman, Andrew and Nematzdeh, Aida}, title = {A Simple Recipe for Contrastively Pre-training Video-First Encoders Beyond 16 Frames}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14386-14397} }
Hierarchical Diffusion Policy for Kinematics-Aware Multi-Task Robotic Manipulation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ma_2024_CVPR, author = {Ma, Xiao and Patidar, Sumit and Haughton, Iain and James, Stephen}, title = {Hierarchical Diffusion Policy for Kinematics-Aware Multi-Task Robotic Manipulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18081-18090} }
Benchmarking the Robustness of Temporal Action Detection Models Against Temporal Corruptions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zeng_2024_CVPR, author = {Zeng, Runhao and Chen, Xiaoyong and Liang, Jiaming and Wu, Huisi and Cao, Guangzhong and Guo, Yong}, title = {Benchmarking the Robustness of Temporal Action Detection Models Against Temporal Corruptions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18263-18274} }
Open-World Human-Object Interaction Detection via Multi-modal Prompts-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Jie and Li, Bingliang and Zeng, Ailing and Zhang, Lei and Zhang, Ruimao}, title = {Open-World Human-Object Interaction Detection via Multi-modal Prompts}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16954-16964} }
UniMODE: Unified Monocular 3D Object Detection-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Zhuoling and Xu, Xiaogang and Lim, SerNam and Zhao, Hengshuang}, title = {UniMODE: Unified Monocular 3D Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16561-16570} }
Multi-agent Collaborative Perception via Motion-aware Robust Communication Network-
[pdf]
[bibtex]@InProceedings{Hong_2024_CVPR, author = {Hong, Shixin and Liu, Yu and Li, Zhi and Li, Shaohui and He, You}, title = {Multi-agent Collaborative Perception via Motion-aware Robust Communication Network}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15301-15310} }
The Manga Whisperer: Automatically Generating Transcriptions for Comics-
[pdf]
[arXiv]
[bibtex]@InProceedings{Sachdeva_2024_CVPR, author = {Sachdeva, Ragav and Zisserman, Andrew}, title = {The Manga Whisperer: Automatically Generating Transcriptions for Comics}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12967-12976} }
Exploring Region-Word Alignment in Built-in Detector for Open-Vocabulary Object Detection-
[pdf]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Heng and Zhao, Qiuyu and Zheng, Linyu and Zeng, Hao and Ge, Zhiwei and Li, Tianhao and Xu, Sulong}, title = {Exploring Region-Word Alignment in Built-in Detector for Open-Vocabulary Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16975-16984} }
MovieChat: From Dense Token to Sparse Memory for Long Video Understanding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Song_2024_CVPR, author = {Song, Enxin and Chai, Wenhao and Wang, Guanhong and Zhang, Yucheng and Zhou, Haoyang and Wu, Feiyang and Chi, Haozhe and Guo, Xun and Ye, Tian and Zhang, Yanting and Lu, Yan and Hwang, Jenq-Neng and Wang, Gaoang}, title = {MovieChat: From Dense Token to Sparse Memory for Long Video Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18221-18232} }
Comparing the Decision-Making Mechanisms by Transformers and CNNs via Explanation Methods-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Jiang_2024_CVPR, author = {Jiang, Mingqi and Khorram, Saeed and Fuxin, Li}, title = {Comparing the Decision-Making Mechanisms by Transformers and CNNs via Explanation Methods}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9546-9555} }
Atlantis: Enabling Underwater Depth Estimation with Stable Diffusion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Fan and You, Shaodi and Li, Yu and Fu, Ying}, title = {Atlantis: Enabling Underwater Depth Estimation with Stable Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11852-11861} }
Matching Anything by Segmenting Anything-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Siyuan and Ke, Lei and Danelljan, Martin and Piccinelli, Luigi and Segu, Mattia and Van Gool, Luc and Yu, Fisher}, title = {Matching Anything by Segmenting Anything}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18963-18973} }
Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Jiacheng and Li, Jiaming and Lin, Xiangru and Zhang, Wei and Tan, Xiao and Han, Junyu and Ding, Errui and Wang, Jingdong and Li, Guanbin}, title = {Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16923-16932} }
Temporally Consistent Unbalanced Optimal Transport for Unsupervised Action Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Ming and Gould, Stephen}, title = {Temporally Consistent Unbalanced Optimal Transport for Unsupervised Action Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14618-14627} }
Learning Transferable Negative Prompts for Out-of-Distribution Detection-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Tianqi and Pang, Guansong and Bai, Xiao and Miao, Wenjun and Zheng, Jin}, title = {Learning Transferable Negative Prompts for Out-of-Distribution Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17584-17594} }
Holistic Features are almost Sufficient for Text-to-Video Retrieval-
[pdf]
[bibtex]@InProceedings{Tian_2024_CVPR, author = {Tian, Kaibin and Zhao, Ruixiang and Xin, Zijie and Lan, Bangxiang and Li, Xirong}, title = {Holistic Features are almost Sufficient for Text-to-Video Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17138-17147} }
Uncertainty-aware Action Decoupling Transformer for Action Anticipation-
[pdf]
[supp]
[bibtex]@InProceedings{Guo_2024_CVPR, author = {Guo, Hongji and Agarwal, Nakul and Lo, Shao-Yuan and Lee, Kwonjoon and Ji, Qiang}, title = {Uncertainty-aware Action Decoupling Transformer for Action Anticipation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18644-18654} }
One-Prompt to Segment All Medical Images-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Junde and Xu, Min}, title = {One-Prompt to Segment All Medical Images}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11302-11312} }
GROUNDHOG: Grounding Large Language Models to Holistic Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Yichi and Ma, Ziqiao and Gao, Xiaofeng and Shakiah, Suhaila and Gao, Qiaozi and Chai, Joyce}, title = {GROUNDHOG: Grounding Large Language Models to Holistic Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14227-14238} }
Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of Low-rank Experts-
[pdf]
[supp]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Jialin and Hu, Xia and Wang, Yaqing and Pang, Bo and Soricut, Radu}, title = {Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of Low-rank Experts}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14205-14215} }
SeMoLi: What Moves Together Belongs Together-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Seidenschwarz_2024_CVPR, author = {Seidenschwarz, Jenny and Osep, Aljosa and Ferroni, Francesco and Lucey, Simon and Leal-Taixe, Laura}, title = {SeMoLi: What Moves Together Belongs Together}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14685-14694} }
Context-Guided Spatio-Temporal Video Grounding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gu_2024_CVPR, author = {Gu, Xin and Fan, Heng and Huang, Yan and Luo, Tiejian and Zhang, Libo}, title = {Context-Guided Spatio-Temporal Video Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18330-18339} }
Explaining the Implicit Neural Canvas: Connecting Pixels to Neurons by Tracing their Contributions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Padmanabhan_2024_CVPR, author = {Padmanabhan, Namitha and Gwilliam, Matthew and Kumar, Pulkit and Maiya, Shishira R and Ehrlich, Max and Shrivastava, Abhinav}, title = {Explaining the Implicit Neural Canvas: Connecting Pixels to Neurons by Tracing their Contributions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10957-10967} }
Adapting to Length Shift: FlexiLength Network for Trajectory Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Yi and Fu, Yun}, title = {Adapting to Length Shift: FlexiLength Network for Trajectory Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15226-15237} }
WorDepth: Variational Language Prior for Monocular Depth Estimation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zeng_2024_CVPR, author = {Zeng, Ziyao and Wang, Daniel and Yang, Fengyu and Park, Hyoungseob and Soatto, Stefano and Lao, Dong and Wong, Alex}, title = {WorDepth: Variational Language Prior for Monocular Depth Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9708-9719} }
A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid Transformer and Contrastive Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Yuelin and Zheng, Pengyu and Yan, Wanquan and Fang, Chengyu and Cheng, Shing Shin}, title = {A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid Transformer and Contrastive Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11125-11136} }
Frozen Feature Augmentation for Few-Shot Image Classification-
[pdf]
[supp]
[bibtex]@InProceedings{Bar_2024_CVPR, author = {B\"ar, Andreas and Houlsby, Neil and Dehghani, Mostafa and Kumar, Manoj}, title = {Frozen Feature Augmentation for Few-Shot Image Classification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16046-16057} }
Incorporating Geo-Diverse Knowledge into Prompting for Increased Geographical Robustness in Object Recognition-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Buettner_2024_CVPR, author = {Buettner, Kyle and Malakouti, Sina and Li, Xiang Lorraine and Kovashka, Adriana}, title = {Incorporating Geo-Diverse Knowledge into Prompting for Increased Geographical Robustness in Object Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13515-13524} }
PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Dorkenwald_2024_CVPR, author = {Dorkenwald, Michael and Barazani, Nimrod and Snoek, Cees G. M. and Asano, Yuki M.}, title = {PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13548-13558} }
UniGarmentManip: A Unified Framework for Category-Level Garment Manipulation via Dense Visual Correspondence-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Ruihai and Lu, Haoran and Wang, Yiyan and Wang, Yubo and Dong, Hao}, title = {UniGarmentManip: A Unified Framework for Category-Level Garment Manipulation via Dense Visual Correspondence}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16340-16350} }
Multi-Attribute Interactions Matter for 3D Visual Grounding-
[pdf]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Can and Han, Yuehui and Xu, Rui and Hui, Le and Xie, Jin and Yang, Jian}, title = {Multi-Attribute Interactions Matter for 3D Visual Grounding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17253-17262} }
SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Yunhao and Wang, Xiaodong and Wang, Ping and Yuan, Xin and Liu, Peidong}, title = {SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10542-10552} }
Improved Visual Grounding through Self-Consistent Explanations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{He_2024_CVPR, author = {He, Ruozhen and Cascante-Bonilla, Paola and Yang, Ziyan and Berg, Alexander C. and Ordonez, Vicente}, title = {Improved Visual Grounding through Self-Consistent Explanations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13095-13105} }
DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with Iterative Diffusion-Based Refinement-
[pdf]
[supp]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Jiuming and Wang, Guangming and Ye, Weicai and Jiang, Chaokang and Han, Jinru and Liu, Zhe and Zhang, Guofeng and Du, Dalong and Wang, Hesheng}, title = {DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with Iterative Diffusion-Based Refinement}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15109-15119} }
FlashEval: Towards Fast and Accurate Evaluation of Text-to-image Diffusion Generative Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Lin and Zhao, Tianchen and Lin, Zinan and Ning, Xuefei and Dai, Guohao and Yang, Huazhong and Wang, Yu}, title = {FlashEval: Towards Fast and Accurate Evaluation of Text-to-image Diffusion Generative Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16122-16131} }
View From Above: Orthogonal-View aware Cross-view Localization-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Shan and Nguyen, Chuong and Liu, Jiawei and Zhang, Yanhao and Muthu, Sundaram and Maken, Fahira Afzal and Zhang, Kaihao and Li, Hongdong}, title = {View From Above: Orthogonal-View aware Cross-view Localization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14843-14852} }
PeVL: Pose-Enhanced Vision-Language Model for Fine-Grained Human Action Recognition-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Haosong and Leong, Mei Chee and Li, Liyuan and Lin, Weisi}, title = {PeVL: Pose-Enhanced Vision-Language Model for Fine-Grained Human Action Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18857-18867} }
DeepCache: Accelerating Diffusion Models for Free-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ma_2024_CVPR, author = {Ma, Xinyin and Fang, Gongfan and Wang, Xinchao}, title = {DeepCache: Accelerating Diffusion Models for Free}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15762-15772} }
Learning Correlation Structures for Vision Transformers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Manjin and Seo, Paul Hongsuck and Schmid, Cordelia and Cho, Minsu}, title = {Learning Correlation Structures for Vision Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18941-18951} }
PrPSeg: Universal Proposition Learning for Panoramic Renal Pathology Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Deng_2024_CVPR, author = {Deng, Ruining and Liu, Quan and Cui, Can and Yao, Tianyuan and Yue, Jialin and Xiong, Juming and Yu, Lining and Wu, Yifei and Yin, Mengmeng and Wang, Yu and Zhao, Shilin and Tang, Yucheng and Yang, Haichun and Huo, Yuankai}, title = {PrPSeg: Universal Proposition Learning for Panoramic Renal Pathology Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11736-11746} }
Weakly-Supervised Audio-Visual Video Parsing with Prototype-based Pseudo-Labeling-
[pdf]
[supp]
[bibtex]@InProceedings{Rachavarapu_2024_CVPR, author = {Rachavarapu, Kranthi Kumar and Ramakrishnan, Kalyan and N., Rajagopalan A.}, title = {Weakly-Supervised Audio-Visual Video Parsing with Prototype-based Pseudo-Labeling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18952-18962} }
Intraoperative 2D/3D Image Registration via Differentiable X-ray Rendering-
[pdf]
[supp]
[bibtex]@InProceedings{Gopalakrishnan_2024_CVPR, author = {Gopalakrishnan, Vivek and Dey, Neel and Golland, Polina}, title = {Intraoperative 2D/3D Image Registration via Differentiable X-ray Rendering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11662-11672} }
MICap: A Unified Model for Identity-Aware Movie Descriptions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Raajesh_2024_CVPR, author = {Raajesh, Haran and Desanur, Naveen Reddy and Khan, Zeeshan and Tapaswi, Makarand}, title = {MICap: A Unified Model for Identity-Aware Movie Descriptions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14011-14021} }
MonoDiff: Monocular 3D Object Detection and Pose Estimation with Diffusion Models-
[pdf]
[supp]
[bibtex]@InProceedings{Ranasinghe_2024_CVPR, author = {Ranasinghe, Yasiru and Hegde, Deepti and Patel, Vishal M.}, title = {MonoDiff: Monocular 3D Object Detection and Pose Estimation with Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10659-10670} }
An Upload-Efficient Scheme for Transferring Knowledge From a Server-Side Pre-trained Generator to Clients in Heterogeneous Federated Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Jianqing and Liu, Yang and Hua, Yang and Cao, Jian}, title = {An Upload-Efficient Scheme for Transferring Knowledge From a Server-Side Pre-trained Generator to Clients in Heterogeneous Federated Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12109-12119} }
Instance-aware Exploration-Verification-Exploitation for Instance ImageGoal Navigation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lei_2024_CVPR, author = {Lei, Xiaohan and Wang, Min and Zhou, Wengang and Li, Li and Li, Houqiang}, title = {Instance-aware Exploration-Verification-Exploitation for Instance ImageGoal Navigation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16329-16339} }
One-2-3-45++: Fast Single Image to 3D Objects with Consistent Multi-View Generation and 3D Diffusion-
[pdf]
[supp]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Minghua and Shi, Ruoxi and Chen, Linghao and Zhang, Zhuoyang and Xu, Chao and Wei, Xinyue and Chen, Hansheng and Zeng, Chong and Gu, Jiayuan and Su, Hao}, title = {One-2-3-45++: Fast Single Image to 3D Objects with Consistent Multi-View Generation and 3D Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10072-10083} }
Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language Models with Creative Humor Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Zhong_2024_CVPR, author = {Zhong, Shanshan and Huang, Zhongzhan and Gao, Shanghua and Wen, Wushao and Lin, Liang and Zitnik, Marinka and Zhou, Pan}, title = {Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language Models with Creative Humor Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13246-13257} }
SceneFun3D: Fine-Grained Functionality and Affordance Understanding in 3D Scenes-
[pdf]
[supp]
[bibtex]@InProceedings{Delitzas_2024_CVPR, author = {Delitzas, Alexandros and Takmaz, Ayca and Tombari, Federico and Sumner, Robert and Pollefeys, Marc and Engelmann, Francis}, title = {SceneFun3D: Fine-Grained Functionality and Affordance Understanding in 3D Scenes}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14531-14542} }
Enhanced Motion-Text Alignment for Image-to-Video Transfer Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Wei and Wan, Chaoqun and Liu, Tongliang and Tian, Xinmei and Shen, Xu and Ye, Jieping}, title = {Enhanced Motion-Text Alignment for Image-to-Video Transfer Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18504-18515} }
UV-IDM: Identity-Conditioned Latent Diffusion Model for Face UV-Texture Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Hong and Feng, Yutang and Xue, Song and Liu, Xuhui and Zeng, Bohan and Li, Shanglin and Liu, Boyu and Liu, Jianzhuang and Han, Shumin and Zhang, Baochang}, title = {UV-IDM: Identity-Conditioned Latent Diffusion Model for Face UV-Texture Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10585-10595} }
A Pedestrian is Worth One Prompt: Towards Language Guidance Person Re-Identification-
[pdf]
[supp]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Zexian and Wu, Dayan and Wu, Chenming and Lin, Zheng and Gu, Jingzi and Wang, Weiping}, title = {A Pedestrian is Worth One Prompt: Towards Language Guidance Person Re-Identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17343-17353} }
NetTrack: Tracking Highly Dynamic Objects with a Net-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zheng_2024_CVPR, author = {Zheng, Guangze and Lin, Shijie and Zuo, Haobo and Fu, Changhong and Pan, Jia}, title = {NetTrack: Tracking Highly Dynamic Objects with a Net}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {19145-19155} }
Grounded Question-Answering in Long Egocentric Videos-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Di_2024_CVPR, author = {Di, Shangzhe and Xie, Weidi}, title = {Grounded Question-Answering in Long Egocentric Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12934-12943} }
HPNet: Dynamic Trajectory Forecasting with Historical Prediction Attention-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tang_2024_CVPR, author = {Tang, Xiaolong and Kan, Meina and Shan, Shiguang and Ji, Zhilong and Bai, Jinfeng and Chen, Xilin}, title = {HPNet: Dynamic Trajectory Forecasting with Historical Prediction Attention}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15261-15270} }
SI-MIL: Taming Deep MIL for Self-Interpretability in Gigapixel Histopathology-
[pdf]
[supp]
[bibtex]@InProceedings{Kapse_2024_CVPR, author = {Kapse, Saarthak and Pati, Pushpak and Das, Srijan and Zhang, Jingwei and Chen, Chao and Vakalopoulou, Maria and Saltz, Joel and Samaras, Dimitris and Gupta, Rajarsi R. and Prasanna, Prateek}, title = {SI-MIL: Taming Deep MIL for Self-Interpretability in Gigapixel Histopathology}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11226-11237} }
LayoutFormer: Hierarchical Text Detection Towards Scene Text Understanding-
[pdf]
[bibtex]@InProceedings{Liang_2024_CVPR, author = {Liang, Min and Ma, Jia-Wei and Zhu, Xiaobin and Qin, Jingyan and Yin, Xu-Cheng}, title = {LayoutFormer: Hierarchical Text Detection Towards Scene Text Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15665-15674} }
GLOW: Global Layout Aware Attacks on Object Detection-
[pdf]
[supp]
[bibtex]@InProceedings{Bao_2024_CVPR, author = {Bao, Jun and Liu, Buyu and Ren, Kui and Yu, Jun}, title = {GLOW: Global Layout Aware Attacks on Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12057-12066} }
SIRA: Scalable Inter-frame Relation and Association for Radar Perception-
[pdf]
[supp]
[bibtex]@InProceedings{Yataka_2024_CVPR, author = {Yataka, Ryoma and Wang, Pu and Boufounos, Petros and Takahashi, Ryuhei}, title = {SIRA: Scalable Inter-frame Relation and Association for Radar Perception}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15024-15034} }
VOODOO 3D: Volumetric Portrait Disentanglement For One-Shot 3D Head Reenactment-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tran_2024_CVPR, author = {Tran, Phong and Zakharov, Egor and Ho, Long-Nhat and Tran, Anh Tuan and Hu, Liwen and Li, Hao}, title = {VOODOO 3D: Volumetric Portrait Disentanglement For One-Shot 3D Head Reenactment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10336-10348} }
Visual Fact Checker: Enabling High-Fidelity Detailed Caption Generation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ge_2024_CVPR, author = {Ge, Yunhao and Zeng, Xiaohui and Huffman, Jacob Samuel and Lin, Tsung-Yi and Liu, Ming-Yu and Cui, Yin}, title = {Visual Fact Checker: Enabling High-Fidelity Detailed Caption Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14033-14042} }
Communication-Efficient Collaborative Perception via Information Filling with Codebook-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Hu_2024_CVPR, author = {Hu, Yue and Peng, Juntong and Liu, Sifei and Ge, Junhao and Liu, Si and Chen, Siheng}, title = {Communication-Efficient Collaborative Perception via Information Filling with Codebook}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15481-15490} }
MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation-
[pdf]
[supp]
[bibtex]@InProceedings{Hu_2024_CVPR, author = {Hu, Hanzhe and Zhou, Zhizhuo and Jampani, Varun and Tulsiani, Shubham}, title = {MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9698-9707} }
Effective Video Mirror Detection with Inconsistent Motion Cues-
[pdf]
[supp]
[bibtex]@InProceedings{Warren_2024_CVPR, author = {Warren, Alex and Xu, Ke and Lin, Jiaying and Tam, Gary K.L. and Lau, Rynson W.H.}, title = {Effective Video Mirror Detection with Inconsistent Motion Cues}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17244-17252} }
DiffLoc: Diffusion Model for Outdoor LiDAR Localization-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Wen and Yang, Yuyang and Yu, Shangshu and Hu, Guosheng and Wen, Chenglu and Cheng, Ming and Wang, Cheng}, title = {DiffLoc: Diffusion Model for Outdoor LiDAR Localization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15045-15054} }
On Scaling Up a Multilingual Vision and Language Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and Shakeri, Siamak and Dehghani, Mostafa and Salz, Daniel and Lucic, Mario and Tschannen, Michael and Nagrani, Arsha and Hu, Hexiang and Joshi, Mandar and Pang, Bo and Montgomery, Ceslee and Pietrzyk, Paulina and Ritter, Marvin and Piergiovanni, AJ and Minderer, Matthias and Pavetic, Filip and Waters, Austin and Li, Gang and Alabdulmohsin, Ibrahim and Beyer, Lucas and Amelot, Julien and Lee, Kenton and Steiner, Andreas Peter and Li, Yang and Keysers, Daniel and Arnab, Anurag and Xu, Yuanzhong and Rong, Keran and Kolesnikov, Alexander and Seyedhosseini, Mojtaba and Angelova, Anelia and Zhai, Xiaohua and Houlsby, Neil and Soricut, Radu}, title = {On Scaling Up a Multilingual Vision and Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14432-14444} }
Day-Night Cross-domain Vehicle Re-identification-
[pdf]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Hongchao and Chen, Jingong and Zheng, Aihua and Wu, Yong and Luo, Yonglong}, title = {Day-Night Cross-domain Vehicle Re-identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12626-12635} }
Holodeck: Language Guided Generation of 3D Embodied AI Environments-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Yue and Sun, Fan-Yun and Weihs, Luca and VanderBilt, Eli and Herrasti, Alvaro and Han, Winson and Wu, Jiajun and Haber, Nick and Krishna, Ranjay and Liu, Lingjie and Callison-Burch, Chris and Yatskar, Mark and Kembhavi, Aniruddha and Clark, Christopher}, title = {Holodeck: Language Guided Generation of 3D Embodied AI Environments}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16227-16237} }
Distilled Datamodel with Reverse Gradient Matching-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ye_2024_CVPR, author = {Ye, Jingwen and Yu, Ruonan and Liu, Songhua and Wang, Xinchao}, title = {Distilled Datamodel with Reverse Gradient Matching}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11954-11963} }
Pseudo Label Refinery for Unsupervised Domain Adaptation on Cross-dataset 3D Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Zhanwei and Chen, Minghao and Xiao, Shuai and Peng, Liang and Li, Hengjia and Lin, Binbin and Li, Ping and Wang, Wenxiao and Wu, Boxi and Cai, Deng}, title = {Pseudo Label Refinery for Unsupervised Domain Adaptation on Cross-dataset 3D Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15291-15300} }
Reconstructing Hands in 3D with Transformers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pavlakos_2024_CVPR, author = {Pavlakos, Georgios and Shan, Dandan and Radosavovic, Ilija and Kanazawa, Angjoo and Fouhey, David and Malik, Jitendra}, title = {Reconstructing Hands in 3D with Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9826-9836} }
PELA: Learning Parameter-Efficient Models with Low-Rank Approximation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Guo_2024_CVPR, author = {Guo, Yangyang and Wang, Guangzhi and Kankanhalli, Mohan}, title = {PELA: Learning Parameter-Efficient Models with Low-Rank Approximation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15699-15709} }
Auto-Train-Once: Controller Network Guided Automatic Network Pruning from Scratch-
[pdf]
[supp]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Xidong and Gao, Shangqian and Zhang, Zeyu and Li, Zhenzhen and Bao, Runxue and Zhang, Yanfu and Wang, Xiaoqian and Huang, Heng}, title = {Auto-Train-Once: Controller Network Guided Automatic Network Pruning from Scratch}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16163-16173} }
Constructing and Exploring Intermediate Domains in Mixed Domain Semi-supervised Medical Image Segmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ma_2024_CVPR, author = {Ma, Qinghe and Zhang, Jian and Qi, Lei and Yu, Qian and Shi, Yinghuan and Gao, Yang}, title = {Constructing and Exploring Intermediate Domains in Mixed Domain Semi-supervised Medical Image Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11642-11651} }
From Isolated Islands to Pangea: Unifying Semantic Space for Human Action Understanding-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Yong-Lu and Wu, Xiaoqian and Liu, Xinpeng and Wang, Zehao and Dou, Yiming and Ji, Yikun and Zhang, Junyi and Li, Yixing and Lu, Xudong and Tan, Jingru and Lu, Cewu}, title = {From Isolated Islands to Pangea: Unifying Semantic Space for Human Action Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16582-16592} }
Bootstrapping Autonomous Driving Radars with Self-Supervised Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Hao_2024_CVPR, author = {Hao, Yiduo and Madani, Sohrab and Guan, Junfeng and Alloulah, Mohammed and Gupta, Saurabh and Hassanieh, Haitham}, title = {Bootstrapping Autonomous Driving Radars with Self-Supervised Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15012-15023} }
Weakly Supervised Monocular 3D Detection with a Single-View Image-
[pdf]
[arXiv]
[bibtex]@InProceedings{Jiang_2024_CVPR, author = {Jiang, Xueying and Jin, Sheng and Lu, Lewei and Zhang, Xiaoqin and Lu, Shijian}, title = {Weakly Supervised Monocular 3D Detection with a Single-View Image}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10508-10518} }
Blind Image Quality Assessment Based on Geometric Order Learning-
[pdf]
[supp]
[bibtex]@InProceedings{Shin_2024_CVPR, author = {Shin, Nyeong-Ho and Lee, Seon-Ho and Kim, Chang-Su}, title = {Blind Image Quality Assessment Based on Geometric Order Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12799-12808} }
Generalizable Whole Slide Image Classification with Fine-Grained Visual-Semantic Interaction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Hao and Chen, Ying and Chen, Yifei and Yu, Rongshan and Yang, Wenxian and Wang, Liansheng and Ding, Bowen and Han, Yuchen}, title = {Generalizable Whole Slide Image Classification with Fine-Grained Visual-Semantic Interaction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11398-11407} }
Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge-
[pdf]
[supp]
[bibtex]@InProceedings{Ma_2024_CVPR, author = {Ma, Haoxiang and Shi, Modi and Gao, Boyang and Huang, Di}, title = {Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18102-18111} }
RadSimReal: Bridging the Gap Between Synthetic and Real Data in Radar Object Detection With Simulation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Bialer_2024_CVPR, author = {Bialer, Oded and Haitman, Yuval}, title = {RadSimReal: Bridging the Gap Between Synthetic and Real Data in Radar Object Detection With Simulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15407-15416} }
3DSFLabelling: Boosting 3D Scene Flow Estimation by Pseudo Auto-labelling-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Jiang_2024_CVPR, author = {Jiang, Chaokang and Wang, Guangming and Liu, Jiuming and Wang, Hesheng and Ma, Zhuang and Liu, Zhenqiang and Liang, Zhujin and Shan, Yi and Du, Dalong}, title = {3DSFLabelling: Boosting 3D Scene Flow Estimation by Pseudo Auto-labelling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15173-15183} }
Question Aware Vision Transformer for Multimodal Reasoning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ganz_2024_CVPR, author = {Ganz, Roy and Kittenplon, Yair and Aberdam, Aviad and Ben Avraham, Elad and Nuriel, Oren and Mazor, Shai and Litman, Ron}, title = {Question Aware Vision Transformer for Multimodal Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13861-13871} }
OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Tongjia and Yu, Hongshan and Yang, Zhengeng and Li, Zechuan and Sun, Wei and Chen, Chen}, title = {OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18888-18898} }
Habitat Synthetic Scenes Dataset (HSSD-200): An Analysis of 3D Scene Scale and Realism Tradeoffs for ObjectGoal Navigation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Khanna_2024_CVPR, author = {Khanna, Mukul and Mao, Yongsen and Jiang, Hanxiao and Haresh, Sanjay and Shacklett, Brennan and Batra, Dhruv and Clegg, Alexander and Undersander, Eric and Chang, Angel X. and Savva, Manolis}, title = {Habitat Synthetic Scenes Dataset (HSSD-200): An Analysis of 3D Scene Scale and Realism Tradeoffs for ObjectGoal Navigation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16384-16393} }
NViST: In the Wild New View Synthesis from a Single Image with Transformers-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Jang_2024_CVPR, author = {Jang, Wonbong and Agapito, Lourdes}, title = {NViST: In the Wild New View Synthesis from a Single Image with Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10181-10193} }
Step Differences in Instructional Video-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Nagarajan_2024_CVPR, author = {Nagarajan, Tushar and Torresani, Lorenzo}, title = {Step Differences in Instructional Video}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18740-18750} }
Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, title = {Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10371-10381} }
MPOD123: One Image to 3D Content Generation Using Mask-enhanced Progressive Outline-to-Detail Optimization-
[pdf]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Jimin and Wang, Tianbao and Jin, Tao and Zhang, Shengyu and Fu, Dongjie and Wang, Zhe and Lyu, Jiangjing and Lv, Chengfei and Niu, Chaoyue and Yu, Zhou and Zhao, Zhou and Wu, Fei}, title = {MPOD123: One Image to 3D Content Generation Using Mask-enhanced Progressive Outline-to-Detail Optimization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10682-10692} }
UnionFormer: Unified-Learning Transformer with Multi-View Representation for Image Manipulation Detection and Localization-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Shuaibo and Ma, Wei and Guo, Jianwei and Xu, Shibiao and Li, Benchong and Zhang, Xiaopeng}, title = {UnionFormer: Unified-Learning Transformer with Multi-View Representation for Image Manipulation Detection and Localization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12523-12533} }
Situational Awareness Matters in 3D Vision Language Reasoning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Man_2024_CVPR, author = {Man, Yunze and Gui, Liang-Yan and Wang, Yu-Xiong}, title = {Situational Awareness Matters in 3D Vision Language Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13678-13688} }
RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection-
[pdf]
[bibtex]@InProceedings{Lin_2024_CVPR, author = {Lin, Zhiwei and Liu, Zhe and Xia, Zhongyu and Wang, Xinhao and Wang, Yongtao and Qi, Shengxiang and Dong, Yang and Dong, Nan and Zhang, Le and Zhu, Ce}, title = {RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14928-14937} }
Adaptive Softassign via Hadamard-Equipped Sinkhorn-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shen_2024_CVPR, author = {Shen, Binrui and Niu, Qiang and Zhu, Shengxin}, title = {Adaptive Softassign via Hadamard-Equipped Sinkhorn}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17638-17647} }
Re-thinking Data Availability Attacks Against Deep Neural Networks-
[pdf]
[supp]
[bibtex]@InProceedings{Fang_2024_CVPR, author = {Fang, Bin and Li, Bo and Wu, Shuang and Ding, Shouhong and Yi, Ran and Ma, Lizhuang}, title = {Re-thinking Data Availability Attacks Against Deep Neural Networks}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12215-12224} }
SHiNe: Semantic Hierarchy Nexus for Open-vocabulary Object Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Mingxuan and Hayes, Tyler L. and Ricci, Elisa and Csurka, Gabriela and Volpi, Riccardo}, title = {SHiNe: Semantic Hierarchy Nexus for Open-vocabulary Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16634-16644} }
Ranking Distillation for Open-Ended Video Question Answering with Insufficient Labels-
[pdf]
[arXiv]
[bibtex]@InProceedings{Liang_2024_CVPR, author = {Liang, Tianming and Tan, Chaolei and Xia, Beihao and Zheng, Wei-Shi and Hu, Jian-Fang}, title = {Ranking Distillation for Open-Ended Video Question Answering with Insufficient Labels}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13161-13170} }
Depth-Aware Concealed Crop Detection in Dense Agricultural Scenes-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Liqiong and Yang, Jinyu and Zhang, Yanfu and Wang, Fangyi and Zheng, Feng}, title = {Depth-Aware Concealed Crop Detection in Dense Agricultural Scenes}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17201-17211} }
Solving the Catastrophic Forgetting Problem in Generalized Category Discovery-
[pdf]
[supp]
[bibtex]@InProceedings{Cao_2024_CVPR, author = {Cao, Xinzi and Zheng, Xiawu and Wang, Guanhong and Yu, Weijiang and Shen, Yunhang and Li, Ke and Lu, Yutong and Tian, Yonghong}, title = {Solving the Catastrophic Forgetting Problem in Generalized Category Discovery}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16880-16889} }
Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame for 4D Medical Images-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, JungEun and Yoon, Hangyul and Park, Geondo and Kim, Kyungsu and Yang, Eunho}, title = {Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame for 4D Medical Images}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11353-11364} }
Learning the 3D Fauna of the Web-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Zizhang and Litvak, Dor and Li, Ruining and Zhang, Yunzhi and Jakab, Tomas and Rupprecht, Christian and Wu, Shangzhe and Vedaldi, Andrea and Wu, Jiajun}, title = {Learning the 3D Fauna of the Web}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9752-9762} }
LISA: Reasoning Segmentation via Large Language Model-
[pdf]
[arXiv]
[bibtex]@InProceedings{Lai_2024_CVPR, author = {Lai, Xin and Tian, Zhuotao and Chen, Yukang and Li, Yanwei and Yuan, Yuhui and Liu, Shu and Jia, Jiaya}, title = {LISA: Reasoning Segmentation via Large Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9579-9589} }
Bridging the Gap: A Unified Video Comprehension Framework for Moment Retrieval and Highlight Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xiao_2024_CVPR, author = {Xiao, Yicheng and Luo, Zhuoyan and Liu, Yong and Ma, Yue and Bian, Hengwei and Ji, Yatai and Yang, Yujiu and Li, Xiu}, title = {Bridging the Gap: A Unified Video Comprehension Framework for Moment Retrieval and Highlight Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18709-18719} }
MuseChat: A Conversational Music Recommendation System for Videos-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Dong_2024_CVPR, author = {Dong, Zhikang and Liu, Xiulong and Chen, Bin and Polak, Pawel and Zhang, Peng}, title = {MuseChat: A Conversational Music Recommendation System for Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12775-12785} }
Device-Wise Federated Network Pruning-
[pdf]
[supp]
[bibtex]@InProceedings{Gao_2024_CVPR, author = {Gao, Shangqian and Li, Junyi and Zhang, Zeyu and Zhang, Yanfu and Cai, Weidong and Huang, Heng}, title = {Device-Wise Federated Network Pruning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12342-12352} }
MoReVQA: Exploring Modular Reasoning Models for Video Question Answering-
[pdf]
[arXiv]
[bibtex]@InProceedings{Min_2024_CVPR, author = {Min, Juhong and Buch, Shyamal and Nagrani, Arsha and Cho, Minsu and Schmid, Cordelia}, title = {MoReVQA: Exploring Modular Reasoning Models for Video Question Answering}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13235-13245} }
Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design Approach-
[pdf]
[supp]
[bibtex]@InProceedings{Dong_2024_CVPR, author = {Dong, Wei and Zhang, Xing and Chen, Bihui and Yan, Dawei and Lin, Zhijun and Yan, Qingsen and Wang, Peng and Yang, Yang}, title = {Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design Approach}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16101-16110} }
Distribution-aware Knowledge Prototyping for Non-exemplar Lifelong Person Re-identification-
[pdf]
[supp]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Kunlun and Zou, Xu and Peng, Yuxin and Zhou, Jiahuan}, title = {Distribution-aware Knowledge Prototyping for Non-exemplar Lifelong Person Re-identification}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16604-16613} }
Generating Enhanced Negatives for Training Language-Based Object Detectors-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Shiyu and Zhao, Long and G, Vijay Kumar B and Suh, Yumin and Metaxas, Dimitris N. and Chandraker, Manmohan and Schulter, Samuel}, title = {Generating Enhanced Negatives for Training Language-Based Object Detectors}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13592-13602} }
FedAS: Bridging Inconsistency in Personalized Federated Learning-
[pdf]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Xiyuan and Huang, Wenke and Ye, Mang}, title = {FedAS: Bridging Inconsistency in Personalized Federated Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11986-11995} }
MoST: Multi-Modality Scene Tokenization for Motion Prediction-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Mu_2024_CVPR, author = {Mu, Norman and Ji, Jingwei and Yang, Zhenpei and Harada, Nate and Tang, Haotian and Chen, Kan and Qi, Charles R. and Ge, Runzhou and Goel, Kratarth and Yang, Zoey and Ettinger, Scott and Al-Rfou, Rami and Anguelov, Dragomir and Zhou, Yin}, title = {MoST: Multi-Modality Scene Tokenization for Motion Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14988-14999} }
PIGEON: Predicting Image Geolocations-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Haas_2024_CVPR, author = {Haas, Lukas and Skreta, Michal and Alberti, Silas and Finn, Chelsea}, title = {PIGEON: Predicting Image Geolocations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12893-12902} }
Flow-Guided Online Stereo Rectification for Wide Baseline Stereo-
[pdf]
[supp]
[bibtex]@InProceedings{Kumar_2024_CVPR, author = {Kumar, Anush and Mannan, Fahim and Jafari, Omid Hosseini and Li, Shile and Heide, Felix}, title = {Flow-Guided Online Stereo Rectification for Wide Baseline Stereo}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15375-15385} }
Driving Everywhere with Large Language Model Policy Adaptation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Boyi and Wang, Yue and Mao, Jiageng and Ivanovic, Boris and Veer, Sushant and Leung, Karen and Pavone, Marco}, title = {Driving Everywhere with Large Language Model Policy Adaptation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14948-14957} }
Koala: Key Frame-Conditioned Long Video-LLM-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tan_2024_CVPR, author = {Tan, Reuben and Sun, Ximeng and Hu, Ping and Wang, Jui-hsien and Deilamsalehy, Hanieh and Plummer, Bryan A. and Russell, Bryan and Saenko, Kate}, title = {Koala: Key Frame-Conditioned Long Video-LLM}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13581-13591} }
HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Guan_2024_CVPR, author = {Guan, Tianrui and Liu, Fuxiao and Wu, Xiyang and Xian, Ruiqi and Li, Zongxia and Liu, Xiaoyu and Wang, Xijun and Chen, Lichang and Huang, Furong and Yacoob, Yaser and Manocha, Dinesh and Zhou, Tianyi}, title = {HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14375-14385} }
ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Bai_2024_CVPR, author = {Bai, Yichen and Han, Zongbo and Cao, Bing and Jiang, Xiaoheng and Hu, Qinghua and Zhang, Changqing}, title = {ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17480-17489} }
Jack of All Tasks Master of Many: Designing General-Purpose Coarse-to-Fine Vision-Language Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Pramanick_2024_CVPR, author = {Pramanick, Shraman and Han, Guangxing and Hou, Rui and Nag, Sayan and Lim, Ser-Nam and Ballas, Nicolas and Wang, Qifan and Chellappa, Rama and Almahairi, Amjad}, title = {Jack of All Tasks Master of Many: Designing General-Purpose Coarse-to-Fine Vision-Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14076-14088} }
SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation System-
[pdf]
[arXiv]
[bibtex]@InProceedings{Fan_2024_CVPR, author = {Fan, Yunfei and Zhao, Tianyu and Wang, Guidong}, title = {SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation System}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17964-17973} }
ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual Prompts-
[pdf]
[supp]
[bibtex]@InProceedings{Cai_2024_CVPR, author = {Cai, Mu and Liu, Haotian and Mustikovela, Siva Karthik and Meyer, Gregory P. and Chai, Yuning and Park, Dennis and Lee, Yong Jae}, title = {ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual Prompts}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12914-12923} }
OVER-NAV: Elevating Iterative Vision-and-Language Navigation with Open-Vocabulary Detection and StructurEd Representation-
[pdf]
[supp]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Ganlong and Li, Guanbin and Chen, Weikai and Yu, Yizhou}, title = {OVER-NAV: Elevating Iterative Vision-and-Language Navigation with Open-Vocabulary Detection and StructurEd Representation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16296-16306} }
All Rivers Run to the Sea: Private Learning with Asymmetric Flows-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Niu_2024_CVPR, author = {Niu, Yue and Ali, Ramy E. and Prakash, Saurav and Avestimehr, Salman}, title = {All Rivers Run to the Sea: Private Learning with Asymmetric Flows}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12353-12362} }
HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional Synthesis and Sampling of Hand-Object Interactions-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Hao and Li, Haipeng and Wang, Yinqiao and Liu, Shuaicheng and Fu, Chi-Wing}, title = {HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional Synthesis and Sampling of Hand-Object Interactions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10159-10169} }
A-Teacher: Asymmetric Network for 3D Semi-Supervised Object Detection-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Hanshi and Zhang, Zhipeng and Gao, Jin and Hu, Weiming}, title = {A-Teacher: Asymmetric Network for 3D Semi-Supervised Object Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14978-14987} }
Visual Objectification in Films: Towards a New AI Task for Video Interpretation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tores_2024_CVPR, author = {Tores, Julie and Sassatelli, Lucile and Wu, Hui-Yin and Bergman, Clement and Andolfi, L\'ea and Ecrement, Victor and Precioso, Fr\'ed\'eric and Devars, Thierry and Guaresi, Magali and Julliard, Virginie and Lecossais, Sarah}, title = {Visual Objectification in Films: Towards a New AI Task for Video Interpretation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10864-10874} }
BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands from a Single Image-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Minje and Kim, Tae-Kyun}, title = {BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands from a Single Image}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10726-10735} }
Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ranasinghe_2024_CVPR, author = {Ranasinghe, Kanchana and Shukla, Satya Narayan and Poursaeed, Omid and Ryoo, Michael S. and Lin, Tsung-Yu}, title = {Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12977-12987} }
Self-Distilled Masked Auto-Encoders are Efficient Video Anomaly Detectors-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ristea_2024_CVPR, author = {Ristea, Nicolae-C?t?lin and Croitoru, Florinel-Alin and Ionescu, Radu Tudor and Popescu, Marius and Khan, Fahad Shahbaz and Shah, Mubarak}, title = {Self-Distilled Masked Auto-Encoders are Efficient Video Anomaly Detectors}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15984-15995} }
Distilling Vision-Language Models on Millions of Videos-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhao_2024_CVPR, author = {Zhao, Yue and Zhao, Long and Zhou, Xingyi and Wu, Jialin and Chu, Chun-Te and Miao, Hui and Schroff, Florian and Adam, Hartwig and Liu, Ting and Gong, Boqing and Krahenbuhl, Philipp and Yuan, Liangzhe}, title = {Distilling Vision-Language Models on Millions of Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13106-13116} }
Generalized Predictive Model for Autonomous Driving-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Jiazhi and Gao, Shenyuan and Qiu, Yihang and Chen, Li and Li, Tianyu and Dai, Bo and Chitta, Kashyap and Wu, Penghao and Zeng, Jia and Luo, Ping and Zhang, Jun and Geiger, Andreas and Qiao, Yu and Li, Hongyang}, title = {Generalized Predictive Model for Autonomous Driving}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14662-14672} }
FACT: Frame-Action Cross-Attention Temporal Modeling for Efficient Action Segmentation-
[pdf]
[supp]
[bibtex]@InProceedings{Lu_2024_CVPR, author = {Lu, Zijia and Elhamifar, Ehsan}, title = {FACT: Frame-Action Cross-Attention Temporal Modeling for Efficient Action Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18175-18185} }
Test-Time Zero-Shot Temporal Action Localization-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liberatori_2024_CVPR, author = {Liberatori, Benedetta and Conti, Alessandro and Rota, Paolo and Wang, Yiming and Ricci, Elisa}, title = {Test-Time Zero-Shot Temporal Action Localization}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18720-18729} }
AM-RADIO: Agglomerative Vision Foundation Model Reduce All Domains Into One-
[pdf]
[supp]
[bibtex]@InProceedings{Ranzinger_2024_CVPR, author = {Ranzinger, Mike and Heinrich, Greg and Kautz, Jan and Molchanov, Pavlo}, title = {AM-RADIO: Agglomerative Vision Foundation Model Reduce All Domains Into One}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12490-12500} }
FastMAC: Stochastic Spectral Sampling of Correspondence Graph-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Yifei and Zhao, Hao and Li, Hongyang and Chen, Siheng}, title = {FastMAC: Stochastic Spectral Sampling of Correspondence Graph}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17857-17867} }
FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in Federated Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Gihun and Jeong, Minchan and Kim, Sangmook and Oh, Jaehoon and Yun, Se-Young}, title = {FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in Federated Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12512-12522} }
A Category Agnostic Model for Visual Rearrangment-
[pdf]
[bibtex]@InProceedings{Liu_2024_CVPR, author = {Liu, Yuyi and Song, Xinhang and Li, Weijie and Wang, Xiaohan and Jiang, Shuqiang}, title = {A Category Agnostic Model for Visual Rearrangment}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16457-16466} }
Representing Part-Whole Hierarchies in Foundation Models by Learning Localizability Composability and Decomposability from Anatomy via Self Supervision-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Taher_2024_CVPR, author = {Taher, Mohammad Reza Hosseinzadeh and Gotway, Michael B. and Liang, Jianming}, title = {Representing Part-Whole Hierarchies in Foundation Models by Learning Localizability Composability and Decomposability from Anatomy via Self Supervision}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11269-11281} }
Efficient Test-Time Adaptation of Vision-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Karmanov_2024_CVPR, author = {Karmanov, Adilbek and Guan, Dayan and Lu, Shijian and El Saddik, Abdulmotaleb and Xing, Eric}, title = {Efficient Test-Time Adaptation of Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14162-14171} }
Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Tong_2024_CVPR, author = {Tong, Shengbang and Liu, Zhuang and Zhai, Yuexiang and Ma, Yi and LeCun, Yann and Xie, Saining}, title = {Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9568-9578} }
Mind Marginal Non-Crack Regions: Clustering-Inspired Representation Learning for Crack Segmentation-
[pdf]
[bibtex]@InProceedings{Chen_2024_CVPR, author = {Chen, Zhuangzhuang and Lai, Zhuonan and Chen, Jie and Li, Jianqiang}, title = {Mind Marginal Non-Crack Regions: Clustering-Inspired Representation Learning for Crack Segmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {12698-12708} }
RegionGPT: Towards Region Understanding Vision Language Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Guo_2024_CVPR, author = {Guo, Qiushan and De Mello, Shalini and Yin, Hongxu and Byeon, Wonmin and Cheung, Ka Chun and Yu, Yizhou and Luo, Ping and Liu, Sifei}, title = {RegionGPT: Towards Region Understanding Vision Language Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13796-13806} }
Error Detection in Egocentric Procedural Task Videos-
[pdf]
[supp]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Shih-Po and Lu, Zijia and Zhang, Zekun and Hoai, Minh and Elhamifar, Ehsan}, title = {Error Detection in Egocentric Procedural Task Videos}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18655-18666} }
Uncertainty-Guided Never-Ending Learning to Drive-
[pdf]
[bibtex]@InProceedings{Lai_2024_CVPR, author = {Lai, Lei and Ohn-Bar, Eshed and Arora, Sanjay and Yi, John Seon Keun}, title = {Uncertainty-Guided Never-Ending Learning to Drive}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15088-15098} }
FakeInversion: Learning to Detect Images from Unseen Text-to-Image Models by Inverting Stable Diffusion-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Cazenavette_2024_CVPR, author = {Cazenavette, George and Sud, Avneesh and Leung, Thomas and Usman, Ben}, title = {FakeInversion: Learning to Detect Images from Unseen Text-to-Image Models by Inverting Stable Diffusion}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10759-10769} }
Attribute-Guided Pedestrian Retrieval: Bridging Person Re-ID with Internal Attribute Variability-
[pdf]
[supp]
[bibtex]@InProceedings{Huang_2024_CVPR, author = {Huang, Yan and Zhang, Zhang and Wu, Qiang and Zhong, Yi and Wang, Liang}, title = {Attribute-Guided Pedestrian Retrieval: Bridging Person Re-ID with Internal Attribute Variability}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17689-17699} }
Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Jiamian and Sun, Guohao and Wang, Pichao and Liu, Dongfang and Dianat, Sohail and Rabbani, Majid and Rao, Raghuveer and Tao, Zhiqiang}, title = {Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16551-16560} }
Know Your Neighbors: Improving Single-View Reconstruction via Spatial Vision-Language Reasoning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Rui and Fischer, Tobias and Segu, Mattia and Pollefeys, Marc and Van Gool, Luc and Tombari, Federico}, title = {Know Your Neighbors: Improving Single-View Reconstruction via Spatial Vision-Language Reasoning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9848-9858} }
Preserving Fairness Generalization in Deepfake Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Lin_2024_CVPR, author = {Lin, Li and He, Xinan and Ju, Yan and Wang, Xin and Ding, Feng and Hu, Shu}, title = {Preserving Fairness Generalization in Deepfake Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16815-16825} }
Structure-Aware Sparse-View X-ray 3D Reconstruction-
[pdf]
[bibtex]@InProceedings{Cai_2024_CVPR, author = {Cai, Yuanhao and Wang, Jiahao and Yuille, Alan and Zhou, Zongwei and Wang, Angtian}, title = {Structure-Aware Sparse-View X-ray 3D Reconstruction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11174-11183} }
Dexterous Grasp Transformer-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Guo-Hao and Wei, Yi-Lin and Zheng, Dian and Wu, Xiao-Ming and Zheng, Wei-Shi}, title = {Dexterous Grasp Transformer}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {17933-17942} }
EgoThink: Evaluating First-Person Perspective Thinking Capability of Vision-Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Cheng_2024_CVPR, author = {Cheng, Sijie and Guo, Zhicheng and Wu, Jingwen and Fang, Kechen and Li, Peng and Liu, Huaping and Liu, Yang}, title = {EgoThink: Evaluating First-Person Perspective Thinking Capability of Vision-Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {14291-14302} }
Hearing Anything Anywhere-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Mason Long and Sawata, Ryosuke and Clarke, Samuel and Gao, Ruohan and Wu, Shangzhe and Wu, Jiajun}, title = {Hearing Anything Anywhere}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11790-11799} }
PatchFusion: An End-to-End Tile-Based Framework for High-Resolution Monocular Metric Depth Estimation-
[pdf]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Zhenyu and Bhat, Shariq Farooq and Wonka, Peter}, title = {PatchFusion: An End-to-End Tile-Based Framework for High-Resolution Monocular Metric Depth Estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10016-10025} }
Retrieval-Augmented Egocentric Video Captioning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xu_2024_CVPR, author = {Xu, Jilan and Huang, Yifei and Hou, Junlin and Chen, Guo and Zhang, Yuejie and Feng, Rui and Xie, Weidi}, title = {Retrieval-Augmented Egocentric Video Captioning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13525-13536} }
SkillDiffuser: Interpretable Hierarchical Planning via Skill Abstractions in Diffusion-Based Task Execution-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Liang_2024_CVPR, author = {Liang, Zhixuan and Mu, Yao and Ma, Hengbo and Tomizuka, Masayoshi and Ding, Mingyu and Luo, Ping}, title = {SkillDiffuser: Interpretable Hierarchical Planning via Skill Abstractions in Diffusion-Based Task Execution}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16467-16476} }
TE-TAD: Towards Full End-to-End Temporal Action Detection via Time-Aligned Coordinate Expression-
[pdf]
[supp]
[bibtex]@InProceedings{Kim_2024_CVPR, author = {Kim, Ho-Joong and Hong, Jung-Ho and Kong, Heejo and Lee, Seong-Whan}, title = {TE-TAD: Towards Full End-to-End Temporal Action Detection via Time-Aligned Coordinate Expression}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18837-18846} }
PointBeV: A Sparse Approach for BeV Predictions-
[pdf]
[supp]
[bibtex]@InProceedings{Chambon_2024_CVPR, author = {Chambon, Loick and Zablocki, Eloi and Chen, Micka\"el and Bartoccioni, Florent and P\'erez, Patrick and Cord, Matthieu}, title = {PointBeV: A Sparse Approach for BeV Predictions}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15195-15204} }
From-Ground-To-Objects: Coarse-to-Fine Self-supervised Monocular Depth Estimation of Dynamic Objects with Ground Contact Prior-
[pdf]
[supp]
[bibtex]@InProceedings{Moon_2024_CVPR, author = {Moon, Jaeho and Bello, Juan Luis Gonzalez and Kwon, Byeongjun and Kim, Munchurl}, title = {From-Ground-To-Objects: Coarse-to-Fine Self-supervised Monocular Depth Estimation of Dynamic Objects with Ground Contact Prior}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10519-10529} }
SRTube: Video-Language Pre-Training with Action-Centric Video Tube Features and Semantic Role Labeling-
[pdf]
[supp]
[bibtex]@InProceedings{Lee_2024_CVPR, author = {Lee, Ju-Hee and Kang, Je-Won}, title = {SRTube: Video-Language Pre-Training with Action-Centric Video Tube Features and Semantic Role Labeling}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13689-13699} }
Prompt Highlighter: Interactive Control for Multi-Modal LLMs-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Yuechen and Qian, Shengju and Peng, Bohao and Liu, Shu and Jia, Jiaya}, title = {Prompt Highlighter: Interactive Control for Multi-Modal LLMs}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13215-13224} }
Continual Learning for Motion Prediction Model via Meta-Representation Learning and Optimal Memory Buffer Retention Strategy-
[pdf]
[bibtex]@InProceedings{Kang_2024_CVPR, author = {Kang, DaeJun and Kum, Dongsuk and Kim, Sanmin}, title = {Continual Learning for Motion Prediction Model via Meta-Representation Learning and Optimal Memory Buffer Retention Strategy}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15438-15448} }
EditGuard: Versatile Image Watermarking for Tamper Localization and Copyright Protection-
[pdf]
[arXiv]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Xuanyu and Li, Runyi and Yu, Jiwen and Xu, Youmin and Li, Weiqi and Zhang, Jian}, title = {EditGuard: Versatile Image Watermarking for Tamper Localization and Copyright Protection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11964-11974} }
FairRAG: Fair Human Generation via Fair Retrieval Augmentation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shrestha_2024_CVPR, author = {Shrestha, Robik and Zou, Yang and Chen, Qiuyu and Li, Zhiheng and Xie, Yusheng and Deng, Siqi}, title = {FairRAG: Fair Human Generation via Fair Retrieval Augmentation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11996-12005} }
Template Free Reconstruction of Human-object Interaction with Procedural Interaction Generation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Xie_2024_CVPR, author = {Xie, Xianghui and Bhatnagar, Bharat Lal and Lenssen, Jan Eric and Pons-Moll, Gerard}, title = {Template Free Reconstruction of Human-object Interaction with Procedural Interaction Generation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10003-10015} }
Open-Vocabulary Video Anomaly Detection-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Peng and Zhou, Xuerong and Pang, Guansong and Sun, Yujia and Liu, Jing and Wang, Peng and Zhang, Yanning}, title = {Open-Vocabulary Video Anomaly Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18297-18307} }
ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text Detection and Spotting-
[pdf]
[arXiv]
[bibtex]@InProceedings{Duan_2024_CVPR, author = {Duan, Chen and Fu, Pei and Guo, Shan and Jiang, Qianyi and Wei, Xiaoming}, title = {ODM: A Text-Image Further Alignment Pre-training Approach for Scene Text Detection and Spotting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15587-15597} }
Epistemic Uncertainty Quantification For Pre-Trained Neural Networks-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Hanjing and Ji, Qiang}, title = {Epistemic Uncertainty Quantification For Pre-Trained Neural Networks}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11052-11061} }
Diffusion-ES: Gradient-free Planning with Diffusion for Autonomous and Instruction-guided Driving-
[pdf]
[supp]
[bibtex]@InProceedings{Yang_2024_CVPR, author = {Yang, Brian and Su, Huangyuan and Gkanatsios, Nikolaos and Ke, Tsung-Wei and Jain, Ayush and Schneider, Jeff and Fragkiadaki, Katerina}, title = {Diffusion-ES: Gradient-free Planning with Diffusion for Autonomous and Instruction-guided Driving}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {15342-15353} }
MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation-
[pdf]
[supp]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Yuelong and Mao, Yafei and Bala, Raja and Hadap, Sunil}, title = {MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10476-10486} }
MonoCD: Monocular 3D Object Detection with Complementary Depths-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Yan_2024_CVPR, author = {Yan, Longfei and Yan, Pei and Xiong, Shengzhou and Xiang, Xuanyu and Tan, Yihua}, title = {MonoCD: Monocular 3D Object Detection with Complementary Depths}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {10248-10257} }
Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation with Deterministic Sampling Prior-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Wu_2024_CVPR, author = {Wu, Zike and Zhou, Pan and Yi, Xuanyu and Yuan, Xiaoding and Zhang, Hanwang}, title = {Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation with Deterministic Sampling Prior}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {9892-9902} }
ManipLLM: Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Xiaoqi and Zhang, Mingxu and Geng, Yiran and Geng, Haoran and Long, Yuxing and Shen, Yan and Zhang, Renrui and Liu, Jiaming and Dong, Hao}, title = {ManipLLM: Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {18061-18070} }
GLaMM: Pixel Grounding Large Multimodal Model-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Rasheed_2024_CVPR, author = {Rasheed, Hanoona and Maaz, Muhammad and Shaji, Sahal and Shaker, Abdelrahman and Khan, Salman and Cholakkal, Hisham and Anwer, Rao M. and Xing, Eric and Yang, Ming-Hsuan and Khan, Fahad S.}, title = {GLaMM: Pixel Grounding Large Multimodal Model}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {13009-13018} }
Incremental Residual Concept Bottleneck Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Shang_2024_CVPR, author = {Shang, Chenming and Zhou, Shiji and Zhang, Hengyuan and Ni, Xinzhe and Yang, Yujiu and Wang, Yuwang}, title = {Incremental Residual Concept Bottleneck Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {11030-11040} }
SPOC: Imitating Shortest Paths in Simulation Enables Effective Navigation and Manipulation in the Real World-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Ehsani_2024_CVPR, author = {Ehsani, Kiana and Gupta, Tanmay and Hendrix, Rose and Salvador, Jordi and Weihs, Luca and Zeng, Kuo-Hao and Singh, Kunal Pratap and Kim, Yejin and Han, Winson and Herrasti, Alvaro and Krishna, Ranjay and Schwenk, Dustin and VanderBilt, Eli and Kembhavi, Aniruddha}, title = {SPOC: Imitating Shortest Paths in Simulation Enables Effective Navigation and Manipulation in the Real World}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2024}, pages = {16238-16250} }
LoCoNet: Long-Short Context Network for Active Speaker Detection