Efficient Large Vision Models


EfficientViT-SAM: Accelerated Segment Anything Model Without Performance Loss
Zhuoyang Zhang,
Han Cai,
Song Han
[pdf]
[bibtex]
@InProceedings{Zhang_2024_CVPR, author = {Zhang, Zhuoyang and Cai, Han and Han, Song}, title = {EfficientViT-SAM: Accelerated Segment Anything Model Without Performance Loss}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {7859-7863} }

On Speculative Decoding for Multimodal Large Language Models
Mukul Gagrani,
Raghavv Goel,
Wonseok Jeon,
Junyoung Park,
Mingu Lee,
Christopher Lott
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Gagrani_2024_CVPR, author = {Gagrani, Mukul and Goel, Raghavv and Jeon, Wonseok and Park, Junyoung and Lee, Mingu and Lott, Christopher}, title = {On Speculative Decoding for Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8285-8289} }

QAttn: Efficient GPU Kernels for Mixed-precision Vision Transformers
Piotr Kluska,
Adrián Castelló,
Florian Scheidegger,
A. Cristiano I. Malossi,
Enrique S. Quintana-Ortí
[pdf]
[bibtex]
@InProceedings{Kluska_2024_CVPR, author = {Kluska, Piotr and Castell\'o, Adri\'an and Scheidegger, Florian and Malossi, A. Cristiano I. and Quintana-Ort{\'\i}, Enrique S.}, title = {QAttn: Efficient GPU Kernels for Mixed-precision Vision Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3648-3657} }

SimFreeze: Adaptively Freeze Vision Transformer Encoders with Token Similarity
Tianyi Shen,
Chonghan Lee,
Vijaykrishnan Narayanan
[pdf]
[bibtex]
@InProceedings{Shen_2024_CVPR, author = {Shen, Tianyi and Lee, Chonghan and Narayanan, Vijaykrishnan}, title = {SimFreeze: Adaptively Freeze Vision Transformer Encoders with Token Similarity}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8266-8270} }

Lowering PyTorch's Memory Consumption for Selective Differentiation
Samarth Bhatia,
Felix Dangel
[pdf] [supp]
[bibtex]
@InProceedings{Bhatia_2024_CVPR, author = {Bhatia, Samarth and Dangel, Felix}, title = {Lowering PyTorch's Memory Consumption for Selective Differentiation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8260-8265} }

SAM-CLIP: Merging Vision Foundation Models Towards Semantic and Spatial Understanding
Haoxiang Wang,
Pavan Kumar Anasosalu Vasu,
Fartash Faghri,
Raviteja Vemulapalli,
Mehrdad Farajtabar,
Sachin Mehta,
Mohammad Rastegari,
Oncel Tuzel,
Hadi Pouransari
[pdf] [supp]
[bibtex]
@InProceedings{Wang_2024_CVPR, author = {Wang, Haoxiang and Vasu, Pavan Kumar Anasosalu and Faghri, Fartash and Vemulapalli, Raviteja and Farajtabar, Mehrdad and Mehta, Sachin and Rastegari, Mohammad and Tuzel, Oncel and Pouransari, Hadi}, title = {SAM-CLIP: Merging Vision Foundation Models Towards Semantic and Spatial Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3635-3647} }

Adapting the Segment Anything Model During Usage in Novel Situations
Robin Schön,
Julian Lorenz,
Katja Ludwig,
Rainer Lienhart
[pdf]
[bibtex]
@InProceedings{Schon_2024_CVPR, author = {Sch\"on, Robin and Lorenz, Julian and Ludwig, Katja and Lienhart, Rainer}, title = {Adapting the Segment Anything Model During Usage in Novel Situations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3616-3626} }

PMAFusion: Projection-Based Multi-Modal Alignment for 3D Semantic Occupancy Prediction
Shiyao Li,
Wenming Yang,
Qingmin Liao
[pdf]
[bibtex]
@InProceedings{Li_2024_CVPR, author = {Li, Shiyao and Yang, Wenming and Liao, Qingmin}, title = {PMAFusion: Projection-Based Multi-Modal Alignment for 3D Semantic Occupancy Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3627-3634} }

Adaptive Memory Replay for Continual Learning
James Seale Smith,
Lazar Valkov,
Shaunak Halbe,
Vyshnavi Gutta,
Rogerio Feris,
Zsolt Kira,
Leonid Karlinsky
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Smith_2024_CVPR, author = {Smith, James Seale and Valkov, Lazar and Halbe, Shaunak and Gutta, Vyshnavi and Feris, Rogerio and Kira, Zsolt and Karlinsky, Leonid}, title = {Adaptive Memory Replay for Continual Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3605-3615} }

Efficient Transformer Adaptation with Soft Token Merging
Xin Yuan,
Hongliang Fei,
Jinoo Baek
[pdf] [supp]
[bibtex]
@InProceedings{Yuan_2024_CVPR, author = {Yuan, Xin and Fei, Hongliang and Baek, Jinoo}, title = {Efficient Transformer Adaptation with Soft Token Merging}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3658-3668} }

Opportunities for Post-Training Dynamic Layer Sparsity in Large Vision and Language Models
Jordan Dotzel,
Carly Jiang,
Mohamed Abdelfattah,
Zhiru Zhang
[pdf] [supp]
[bibtex]
@InProceedings{Dotzel_2024_CVPR, author = {Dotzel, Jordan and Jiang, Carly and Abdelfattah, Mohamed and Zhang, Zhiru}, title = {Opportunities for Post-Training Dynamic Layer Sparsity in Large Vision and Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8280-8284} }

HaLViT: Half of the Weights are Enough
Onur Can Koyun,
Behçet Uğur Töreyin
[pdf] [supp]
[bibtex]
@InProceedings{Koyun_2024_CVPR, author = {Koyun, Onur Can and T\"oreyin, Beh\c{c}et U\u{g}ur}, title = {HaLViT: Half of the Weights are Enough}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3669-3678} }

Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting
Reza Akbarian Bafghi,
Nidhin Harilal,
Claire Monteleoni,
Maziar Raissi
[pdf] [arXiv]
[bibtex]
@InProceedings{Bafghi_2024_CVPR, author = {Bafghi, Reza Akbarian and Harilal, Nidhin and Monteleoni, Claire and Raissi, Maziar}, title = {Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3679-3684} }

Unleash the Potential of CLIP for Video Highlight Detection
Donghoon Han,
Seunghyeon Seo,
Eunhwan Park,
Seong-Uk Nam,
Nojun Kwak
[pdf] [arXiv]
[bibtex]
@InProceedings{Han_2024_CVPR, author = {Han, Donghoon and Seo, Seunghyeon and Park, Eunhwan and Nam, Seong-Uk and Kwak, Nojun}, title = {Unleash the Potential of CLIP for Video Highlight Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8275-8279} }

Layered Diffusion Model for One-Shot High Resolution Text-to-Image Synthesis
Emaad Khwaja,
Abdullah Rashwan,
Ting Chen,
Oliver Wang,
Suraj Kothawade,
Yeqing Li
[pdf] [supp] [arXiv]
[bibtex]
@InProceedings{Khwaja_2024_CVPR, author = {Khwaja, Emaad and Rashwan, Abdullah and Chen, Ting and Wang, Oliver and Kothawade, Suraj and Li, Yeqing}, title = {Layered Diffusion Model for One-Shot High Resolution Text-to-Image Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8271-8274} }