Efficient Large Vision Models
EfficientViT-SAM: Accelerated Segment Anything Model Without Performance Loss-
[pdf]
[bibtex]@InProceedings{Zhang_2024_CVPR, author = {Zhang, Zhuoyang and Cai, Han and Han, Song}, title = {EfficientViT-SAM: Accelerated Segment Anything Model Without Performance Loss}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {7859-7863} }
On Speculative Decoding for Multimodal Large Language Models-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Gagrani_2024_CVPR, author = {Gagrani, Mukul and Goel, Raghavv and Jeon, Wonseok and Park, Junyoung and Lee, Mingu and Lott, Christopher}, title = {On Speculative Decoding for Multimodal Large Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8285-8289} }
QAttn: Efficient GPU Kernels for Mixed-precision Vision Transformers-
[pdf]
[bibtex]@InProceedings{Kluska_2024_CVPR, author = {Kluska, Piotr and Castell\'o, Adri\'an and Scheidegger, Florian and Malossi, A. Cristiano I. and Quintana-Ort{\'\i}, Enrique S.}, title = {QAttn: Efficient GPU Kernels for Mixed-precision Vision Transformers}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3648-3657} }
SimFreeze: Adaptively Freeze Vision Transformer Encoders with Token Similarity-
[pdf]
[bibtex]@InProceedings{Shen_2024_CVPR, author = {Shen, Tianyi and Lee, Chonghan and Narayanan, Vijaykrishnan}, title = {SimFreeze: Adaptively Freeze Vision Transformer Encoders with Token Similarity}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8266-8270} }
Lowering PyTorch's Memory Consumption for Selective Differentiation-
[pdf]
[supp]
[bibtex]@InProceedings{Bhatia_2024_CVPR, author = {Bhatia, Samarth and Dangel, Felix}, title = {Lowering PyTorch's Memory Consumption for Selective Differentiation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8260-8265} }
SAM-CLIP: Merging Vision Foundation Models Towards Semantic and Spatial Understanding-
[pdf]
[supp]
[bibtex]@InProceedings{Wang_2024_CVPR, author = {Wang, Haoxiang and Vasu, Pavan Kumar Anasosalu and Faghri, Fartash and Vemulapalli, Raviteja and Farajtabar, Mehrdad and Mehta, Sachin and Rastegari, Mohammad and Tuzel, Oncel and Pouransari, Hadi}, title = {SAM-CLIP: Merging Vision Foundation Models Towards Semantic and Spatial Understanding}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3635-3647} }
Adapting the Segment Anything Model During Usage in Novel Situations-
[pdf]
[bibtex]@InProceedings{Schon_2024_CVPR, author = {Sch\"on, Robin and Lorenz, Julian and Ludwig, Katja and Lienhart, Rainer}, title = {Adapting the Segment Anything Model During Usage in Novel Situations}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3616-3626} }
PMAFusion: Projection-Based Multi-Modal Alignment for 3D Semantic Occupancy Prediction-
[pdf]
[bibtex]@InProceedings{Li_2024_CVPR, author = {Li, Shiyao and Yang, Wenming and Liao, Qingmin}, title = {PMAFusion: Projection-Based Multi-Modal Alignment for 3D Semantic Occupancy Prediction}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3627-3634} }
Adaptive Memory Replay for Continual Learning-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Smith_2024_CVPR, author = {Smith, James Seale and Valkov, Lazar and Halbe, Shaunak and Gutta, Vyshnavi and Feris, Rogerio and Kira, Zsolt and Karlinsky, Leonid}, title = {Adaptive Memory Replay for Continual Learning}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3605-3615} }
Efficient Transformer Adaptation with Soft Token Merging-
[pdf]
[supp]
[bibtex]@InProceedings{Yuan_2024_CVPR, author = {Yuan, Xin and Fei, Hongliang and Baek, Jinoo}, title = {Efficient Transformer Adaptation with Soft Token Merging}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3658-3668} }
Opportunities for Post-Training Dynamic Layer Sparsity in Large Vision and Language Models-
[pdf]
[supp]
[bibtex]@InProceedings{Dotzel_2024_CVPR, author = {Dotzel, Jordan and Jiang, Carly and Abdelfattah, Mohamed and Zhang, Zhiru}, title = {Opportunities for Post-Training Dynamic Layer Sparsity in Large Vision and Language Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8280-8284} }
HaLViT: Half of the Weights are Enough-
[pdf]
[supp]
[bibtex]@InProceedings{Koyun_2024_CVPR, author = {Koyun, Onur Can and T\"oreyin, Beh\c{c}et U\u{g}ur}, title = {HaLViT: Half of the Weights are Enough}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3669-3678} }
Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting-
[pdf]
[arXiv]
[bibtex]@InProceedings{Bafghi_2024_CVPR, author = {Bafghi, Reza Akbarian and Harilal, Nidhin and Monteleoni, Claire and Raissi, Maziar}, title = {Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {3679-3684} }
Unleash the Potential of CLIP for Video Highlight Detection-
[pdf]
[arXiv]
[bibtex]@InProceedings{Han_2024_CVPR, author = {Han, Donghoon and Seo, Seunghyeon and Park, Eunhwan and Nam, Seong-Uk and Kwak, Nojun}, title = {Unleash the Potential of CLIP for Video Highlight Detection}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8275-8279} }
Layered Diffusion Model for One-Shot High Resolution Text-to-Image Synthesis-
[pdf]
[supp]
[arXiv]
[bibtex]@InProceedings{Khwaja_2024_CVPR, author = {Khwaja, Emaad and Rashwan, Abdullah and Chen, Ting and Wang, Oliver and Kothawade, Suraj and Li, Yeqing}, title = {Layered Diffusion Model for One-Shot High Resolution Text-to-Image Synthesis}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = {June}, year = {2024}, pages = {8271-8274} }