import os
from .clip_encoder import CLIPVisionTower
from llava.model.multimodal_encoder.donut_unichart_encoder import DonutEncoder

def build_modal_tower(tower_config,modal="image",**kwargs):
    if modal=="image":
        # print("Build vision tower")
        vision_tower = getattr(tower_config, 'mm_vision_tower', getattr(tower_config, 'vision_tower', None))
        is_absolute_path_exists = os.path.exists(vision_tower)
        if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
            return CLIPVisionTower(vision_tower, args=tower_config, **kwargs)

        raise ValueError(f'Unknown vision tower: {vision_tower}')
    elif modal=="chart":
        # print("Build chart tower")
        chart_tower = getattr(tower_config, 'mm_chart_tower', getattr(tower_config, 'chart_tower', None))
        return DonutEncoder(chart_tower,args=tower_config,**kwargs)
        
def build_vision_tower(vision_tower_cfg, **kwargs):
    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
    is_absolute_path_exists = os.path.exists(vision_tower)
    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

    raise ValueError(f'Unknown vision tower: {vision_tower}')
