## Install guide

- Docker Image - nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 

### Package install
```sh
    bash install.sh
```
## Scripts

### Training

- pre-train (stage 1)
    ```sh
        mkdir -p $OUT_PATH
        deepspeed llava/train/train_mem.py \
            --model_name_or_path lmsys/vicuna-7b-v1.5 \
            --version plain_v2 \
            --train_data_path [DATA PATH] \
            --chart_folder [chart folder] \
            --chart_tower ahmed-masry/unichart-base-960 \
            --mm_chart_projector_type mlp2x_gelu \
            --mm_chart_select_layer -2 \
            --tune_mm_mlp_adapter True \
            --mm_use_im_start_end False \
            --mm_use_im_patch_token False \
            --bf16 True \
            --output_dir [OUT_PATH] \
            --num_train_epochs 1 \
            --per_device_train_batch_size 8 \
            --per_device_eval_batch_size 4 \
            --gradient_accumulation_steps 4 \
            --evaluation_strategy "no" \
            --save_strategy "steps" \
            --save_steps 24000 \
            --save_total_limit 1 \
            --learning_rate 1e-3 \
            --weight_decay 0. \
            --warmup_ratio 0.03 \
            --lr_scheduler_type "cosine" \
            --logging_steps 1 \
            --tf32 True \
            --model_max_length 2048 \
            --gradient_checkpointing True \
            --dataloader_num 4 \
            --lazy_preprocess True
    ```

- fine-tuning (stage 2 or 3)
    ```sh
        deepspeed llava/train/train_mem.py \
            --model_name_or_path lmsys/vicuna-7b-v1.5 \
            --version v1 \
            --train_data_path [TRAIN_DATA_PATH] \
            --image_folder [IMAGE_FODLDER] \
            --load_wo_init_chart True \
            --load_wo_init_vision True \
            --vision_tower openai/clip-vit-large-patch14-336 \
            --mm_projector_type mlp2x_gelu \
            --mm_vision_select_layer -2 \
            --pretrain_mm_mlp_adapter [PRETRAINED_IMAGE_PROJECTION_LAYER_PATH] \
            --chart_folder [CHART_FOLDER] \
            --chart_tower ahmed-masry/unichart-base-960 \
            --mm_chart_projector_type mlp2x_gelu \
            --pretrain_chart_mm_mlp_adapter [PRETRAINED_CHART_PROJECTION_LAYER_PATH] \
            --bf16 True \
            --output_dir [OUT_PATH] \
            --num_train_epochs 1 \
            --save_strategy "steps" \
            --save_steps 1000 \
            --save_total_limit 1 \
            --per_device_train_batch_size 8 \
            --per_device_eval_batch_size 4 \
            --gradient_accumulation_steps 2 \
            --evaluation_strategy "no" \
            --learning_rate 2e-5 \
            --weight_decay 0. \
            --warmup_ratio 0.03 \
            --lr_scheduler_type "cosine" \
            --logging_steps 1 \
            --tf32 True \
            --model_max_length 2048 \
            --gradient_checkpointing True \
            --dataloader_num_workers 4 \
            --lazy_preprocess True 
    ```


### Inference

- Example (ICT-QA)

    ```sh
    CUDA_VISIBLE_DEVICES=0 python inference.py \
        --model-path ./checkpoints/vicuna-7b-v1.5-v4-v6_1-ict-qa-interleaved_wo_aug-3epoch-2k \
        --image-folder ../../data/ICT-QA_data/image/ \
        --chart-folder ../../data/ICT-QA_data/image/ \
        --question-file ../../data/ICT-QA_data/ict-qa_test.json \
        --answers-file ./test_answers.jsonl \
        --temperature 0 \
        --conv-mode vicuna_v1
    ```


