def run(paths, args):
    import os, csv, pickle, importlib
    import numpy as np
    import pandas as pd
    import torch, torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
    from torchvision import models
    from PIL import Image
    from sklearn.model_selection import StratifiedShuffleSplit
    from scipy.stats import spearmanr
    import albumentations as A
    from albumentations.pytorch import ToTensorV2

    from src import calibrate
    
    from src.train_ferplus import train_ferplus_models
    train_ferplus_models()
    
    length = 224
    transform_train = A.Compose([
        A.Resize(length, length),
        A.HorizontalFlip(p=0.5),
        A.Rotate(p=0.5, limit=(-30, 30)),
        A.RandomBrightnessContrast(p=0.5, brightness_limit=(-0.1, 0.1),
                                   contrast_limit=(-0.1, 0.1), brightness_by_max=False),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ])
    transform_test = A.Compose([
        A.Resize(length, length),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ])

    class FERImageFolder(Dataset):
        def __init__(self, csv_file, root_dir, transform=None):
            self.data = pd.read_csv(csv_file) if isinstance(csv_file, str) else csv_file
            self.root_dir = root_dir
            self.transform = transform
        def __len__(self): return len(self.data)
        def __getitem__(self, idx):
            img_name = os.path.join(self.root_dir, self.data.iloc[idx]['image_id'])
            image = Image.open(img_name).convert("RGB")
            label = int(self.data.iloc[idx]['label'])
            if self.transform:
                image = self.transform(image=np.array(image))['image']
            return image, label

    base_path = paths["dataset_dir"] / "fer-pytorch" / "fer_pytorch" / "dataset"
    train_csv = pd.read_csv(base_path / "new_train.csv")
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
    train_idx, val_idx = next(splitter.split(train_csv["image_id"], train_csv["label"]))
    train_split = train_csv.iloc[train_idx].reset_index(drop=True)
    val_split = train_csv.iloc[val_idx].reset_index(drop=True)

    train_dataset = FERImageFolder(train_split, str(base_path / "data/FER2013Train"), transform_train)
    val_dataset   = FERImageFolder(val_split,   str(base_path / "data/FER2013Train"), transform_test)
    calib_dataset = FERImageFolder(str(base_path / "new_val.csv"),  str(base_path / "data/FER2013Valid"), transform_test)
    test_dataset  = FERImageFolder(str(base_path / "new_test.csv"), str(base_path / "data/FER2013Test"),  transform_test)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)
    calib_loader = DataLoader(calib_dataset, batch_size=64, shuffle=False)
    test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

    mutiple_annotation_data_path = paths["dataset_dir"] / "fer2013new.csv"
    mutiple_annotation = pd.read_csv(mutiple_annotation_data_path)
    new_test = pd.read_csv(base_path / "new_test.csv")
    filtered = mutiple_annotation[mutiple_annotation['Image name'].isin(new_test['image_id'])]
    filtered_deduped = filtered.drop_duplicates(subset='Image name', keep='first')
    selected = filtered_deduped[['neutral','happiness','surprise','sadness','anger','disgust','fear']].to_numpy()
    nonzero_classes = (selected > 0).sum(axis=1)
    groundtruth_length = nonzero_classes

    y_true = np.concatenate([targets.cpu().numpy() for _, targets in test_loader])
    model_names = ['resnet18','resnet34','resnet50','vgg16','vgg19','densenet121','densenet161','mobilenet_v2']

    def adapt_model_for_fer(model, name, num_classes):
        if name in ['resnet18','resnet34','resnet50']:
            model.fc = nn.Linear(model.fc.in_features, num_classes)
        elif name in ['vgg16','vgg19']:
            model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
        elif name in ['densenet121','densenet161']:
            model.classifier = nn.Linear(model.classifier.in_features, num_classes)
        elif name == 'mobilenet_v2':
            if hasattr(model, "classifier") and hasattr(model.classifier, "__iter__"):
                for i in reversed(range(len(model.classifier))):
                    if isinstance(model.classifier[i], nn.Linear):
                        in_features = model.classifier[i].in_features
                        model.classifier[i] = nn.Linear(in_features, num_classes)
                        break
            else:
                model.classifier = nn.Linear(model.classifier.in_features, num_classes)
        return model

    from scipy.stats import spearmanr
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    sets_eval = {'cc': [], 'ssc': [], 'width_average': [], 'spearman_corr': [], 'p_value': []}
    def compute_metrics(prediction_sets):
        cc = calibrate.compute_coverage(prediction_sets, y_true)
        ssc = calibrate.compute_size_stratified_coverage(prediction_sets, y_true)
        width_average = calibrate.compute_average_width(prediction_sets)
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, p_value = spearmanr(groundtruth_length, prediction_set_length)
        sets_eval['cc'].append(cc); sets_eval['ssc'].append(ssc)
        sets_eval['width_average'].append(width_average)
        sets_eval['spearman_corr'].append(spearman_corr)
        sets_eval['p_value'].append(p_value)
        return cc, ssc, width_average, spearman_corr

    def cp_cond_entropy_corr(cond_entropy, prediction_sets):
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, p_value = spearmanr(cond_entropy, prediction_set_length)
        return spearman_corr

    performance_table = {'acc': [], 'lac_cc': [], 'lac_ssc': [], 'lac_mean_width': [],
                         'aps_cc': [], 'aps_ssc': [], 'aps_mean_width': [],
                         'raps_cc': [], 'raps_ssc': [], 'raps_mean_width': []}
    correlation_table = {'lac_spearman': [], 'aps_spearman': [], 'raps_spearman': []}
    ece_table = []
    correlation_table_entropy = {'lac': [], 'aps': [], 'raps': []}
    prediction_sets_holder = []
    alpha = 0.05

    base_results = paths["results_dir"]
    base_results.mkdir(parents=True, exist_ok=True)

    def _dict_to_csv(d, out_name, directory):
        rows = zip(*d.values())
        with open(directory / out_name, "w", newline="") as csvfile:
            w = csv.writer(csvfile); w.writerow(list(d.keys())); w.writerows(rows)

    def _save_all(directory):
        with open(directory / "ece_table_ferplus.csv", "w", newline="") as f:
            csv.writer(f).writerows([[item] for item in ece_table])
        _dict_to_csv(performance_table, "performance_table_ferplus.csv", directory)
        _dict_to_csv(correlation_table, "correlation_table_ferplus.csv", directory)
        _dict_to_csv(correlation_table_entropy, "correlation_table_entropy_ferplus.csv", directory)

    for name in model_names:
        # per-model results folder: <timestamp>_<modelname>
        results_dir = base_results.parent / f"{base_results.name}_{name}"
        results_dir.mkdir(parents=True, exist_ok=True)

        num_classes = len(set(label for _, label in train_loader.dataset))
        model_fn = getattr(models, name)
        model = model_fn(pretrained=True)
        model = adapt_model_for_fer(model, name, num_classes)
        weights_dir = paths["dataset_dir"] / "models_refined" / f"{name}.pth"
        model.load_state_dict(torch.load(weights_dir, map_location="cpu"))
        model = model.to(device).eval()

        ece = calibrate.compute_ece(model, test_loader, device)
        ece_table.append(ece)

        cal_nonconformity_scores, _ = calibrate.compute_nonconformity_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(cal_nonconformity_scores, alpha)
        prediction_sets = calibrate.predict_with_lac(model, test_loader, threshold, device)
        prediction_sets_holder.append(prediction_sets)

        cond_entropy = calibrate.compute_conditional_entropy_per_image(model, test_loader, device)
        correlation_table_entropy['lac'].append(cp_cond_entropy_corr(cond_entropy, prediction_sets))

        acc = calibrate.compute_accuracy(model, test_loader, device)
        performance_table['acc'].append(acc)
        cc, ssc, width_average, spearman_corr = compute_metrics(prediction_sets)
        performance_table['lac_cc'].append(cc); performance_table['lac_ssc'].append(ssc)
        performance_table['lac_mean_width'].append(width_average)
        correlation_table['lac_spearman'].append(spearman_corr)

        aps_scores = calibrate.compute_aps_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(aps_scores, alpha)
        aps_prediction_sets = calibrate.predict_with_aps(model, test_loader, threshold, device)
        prediction_sets_holder.append(aps_prediction_sets)
        correlation_table_entropy['aps'].append(cp_cond_entropy_corr(cond_entropy, aps_prediction_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(aps_prediction_sets)
        performance_table['aps_cc'].append(cc); performance_table['aps_ssc'].append(ssc)
        performance_table['aps_mean_width'].append(width_average)
        correlation_table['aps_spearman'].append(spearman_corr)

        raps_scores = calibrate.compute_raps_scores(
            model, calib_loader, device,
            lambda_reg=0.05, k_reg=5, beta=1.0,
            calibration_fraction=0.9, random_state=42
        )
        threshold = calibrate.determine_threshold(raps_scores, alpha)
        raps_sets = calibrate.predict_with_raps(model, test_loader, threshold, device)
        prediction_sets_holder.append(raps_sets)
        correlation_table_entropy['raps'].append(cp_cond_entropy_corr(cond_entropy, raps_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(raps_sets)
        performance_table['raps_cc'].append(cc); performance_table['raps_ssc'].append(ssc)
        performance_table['raps_mean_width'].append(width_average)
        correlation_table['raps_spearman'].append(spearman_corr)

        # save partial results after each model into the model-specific folder
        _save_all(results_dir)
        del model

    prediction_sets_holder.insert(0, y_true)
    prediction_sets_holder.insert(1, groundtruth_length)
    with open(base_results / "prediction_sets_ferplus.pkl", "wb") as f:
        pickle.dump(prediction_sets_holder, f)

    _save_all(base_results)
