def run(paths, args):
    import os, csv, copy, pickle, importlib
    import numpy as np
    import tensorflow_datasets as tfds
    import torch
    from torch.utils.data import Dataset, DataLoader, Subset
    from torchvision import transforms, models
    from collections import defaultdict
    from scipy.stats import spearmanr

    from src import calibrate

    data = tfds.load(
        'imagenet2012_real', split='validation', download=True,
        data_dir=str(paths["dataset_dir"]), try_gcs=False, shuffle_files=False
    )

    imagenet_mean = [0.485, 0.456, 0.406]
    imagenet_std  = [0.229, 0.224, 0.225]
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.Normalize(mean=imagenet_mean, std=imagenet_std),
    ])

    class ImageNetRealDataset(Dataset):
        def __init__(self, tf_data):
            self.examples, self.real_labels, self.file_names = [], [], []
            for ex in tf_data.as_numpy_iterator():
                if len(ex['real_label']) > 0:
                    self.examples.append(ex)
                    self.real_labels.append(ex['real_label'])
                    self.file_names.append(ex['file_name'])
        def __len__(self): return len(self.examples)
        def __getitem__(self, idx):
            ex = self.examples[idx]
            image = transform(ex['image'])
            label = torch.tensor(ex['original_label'], dtype=torch.long)
            return image, label

    dataset = ImageNetRealDataset(data)

    class_indices = defaultdict(list)
    for idx, ex in enumerate(dataset.examples):
        class_indices[ex['original_label']].append(idx)

    calib_indices, test_indices = [], []
    for idx_list in class_indices.values():
        calib_indices.extend(idx_list[:20])
        test_indices.extend(idx_list[20:])

    calib_dataset = Subset(dataset, calib_indices)
    test_dataset  = Subset(dataset, test_indices)
    calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False)
    test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

    y_true = np.concatenate([targets.cpu().numpy() for _, targets in test_loader])
    real_labels_test = [dataset.real_labels[i] for i in test_loader.dataset.indices]
    groundtruth_length = [len(i) for i in real_labels_test]

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_names = ['resnet18','resnet34','resnet50','vgg16','vgg19','densenet121','densenet161','mobilenet_v2']

    def get_model(name, device):
        model_fn = getattr(models, name)
        return model_fn(pretrained=True).to(device).eval()

    sets_eval = {'cc': [], 'ssc': [], 'width_average': [], 'spearman_corr': [], 'p_value': []}
    def compute_metrics(prediction_sets):
        cc = calibrate.compute_coverage(prediction_sets, y_true)
        ssc = calibrate.compute_size_stratified_coverage(prediction_sets, y_true)
        width_average = calibrate.compute_average_width(prediction_sets)
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, p_value = spearmanr(groundtruth_length, prediction_set_length)
        sets_eval['cc'].append(cc); sets_eval['ssc'].append(ssc)
        sets_eval['width_average'].append(width_average)
        sets_eval['spearman_corr'].append(spearman_corr)
        sets_eval['p_value'].append(p_value)
        return cc, ssc, width_average, spearman_corr

    def cp_cond_entropy_corr(cond_entropy, prediction_sets):
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, p_value = spearmanr(cond_entropy, prediction_set_length)
        return spearman_corr

    prediction_sets_holder = []
    performance_table = {'acc': [], 'lac_cc': [], 'lac_ssc': [], 'lac_mean_width': [],
                         'aps_cc': [], 'aps_ssc': [], 'aps_mean_width': [],
                         'raps_cc': [], 'raps_ssc': [], 'raps_mean_width': []}
    correlation_table = {'lac_spearman': [], 'aps_spearman': [], 'raps_spearman': []}
    ece_table = []
    correlation_table_entropy = {'lac': [], 'aps': [], 'raps': []}
    alpha = 0.1

    base_results = paths["results_dir"]
    base_results.mkdir(parents=True, exist_ok=True)

    def _dict_to_csv(d, out_name, directory):
        rows = zip(*d.values())
        with open(directory / out_name, "w", newline="") as csvfile:
            w = csv.writer(csvfile); w.writerow(list(d.keys())); w.writerows(rows)

    def _save_all(directory):
        with open(directory / "ece_table_reassessed-imagenet.csv", "w", newline="") as f:
            csv.writer(f).writerows([[item] for item in ece_table])
        _dict_to_csv(performance_table, "performance_table_reassessed-imagenet.csv", directory)
        _dict_to_csv(correlation_table, "correlation_table_reassessed-imagenet.csv", directory)
        _dict_to_csv(correlation_table_entropy, "correlation_table_entropy_reassessed-imagenet.csv", directory)

    for name in model_names:
        # per-model results folder: <timestamp>_<modelname>
        results_dir = base_results.parent / f"{base_results.name}_{name}"
        results_dir.mkdir(parents=True, exist_ok=True)

        model = get_model(name, device)
        ece = calibrate.compute_ece(model, test_loader, device)
        ece_table.append(ece)

        cal_nonconformity_scores, _ = calibrate.compute_nonconformity_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(cal_nonconformity_scores, alpha)
        prediction_sets = calibrate.predict_with_lac(model, test_loader, threshold, device)
        prediction_sets_holder.append(prediction_sets)

        cond_entropy = calibrate.compute_conditional_entropy_per_image(model, test_loader, device)
        correlation_table_entropy['lac'].append(cp_cond_entropy_corr(cond_entropy, prediction_sets))

        acc = calibrate.compute_accuracy(model, test_loader, device)
        performance_table['acc'].append(acc)
        cc, ssc, width_average, spearman_corr = compute_metrics(prediction_sets)
        performance_table['lac_cc'].append(cc); performance_table['lac_ssc'].append(ssc)
        performance_table['lac_mean_width'].append(width_average)
        correlation_table['lac_spearman'].append(spearman_corr)

        aps_scores = calibrate.compute_aps_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(aps_scores, alpha)
        aps_prediction_sets = calibrate.predict_with_aps(model, test_loader, threshold, device)
        prediction_sets_holder.append(aps_prediction_sets)
        correlation_table_entropy['aps'].append(cp_cond_entropy_corr(cond_entropy, aps_prediction_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(aps_prediction_sets)
        performance_table['aps_cc'].append(cc); performance_table['aps_ssc'].append(ssc)
        performance_table['aps_mean_width'].append(width_average)
        correlation_table['aps_spearman'].append(spearman_corr)

        raps_scores = calibrate.compute_raps_scores(
            model, calib_loader, device,
            lambda_reg=0.0001, k_reg=5, beta=1.0,
            calibration_fraction=0.9, random_state=42
        )
        threshold = calibrate.determine_threshold(raps_scores, alpha)
        raps_sets = calibrate.predict_with_raps(model, test_loader, threshold, device)
        prediction_sets_holder.append(raps_sets)
        correlation_table_entropy['raps'].append(cp_cond_entropy_corr(cond_entropy, raps_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(raps_sets)
        performance_table['raps_cc'].append(cc); performance_table['raps_ssc'].append(ssc)
        performance_table['raps_mean_width'].append(width_average)
        correlation_table['raps_spearman'].append(spearman_corr)

        _save_all(results_dir)
        del model

    prediction_sets_holder.insert(0, y_true)
    prediction_sets_holder.insert(1, groundtruth_length)
    with open(base_results / "prediction_sets_reassessed-imagenet.pkl", "wb") as f:
        pickle.dump(prediction_sets_holder, f)

    _save_all(base_results)
