def run(paths, args):
    import csv
    import numpy as np
    import matplotlib.pyplot as plt
    import torch
    from torch.utils.data import DataLoader, Subset
    from torchvision import transforms
    from torchvision.datasets import CIFAR10
    from scipy.stats import spearmanr
    from sklearn.model_selection import StratifiedShuffleSplit
    from src import train_cifar10
    
    from src import calibrate

    plt.rc("text", usetex=False)
    plt.rc("font", family="serif")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2471, 0.2435, 0.2616)),
    ])

    cifar_root = paths["dataset_dir"]
    cifar_root.mkdir(parents=True, exist_ok=True)

    dataset = CIFAR10(root=str(cifar_root), train=True, download=True, transform=transform)
    cifar10_test = CIFAR10(root=str(cifar_root), train=False, download=True, transform=transform)

    torch.save(dataset, str(cifar_root / "cifar10_train.pt"))
    torch.save(cifar10_test, str(cifar_root / "cifar10_test.pt"))

    counts = np.load(str(cifar_root / "cifar10h-counts.npy"))
    probs  = np.load(str(cifar_root / "cifar10h-probs.npy"))

    labels = [label for _, label in cifar10_test]
    splitter = StratifiedShuffleSplit(n_splits=1, train_size=1000, random_state=42)
    calib_idx, test_idx = next(splitter.split(X=labels, y=labels))

    calib_loader = DataLoader(Subset(cifar10_test, calib_idx), batch_size=120, shuffle=False)
    test_loader  = DataLoader(Subset(cifar10_test, test_idx), batch_size=120, shuffle=False)

    nonzero_classes = (counts[test_idx] > 0).sum(axis=1)
    groundtruth_length = nonzero_classes
    y_true = np.concatenate([targets.cpu().numpy() for _, targets in test_loader])

    sets_eval = {'cc': [], 'ssc': [], 'width_average': [], 'spearman_corr': [], 'p_value': []}

    def compute_metrics(prediction_sets):
        cc = calibrate.compute_coverage(prediction_sets, y_true)
        ssc = calibrate.compute_size_stratified_coverage(prediction_sets, y_true)
        width_average = calibrate.compute_average_width(prediction_sets)
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, p_value = spearmanr(groundtruth_length, prediction_set_length)
        sets_eval['cc'].append(cc); sets_eval['ssc'].append(ssc)
        sets_eval['width_average'].append(width_average)
        sets_eval['spearman_corr'].append(spearman_corr)
        sets_eval['p_value'].append(p_value)
        return cc, ssc, width_average, spearman_corr

    def cp_cond_entropy_corr(cond_entropy, prediction_sets):
        prediction_set_length = [len(i) for i in prediction_sets]
        spearman_corr, _ = spearmanr(cond_entropy, prediction_set_length)
        return spearman_corr

    
    alpha = 0.05

    results_dir = paths["results_dir"]
    results_dir.mkdir(parents=True, exist_ok=True)

    def dict_to_csv(d, out_name):
        rows = zip(*d.values())
        with open(results_dir / out_name, "w", newline="") as csvfile:
            w = csv.writer(csvfile); w.writerow(list(d.keys())); w.writerows(rows)

    def _save_all():
        with open(results_dir / "ece_table_cifar-10h.csv", "w", newline="") as f:
            csv.writer(f).writerows([[item] for item in ece_table])
        dict_to_csv(performance_table, "performance_table_cifar-10h.csv")
        dict_to_csv(correlation_table, "correlation_table_cifar-10h.csv")
        dict_to_csv(correlation_table_entropy, "correlation_table_entropy_cifar-10h.csv")
    """
    model_sources = {
        'resnet18': resnet, 'resnet34': resnet, 'resnet50': resnet,
        'vgg16_bn': vgg, 'vgg19_bn': vgg,
        'densenet121': densenet, 'densenet161': densenet,
        'mobilenet_v2': mobilenet,
    }
    """
    model_names = ['resnet18', 'resnet34', 'resnet50', 'vgg16_bn', 'vgg19_bn', 'densenet121', 'densenet161', 'mobilenet_v2']

    # for name, module in model_sources.items():
    for name in model_names:
        performance_table = {'acc': [], 'lac_cc': [], 'lac_ssc': [], 'lac_mean_width': [],
                         'aps_cc': [], 'aps_ssc': [], 'aps_mean_width': [],
                         'raps_cc': [], 'raps_ssc': [], 'raps_mean_width': []}
        correlation_table = {'lac_spearman': [], 'aps_spearman': [], 'raps_spearman': []}
        ece_table = []
        correlation_table_entropy = {'lac': [], 'aps': [], 'raps': []}
        
        base_results = paths["results_dir"]  # timestamp-only dir from paths.py
        # results_dir = base_results.parent / f"{base_results.name}_{name}"
        results_dir = base_results / name
        results_dir.mkdir(parents=True, exist_ok=True)
        
        # model_fn = getattr(module, name)
        # model = model_fn(pretrained=False).to(device).eval()
        model = train_cifar10.train_and_return_model(name).to(device).eval()
        
        ece = calibrate.compute_ece(model, test_loader, device)
        ece_table.append(ece)

        cal_nonconformity_scores, _ = calibrate.compute_nonconformity_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(cal_nonconformity_scores, alpha)
        prediction_sets = calibrate.predict_with_lac(model, test_loader, threshold, device)

        cond_entropy = calibrate.compute_conditional_entropy_per_image(model, test_loader, device)
        correlation_table_entropy['lac'].append(cp_cond_entropy_corr(cond_entropy, prediction_sets))

        acc = calibrate.compute_accuracy(model, test_loader, device)
        performance_table['acc'].append(acc)

        cc, ssc, width_average, spearman_corr = compute_metrics(prediction_sets)
        performance_table['lac_cc'].append(cc); performance_table['lac_ssc'].append(ssc)
        performance_table['lac_mean_width'].append(width_average)
        correlation_table['lac_spearman'].append(spearman_corr)

        aps_scores = calibrate.compute_aps_scores(model, calib_loader, device)
        threshold = calibrate.determine_threshold(aps_scores, alpha)
        aps_prediction_sets = calibrate.predict_with_aps(model, test_loader, threshold, device)

        correlation_table_entropy['aps'].append(cp_cond_entropy_corr(cond_entropy, aps_prediction_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(aps_prediction_sets)
        performance_table['aps_cc'].append(cc); performance_table['aps_ssc'].append(ssc)
        performance_table['aps_mean_width'].append(width_average)
        correlation_table['aps_spearman'].append(spearman_corr)

        raps_scores = calibrate.compute_raps_scores(
            model, calib_loader, device,
            lambda_reg=0.05, k_reg=5, beta=1.0,
            calibration_fraction=0.9, random_state=42
        )
        threshold = calibrate.determine_threshold(raps_scores, alpha)
        raps_sets = calibrate.predict_with_raps(model, test_loader, threshold, device)

        correlation_table_entropy['raps'].append(cp_cond_entropy_corr(cond_entropy, raps_sets))
        cc, ssc, width_average, spearman_corr = compute_metrics(raps_sets)
        performance_table['raps_cc'].append(cc); performance_table['raps_ssc'].append(ssc)
        performance_table['raps_mean_width'].append(width_average)
        correlation_table['raps_spearman'].append(spearman_corr)

        # save partial results after each model
        _save_all()
        del model

    # save again to ensure everything is flushed
    _save_all()
