Source code for verona.evaluation.stattests.hierarchical

import tempfile
from typing import List

import numpy as np
import pandas as pd
import scipy.stats as stats
from cmdstanpy import CmdStanModel, install_cmdstan, cmdstan_path

from verona.evaluation.stattests.stan_codes import STAN_CODE



[docs]
class BayesianHierarchicalResults:
    def __init__(self, approximated, global_wins, posterior_distribution, per_dataset, global_sign, raw_results):
        """
        A class to store the results of the Bayesian hierarchical test.

        Args:
            global_wins (pd.DataFrame): A DataFrame containing the global winning probabilities for each condition
                (left, right, rope).
            posterior_distribution (pd.DataFrame): A DataFrame with the posterior distribution probabilities
                (left, rope, right).
            per_dataset (pd.DataFrame): A DataFrame with per-dataset statistics.
            global_sign (pd.DataFrame): A DataFrame containing the global sign probabilities (positive, negative).
            raw_results (pd.DataFrame): A DataFrame containing the raw results from the sampling.
        """

        self.global_wins = global_wins
        self.approximated = approximated
        self.posterior_distribution = posterior_distribution
        self.per_dataset = per_dataset
        self.global_sign = global_sign
        self.raw_results = raw_results




[docs]
class HierarchicalBayesianTest:


[docs]
    @staticmethod
    def pt_scaled(q, df, mean=0, sd=1):
        """
        This function emulates the function pt.scaled from the "metRology" R package. That function computes the
        cumulative distribution function of a T-Student for a given number of degrees of freedom (df). This computation
        is scaled and shifted by mean, and sd, respectively.

        Args:
            q: quantile
            df: degrees of freedom
            mean: scale (mean). Default is ``0``.
            sd: shift (standard deviation). Default is ``1``.

        Returns:
            Cumulative distribution function shifted and scaled.
        """
        return stats.t.cdf((q - mean) / sd, df)


    def __init__(self, x_result: pd.DataFrame, y_result: pd.DataFrame, approaches: List[str], datasets: List[str]):
        """
        Args:
            x_result (pd.DataFrame): A 2D dataframe containing the performance metrics obtained by the first algorithm. Each row in
                this matrix represents a unique dataset, and each column corresponds to the result of an individual fold in
                a k-fold cross-validation for that dataset.
            y_result (pd.DataFrame): A 2D dataframe containing the performance metrics obtained by the second algorithm. The structure
                is the same as `x_result`, where rows correspond to datasets and columns to individual k-fold
                cross-validation results.
            approaches (List[str]): A list containing the names of the two algorithms being compared. The first element in the list
                should correspond to the algorithm associated with `x_result`, and the second element should correspond to
                the algorithm associated with `y_result`.
            datasets (List[str]): A list containing the names of the datasets used in the model. The order should match the row
                order in `x_result` and `y_result`.
        """

        assert len(approaches) == 2, "The number of names of the approaches is not 2"
        assert len(datasets) > 0, "The number of datasets using for comparing the approaches must be greater than 0"

        self.x_result = pd.DataFrame(x_result)
        self.y_result = pd.DataFrame(y_result)
        self.approaches = approaches
        self.datasets = datasets

        assert self.x_result.shape[0] == self.y_result.shape[0], "The number of rows of both matrices do not match"
        assert self.x_result.shape[1] == self.y_result.shape[1], "The number of columns of both matrices do not match"
        assert self.x_result.shape[0] == len(self.datasets)

        try:
            cmdstan_path()
        except:
            install_cmdstan()


[docs]
    def run(self, rope=[-1,1], rho=0.2, n_chains=4, num_samples=300000, std_upper=1000, alpha_lower=0.5, alpha_upper=5,
            beta_lower=0.05, beta_upper=0.15, d0_lower=None, d0_upper=None) -> BayesianHierarchicalResults:
        """
        Executes a Bayesian hierarchical model tailored for comparing the performance of two machine learning algorithms
        across multiple datasets based on cross-validation results.

        This model employs a two-level hierarchical structure to account for both dataset-specific variations and global
        trends in the performance differences between the two algorithms. Specifically, it treats the performance
        metrics from each dataset as arising from a dataset-specific distribution, which in turn is governed by global
        hyperparameters.

        The model can work directly on a variety of metrics obtained through cross-validation, thereby providing a
        comprehensive statistical insight into the comparative evaluation. This is a significant advantage over
        frequentist methods which may not fully capture the uncertainty in the metrics.

        This implementation is ported from the Bayesian hierarchical model implemented in [2], which provides posterior
        probabilities for a richer interpretation of the comparison.


        Args:
            rope (List, optional): Region of Practical Equivalence, defines the interval within which performance
                differences are considered "irrelevant" or "insignificant". Default is ``[-1, 1]``.
            rho (float, optional): A hyperparameter representing the correlation factor across datasets. A higher
                value indicates stronger correlation between datasets in terms of algorithm performance.
                Default is ``0.2``.
            n_chains (int, optional): The number of Markov Chains to be used in the simulation. Half of the simulations
                are used for warm-up. Default is ``4``.
            num_samples (int, optional): The total number of samples (per chain) used for estimating the posterior
                distribution. Default is ``300000``.
            std_upper (int, optional): A scaling factor that sets the upper bounds for the hyperparameters sigma_i
                and sigma_0, which represent dataset-specific and global variability, respectively.
                Default is ``1000``.
            alpha_lower (float, optional): Lower bound for the uniform prior of the alpha hyperparameter, which
                models the global variance. Default is ``0.5``.
            alpha_upper (float, optional): Upper bound for the uniform prior of the alpha hyperparameter.
                Default is ``5``.
            beta_lower (float, optional): Lower bound for the uniform prior of the beta hyperparameter, which models
                dataset-specific variances. Default is ``0.05``.
            beta_upper (float, optional): Upper bound for the uniform prior of the beta hyperparameter.
                Default is ``0.15``.
            d0_lower (float, optional): Lower bound for the prior distribution of mu_0, the grand mean of performance
                differences. If not provided, the smallest observed difference is used as the lower bound.
            d0_upper (float, optional): Upper bound for the prior distribution of mu_0. If not provided, the largest
                observed difference is used as the upper bound.

        Note:
            The results includes the typical information relative to the three areas of the posterior density (left,
            right and rope probabilities), both global and per dataset (in the additional information). Also, the
            simulation results are included.

            As for the prior parameters, they are set to the default values indicated in [1,2], except for the bound for
            the prior distribution of mu_0, which are set to the maximum and minimum values observed in the sample. You
            should not modify them unless you know what you are doing.

            [1] A. Benavoli, G. Corani, J. Demsar, M. Zaffalon (2017) Time for a Change: a Tutorial for Comparing
            Multiple Classifiers Through Bayesian Analysis. \emph{Journal of Machine Learning Research}, 18, 1-36.
            [2] Borja Calvo and Guzmán Santafé (2016) scmamp: Statistical Comparison of Multiple Algorithms in Multiple
            Problems. *The R Journal*, 8(1), 248-256. [DOI: 10.32614/RJ-2016-017](https://doi.org/10.32614/RJ-2016-017)

        Returns:
            BayesianHierarchicalResults
                An object containing the following attributes:
                    - `approximated` : Boolean value that indicates whether the posterior distribution is approximated (True in this case).
                    - `global_wins`: DataFrame containing the global winning probabilities for each condition (left, right, rope).
                    - `posterior_distribution`: DataFrame with the posterior distribution probabilities (left, rope, right).
                    - `per_dataset`: DataFrame with per-dataset statistics.
                    - `global_sign`: DataFrame containing the global sign probabilities (positive, negative).
                    - `raw_results`: DataFrame containing the raw results from the sampling.

        Examples:
            >>> x_data = pd.DataFrame([[75.3, 78.3, 60.4], [68.5, 77.5, 76.9], [77.9, 74.5, 80.9], [90, 90, 90]])
            >>> y_data = pd.DataFrame([[74.3, 75.3, 61.4], [65.5, 70.5, 80.9], [79.9, 76.2, 81.9], [90, 90, 90]])
            >>> results = HierarchicalBayesianTest(x_data, y_data, approaches=["approach 1", "approach 2"],
            datasets=["d1", "d2", "d3", "d4"]).run([-1, 1])
            >>> print("Global wins: ", results.global_wins)
            Global wins:     left (approach 1 < approach 2)  rope (approach 1 = approach 2)  right (approach 1 > approach 2)
                                    0.809272                             0.0                         0.190728
            >>> print("Per dataset: ", results.per_dataset.iloc[:, 1:4])
            Per dataset:      left (approach 1 < approach 2)  rope (approach 1 = approach 2)  right (approach 1 > approach 2)
            d1                        0.207137                        0.503125                         0.289738
            d2                        0.279955                        0.419848                         0.300197
            d3                        0.619778                        0.337055                         0.043167
            d4                        0.039968                        0.937468                         0.022563
        """
        return self._run_stan(x_matrix=self.x_result, y_matrix=self.y_result, rope=rope, rho=rho,
                              n_chains=n_chains, stan_samples=num_samples, std_upper=std_upper,
                              alpha_lower=alpha_lower, alpha_upper=alpha_upper, beta_lower=beta_lower,
                              beta_upper=beta_upper, d0_lower=d0_lower, d0_upper=d0_upper)


    def _run_stan(self, x_matrix: pd.DataFrame, y_matrix: pd.DataFrame, rope: List, rho=0.2, n_chains=4,
                  stan_samples=1000, std_upper=1000, alpha_lower=0.5, alpha_upper=5, beta_lower=0.05, beta_upper=0.15,
                  d0_lower=None, d0_upper=None, seed=42) -> BayesianHierarchicalResults:
        # TODO: refactor this method since it follows the structure of the original code, but it is not very pythonic

        assert len(rope) == 2, "The number of elements of rope must be 2"
        assert rope[0] < rope[1], "rope[0] must be less than rope[1]"

        stan_code = STAN_CODE.HIERARCHICAL_TEST

        num_samples = x_matrix.shape[1]
        num_datasets = x_matrix.shape[0]

        experiment_results = x_matrix - y_matrix

        dataset_sds = experiment_results.std(axis=1)
        mean_dataset_sd = dataset_sds.mean()
        scale_factor = mean_dataset_sd  # TODO: ???

        # Scale the cross-validation results and the rope
        sample_matrix = experiment_results / mean_dataset_sd
        rope[0] = rope[0] / mean_dataset_sd
        rope[1] = rope[1] / mean_dataset_sd

        # scmamp: In case there is any dataset with 0 variance, add a small value to avoid problems
        # taking care to not alter the mean value
        # me: Taking care of not altering the mean value means that if the number of samples is odd, one of the
        # elements of the sample will NOT be altered (the one in the middle). Therefore, adding and subtracting
        # the same noise in different positions of the array (as long as the noise is within the rope, to not alter
        # the final statistical test results) keeps the mean value of the sample unchanged.
        for id, sd in enumerate(dataset_sds):
            if sd == 0:
                noise = np.random.uniform(rope[0], rope[1], size=int(num_samples / 2))
                sample_matrix.iloc[id, :int((num_samples / 2))] += noise
                # The +1 add takes care of the case of an odd list of samples, and does it work for even lists too.
                sample_matrix.iloc[id, int(((num_samples + 1) / 2)):] -= noise

        dataset_sds = sample_matrix.std(axis=1)
        mean_dataset_sd = dataset_sds.mean()

        if num_samples == 1:
            dataset_mean_sd = sample_matrix.std()
        else:
            dataset_mean_sd = sample_matrix.mean(axis=1).std()

        if d0_lower is None:
            d0_lower = sample_matrix.abs().to_numpy().max() * -1

        if d0_upper is None:
            d0_upper = sample_matrix.abs().to_numpy().max()

        stan_data = {
            "deltaLow": d0_lower,
            "deltaHi": d0_upper,
            "stdLow": 0,
            "stdHi": mean_dataset_sd * std_upper,
            "std0Low": 0,
            "std0Hi": dataset_mean_sd * std_upper,
            "Nsamples": num_samples,
            "q": num_datasets,
            "x": np.array(sample_matrix),
            "rho": rho,
            "upperAlpha": alpha_upper,
            "lowerAlpha": alpha_lower,
            "upperBeta": beta_upper,
            "lowerBeta": beta_lower
        }

        #posterior = stan.build(stan_code, data=stan_data, random_seed=seed)
        #fit = posterior.sample(num_chains=n_chains, num_samples=stan_samples)

        with tempfile.NamedTemporaryFile(suffix='.stan', delete=False) as temp:
            temp.write(stan_code.encode('utf-8'))
            temp_file_name = temp.name  # Save the filename to use later

        model = CmdStanModel(stan_file=temp_file_name)
        fit = model.sample(data=stan_data, chains=n_chains, iter_sampling=int(stan_samples/2),
                           iter_warmup=int(stan_samples/2), seed=42)

        results = fit.draws_pd()

        # The cols are named like: "diff.1.3", but we get diff as name.
        # iterate over the columns "templates" and detect them in the dataframe
        # Then, drop those columns from the df.
        cols_to_drop_df = []
        cols_to_drop = ["diff", "diagQuad", "oneOverSigma2", "nuMinusOne", "logLik"]

        for col in results.columns:
            for col_to_drop_template in cols_to_drop:
                if col_to_drop_template in col:
                    cols_to_drop_df.append(col)

        results = results.drop(cols_to_drop_df, axis=1)
        # Get the delta.X columns, that represent each of the datasets.
        delta_cols = [col for col in results.columns if ("delta" in col and col != "delta0")]
        delta_df = results[delta_cols]
        left = delta_df[delta_df < rope[0]].count() / delta_df.count()
        right = delta_df[delta_df > rope[1]].count() / delta_df.count()
        rope = delta_df[(delta_df > rope[0]) & (delta_df < rope[1])].count() / delta_df.count()

        probs_per_dataset = pd.DataFrame({"left": left, "rope": rope, "right": right})

        aux = HierarchicalBayesianTest.pt_scaled(rope[1], df=results["nu"], mean=results["delta0"], sd=results["std0"])
        cum_left = HierarchicalBayesianTest.pt_scaled(rope[0], df=results["nu"], mean=results["delta0"],
                                                      sd=results["std0"])
        cum_rope = aux - cum_left
        cum_right = 1 - aux

        posterior_distribution = pd.DataFrame({"left": cum_left, "rope": cum_rope, "right": cum_right})

        # Get the probabilities according to the counts
        left_wins = (cum_left > cum_right) & (cum_left > cum_rope)
        right_wins = (cum_right > cum_left) & (cum_right > cum_rope)
        rope_wins = (left_wins | right_wins)
        rope_wins = np.array([not x for x in rope_wins])
        prob_left_win = left_wins.mean()
        prob_right_win = right_wins.mean()
        prob_rope_win = rope_wins.mean()

        positive_d0 = results["delta0"] > 0
        prob_positive = positive_d0.mean()
        prob_negative = 1 - prob_positive

        # Get the results ready
        per_dataset = delta_df.mean(axis=0) * scale_factor
        per_dataset = pd.concat([per_dataset, probs_per_dataset], axis=1)
        left_str = "left (" + self.approaches[0] + " < " + self.approaches[1] + ")"
        right_str = "right (" + self.approaches[0] + " > " + self.approaches[1] + ")"
        rope_str = "rope (" + self.approaches[0] + " = " + self.approaches[1] + ")"
        per_dataset.rename(columns={
            per_dataset.columns[0]: "mean_delta",
            "left" : left_str,
            "rope" : rope_str,
            "right" : right_str
        }, inplace=True)
        per_dataset.index = self.datasets

        global_sign = pd.DataFrame({"negative": prob_negative, "positive": prob_positive}, index=[0])
        global_wins = pd.DataFrame({left_str: prob_left_win, rope_str: prob_rope_win, right_str: prob_right_win},
                                   index=[0])

        import os
        os.remove(temp_file_name)

        return BayesianHierarchicalResults(True, global_wins, posterior_distribution, per_dataset, global_sign, results)



if __name__ == "__main__":
    x_data = pd.DataFrame([[75.3, 78.3, 60.4], [68.5, 77.5, 76.9], [77.9, 74.5, 80.9], [90, 90, 90]])
    y_data = pd.DataFrame([[74.3, 75.3, 61.4], [65.5, 70.5, 80.9], [79.9, 76.2, 81.9], [90, 90, 90]])
    results = HierarchicalBayesianTest(x_data, y_data, approaches=["approach 1", "approach 2"],
                                       datasets=["d1", "d2", "d3", "d4"]).run([-1, 1])
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    print("Global wins: ", results.global_wins)
    print("Per dataset: ", results.per_dataset.iloc[:, 1:4])