Source code for verona.evaluation.metrics.utils

from typing import Literal, Union

import numpy as np
import pandas as pd

from verona.evaluation.metrics import event, suffix, time


[docs] def get_metric_by_prefix_len(metric: Literal['accuracy', 'fbeta', 'f1_score', 'precision', 'recall', 'mcc', 'brier_loss', 'damerau_levenshtein', 'mae', 'mse'], predictions: np.array, ground_truths: np.array, prefixes: list[pd.DataFrame], preds_format: Literal['labels', 'onehot'], gt_format: Literal['labels', 'onehot'], average: Literal['micro', 'macro', 'weighted'] = None, beta: float = None, eoc: Union[str, int] = None) -> pd.DataFrame: """ Calculates the value of the specified metric individually for each prefix size. Generates a Pandas DataFrame in which each column represents a prefix size with: 1- its corresponding value for the selected metric, 2- the number of prefixes with that length. Args: metric (Literal['accuracy', 'fbeta', 'f1_score', 'precision', 'recall', 'mcc', 'brier_loss', 'damerau_levenshtein', 'mae', 'mse']): Metric to be calculated. predictions (np.array): Array of shape (n_samples, n_classes) containing the predictions done by the model as probabilities. The predictions on the array should respect the same order as their respective prefixes and their ground_truths. ground_truths (np.array): Array containing the ground truths. The grounds truths on the array should respect the same order as their respective prefixes and predictions. prefixes (list[pd.DataFrame]): List containing the prefixes as Pandas DataFrame. The prefixes on the list should respect the same order as their respective predicates and ground_truths. preds_format (Literal['labels', 'onehot'], optional): Format of the predictions. ``'label'`` for labels and ``'onehot'`` for one-hot vectors. gt_format (Literal['labels', 'onehot'], optional): Format of the ground truths. ``'label'`` for labels and ``'onehot'`` for one-hot vectors. average (Literal['micro', 'macro', 'weighted'], optional): Type of averaging to be performed on data. Only needed for ``'fbeta'``, ``'f1_score'``, ``'precision'`` and ``'recall'`` value in metric parameter. beta (float, optional): Ratio of recall importance to precision importance. Only needed for ``'fbeta'`` value in metric parameter. eoc (Union[str, int], optional): Label of the End-of-Case (EOC) which is an element that signifies the end of the trace/suffix. Only needed for ``'damerau_levenshtein'`` value in metric parameter. Returns: df_results: Pandas DataFrame where the columns indicate the size of the prefix and its two values indicate: 1- the value of the metric, 2- the number of prefixes with that size. """ preds_by_lens = {} gts_by_lens = {} for prefix, pred, gt in zip(prefixes, predictions, ground_truths): prefix_len = len(prefix) if prefix_len in preds_by_lens: preds_by_lens[prefix_len].append(pred) gts_by_lens[prefix_len].append(gt) else: preds_by_lens[prefix_len] = [pred] gts_by_lens[prefix_len] = [gt] preds_by_lens = dict(sorted(preds_by_lens.items())) gts_by_lens = dict(sorted(gts_by_lens.items())) dict_results = {} for prefix_len in preds_by_lens.keys(): result = __apply_metric(metric, np.array(preds_by_lens[prefix_len]), np.array(gts_by_lens[prefix_len]), preds_format, gt_format, average, beta, eoc) num_prefixes = len(preds_by_lens[prefix_len]) dict_results[f'{prefix_len}-prefix'] = [result, num_prefixes] df_result = pd.DataFrame(dict_results) return df_result
def __apply_metric(metric: Literal['accuracy', 'fbeta', 'f1_score', 'precision', 'recall', 'mcc', 'brier_loss', 'damerau_levenshtein', 'mae', 'mse'], predictions: np.array, ground_truths: np.array, preds_format: Literal['labels', 'onehot'], gt_format: Literal['labels', 'onehot'], average: Literal['micro', 'macro', 'weighted'], beta: float, eoc: Union[str, int] = None) -> float: if metric == 'accuracy': result, _, _ = event.get_accuracy(predictions, ground_truths, preds_format, gt_format) elif metric == 'fbeta': result, _, _ = event.get_fbeta(predictions, ground_truths, beta, average, preds_format, gt_format) elif metric == 'f1_score': result, _, _ = event.get_f1_score(predictions, ground_truths, average, preds_format, gt_format) elif metric == 'precision': result = event.get_precision(predictions, ground_truths, average, preds_format, gt_format) elif metric == 'recall': result = event.get_recall(predictions, ground_truths, average, preds_format, gt_format) elif metric == 'mcc': result = event.get_mcc(predictions, ground_truths, preds_format, gt_format) elif metric == 'brier_loss': result = event.get_brier_loss(predictions, ground_truths, gt_format) elif metric == 'damerau_levenshtein': result = suffix.get_damerau_levenshtein_score(predictions, ground_truths, preds_format, gt_format, eoc) elif metric == 'mae': result = time.get_mae(predictions, ground_truths, reduction='mean') elif metric == 'mae': result = time.get_mse(predictions, ground_truths, reduction='mean') else: result = 0.0 return result