Source code for verona.evaluation.metrics.suffix

from typing import Literal, Union

import numpy as np

from verona.data.utils import get_labels_from_onehot


[docs] def get_damerau_levenshtein_score(predictions: list[np.array], ground_truths: list[np.array], preds_format: Literal['labels', 'onehot'], gt_format: Literal['labels', 'onehot'], eoc: Union[str, int] = None) -> float: """ Calculates the Damerau-Levenshtein score between the predictions and the real values. The Damerau-Levenshtein distance represents the number of insertions, deletions, substitutions, and transpositions required to change the first sequence into the second. In this function, the score is normalized by the size of the longest sequence, and the value is obtained by subtracting the normalized distance from 1. Args: predictions (list[np.array]): List containing the predicted suffixes as NumPy Arrays. ground_truths (list[np.array]): List containing the ground truth suffixes as NumPy Arrays. preds_format (Literal['labels', 'onehot']): Format of the predictions. If ``'label'``, the predictions array contains the labels of the activities/attributes predicted. If ``'onehot'``, the predictions array contains vectors of probabilities, and the labels are internally extracted based on the highest value element for the metric calculation. gt_format (Literal['labels', 'onehot']): Format of the ground truth. If ``'label'``, the ground truth array contains the labels of the correct activities/attributes. If ``'onehot'``, the ground truth array contains the one-hot representation of the correct values, and the labels are internally extracted for the metric calculation. eoc (Union[str, int], optional): Label of the End-of-Case (EOC) which is an element that signifies the end of the trace/suffix. Returns: float: Damerau-Levenshtein score between 0 and 1. A lower value indicates worse suffix prediction, whereas a higher value indicates a prediction closer to the actual suffix. Examples: >>> ground_truths = [np.array([0, 1, 2, 3, 4])] >>> predictions = [np.array([0, 12, 2])] >>> dl_score = suffix.get_damerau_levenshtein_score(predictions, ground_truths, preds_format='labels', gt_format='labels') >>> print(dl_score) 0.4 """ if preds_format == 'onehot': predictions = get_labels_from_onehot(predictions) if gt_format == 'onehot': ground_truths = get_labels_from_onehot(ground_truths) list_dl_scores = [] for pred, gt in zip(predictions, ground_truths): dl_distance, len_preds, len_gts = __damerau_levenshtein_similarity(pred, gt, eoc) dl_score = 1 - (dl_distance / max(len_preds, len_gts)) list_dl_scores.append(dl_score) dl_score = np.mean(np.array(list_dl_scores)).item() return dl_score
def __damerau_levenshtein_similarity(predictions: np.array, ground_truths: np.array, code_end: Union[str, int]) -> (float, int, int): if code_end: try: l1 = np.where(predictions == code_end)[0].item() except ValueError: l1 = predictions.size try: l2 = np.where(ground_truths == code_end)[0].item() except ValueError: l2 = ground_truths.size else: l1 = predictions.size l2 = ground_truths.size if max(l1, l2) == 0: return 1.0 matrix = [list(range(l1 + 1))] * (l2 + 1) for i in list(range(l2 + 1)): matrix[i] = list(range(i, i + l1 + 1)) for i in range(1, l2 + 1): for j in range(1, l1 + 1): cost = 0 if predictions[j - 1] == ground_truths[i - 1] else 1 matrix[i][j] = min(matrix[i - 1][j] + 1, # Deletion matrix[i][j - 1] + 1, # Insertion matrix[i - 1][j - 1] + cost) # Substitution # Check for transposition if i > 1 and j > 1 and predictions[j - 1] == ground_truths[i - 2] and \ predictions[j - 2] == ground_truths[i - 1]: matrix[i][j] = min(matrix[i][j], matrix[i - 2][j - 2] + cost) # Transposition distance = float(matrix[l2][l1]) return distance, l1, l2