Source code for verona.data.statistics

import pandas as pd

from verona.data.utils import DataFrameFields


[docs] def get_num_activities(dataset: pd.DataFrame, activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> int: """ Returns the number of unique activities in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. activity_id (str, optional): Name of the activity column in the DataFrame. Default is ``DataFrameFields.ACTIVITY_COLUMN``. Returns: int: The number of unique activities in the dataset. Raises: ValueError: If the dataset is empty or the activity column does not exist. Examples: >>> df = pd.DataFrame({'activity': ['A', 'B', 'A', 'C']}) >>> num_activities = get_num_activities(df) >>> print(num_activities) 3 """ return get_num_values(dataset, activity_id)
[docs] def get_activities_list(dataset: pd.DataFrame, activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> list: """ Returns the list of unique activities in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. activity_id (str, optional): Name of the activity column in the DataFrame. Default is ``DataFrameFields.ACTIVITY_COLUMN``. Returns: list: A list containing unique activities in the dataset. Raises: ValueError: If the dataset is empty or the activity column does not exist. Examples: >>> df = pd.DataFrame({'activity': ['A', 'B', 'A', 'C']}) >>> activities_list = get_activities_list(df) >>> print(activities_list) ['A', 'B', 'C'] """ return get_values_list(dataset, activity_id)
[docs] def get_num_values(dataset: pd.DataFrame, attribute_id: str) -> int: """ Returns the number of unique values for the specified attribute in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. attribute_id (str): Name of the attribute column in the DataFrame. Returns: int: The number of unique values for the specified attribute in the dataset. Raises: ValueError: If the dataset is empty or the attribute column does not exist. Examples: >>> df = pd.DataFrame({'attribute': [1, 2, 2, 3]}) >>> num_values = get_num_values(df, 'attribute') >>> print(num_values) 3 """ return dataset[attribute_id].nunique()
[docs] def get_values_list(dataset: pd.DataFrame, attribute_id: str = DataFrameFields.ACTIVITY_COLUMN) -> list: """ Returns the list of unique values for the specified attribute in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. attribute_id (str, optional): Name of the attribute column in the DataFrame. Default is ``DataFrameFields.ACTIVITY_COLUMN``. Returns: list: The list of unique values for the specified attribute in the dataset. Raises: ValueError: If the dataset is empty or the attribute column does not exist. Examples: >>> df = pd.DataFrame({'attribute': [1, 2, 2, 3]}) >>> values_list = get_values_list(df, 'attribute') >>> print(values_list) [1, 2, 3] """ return dataset[attribute_id].unique().tolist()
[docs] def get_num_cases(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN) -> int: """ Returns the number of unique cases in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. Returns: int: The number of unique cases in the dataset. Raises: ValueError: If the dataset is empty or the case identifier column does not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> num_cases = get_num_cases(df, 'case') >>> print(num_cases) 3 """ return dataset[case_id].nunique()
[docs] def get_max_len_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN) -> int: """ Returns the maximum case length in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. Returns: int: The maximum case length in the dataset. Raises: ValueError: If the dataset is empty or the case identifier column does not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> max_len = get_max_len_case(df, 'case') >>> print(max_len) 2 """ cases = dataset.groupby(case_id) return cases[case_id].count().max()
[docs] def get_min_len_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN) -> int: """ Returns the minimum case length in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. Returns: int: The minimum case length in the dataset. Raises: ValueError: If the dataset is empty or the case identifier column does not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> min_len = get_min_len_case(df, 'case') >>> print(min_len) 1 """ cases = dataset.groupby(case_id) return cases[case_id].count().min()
[docs] def get_avg_len_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN) -> float: """ Returns the average case length in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. Returns: float: The average case length in the dataset. Raises: ValueError: If the dataset is empty or the case identifier column does not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> avg_len = get_avg_len_case(df, 'case') >>> print(avg_len) 1.6666666666666667 """ cases = dataset.groupby(case_id) return cases[case_id].count().mean()
[docs] def get_max_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the maximum case temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The maximum temporal duration of a case in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> max_dur_case = get_max_duration_case(df, 'CaseID', 'Timestamp') >>> print(max_dur_case) 91 days 10:55:36.161000 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"]) return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).max()
[docs] def get_min_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the minimum case temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The minimum temporal duration of a case in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> min_dur_case = get_max_duration_case(df, 'CaseID', 'Timestamp') >>> print(min_dur_case) 0 days 00:00:01.855000 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"]) return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).min()
[docs] def get_avg_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the average case temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The average temporal duration of a case in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> min_dur_case = get_avg_duration_case(df, 'CaseID', 'Timestamp') >>> print(min_dur_case) 8 days 01:55:14.860649805 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"]) return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).mean()
[docs] def get_max_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the maximum event temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The maximum temporal duration of an event in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> max_dur_event = get_max_duration_event(df, 'CaseID', 'Timestamp') >>> print(max_dur_event) 89 days 13:10:06.164000 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) return cases[timestamp_id].diff().max()
[docs] def get_min_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the minimum event temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The minimum temporal duration of an event in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> min_dur_event = get_min_duration_event(df, 'CaseID', 'Timestamp') >>> print(min_dur_event) 0 days 00:00:00 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) return cases[timestamp_id].diff().min()
[docs] def get_avg_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta: """ Returns the average event temporal duration in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. timestamp_id (str, optional): Name of the timestamp column in the DataFrame. Default is ``DataFrameFields.TIMESTAMP_COLUMN``. Returns: pd.Timedelta: The average temporal duration of an event in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist. Examples: >>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv') >>> avg_dur_event = get_avg_duration_event(df, 'CaseID', 'Timestamp') >>> print(avg_dur_event) 2 days 05:08:06.570523093 """ dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601') cases = dataset.groupby(case_id) return cases[timestamp_id].diff().mean()
[docs] def get_num_variants(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> int: """ Returns the number of unique cases (different sequences of activities) in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. activity_id (str, optional): Name of the activity column in the DataFrame. Default is ``DataFrameFields.ACTIVITY_COLUMN``. Returns: int: The number of variants (cases with different sequences of activities). Raises: ValueError: If the dataset is empty, or the case identifier or activity columns do not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> num_variants = get_num_variants(df, 'case', 'activity') >>> print(num_variants) 3 """ dataset[activity_id] = dataset[activity_id].astype(str) cases = dataset.groupby(case_id) return cases[activity_id].agg("->".join).nunique()
[docs] def get_count_variants(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN, activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> dict: """ Returns the number of times each variant appears in the dataset. Parameters: dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed. case_id (str, optional): Name of the case identifier column in the DataFrame. Default is ``DataFrameFields.CASE_COLUMN``. activity_id (str, optional): Name of the activity column in the DataFrame. Default is ``DataFrameFields.ACTIVITY_COLUMN``. Returns: dict: Dictionary where the keys are the variants and the values are the count of occurrences of each variant in the dataset. Raises: ValueError: If the dataset is empty, or the case identifier or activity columns do not exist. Examples: >>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']}) >>> count_variants = get_count_variants(df, 'case', 'activity') >>> print(count_variants) {'A->B': 1, 'A->C': 1, 'D': 1} """ dataset[activity_id] = dataset[activity_id].astype(str) cases = dataset.groupby(case_id) return cases[activity_id].agg("->".join).value_counts().to_dict()