import pandas as pd
from verona.data.utils import DataFrameFields
[docs]
def get_num_activities(dataset: pd.DataFrame,
activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> int:
"""
Returns the number of unique activities in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
activity_id (str, optional): Name of the activity column in the DataFrame.
Default is ``DataFrameFields.ACTIVITY_COLUMN``.
Returns:
int: The number of unique activities in the dataset.
Raises:
ValueError: If the dataset is empty or the activity column does not exist.
Examples:
>>> df = pd.DataFrame({'activity': ['A', 'B', 'A', 'C']})
>>> num_activities = get_num_activities(df)
>>> print(num_activities)
3
"""
return get_num_values(dataset, activity_id)
[docs]
def get_activities_list(dataset: pd.DataFrame,
activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> list:
"""
Returns the list of unique activities in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
activity_id (str, optional): Name of the activity column in the DataFrame.
Default is ``DataFrameFields.ACTIVITY_COLUMN``.
Returns:
list: A list containing unique activities in the dataset.
Raises:
ValueError: If the dataset is empty or the activity column does not exist.
Examples:
>>> df = pd.DataFrame({'activity': ['A', 'B', 'A', 'C']})
>>> activities_list = get_activities_list(df)
>>> print(activities_list)
['A', 'B', 'C']
"""
return get_values_list(dataset, activity_id)
[docs]
def get_num_values(dataset: pd.DataFrame,
attribute_id: str) -> int:
"""
Returns the number of unique values for the specified attribute in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
attribute_id (str): Name of the attribute column in the DataFrame.
Returns:
int: The number of unique values for the specified attribute in the dataset.
Raises:
ValueError: If the dataset is empty or the attribute column does not exist.
Examples:
>>> df = pd.DataFrame({'attribute': [1, 2, 2, 3]})
>>> num_values = get_num_values(df, 'attribute')
>>> print(num_values)
3
"""
return dataset[attribute_id].nunique()
[docs]
def get_values_list(dataset: pd.DataFrame,
attribute_id: str = DataFrameFields.ACTIVITY_COLUMN) -> list:
"""
Returns the list of unique values for the specified attribute in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
attribute_id (str, optional): Name of the attribute column in the DataFrame.
Default is ``DataFrameFields.ACTIVITY_COLUMN``.
Returns:
list: The list of unique values for the specified attribute in the dataset.
Raises:
ValueError: If the dataset is empty or the attribute column does not exist.
Examples:
>>> df = pd.DataFrame({'attribute': [1, 2, 2, 3]})
>>> values_list = get_values_list(df, 'attribute')
>>> print(values_list)
[1, 2, 3]
"""
return dataset[attribute_id].unique().tolist()
[docs]
def get_num_cases(dataset: pd.DataFrame,
case_id: str = DataFrameFields.CASE_COLUMN) -> int:
"""
Returns the number of unique cases in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
Returns:
int: The number of unique cases in the dataset.
Raises:
ValueError: If the dataset is empty or the case identifier column does not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> num_cases = get_num_cases(df, 'case')
>>> print(num_cases)
3
"""
return dataset[case_id].nunique()
[docs]
def get_max_len_case(dataset: pd.DataFrame,
case_id: str = DataFrameFields.CASE_COLUMN) -> int:
"""
Returns the maximum case length in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
Returns:
int: The maximum case length in the dataset.
Raises:
ValueError: If the dataset is empty or the case identifier column does not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> max_len = get_max_len_case(df, 'case')
>>> print(max_len)
2
"""
cases = dataset.groupby(case_id)
return cases[case_id].count().max()
[docs]
def get_min_len_case(dataset: pd.DataFrame,
case_id: str = DataFrameFields.CASE_COLUMN) -> int:
"""
Returns the minimum case length in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
Returns:
int: The minimum case length in the dataset.
Raises:
ValueError: If the dataset is empty or the case identifier column does not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> min_len = get_min_len_case(df, 'case')
>>> print(min_len)
1
"""
cases = dataset.groupby(case_id)
return cases[case_id].count().min()
[docs]
def get_avg_len_case(dataset: pd.DataFrame,
case_id: str = DataFrameFields.CASE_COLUMN) -> float:
"""
Returns the average case length in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
Returns:
float: The average case length in the dataset.
Raises:
ValueError: If the dataset is empty or the case identifier column does not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> avg_len = get_avg_len_case(df, 'case')
>>> print(avg_len)
1.6666666666666667
"""
cases = dataset.groupby(case_id)
return cases[case_id].count().mean()
[docs]
def get_max_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the maximum case temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The maximum temporal duration of a case in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> max_dur_case = get_max_duration_case(df, 'CaseID', 'Timestamp')
>>> print(max_dur_case)
91 days 10:55:36.161000
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"])
return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).max()
[docs]
def get_min_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the minimum case temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The minimum temporal duration of a case in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> min_dur_case = get_max_duration_case(df, 'CaseID', 'Timestamp')
>>> print(min_dur_case)
0 days 00:00:01.855000
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"])
return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).min()
[docs]
def get_avg_duration_case(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the average case temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The average temporal duration of a case in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> min_dur_case = get_avg_duration_case(df, 'CaseID', 'Timestamp')
>>> print(min_dur_case)
8 days 01:55:14.860649805
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
first_and_last_timestamp_per_case = cases[timestamp_id].agg(["first", "last"])
return (first_and_last_timestamp_per_case["last"] - first_and_last_timestamp_per_case["first"]).mean()
[docs]
def get_max_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the maximum event temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The maximum temporal duration of an event in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> max_dur_event = get_max_duration_event(df, 'CaseID', 'Timestamp')
>>> print(max_dur_event)
89 days 13:10:06.164000
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
return cases[timestamp_id].diff().max()
[docs]
def get_min_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the minimum event temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The minimum temporal duration of an event in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> min_dur_event = get_min_duration_event(df, 'CaseID', 'Timestamp')
>>> print(min_dur_event)
0 days 00:00:00
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
return cases[timestamp_id].diff().min()
[docs]
def get_avg_duration_event(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
timestamp_id: str = DataFrameFields.TIMESTAMP_COLUMN) -> pd.Timedelta:
"""
Returns the average event temporal duration in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
timestamp_id (str, optional): Name of the timestamp column in the DataFrame.
Default is ``DataFrameFields.TIMESTAMP_COLUMN``.
Returns:
pd.Timedelta: The average temporal duration of an event in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or timestamp columns do not exist.
Examples:
>>> df = pd.read_csv('../../BPI_Challenge_2012_A.csv')
>>> avg_dur_event = get_avg_duration_event(df, 'CaseID', 'Timestamp')
>>> print(avg_dur_event)
2 days 05:08:06.570523093
"""
dataset[timestamp_id] = pd.to_datetime(dataset[timestamp_id], format='ISO8601')
cases = dataset.groupby(case_id)
return cases[timestamp_id].diff().mean()
[docs]
def get_num_variants(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> int:
"""
Returns the number of unique cases (different sequences of activities) in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
activity_id (str, optional): Name of the activity column in the DataFrame.
Default is ``DataFrameFields.ACTIVITY_COLUMN``.
Returns:
int: The number of variants (cases with different sequences of activities).
Raises:
ValueError: If the dataset is empty, or the case identifier or activity columns do not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> num_variants = get_num_variants(df, 'case', 'activity')
>>> print(num_variants)
3
"""
dataset[activity_id] = dataset[activity_id].astype(str)
cases = dataset.groupby(case_id)
return cases[activity_id].agg("->".join).nunique()
[docs]
def get_count_variants(dataset: pd.DataFrame, case_id: str = DataFrameFields.CASE_COLUMN,
activity_id: str = DataFrameFields.ACTIVITY_COLUMN) -> dict:
"""
Returns the number of times each variant appears in the dataset.
Parameters:
dataset (pd.DataFrame): DataFrame containing the dataset to be analyzed.
case_id (str, optional): Name of the case identifier column in the DataFrame.
Default is ``DataFrameFields.CASE_COLUMN``.
activity_id (str, optional): Name of the activity column in the DataFrame.
Default is ``DataFrameFields.ACTIVITY_COLUMN``.
Returns:
dict: Dictionary where the keys are the variants and the values are the count of occurrences of each variant
in the dataset.
Raises:
ValueError: If the dataset is empty, or the case identifier or activity columns do not exist.
Examples:
>>> df = pd.DataFrame({'case': [1, 1, 2, 2, 3], 'activity': ['A', 'B', 'A', 'C', 'D']})
>>> count_variants = get_count_variants(df, 'case', 'activity')
>>> print(count_variants)
{'A->B': 1, 'A->C': 1, 'D': 1}
"""
dataset[activity_id] = dataset[activity_id].astype(str)
cases = dataset.groupby(case_id)
return cases[activity_id].agg("->".join).value_counts().to_dict()