import warnings
from datetime import timedelta, timezone
from typing import TYPE_CHECKING, Tuple, Union
import numpy as np
import pandas as pd
from databallpy.data_parsers.tracking_data_parsers.utils import (
_adjust_start_end_frames,
_get_gametime,
_insert_missing_rows,
)
from databallpy.schemas import EventData, TrackingData
from databallpy.utils.constants import MISSING_INT
if TYPE_CHECKING:
from kloppy.domain import EventDataset, TrackingDataset
def _remove_utc(ts: pd.Timestamp) -> pd.Timestamp:
"""Remove timezone information from a timestamp or convert to UTC then remove timezone.
If the timestamp has no timezone info, it's assumed to be UTC and timezone info is added
then removed. If it has timezone info, it's converted to UTC then the timezone info is removed.
This ensures consistent handling of timestamps regardless of their initial timezone state.
Args:
ts (pd.Timestamp): timestamp to process
Returns:
pd.Timestamp: timezone-naive timestamp in UTC
"""
if ts is None:
return
return (
ts.replace(tzinfo=timezone.utc)
if ts.tzinfo is None
else ts.astimezone(timezone.utc).replace(tzinfo=None)
)
def _convert_datetime(
kloppy_timestamp: timedelta,
date: pd.Timestamp,
period_id: int = None,
verbose: bool = False,
) -> pd.Timestamp:
"""Convert a kloppy relative timestamp to an absolute datetime.
Kloppy timestamps are relative to the start of each period. This function converts them to
absolute timestamps by adding the game date and a period-based offset. The offset accounts
for the duration of previous periods (60 min for period 2, 105 min for period 3, 120 min
for period 4). Note that time between periods is not included in the offset.
Args:
kloppy_timestamp (timedelta): relative timestamp from kloppy (time since period start)
date (pd.Timestamp): game date to use as base for absolute timestamp
period_id (int, optional): period identifier (1, 2, 3, or 4). If None, no offset is applied.
This is used when the date has already been adjusted for the period. Defaults to None.
verbose (bool, optional): whether to print warnings. Defaults to False
Returns:
pd.Timestamp: absolute timestamp combining game date, kloppy timestamp, and period offset.
Returns None if kloppy_timestamp is None. Falls back to Unix epoch ('1975-01-01') if date is None
"""
# Note: this disregards the time in between periods
timestamp_offset = (
(
pd.Timedelta(minutes=60)
if period_id == 2
else pd.Timedelta(minutes=105)
if period_id == 3
else pd.Timedelta(minutes=120)
if period_id == 4
else pd.Timedelta(0)
)
if period_id is not None
else pd.Timedelta(0)
)
if kloppy_timestamp is None:
return None
date = _remove_utc(date)
if date is not None:
return kloppy_timestamp + date + timestamp_offset
else:
if verbose:
warnings.warn(
"Game date is None, using Unix epoch ('1975-01-01') as fall back date."
)
return kloppy_timestamp + pd.Timestamp("1975-01-01") + timestamp_offset
def players_from_kloppy(
dataset: Union["EventDataset", "TrackingDataset"],
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Function to get all information on players from a given kloppy dataset
Args:
dataset (kloppy.domain.TrackingDataset, kloppy.domain.EventDataset): Kloppy event or tracking dataset
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: All home and away player information in two DataFrames
"""
from kloppy.domain import Ground
home_players, away_players = [], []
for player in dataset.metadata.teams[0].players + dataset.metadata.teams[1].players:
p = {
"id": player.player_id,
"full_name": player.name,
"shirt_num": player.jersey_no,
"position": (
player.starting_position.position_group.value[0]
.lower()
.replace("attacker", "forward")
.replace("unknown", "unspecified")
)
if player.starting_position is not None
else "unspecified",
"start_frame": MISSING_INT,
"end_frame": MISSING_INT,
"starter": player.starting if player.starting is not None else False,
}
if player.team.ground == Ground.HOME:
home_players.append(p)
else:
away_players.append(p)
return pd.DataFrame(home_players), pd.DataFrame(away_players)
def periods_from_kloppy(
event_dataset: "EventDataset" = None, tracking_dataset: "TrackingDataset" = None
) -> pd.DataFrame:
"""
Function to get all information on period start and end times from kloppy datasets
Args:
tracking_dataset (kloppy.domain.TrackingDataset, optional): A Kloppy tracking dataset.
Defaults to None.
event_dataset (kloppy.domain.EventDataset, optional): A Kloppy event dataset.
Defaults to None.
Returns:
periods (pd.DataFrame) All information about the periods
"""
uses_tracking_data = False
uses_event_data = False
if tracking_dataset is not None:
uses_tracking_data = True
if event_dataset is not None:
uses_event_data = True
if event_dataset is None and tracking_dataset is None:
raise ValueError(
"At least one of event_dataset or tracking_dataset must be provided."
)
if event_dataset is not None and tracking_dataset is not None:
if len(event_dataset.metadata.periods) != len(tracking_dataset.metadata.periods):
min_periods = min(
len(event_dataset.metadata.periods),
len(tracking_dataset.metadata.periods),
)
event_dataset.metadata.periods = event_dataset.metadata.periods[
0:min_periods
]
tracking_dataset.metadata.periods = tracking_dataset.metadata.periods[
0:min_periods
]
warnings.warn(
f"Number of periods in event and tracking dataset do not match. Using the minimum ({min_periods}) periods from both datasets."
)
game_date = (
tracking_dataset.metadata.date
if uses_tracking_data
else event_dataset.metadata.date
)
periods = []
# the Game.periods object must always have 5 enties
for i in range(1, 6):
period_info = {"period_id": i}
period_records_td = (
tracking_dataset.filter(lambda frame: frame.period.id == i)
if uses_tracking_data
else None
)
period_records_ed = (
event_dataset.filter(lambda frame: frame.period.id == i)
if uses_event_data
else None
)
if uses_tracking_data:
if len(period_records_td.records) == 0:
start_frame = end_frame = MISSING_INT
start_timestamp_td = end_timestamp_td = None
else:
period_td = tracking_dataset.metadata.periods[i - 1]
if (
isinstance(period_td.start_timestamp, timedelta)
or period_td.start_timestamp is None
):
start_timestamp_td = _convert_datetime(
period_records_td[0].timestamp, game_date, period_id=i
)
else:
start_timestamp_td = (
_remove_utc(period_td.start_timestamp)
if uses_tracking_data
else None
)
if (
isinstance(period_td.end_timestamp, timedelta)
or period_td.end_timestamp is None
):
end_timestamp_td = _convert_datetime(
period_records_td[-1].timestamp, game_date, period_id=i
)
else:
end_timestamp_td = (
_remove_utc(period_td.end_timestamp)
if uses_tracking_data
else None
)
start_frame = period_records_td[0].frame_id
end_frame = period_records_td[-1].frame_id
period_info.update(
{
"start_frame": start_frame,
"end_frame": end_frame,
"start_timestamp_td": start_timestamp_td,
"end_timestamp_td": end_timestamp_td,
}
)
if uses_event_data:
if (i - 1) >= len(event_dataset.metadata.periods):
start_timestamp_ed = end_timestamp_ed = None
else:
period_ed = event_dataset.metadata.periods[i - 1]
if (
isinstance(period_ed.start_timestamp, timedelta)
or period_ed.start_timestamp is None
):
start_timestamp_ed = _convert_datetime(
period_records_ed[0].timestamp, game_date, period_id=i
)
else:
start_timestamp_ed = (
_remove_utc(period_ed.start_timestamp)
if uses_event_data
else None
)
if (
isinstance(period_ed.end_timestamp, timedelta)
or period_ed.end_timestamp is None
):
end_timestamp_ed = _convert_datetime(
period_records_ed[-1].timestamp, game_date, period_id=i
)
else:
end_timestamp_ed = (
_remove_utc(period_ed.end_timestamp) if uses_event_data else None
)
period_info.update(
{
"start_timestamp_ed": start_timestamp_ed,
"end_timestamp_ed": end_timestamp_ed,
}
)
periods.append(period_info)
return pd.DataFrame(periods)
[docs]
def convert_kloppy_tracking_dataset(
tracking_dataset: "TrackingDataset", periods: pd.DataFrame
) -> TrackingData:
"""
Function to get all information of a game given kloppy dataset(s)
Args:
tracking_dataset (kloppy.domain.TrackingDataset, optional): a Kloppy tracking dataset.
periods (pd.DataFrame): DataFrame containing information about the periods
Returns:
TrackingData: All tracking data in a TrackingData object
"""
home_team, away_team = tracking_dataset.metadata.teams
player_columns = {}
for player in home_team.players + away_team.players:
player_columns.update(
{f"{player.player_id}_x": f"{player.team.ground}_{player.jersey_no}_x"}
)
player_columns.update(
{f"{player.player_id}_y": f"{player.team.ground}_{player.jersey_no}_y"}
)
team_id_to_side = {home_team.team_id: "home", away_team.team_id: "away"}
tracking_dataframe = (
tracking_dataset.to_df(
"frame_id",
"period_id",
"timestamp",
"ball_state",
"ball_owning_team_id",
"ball_z",
"*_x",
"*_y",
engine="pandas",
)
.assign(
timestamp=lambda x: x.apply(
lambda row: _convert_datetime(
row["timestamp"],
periods[periods["period_id"] == row["period_id"]][
"start_timestamp_td"
].iloc[0],
period_id=None,
verbose=False,
),
axis=1,
),
team_possession=lambda x: x["ball_owning_team_id"].map(team_id_to_side),
)
.rename(
columns={
"frame_id": "frame",
"ball_state": "ball_status",
"timestamp": "datetime",
}
| player_columns
)
.drop(columns=["ball_owning_team_id"])
)
class SimplifiedMetada:
def __init__(self, periods, frame_rate):
self.periods_frames = periods
self.frame_rate = frame_rate
simplified_metadata = SimplifiedMetada(periods, tracking_dataset.metadata.frame_rate)
tracking_dataframe = _insert_missing_rows(
tracking_dataframe.reset_index(drop=True), "frame"
)
tracking_dataframe, simplified_metadata = _adjust_start_end_frames(
tracking_dataframe, simplified_metadata
)
tracking_dataframe["gametime_td"] = _get_gametime(
tracking_dataframe["frame"], tracking_dataframe["period_id"], simplified_metadata
)
return TrackingData(
tracking_dataframe,
provider=tracking_dataset.metadata.provider.value,
frame_rate=tracking_dataset.metadata.frame_rate,
)
[docs]
def convert_kloppy_event_dataset(
event_dataset: "EventDataset", periods: pd.DataFrame
) -> EventData:
"""
Function to get all information of a game given kloppy dataset(s)
Args:
event_dataset (kloppy.domain.EventDataset, optional): A Kloppy event dataset.
periods (pd.DataFrame): DataFrame containing information about the periods
Returns:
EventData: All event data in an EventData object
"""
from kloppy.domain import (
CarryResult,
DuelResult,
EventType,
InterceptionResult,
PassResult,
ShotResult,
TakeOnResult,
)
is_successful = [
ShotResult.GOAL,
ShotResult.OWN_GOAL,
PassResult.COMPLETE,
TakeOnResult.COMPLETE,
CarryResult.COMPLETE,
DuelResult.WON,
InterceptionResult.SUCCESS,
]
event_map = {
EventType.PASS.value: "pass",
EventType.SHOT.value: "shot",
EventType.CARRY.value: "dribble",
EventType.TAKE_ON.value: "dribble",
}
home_team, away_team = event_dataset.metadata.teams
players = home_team.players + away_team.players
player_id_to_name = {player.player_id: player.name for player in players}
event_dataframe = (
event_dataset.to_df(
"period_id",
"event_id",
"timestamp",
"player_id",
"team_id",
"coordinates_x",
"coordinates_y",
"event_type",
"result",
is_successful=lambda event: None
if event.result is None
else True
if event.result in is_successful
else False,
minutes=lambda event: (int(event.timestamp.total_seconds()) % 3600 // 60)
+ (45 if event.period.id == 2 else 15 if event.period.id in [3, 4] else 0),
seconds=lambda event: float(event.timestamp.total_seconds()) % 60,
engine="pandas",
)
.sort_values(by=["period_id", "timestamp"], ascending=True)
.reset_index(drop=True)
.reset_index()
.assign(
timestamp=lambda x: x.apply(
lambda row: _convert_datetime(
row["timestamp"],
periods[periods["period_id"] == row["period_id"]][
"start_timestamp_ed"
].iloc[0],
period_id=None,
verbose=False,
),
axis=1,
),
databallpy_event=lambda x: np.where(
x["result"] == ShotResult.OWN_GOAL,
"own_goal",
x["event_type"].map(event_map),
),
player_name=lambda x: x["player_id"].map(player_id_to_name).astype(str),
is_successful=lambda x: x["is_successful"].astype(pd.BooleanDtype()),
)
.rename(
columns={
"frame_id": "frame",
"ball_state": "ball_status",
"ball_owning_team_id": "team_possession",
"timestamp": "datetime",
"coordinates_x": "start_x",
"coordinates_y": "start_y",
"event_id": "original_event_id",
"index": "event_id",
"event_type": "original_event",
}
)
.drop("result", axis=1)
)
# Because Pandera (when nullable=False) expects 'datetime' to have type datetime64[ns] and not datetime64[ns, UTC], remove UTC component
event_dataframe["datetime"] = event_dataframe["datetime"].dt.tz_localize(None)
return EventData(event_dataframe, provider=event_dataset.metadata.provider.value)