import datetime as dt
import html
import io
import json
import os
import re
import chardet
import numpy as np
import pandas as pd
import requests
from databallpy.data_parsers import Metadata
from databallpy.data_parsers.event_data_parsers.utils import (
_normalize_playing_direction_events,
)
from databallpy.data_parsers.metrica_metadata_parser import (
_get_metadata,
_get_td_channels,
_update_metadata,
)
from databallpy.events import DribbleEvent, PassEvent, ShotEvent, TackleEvent
from databallpy.utils.constants import MISSING_INT
from databallpy.utils.logging import logging_wrapper
from databallpy.utils.utils import _to_float, _to_int
metrica_databallpy_map = {
"pass": "pass",
"carry": "dribble",
"shot": "shot",
"tackle": "tackle",
}
[docs]
@logging_wrapper(__file__)
def load_metrica_event_data(
event_data_loc: str, metadata_loc: str
) -> tuple[pd.DataFrame, Metadata, dict]:
"""Function to load the metrica event data.
Args:
event_data_loc (str): location of the event data .json file
metadata_loc (str): location of the metadata .xml file
Raises:
TypeError: type error if event_data_loc, or metadata_loc is not a valid input
type (str)
Returns:
Tuple[pd.DataFrame, Metadata, dict]: The event data and the metadata, and databallpy events
"""
if isinstance(event_data_loc, str) and "{" not in event_data_loc:
if not os.path.exists(metadata_loc):
raise FileNotFoundError(f"Could not find {metadata_loc}")
elif isinstance(event_data_loc, str) and "{" in event_data_loc:
# event_data_loc has a '{' in it. Expecting it to be the json file
# with the event data, not the location of the event data .json.
pass
else:
raise TypeError(
"tracking_data_loc must be either a str or a StringIO object,"
f" not a {type(event_data_loc)}"
)
metadata = _get_metadata(metadata_loc, is_tracking_data=False, is_event_data=True)
td_channels = _get_td_channels(metadata_loc, metadata)
metadata = _update_metadata(td_channels, metadata)
event_data = _get_event_data(event_data_loc)
# rescale the event locations, metrica data is scaled between 0 and 1.
for col in [x for x in event_data.columns if "_x" in x]:
event_data[col] = (
event_data[col] * metadata.pitch_dimensions[0]
- metadata.pitch_dimensions[0] / 2.0
)
for col in [x for x in event_data.columns if "_y" in x]:
event_data[col] = (
event_data[col] * metadata.pitch_dimensions[1]
- metadata.pitch_dimensions[1] / 2.0
)
# add datetime based on frame numbers
first_frame = metadata.periods_frames.loc[
metadata.periods_frames["period_id"] == 1, "start_frame"
].iloc[0]
start_time = metadata.periods_frames.loc[
metadata.periods_frames["period_id"] == 1, "start_datetime_ed"
].iloc[0]
frame_rate = metadata.frame_rate
rel_timedelta = [
dt.timedelta(milliseconds=(x - first_frame) / frame_rate * 1000)
for x in event_data["td_frame"]
]
# no idea about time zone since we have no real data, so just assume utc
event_data["datetime"] = [
pd.to_datetime(start_time, utc=True) + x for x in rel_timedelta
]
event_data = _normalize_playing_direction_events(
event_data, metadata.home_team_id, metadata.away_team_id
)
databallpy_events = _get_databallpy_events(
event_data,
metadata.pitch_dimensions,
metadata.home_team_id,
pd.concat([metadata.home_players, metadata.away_players], ignore_index=True),
)
return event_data, metadata, databallpy_events
[docs]
@logging_wrapper(__file__)
def load_metrica_open_event_data() -> tuple[pd.DataFrame, Metadata, dict]:
"""Function to load the open event data of metrica
Returns:
Tuple[pd.DataFrame, Metadata]: event data and metadata of the game and databallpy events
"""
metadata_link = "https://raw.githubusercontent.com/metrica-sports/sample-data\
/master/data/Sample_Game_3/Sample_Game_3_metadata.xml"
ed_link = "https://raw.githubusercontent.com/metrica-sports/sample-data\
/master/data/Sample_Game_3/Sample_Game_3_events.json"
raw_ed = requests.get(ed_link).text
raw_metadata = requests.get(metadata_link).text
return load_metrica_event_data(raw_ed, raw_metadata)
@logging_wrapper(__file__)
def _get_event_data(event_data_loc: str | io.StringIO) -> pd.DataFrame:
"""Function to load metrica event data
Args:
event_data_loc (Union[str, io.StringIO]): location of the event data file
Returns:
pd.DataFrame: event data
"""
if isinstance(event_data_loc, str) and "{" not in event_data_loc:
with open(event_data_loc, "rb") as file:
encoding = chardet.detect(file.read())["encoding"]
with open(event_data_loc, "r", encoding=encoding) as file:
lines = file.readlines()
raw_data = "".join(str(i) for i in lines)
events_dict = json.loads(html.unescape(re.sub(r"<[^>]+>", "", raw_data)))
else:
events_dict = json.loads(
html.unescape(re.sub(r"<[^>]+>", "", event_data_loc.strip()))
)
result_dict = {
"event_id": [],
"databallpy_event": [],
"period_id": [],
"minutes": [],
"seconds": [],
"player_id": [],
"player_name": [],
"team_id": [],
"is_successful": [],
"start_x": [],
"start_y": [],
"to_player_id": [],
"to_player_name": [],
"end_x": [],
"end_y": [],
"original_event_id": [],
"original_event": [],
"event_type_id": [],
"td_frame": [],
}
check_outcome_last_event = False
in_possession_events = ["pass", "carry", "recovery", "shot"]
out_of_possession_events = ["fault received", "ball out", "ball lost"]
for i_event, event in enumerate(events_dict["data"]):
result_dict["event_id"].append(i_event)
result_dict["original_event_id"].append(event["index"])
result_dict["event_type_id"].append(event["type"]["id"])
event_name = event["type"]["name"].lower()
if event_name == "challenge":
if _is_in_subtypes(event["subtypes"], "TACKLE"):
event_name = "tackle"
result_dict["original_event"].append(event_name)
result_dict["period_id"].append(event["period"])
result_dict["minutes"].append(_to_int((event["start"]["time"] // 60)))
result_dict["seconds"].append(_to_float(event["start"]["time"] % 60))
result_dict["player_id"].append(_to_int(event["from"]["id"][1:]))
result_dict["player_name"].append(event["from"]["name"])
# set outcome for pass or dribble/carry events
if check_outcome_last_event:
if (
event_name in out_of_possession_events
and result_dict["team_id"][-1] == event["team"]["id"]
) or (
event_name in in_possession_events
and result_dict["team_id"][-1] != event["team"]["id"]
):
result_dict["is_successful"][-1] = False
else:
result_dict["is_successful"][-1] = True
check_outcome_last_event = False
# set outcome for shot events
if event_name in ["shot", "tackle"]:
if _is_in_subtypes(event["subtypes"], "GOAL") or _is_in_subtypes(
event["subtypes"], "WON"
):
outcome = True
else:
outcome = False
result_dict["is_successful"].append(outcome)
else:
result_dict["is_successful"].append(None)
# Check if outcome needs to be set based on next event
if event_name in ["pass", "carry"]:
check_outcome_last_event = True
result_dict["team_id"].append(event["team"]["id"])
result_dict["start_x"].append(_to_float(event["start"]["x"]))
result_dict["start_y"].append(_to_float(event["start"]["y"]))
if event["to"] is not None:
result_dict["to_player_id"].append(_to_int(event["to"]["id"][1:]))
result_dict["to_player_name"].append(event["to"]["name"])
else:
result_dict["to_player_id"].append(MISSING_INT)
result_dict["to_player_name"].append(None)
result_dict["end_x"].append(_to_float(event["end"]["x"]))
result_dict["end_y"].append(_to_float(event["end"]["y"]))
result_dict["td_frame"].append(event["start"]["frame"])
result_dict["databallpy_event"] = [None] * len(result_dict["event_id"])
events = pd.DataFrame(result_dict)
events["databallpy_event"] = (
events["original_event"].map(metrica_databallpy_map).replace([np.nan], [None])
)
events["is_successful"] = events["is_successful"].astype("boolean")
return events
def _is_in_subtypes(subtypes: list[dict] | dict, name: str) -> bool:
"""Function to search for a name in the subtypes
Args:
subtypes (list[dict] | dict): list of subtypes
name (str): name to search for
Returns:
bool: True if the name is in the subtypes, False otherwise
"""
result = False
if isinstance(subtypes, list):
for sub in subtypes:
if sub["name"] == name:
result = True
break
else:
if subtypes["name"] == name:
result = True
return result
@logging_wrapper(__file__)
def _get_databallpy_events(
event_data: pd.DataFrame,
pitch_dimensions: tuple[float, float],
home_team_id: int,
all_players: pd.DataFrame,
) -> dict:
"""Function to get the databallpy events from the event data
Args:
event_data (pd.DataFrame): event data
pitch_dimensions (tuple): dimensions of the pitch
home_team_id (int): id of the home team
all_players (pd.DataFrame): metadata of all the players
Returns:
dict: dictionary with the databallpy events
"""
shot_events = {}
pass_events = {}
dribble_events = {}
shot_mask = event_data["databallpy_event"] == "shot"
shot_events = (
{
shot.event_id: shot
for shot in event_data[shot_mask].apply(
_get_shot_event,
pitch_dimensions=pitch_dimensions,
home_team_id=home_team_id,
players=all_players,
axis=1,
)
}
if shot_mask.sum() > 0
else {}
)
pass_maks = event_data["databallpy_event"] == "pass"
pass_events = (
{
pass_.event_id: pass_
for pass_ in event_data[pass_maks].apply(
_get_pass_event,
pitch_dimensions=pitch_dimensions,
home_team_id=home_team_id,
players=all_players,
axis=1,
)
}
if pass_maks.sum() > 0
else {}
)
dribble_mask = event_data["databallpy_event"] == "dribble"
dribble_events = (
{
dribble.event_id: dribble
for dribble in event_data[dribble_mask].apply(
_get_dribble_event,
pitch_dimensions=pitch_dimensions,
home_team_id=home_team_id,
players=all_players,
axis=1,
)
}
if dribble_mask.sum() > 0
else {}
)
tackle_mask = event_data["databallpy_event"] == "tackle"
tackle_events = (
{
tackle.event_id: tackle
for tackle in event_data[tackle_mask].apply(
_get_tackle_event,
pitch_dimensions=pitch_dimensions,
home_team_id=home_team_id,
players=all_players,
axis=1,
)
}
if tackle_mask.sum() > 0
else {}
)
databallpy_events = {
"shot_events": shot_events,
"pass_events": pass_events,
"dribble_events": dribble_events,
"other_events": tackle_events,
}
return databallpy_events
def _get_shot_event(
row: pd.Series,
pitch_dimensions: tuple[float, float],
home_team_id: int,
players: pd.DataFrame,
) -> ShotEvent:
"""Function to return a ShotEvent object from a row of the metrica
event data
Args:
row (pd.Series): row of the metrica event data with a shot event
pitch_dimensions (tuple): dimensions of the pitch
home_team_id (int): id of the home team
players: pd.DataFrame: Metadata of the players
Returns:
ShotEvent: ShotEvent object
"""
return ShotEvent(
event_id=row.event_id,
period_id=row.period_id,
minutes=row.minutes,
seconds=row.seconds,
datetime=row.datetime,
start_x=row.start_x,
start_y=row.start_y,
team_id=row.team_id,
team_side="home" if row.team_id == home_team_id else "away",
pitch_size=pitch_dimensions,
player_id=row.player_id,
jersey=players.loc[players["id"] == row.player_id, "shirt_num"].iloc[0],
outcome=bool(row.is_successful),
related_event_id=MISSING_INT,
body_part="unspecified",
possession_type="unspecified",
set_piece="unspecified",
_xt=np.nan,
outcome_str=["miss", "goal"][row.is_successful],
)
def _get_pass_event(
row: pd.Series,
pitch_dimensions: tuple[float, float],
home_team_id: int,
players: pd.DataFrame,
) -> PassEvent:
"""Function to return a PassEvent object from a row of the metrica
event data.
Args:
row (pd.Series): row of the metrica event data with a pass event
pitch_dimensions (tuple): dimensions of the pitch
home_team_id (int): id of the home team
players: pd.DataFrame: Metadata of the players
Returns:
PassEvent: PassEvent object
"""
return PassEvent(
event_id=row.event_id,
period_id=row.period_id,
minutes=row.minutes,
seconds=row.seconds,
datetime=row.datetime,
start_x=row.start_x,
start_y=row.start_y,
team_id=row.team_id,
team_side="home" if row.team_id == home_team_id else "away",
pitch_size=pitch_dimensions,
player_id=row.player_id,
jersey=players.loc[players["id"] == row.player_id, "shirt_num"].iloc[0],
outcome=bool(row.is_successful),
related_event_id=MISSING_INT,
body_part="unspecified",
possession_type="unspecified",
set_piece="unspecified",
_xt=np.nan,
outcome_str=["unsuccessful", "successful"][row.is_successful]
if not pd.isnull(row.is_successful)
else "not_specified",
end_x=row.end_x,
end_y=row.end_y,
pass_type="unspecified",
)
def _get_dribble_event(
row: pd.Series,
pitch_dimensions: tuple[float, float],
home_team_id: int,
players: pd.DataFrame,
) -> DribbleEvent:
"""Function to return a DribbleEvent object from a row of the metrica
event data.
Args:
row (pd.Series): row of the metrica event data with a dribble event
pitch_dimensions (tuple): dimensions of the pitch
home_team_id (int): id of the home team
players: pd.DataFrame: Metadata of the players
Returns:
DribbleEvent: DribbleEvent object
"""
return DribbleEvent(
event_id=row.event_id,
period_id=row.period_id,
minutes=row.minutes,
seconds=row.seconds,
datetime=row.datetime,
start_x=row.start_x,
start_y=row.start_y,
team_id=row.team_id,
team_side="home" if row.team_id == home_team_id else "away",
pitch_size=pitch_dimensions,
player_id=row.player_id,
jersey=players.loc[players["id"] == row.player_id, "shirt_num"].iloc[0],
outcome=bool(row.is_successful),
related_event_id=MISSING_INT,
body_part="unspecified",
possession_type="unspecified",
set_piece="unspecified",
_xt=np.nan,
duel_type="unspecified",
with_opponent=False,
)
def _get_tackle_event(
row: pd.Series,
pitch_dimensions: tuple[float, float],
home_team_id: int,
players: pd.DataFrame,
) -> TackleEvent:
"""Function to return a DribbleEvent object from a row of the metrica
event data.
Args:
row (pd.Series): row of the metrica event data with a dribble event
pitch_dimensions (tuple): dimensions of the pitch
home_team_id (int): id of the home team
players: pd.DataFrame: Metadata of the players
Returns:
TackleEvent: TackleEvent object
"""
return TackleEvent(
event_id=row.event_id,
period_id=row.period_id,
minutes=row.minutes,
seconds=row.seconds,
datetime=row.datetime,
start_x=row.start_x,
start_y=row.start_y,
team_id=row.team_id,
team_side="home" if row.team_id == home_team_id else "away",
pitch_size=pitch_dimensions,
player_id=row.player_id,
jersey=players.loc[players["id"] == row.player_id, "shirt_num"].iloc[0],
outcome=bool(row.is_successful),
related_event_id=MISSING_INT,
)