Source code for databallpy.data_parsers.event_data_parsers.sportec_parser

import os
import xml.etree.ElementTree as ET
from pathlib import Path

import chardet
import numpy as np
import pandas as pd
import requests

from databallpy.data_parsers.metadata import Metadata
from databallpy.data_parsers.sportec_metadata_parser import (
    _get_sportec_metadata,
    _get_sportec_open_data_url,
)
from databallpy.events import DribbleEvent, PassEvent, ShotEvent
from databallpy.utils.constants import MISSING_INT
from databallpy.utils.logging import logging_wrapper

SPORTEC_SET_PIECES_MAP = {
    "ThrowIn": "throw_in",
    "GoalKick": "goal_kick",
    "FreeKick": "free_kick",
    "Penalty": "penalty",
    "CornerKick": "corner_kick",
    "KickOff": "kick_off",
}
SPORTEC_ON_BALL_EVENTS_MAP = {
    "ShotAtGoal": "shot",
    "Play": "pass",
    "Pass": "pass",
    "Cross": "pass",
}
SPORTEC_SHOT_OUTCOMES = {
    "SavedShot": "miss_on_target",
    "BlockedShot": "blocked",
    "SuccessfulShot": "goal",
    "ShotWide": "miss_off_target",
    "ShotWoodWork": "miss_hit_post",
    "OtherShot": "unspecified",
}
SPORTEC_BODY_PARTS = {
    "head": "head",
    "leftLeg": "left_foot",
    "rightLeg": "right_foot",
}
SPORTEC_ASSISTS = {
    "freeKick": "free_kick",
    "shot": "rebound",
    "header": "open_play",
    "otherPassFromOpenPlay": "open_play",
    "throwIn": "throw_in",
    "cornerKick": "corner_kick",
    "longPassFromOpenPlay": "open_play",
    "crossFromOpenPlay": "open_play",
}

SPORTEC_UNCLEAR_EVENTS = [
    "TacklingGame",
    "OtherBallAction",
]
ALL_SPORTEC_EVENTS = SPORTEC_UNCLEAR_EVENTS + list(SPORTEC_ON_BALL_EVENTS_MAP.keys())


[docs] @logging_wrapper(__file__) def load_sportec_event_data( event_data_loc: str, metadata_loc: str ) -> tuple[pd.DataFrame, Metadata, dict[str, dict]]: """Base function to load the sportec/DFL event data. Args: event_data_loc (str): the location of the event data xml metadata_loc (str): the location of the tracking data xml Raises: FileNotFoundError: If the event or metadata location is not found Returns: tuple[pd.DataFrame, Metadata, dict[str, dict]]: The event data, the event metadata, and the databallpy events dictionary. """ metadata = _get_sportec_metadata(metadata_loc, only_event_data=True) event_data, databallpy_events = _get_sportec_event_data(event_data_loc, metadata) metadata.periods_frames["start_datetime_ed"] = pd.to_datetime( metadata.periods_frames["start_datetime_ed"] ).dt.tz_localize("Europe/Berlin") metadata.periods_frames["end_datetime_ed"] = pd.to_datetime( metadata.periods_frames["end_datetime_ed"] ).dt.tz_localize("Europe/Berlin") metadata.periods_frames.loc[0, "start_datetime_ed"] = event_data.iloc[0]["datetime"] metadata.periods_frames.loc[0, "end_datetime_ed"] = event_data.loc[ event_data["period_id"] == 1, "datetime" ].iloc[-1] metadata.periods_frames.loc[1, "end_datetime_ed"] = event_data.iloc[-1]["datetime"] metadata.periods_frames.loc[1, "start_datetime_ed"] = event_data.loc[ event_data["period_id"] == 2, "datetime" ].iloc[0] return event_data, metadata, databallpy_events
[docs] @logging_wrapper(__file__) def load_sportec_open_event_data( game_id: str, cache_path: Path ) -> tuple[pd.DataFrame, Metadata, dict[str, dict]]: """Function to (down)load on open game from Sportec/Tracab Args: game_id (str): The id of the open game cache_path (Path): path to cache files. Returns: tuple[pd.DataFrame, Metadata, dict[str, dict]]: The event data, the event metadata, and the databallpy events dictionary. Reference: Bassek, M., Weber, H., Rein, R., & Memmert,D. (2024). An integrated dataset of synchronized spatiotemporal and event data in elite soccer. """ metadata_url = _get_sportec_open_data_url(game_id, "metadata") os.makedirs(cache_path, exist_ok=True) if not (cache_path / "metadata.xml").is_file(): metadata = requests.get(metadata_url) with open(cache_path / "metadata.xml", "wb") as f: f.write(metadata.content) if not (cache_path / "event_data.xml").is_file(): event_data = requests.get(_get_sportec_open_data_url(game_id, "event_data")) with open(cache_path / "event_data.xml", "wb") as f: f.write(event_data.content) return load_sportec_event_data( os.path.join(cache_path, "event_data.xml"), os.path.join(cache_path, "metadata.xml"), )
@logging_wrapper(__file__) def _get_sportec_event_data( event_data_loc: str, metadata: Metadata ) -> tuple[pd.DataFrame, dict[str, dict]]: """Functionto get the event data and the metadata for sportec/DFL based data. Args: event_data_loc (str): location of the event data xml file metadata (Metadata): Metadata object of the game Returns: tuple[pd.DataFrame, dict[str, dict]]: the event data and the databallpy events """ with open(event_data_loc, "rb") as f: encoding = chardet.detect(f.read())["encoding"] with open(event_data_loc, "r", encoding=encoding) as file: lines = file.read() root = ET.fromstring(lines) all_events = root.findall(".//Event[@X-Position]") def update_results_dict(res_dict, i, **kwargs): for key in res_dict.keys(): if key in kwargs: res_dict[key][i] = kwargs[key] return res_dict result_dict = { "event_id": [MISSING_INT] * len(all_events), "databallpy_event": [None] * len(all_events), "period_id": [MISSING_INT] * len(all_events), "minutes": [MISSING_INT] * len(all_events), "seconds": [np.nan] * len(all_events), "player_id": [None] * len(all_events), "team_id": [None] * len(all_events), "is_successful": [None] * len(all_events), "start_x": [np.nan] * len(all_events), "start_y": [np.nan] * len(all_events), "datetime": ["NaT"] * len(all_events), "original_event_id": [MISSING_INT] * len(all_events), "original_event": [None] * len(all_events), } pitch_center, period_start_times, swap_half = _initialize_search_variables( root, metadata.home_team_id ) metadata.periods_changed_playing_direction = [swap_half] databallpy_events = { "pass_events": {}, "shot_events": {}, "dribble_events": {}, "other_events": {}, } for idx, event in enumerate(all_events): kwargs = {} kwargs["set_piece"] = SPORTEC_SET_PIECES_MAP.get( next(iter(event)).tag, "no_set_piece" ) kwargs["datetime"] = pd.to_datetime(event.get("EventTime")).tz_convert( "Europe/Berlin" ) dt_idx = int(kwargs["datetime"] >= period_start_times[1]) kwargs["period_id"] = dt_idx + 1 time_diff_s = ( kwargs["datetime"].timestamp() - period_start_times[dt_idx].timestamp() ) kwargs["minutes"] = int((45 * dt_idx) + time_diff_s // 60) kwargs["seconds"] = time_diff_s % 60 kwargs["event_id"] = idx kwargs["original_event_id"] = int(event.get("EventId")) kwargs["start_x"] = float(event.get("X-Position")) - pitch_center[0] kwargs["start_y"] = float(event.get("Y-Position")) - pitch_center[1] if kwargs["period_id"] == swap_half: kwargs["start_x"] *= -1 kwargs["start_y"] *= -1 event = next( (e for e in event.iter() if e is not event and e.tag in ALL_SPORTEC_EVENTS), next(iter(event)), ) kwargs["original_event"] = event.tag kwargs["player_id"] = event.get("Player", event.get("Winner")) kwargs["team_id"] = event.get("Team", event.get("WinnerTeam")) if event.tag == "ShotAtGoal": kwargs, shot_event = _handle_shot_event(event, metadata, kwargs) databallpy_events["shot_events"][shot_event.event_id] = shot_event elif event.tag == "Play": kwargs, pass_event = _handle_play_event(event, metadata, kwargs) databallpy_events["pass_events"][pass_event.event_id] = pass_event elif event.tag == "TacklingGame": kwargs, dbp_event = _handle_tackling_game_event(event, metadata, kwargs) if isinstance(dbp_event, DribbleEvent): databallpy_events["dribble_events"][dbp_event.event_id] = dbp_event if "outcome" in kwargs.keys(): kwargs["is_successful"] = kwargs.pop("outcome") result_dict = update_results_dict(result_dict, idx, **kwargs) event_data = pd.DataFrame(result_dict) # event_data["datetime"] = pd.to_datetime(event_data["datetime"]).dt.tz_convert( # "Europe/Berlin" # ) event_data = event_data.sort_values("datetime").reset_index(drop=True) all_players = pd.concat([metadata.home_players, metadata.away_players])[ ["full_name", "id"] ] player_name_series = pd.Series(event_data.index, None, dtype=str) player_name_series[~pd.isnull(event_data["player_id"])] = event_data.loc[ ~pd.isnull(event_data["player_id"]), "player_id" ].apply(lambda x: all_players.loc[all_players["id"] == x, "full_name"].iloc[0]) event_data.insert(6, "player_name", player_name_series) event_data["is_successful"] = event_data["is_successful"].astype("boolean") return event_data, databallpy_events def _initialize_search_variables( root: ET.Element, home_team_id: str ) -> tuple[list[float], list]: """Function to get the base variables for the event data. The function calculates the center of the pitch, and the start datetimes of the first and second half, needed later for calculating the minutes/seconds and period of the game. Args: root (ET.Element): The root element of the event data XML home_team_id (str): The id of the home team Returns: tuple[list[float], list[dt.DateTime]]: the x,y location of the center of the pitch and the start datetime of the first and second half. """ if not isinstance(root, ET.Element): root = ET.fromstring(str(root)) first_half_kick_off = next( ( e for e in root.iter("Event") if e.find('KickOff[@GameSection="firstHalf"]') is not None ), None, ) second_half_kick_off = next( ( e for e in root.iter("Event") if e.find('KickOff[@GameSection="secondHalf"]') is not None ), None, ) pitch_center = [ float(first_half_kick_off.get("X-Position")), float(first_half_kick_off.get("Y-Position")), ] period_start_times = pd.to_datetime( [first_half_kick_off.get("EventTime"), second_half_kick_off.get("EventTime")] ) swap_period = ( 1 if first_half_kick_off.find("KickOff").get("TeamRight") == home_team_id else 2 ) return pitch_center, period_start_times, swap_period def _handle_tackling_game_event( event: ET.Element, metadata: Metadata, kwargs_dict: dict ) -> tuple[dict, DribbleEvent | None]: """Funtion to handle tackling game events. Only dribbles are now considered since it is not clear when a tackle was performed. Args: event (ET.Element): The TacklingGame event metadata (Metadata): The metadata of the event data kwargs_dict (dict): The kwargs for event_data and databallpy events Returns: tuple[dict, DribbleEvent | None]: The updated kwargs for the event data, and the dribble event or None if it was not a dribble event """ if not isinstance(event, ET.Element): event = ET.fromstring(str(event)) kwargs_dict["original_event"] = event.get("WinnerResult", event.tag) if not event.get("WinnerResult") == "dribbledAround": return kwargs_dict, None kwargs_dict = _get_base_on_ball_event_kwargs(metadata, kwargs_dict) kwargs_dict["outcome"] = event.get("DribbleEvaluation") == "successful" kwargs_dict["body_part"] = "foot" kwargs_dict["possession_type"] = "open_play" kwargs_dict["duel_type"] = "unspecified" kwargs_dict["with_opponent"] = True kwargs_dict["related_event_id"] = None kwargs_dict["databallpy_event"] = "dribble" temp_exclude = ["original_event", "original_event_id", "databallpy_event"] dribble_event = DribbleEvent( **{k: v for k, v in kwargs_dict.items() if k not in temp_exclude} | {"_xt": -1} ) return kwargs_dict, dribble_event def _handle_shot_event( event: ET.Element, metadata: Metadata, kwargs_dict: dict ) -> tuple[dict, ShotEvent]: """Funtion to handle ShotAtGoal events from sportec Args: event (ET.Element): The ShotAtGoal event metadata (Metadata): The metadata of the event data kwargs_dict (dict): The kwargs for event_data and databallpy events Returns: tuple[dict, ShotEvent]: The updated kwargs for the event data, and the databallpy shot event """ if not isinstance(event, ET.Element): event = ET.fromstring(str(event)) kwargs_dict = _get_base_on_ball_event_kwargs(metadata, kwargs_dict) _first_child_tag = next(iter(event)).tag kwargs_dict["original_event"] = _first_child_tag kwargs_dict["databallpy_event"] = "shot" kwargs_dict["related_event_id"] = None kwargs_dict["body_part"] = SPORTEC_BODY_PARTS.get( event.get("TypeOfShot"), "unspecified" ) kwargs_dict["possession_type"] = SPORTEC_ASSISTS.get( event.get("AssistAction"), "unspecified" ) kwargs_dict["outcome"] = SPORTEC_SHOT_OUTCOMES[_first_child_tag] == "goal" kwargs_dict["outcome_str"] = SPORTEC_SHOT_OUTCOMES[_first_child_tag] temp_exclude = ["original_event", "original_event_id", "databallpy_event"] shot_event = ShotEvent( **{k: v for k, v in kwargs_dict.items() if k not in temp_exclude} | {"_xt": -1} ) return kwargs_dict, shot_event def _handle_play_event( event: ET.Element, metadata: Metadata, kwargs_dict: dict ) -> tuple[dict, PassEvent]: """Funtion to handle Play events from sportec Args: event (ET.Element): The Play event metadata (Metadata): The metadata of the event data kwargs_dict (dict): The kwargs for event_data and databallpy events Returns: tuple[dict, PassEvent]: The updated kwargs for the event data, and the databallpy pass event """ if not isinstance(event, ET.Element): event = ET.fromstring(str(event)) kwargs_dict = _get_base_on_ball_event_kwargs(metadata, kwargs_dict) _first_child_tag = next(iter(event)).tag kwargs_dict["original_event"] = _first_child_tag kwargs_dict["databallpy_event"] = "pass" kwargs_dict["outcome"] = event.get("Evaluation") == "successfullyCompleted" kwargs_dict["related_event_id"] = None kwargs_dict["body_part"] = "unspecified" kwargs_dict["possession_type"] = ( "open_play" if event.get("FromOpenPlay") == "true" else "unspecified" ) kwargs_dict["outcome_str"] = "unspecified" kwargs_dict["end_x"] = np.nan kwargs_dict["end_y"] = np.nan kwargs_dict["pass_type"] = "cross" if _first_child_tag == "Cross" else "unspecified" kwargs_dict["receiver_player_id"] = event.get("Recipient", None) temp_exclude = ["original_event", "original_event_id", "databallpy_event"] pass_event = PassEvent( **{k: v for k, v in kwargs_dict.items() if k not in temp_exclude} | {"_xt": -1} ) return kwargs_dict, pass_event def _get_base_on_ball_event_kwargs(metadata: Metadata, kwargs_dict: dict) -> dict: """Function to get the base on ball event info. "team_side", "pitch_size", and "jersey" Args: metadata (Metadata): metadata of the event data kwargs_dict (dict): The kwargs for event_data and databallpy events Returns: dict: The updated kwargs for event_data and databallpy events """ kwargs_dict["team_side"] = ( "home" if kwargs_dict["team_id"] == metadata.home_team_id else "away" ) kwargs_dict["pitch_size"] = metadata.pitch_dimensions players = ( metadata.home_players if kwargs_dict["team_side"] == "home" else metadata.away_players ) kwargs_dict["jersey"] = players.loc[ players["id"] == kwargs_dict["player_id"], "shirt_num" ].iloc[0] return kwargs_dict