Source code for databallpy.data_parsers.event_data_parsers.statsbomb_parser

import json
import os

import numpy as np
import pandas as pd

from databallpy.data_parsers.metadata import Metadata
from databallpy.events import DribbleEvent, PassEvent, ShotEvent
from databallpy.utils.constants import MISSING_INT
from databallpy.utils.logging import create_logger

LOGGER = create_logger(__name__)

POSSESSION_TYPE_MAPPING = {
    "Corner": "corner_kick",
    "Free Kick": "free_kick",
    "Open Play": "open_play",
    "Penalty": "penalty",
    "Kick Off": "kick_off",
    "Recovery": "unspecified",
    "Throw-in": "throw_in",
    "Interception": "unspecified",
    "Goal Kick": "goal_kick",
}

SET_PIECE_TYPE_MAPPING = {
    "Corner": "corner_kick",
    "Free Kick": "free_kick",
    "Open Play": "no_set_piece",
    "Penalty": "penalty",
    "Kick Off": "kick_off",
    "Throw-in": "throw_in",
    "Goal Kick": "goal_kick",
    "Recovery": "no_set_piece",
    "Interception": "no_set_piece",
}

BODY_PART_MAPPING = {
    "Head": "head",
    "Left Foot": "left_foot",
    "Right Foot": "right_foot",
    "Keeper Arm": "other",
    "Drop Kick": "other",
    "No Touch": "other",
    "Other": "other",
}


[docs] def load_statsbomb_event_data( events_loc: str, match_loc: str, lineup_loc: str, pitch_dimensions: tuple = (105.0, 68.0), ) -> tuple[pd.DataFrame, Metadata, dict]: """This function retrieves the metadata and event data of a specific game. The x and y coordinates provided have been scaled to the dimensions of the pitch, with (0, 0) being the center. Additionally, the coordinates have been standardized so that the home team is represented as playing from left to right for the entire game, and the away team is represented as playing from right to left. Args: events_loc (str): location of the event.json file. match_loc (str): location of the game.json file. lineup_loc (str): location of the lineup.json file. pitch_dimensions (tuple, optional): the length and width of the pitch. Input should be in yards (as this is statsbomb standard (120, 80)) and is recalculated to meters in this function. Defaults to (105.0, 68.0) Returns: Tuple[pd.DataFrame, Metadata, dict]: the event data of the gameh, the metadata, and the databallpy_events. """ LOGGER.info(f"Loading Statsbomb event data: events_loc: {events_loc}") _check_input_values(loc=events_loc, str_type="events_loc") _check_input_values(loc=match_loc, str_type="match_loc") _check_input_values(loc=lineup_loc, str_type="lineup_loc") if not isinstance(pitch_dimensions, (tuple, list)) or len(pitch_dimensions) != 2: LOGGER.error( f"Invalid pitch_dimensions: {pitch_dimensions}. Must be a tuple of length 2." ) raise ValueError( f"Invalid pitch_dimensions: {pitch_dimensions}. Must be a tuple of length 2." ) # Load the metadata metadata = _load_metadata(match_loc, lineup_loc, pitch_dimensions) LOGGER.info("Successfully loaded Statsbomb metadata.") # Load the event data event_data, databallpy_events, metadata = _load_event_data( events_loc, metadata, pitch_dimensions ) LOGGER.info("Successfully loaded Statsbomb event data and databallpy events.") return event_data, metadata, databallpy_events
def _check_input_values(loc: str, str_type: str) -> None: """Function to check the input values for load_statsbomb_event_data Args: loc (str): location of the file, str_type (str): type of file, should be one of: events_loc, match_loc or lineup_loc Returns: None """ if not isinstance(loc, str): LOGGER.error(f"{str_type} should be a string, not a {type(loc)}") raise TypeError(f"{str_type} should be a string, not a {type(loc)}") elif not loc[-5:] == ".json": LOGGER.error(f"{str_type} should by of .json format, not {loc.split('.')[-1]}") raise ValueError( f"{str_type} should by of .json format, not {loc.split('.')[-1]}" ) elif not os.path.exists(loc): LOGGER.error(f"File {loc} does not exist.") raise FileNotFoundError(f"File {loc} does not exist.") def _load_metadata(match_loc: str, lineup_loc: str, pitch_dimensions: tuple) -> Metadata: """Function to load metadata from the match.json and lineup.json files Args: match_loc (str): location of the match.json file lineup_loc (str): location of the lineup.json file pitch_dimensions (tuple): the length and width of the pitch in meters Returns: MetaData: all metadata information of the current game """ with open(match_loc, "r", encoding="utf-8") as f: match_json = json.load(f)[0] with open(lineup_loc, "r", encoding="utf-8") as f: lineup_json = json.load(f) home_index = ( 0 if lineup_json[0]["team_id"] == match_json["home_team"]["home_team_id"] else 1 ) away_index = 1 if home_index == 0 else 0 home_players = _get_player_info(lineup_json[home_index]["lineup"]) away_players = _get_player_info(lineup_json[away_index]["lineup"]) game_start = pd.to_datetime( match_json["match_date"] + " " + match_json["kick_off"], utc=True ) periods = { "period_id": [1, 2, 3, 4, 5], "start_datetime_ed": [ game_start, game_start + pd.to_timedelta(60, unit="minutes"), pd.NaT, pd.NaT, pd.NaT, ], "end_datetime_ed": [ game_start + pd.to_timedelta(45, unit="minutes"), game_start + pd.to_timedelta(105, unit="minutes"), pd.NaT, pd.NaT, pd.NaT, ], } metadata = Metadata( game_id=match_json["match_id"], pitch_dimensions=pitch_dimensions, periods_frames=pd.DataFrame(periods), frame_rate=MISSING_INT, home_team_id=match_json["home_team"]["home_team_id"], home_team_name=match_json["home_team"]["home_team_name"], home_players=home_players, home_score=match_json["home_score"], home_formation="", away_team_id=match_json["away_team"]["away_team_id"], away_team_name=match_json["away_team"]["away_team_name"], away_players=away_players, away_score=match_json["away_score"], away_formation="", country=match_json["competition"]["country_name"], ) return metadata def _get_player_info(players_data: list) -> pd.DataFrame: """Function to loop over all players and save data in a pd.DataFrame Args: players_data (list): for every player a dictionary with info about the player Returns: pd.DataFrame: all information of the players """ n = len(players_data) result_dict = { "id": [MISSING_INT] * n, "full_name": [""] * n, "formation_place": [MISSING_INT] * n, "position": ["unspecified"] * n, "starter": [False] * n, "shirt_num": [MISSING_INT] * n, } positions = { "goalkeeper": [1], "defender": [2, 3, 4, 5, 6, 7, 8, 9], "midfielder": [10, 11, 12, 13, 14, 15, 16, 18, 19, 20], "forward": [17, 21, 22, 23, 24, 25], } position_id_map = {i: position for position, ids in positions.items() for i in ids} for id, player in enumerate(players_data): if len(player["positions"]) > 0: if player["positions"][0]["from"] == "00:00": result_dict["starter"][id] = True result_dict["formation_place"][id] = player["positions"][0]["position_id"] result_dict["position"][id] = position_id_map[ player["positions"][0]["position_id"] ] result_dict["id"][id] = player["player_id"] result_dict["full_name"][id] = player["player_name"] result_dict["shirt_num"][id] = player["jersey_number"] return pd.DataFrame(result_dict) def _load_event_data( events_loc: str, metadata: Metadata, pitch_dimensions: tuple ) -> tuple[pd.DataFrame, dict, Metadata]: """This function retrieves the event data of a specific game. The x and y coordinates provided have been scaled to the dimensions of the pitch, with (0, 0) being the center. Additionally, the coordinates have been standardized so that the home team is represented as playing from left to right for the entire game, and the away team is represented as playing from right to left. Args: events_loc (str): location of the events.json file metadata (Metadata): metadata of the game pitch_dimensions (tuple): the length and with of the pitch in meters. Returns: Tuple[pd.DataFrame, dict, Metadata]: the event data of the game, the databallpy_events, and the updated metadata """ with open(events_loc, "r", encoding="utf-8") as f: events_json = json.load(f) formations = { event["team"]["id"]: str(event["tactics"]["formation"]) for event in events_json if event["type"]["id"] == 35 } metadata.home_formation = formations[metadata.home_team_id] metadata.away_formation = formations[metadata.away_team_id] events_to_exclude = { 5: "Camera On", 18: "Half Start", 19: "Substitution", 26: "Player On", 27: "Player Off", 34: "Half End", 35: "formations", 36: "Tactical Shift", } event_mask = [ event["type"]["id"] not in events_to_exclude.keys() for event in events_json ] n = sum(event_mask) event_data = { "event_id": list(range(0, n)), "databallpy_event": [None] * n, "period_id": [MISSING_INT] * n, "minutes": [MISSING_INT] * n, "seconds": [np.nan] * n, "player_id": [MISSING_INT] * n, "player_name": [None] * n, "team_id": [MISSING_INT] * n, "is_successful": [None] * n, "start_x": [np.nan] * n, "start_y": [np.nan] * n, "datetime": [pd.NaT] * n, "original_event": [None] * n, "original_event_id": [None] * n, "original_outcome": [None] * n, "team_name": [None] * n, } databallpy_mapping = { "Pass": "pass", "Shot": "shot", "Dribble": "dribble", } shot_events = {} pass_events = {} dribble_events = {} x_multiplier = pitch_dimensions[0] / 120.0 y_multiplier = pitch_dimensions[1] / 80.0 for id, event in enumerate(np.array(events_json)[event_mask]): event_data["event_id"][id] = id event_data["original_event_id"][id] = event["id"] event_data["period_id"][id] = event.get("period", MISSING_INT) event_data["minutes"][id] = event.get("minute", MISSING_INT) event_data["seconds"][id] = float(event.get("second", np.nan)) event_data["player_id"][id] = ( event["player"]["id"] if "player" in event.keys() else MISSING_INT ) event_data["team_id"][id] = ( event["team"]["id"] if "team" in event.keys() else MISSING_INT ) if "location" in event.keys(): event_data["start_x"][id] = event["location"][0] * x_multiplier - ( pitch_dimensions[0] / 2 ) event_data["start_y"][id] = event["location"][1] * y_multiplier - ( pitch_dimensions[1] / 2 ) event_data["datetime"][id] = pd.to_datetime( metadata.periods_frames["start_datetime_ed"][event["period"] - 1] + pd.to_timedelta(event["minute"] * 60 + event["second"], unit="seconds") ) event_data["original_event"][id] = event["type"]["name"].lower().replace("*", "") event_data["player_name"][id] = ( event["player"]["name"] if "player" in event.keys() else None ) event_data["team_name"][id] = ( event["team"]["name"] if "team" in event.keys() else None ) event_type_object = ( event["type"]["name"] .lower() .replace(" ", "_") .replace("*", "") .replace("-", "_") .replace("/", "-") ) if event["type"]["name"] in databallpy_mapping: databallpy_event = databallpy_mapping[event["type"]["name"]] event_data["databallpy_event"][id] = databallpy_event if databallpy_event == "shot": shot_events[id] = _get_shot_event( event=event, id=id, pitch_dimensions=pitch_dimensions, periods=metadata.periods_frames, away_team_id=metadata.away_team_id, x_multiplier=x_multiplier, y_multiplier=y_multiplier, ) event_data["is_successful"][id] = ( event[event_type_object]["outcome"]["name"] == "Goal" ) elif databallpy_event == "pass": pass_events[id] = _get_pass_event( event=event, id=id, pitch_dimensions=pitch_dimensions, periods=metadata.periods_frames, away_team_id=metadata.away_team_id, x_multiplier=x_multiplier, y_multiplier=y_multiplier, ) event_data["is_successful"][id] = ( event[event_type_object].get("outcome") is None ) elif databallpy_event == "dribble": dribble_events[id] = _get_dribble_event( event=event, id=id, pitch_dimensions=pitch_dimensions, periods=metadata.periods_frames, away_team_id=metadata.away_team_id, x_multiplier=x_multiplier, y_multiplier=y_multiplier, ) event_data["is_successful"][id] = ( event[event_type_object]["outcome"]["name"] == "Complete" ) if event_type_object in event.keys(): if "outcome" in event[event_type_object]: event_data["original_outcome"][id] = event[event_type_object]["outcome"][ "name" ] event_data = pd.DataFrame(event_data) event_data["is_successful"] = event_data["is_successful"].astype("boolean") event_data.loc[event_data["team_id"] == metadata.away_team_id, ["start_x"]] *= -1 event_data.loc[event_data["team_id"] == metadata.home_team_id, ["start_y"]] *= -1 id_jersey_map = ( pd.concat([metadata.home_players, metadata.away_players])[["id", "shirt_num"]] .set_index("id") .to_dict()["shirt_num"] ) for event in {**shot_events, **pass_events, **dribble_events}.values(): if event.team_side == "away": event.start_x *= -1 if isinstance(event, PassEvent): event.end_x *= -1 else: event.start_y *= -1 if isinstance(event, PassEvent): event.end_y *= -1 event.jersey = id_jersey_map[event.player_id] return ( event_data, { "shot_events": shot_events, "pass_events": pass_events, "dribble_events": dribble_events, }, metadata, ) def _get_shot_event( event: dict, id: int, pitch_dimensions: tuple, periods: pd.DataFrame, away_team_id: int, x_multiplier: float, y_multiplier: float, ) -> ShotEvent: """This function retrieves the shot event of a specific game. Args: event (dict): the shot event. id (int): the id of the event. pitch_dimensions (tuple): pitch dimensions in x and y direction. periods (pd.Dataframe): metadata.periods_frames dataframe away_team_id (int): id of away team x_multiplier (float): The value to multiply the x locations with to get to meters. E.g. 105/120 = 0.875. y_multiplier (float): The value to multiply the y locations with to get to meters. E.g. 68/80 = 0.85. Returns: ShotEvent: the shot event """ shot_outcome_mapping = { "Blocked": "blocked", "Goal": "goal", "Off T": "miss_off_target", "Post": "miss_hit_post", "Saved": "miss", "Wayward": "miss", "Saved Off T": "miss", "Saved To Post": "miss_hit_post", } close_to_ball_event_info = _get_close_to_ball_event_info( event, id, pitch_dimensions, away_team_id, periods, x_multiplier, y_multiplier ) return ShotEvent( **close_to_ball_event_info, related_event_id=event["related_events"], outcome=event["shot"]["outcome"]["name"] == "Goal", body_part=BODY_PART_MAPPING[event["shot"]["body_part"]["name"]], possession_type=POSSESSION_TYPE_MAPPING[event["shot"]["type"]["name"]], set_piece=SET_PIECE_TYPE_MAPPING[event["shot"]["type"]["name"]], _xt=-1.0, outcome_str=shot_outcome_mapping[event["shot"]["outcome"]["name"]], ) def _get_pass_event( event: dict, id: int, pitch_dimensions: tuple, periods: pd.DataFrame, away_team_id: int, x_multiplier: float, y_multiplier: float, ) -> PassEvent: """This function retrieves the pass event of a specific game. Args: event (dict): the shot event. id (int): the id of the event. pitch_dimensions (tuple): pitch dimensions in x and y direction. periods (pd.Dataframe): metadata.periods_frames dataframe away_team_id (int): id of away team x_multiplier (float): The value to multiply the x locations with to get to meters. E.g. 105/120 = 0.875. y_multiplier (float): The value to multiply the y locations with to get to meters. E.g. 68/80 = 0.85. Returns: PassEvent: the pass event """ pass_type_mapping = { "Inswinging": "unspecified", "Outswinging": "unspecified", "Straight": "unspecified", "Through Ball": "through_ball", } pass_type = "unspecified" possession_type = "unspecified" set_piece = "unspecified" body_part = "unspecified" receiver_player_id = MISSING_INT related_events = None if "technique" in event["pass"].keys(): pass_type = pass_type_mapping[event["pass"]["technique"]["name"]] if "type" in event["pass"].keys(): possession_type = POSSESSION_TYPE_MAPPING[event["pass"]["type"]["name"]] set_piece = SET_PIECE_TYPE_MAPPING[event["pass"]["type"]["name"]] if "body_part" in event["pass"].keys(): body_part = BODY_PART_MAPPING[event["pass"]["body_part"]["name"]] if "recipient" in event["pass"].keys(): receiver_player_id = event["pass"]["recipient"]["id"] if "related_events" in event.keys(): related_events = event["related_events"] close_to_ball_event_info = _get_close_to_ball_event_info( event, id, pitch_dimensions, away_team_id, periods, x_multiplier, y_multiplier ) return PassEvent( **close_to_ball_event_info, related_event_id=related_events, end_x=event["pass"]["end_location"][0] * x_multiplier - (pitch_dimensions[0] / 2), end_y=event["pass"]["end_location"][1] * y_multiplier - (pitch_dimensions[1] / 2), outcome=False if "goal-assist" not in event["pass"].keys() else True, body_part=body_part, possession_type=possession_type, set_piece=set_piece, receiver_player_id=receiver_player_id, _xt=-1.0, outcome_str="unspecified", pass_type=pass_type, ) def _get_dribble_event( event: dict, id: int, pitch_dimensions: tuple, periods: pd.DataFrame, away_team_id: int, x_multiplier: float, y_multiplier: float, ) -> DribbleEvent: """This function retrieves the dribble event Args: event (dict): the shot event. id (int): the id of the event. pitch_dimensions (tuple): pitch dimensions in x and y direction. periods (pd.Dataframe): metadata.periods_frames dataframe away_team_id (int): id of away team x_multiplier (float): The value to multiply the x locations with to get to meters. E.g. 105/120 = 0.875. y_multiplier (float): The value to multiply the y locations with to get to meters. E.g. 68/80 = 0.85. Returns: DribbleEvent: the dribble event """ related_events = None if "related_events" in event.keys(): related_events = event["related_events"] close_to_ball_event_info = _get_close_to_ball_event_info( event, id, pitch_dimensions, away_team_id, periods, x_multiplier, y_multiplier ) return DribbleEvent( **close_to_ball_event_info, related_event_id=related_events, outcome=event["dribble"]["outcome"]["name"] == "Complete", _xt=-1.0, body_part="unspecified", possession_type="unspecified", set_piece="unspecified", duel_type="unspecified", with_opponent=None, ) def _get_close_to_ball_event_info( event: dict, id: int, pitch_dimensions: tuple, away_team_id: int, periods: pd.DataFrame, x_multiplier: float, y_multiplier: float, ) -> dict: """Function to get the base event data from the event based on the CloseToBallEvent class. Args: event (dict): event id (int): index pitch_dimensions (tuple): pitch dimensions in x and y direction. away_team_id (int): id of the away team players (pd.DataFrame): dataframe with player information. periods: metadata.periods_frames dataframe x_multiplier (float): The value to multiply the x locations with to get to meters. E.g. 105/120 = 0.875. y_multiplier (float): The value to multiply the y locations with to get to meters. E.g. 68/80 = 0.85. Returns: dict: dictionary with the base event data: start_x, start_y, related_event_id """ return { "start_x": event["location"][0] * x_multiplier - (pitch_dimensions[0] / 2), "start_y": event["location"][1] * y_multiplier - (pitch_dimensions[1] / 2), "event_id": id, "period_id": event["period"], "minutes": event["minute"], "seconds": float(event["second"]), "datetime": pd.to_datetime( periods["start_datetime_ed"][event["period"] - 1] + pd.to_timedelta(event["minute"] * 60 + event["second"], unit="seconds") ), "team_id": event["team"]["id"], "team_side": "away" if event["team"]["id"] == away_team_id else "home", "pitch_size": pitch_dimensions, "player_id": event["player"]["id"], "jersey": MISSING_INT, }