Source code for databallpy.data_parsers.event_data_parsers.instat_parser

import datetime as dt
import json

import chardet
import numpy as np
import pandas as pd

from databallpy.data_parsers import Metadata
from databallpy.utils.constants import DATABALLPY_POSITIONS, MISSING_INT
from databallpy.utils.logging import logging_wrapper
from databallpy.utils.tz_modification import utc_to_local_datetime

INSTAT_DATABALLPY_MAP = {
    "Attacking pass accurate": ["pass", True],
    "Attacking pass inaccurate": ["pass", False],
    "Unsuccessful dribbling": ["dribble", False],
    "Successful dribbling": ["dribble", True],
    "Dribbling": ["dribble", None],
    "Inaccurate key pass": ["pass", False],
    "Crosses inaccurate": ["pass", True],
    "Blocked shot": ["shot", False],
    "Shots blocked": ["shot", False],
    "Wide shot": ["shot", False],
    "Accurate crossing from set piece with a shot": ["pass", True],
    "Shot on target": ["shot", False],
    "Crosses accurate": ["pass", True],
    "Pass into offside": ["pass", False],
    "Accurate key pass": ["pass", True],
    "Shot blocked by field player": ["shot", False],
    "Inaccurate set-piece cross": ["pass", False],
    "Accurate crossing from set piece": ["pass", True],
    "Key assist": ["pass", True],
    "Goal": ["shot", True],
    "Accurate crossing from set piece with a goal": ["pass", 1],
}

logging_wrapper(__file__)


[docs] def load_instat_event_data( event_data_loc: str, metadata_loc: str ) -> tuple[pd.DataFrame, Metadata]: """This function retrieves the metadata and event data of a specific game. The x and y coordinates provided have been scaled to the dimensions of the pitch, with (0, 0) being the center. Additionally, the coordinates have been standardized so that the home team is represented as playing from left to right for the entire game, and the away team is represented as playing from right to left. Args: event_data_loc (str): location of the event_data.json file event_data_metadata_loc (str): location of the metadata.json file Returns: Tuple[pd.DataFrame, Metadata]: the event data of the game and the metadata """ if not isinstance(event_data_loc, str): raise TypeError( f"event_data_loc should be a string, not a {type(event_data_loc)}" ) if not isinstance(metadata_loc, str): raise TypeError(f"metadata_loc should be a string, not a {type(metadata_loc)}") if not event_data_loc[-5:] == ".json": raise ValueError("instat event file should by of .json format") if not metadata_loc[-5:] == ".json": raise ValueError("instat event metadata file should be of .json format") metadata = _load_metadata(metadata_loc=metadata_loc) metadata = _update_metadata(metadata=metadata, event_data_loc=event_data_loc) event_data, pitch_dimensions = _load_event_data( event_data_loc=event_data_loc, metadata=metadata ) metadata.pitch_dimensions = pitch_dimensions return event_data, metadata, {}
logging_wrapper(__file__) def _load_metadata(metadata_loc: str) -> pd.DataFrame: """Function to load the data from the metadata.json file Args: metdata_loc (str): location of the metadata.json file Returns: pd.DataFrame: metadata of the game """ with open(metadata_loc, "rb") as f: encoding = chardet.detect(f.read())["encoding"] with open(metadata_loc, "r", encoding=encoding) as f: data = f.read() metadata_json = json.loads(data) game_info = metadata_json["data"]["match_info"][0] country = game_info["tournament_name"].split(".")[0] periods = { "period_id": [1, 2, 3, 4, 5], "start_datetime_ed": [pd.to_datetime("NaT", utc=True)] * 5, "end_datetime_ed": [pd.to_datetime("NaT", utc=True)] * 5, } # No idea why the instat times need to be subtracted by 3 hours to get to utc time periods["start_datetime_ed"][0] = pd.to_datetime( game_info["match_date"], utc=True ) - dt.timedelta(hours=3) periods["end_datetime_ed"][0] = periods["start_datetime_ed"][0] + dt.timedelta( minutes=45 ) periods["start_datetime_ed"][1] = periods["start_datetime_ed"][0] + dt.timedelta( hours=1 ) periods["end_datetime_ed"][1] = periods["start_datetime_ed"][0] + dt.timedelta( minutes=45, hours=1 ) periods = pd.DataFrame(periods) # set time to local time periods["start_datetime_ed"] = utc_to_local_datetime( periods["start_datetime_ed"], country ) periods["end_datetime_ed"] = utc_to_local_datetime( periods["end_datetime_ed"], country ) metadata = Metadata( game_id=int(game_info["id"]), pitch_dimensions=[np.nan, np.nan], periods_frames=pd.DataFrame(periods), frame_rate=np.nan, home_team_id=int(game_info["team1_id"]), home_team_name=str(game_info["team1_name"]), home_players=pd.DataFrame(columns=["id", "full_name", "shirt_num"]), home_score=int(game_info["score"].split(":")[0]), home_formation="", away_team_id=int(game_info["team2_id"]), away_team_name=str(game_info["team2_name"]), away_players=pd.DataFrame(columns=["id", "full_name", "shirt_num"]), away_score=int(game_info["score"].split(":")[1]), away_formation="", country=country, ) return metadata def _update_metadata(metadata: Metadata, event_data_loc: str) -> pd.DataFrame: """This function updates the metadata with the information in the event_data.json file Args: metadata (Metadata): metadata loaded from the metadata.json file event_data_loc (str): location of the event_data.json file Returns: pd.DataFrame: updated metadata of the game """ with open(event_data_loc, "rb") as f: encoding = chardet.detect(f.read())["encoding"] with open(event_data_loc, "r", encoding=encoding) as f: data = f.read() event_data_json = json.loads(data) events = event_data_json["data"]["row"] players_dict = { "id": [], "full_name": [], "position": [], "starter": [], "shirt_num": [], "team_id": [], } home_formation = "" away_formation = "" for event in events: if event["action_id"].startswith( "16" ): # events starting with 16 contain player metadata if int(event["player_id"]) not in players_dict["id"]: players_dict["id"].append(int(event["player_id"])) players_dict["full_name"].append(str(event["player_name"])) players_dict["position"].append( _parse_instat_position(str(event["position_name"])) ) if str(event["position_name"]) == "Substitute player": players_dict["starter"].append(False) else: players_dict["starter"].append(True) players_dict["shirt_num"].append(int(event["number"])) players_dict["team_id"].append(int(event["team_id"])) if event["action_id"].startswith( "15" ): # events starting with 15 contain formations if int(event["team_id"]) == metadata.home_team_id and home_formation == "": home_formation = str(event["action_name"]).replace("-", "") if int(event["team_id"]) == metadata.away_team_id and away_formation == "": away_formation = str(event["action_name"]).replace("-", "") df_players = pd.DataFrame(players_dict) home_players = df_players[df_players["team_id"] == metadata.home_team_id] home_players = ( home_players.sort_values("starter", ascending=False) .drop("team_id", axis=1) .reset_index(drop=True) ) away_players = df_players[df_players["team_id"] == metadata.away_team_id] away_players = ( away_players.sort_values("starter", ascending=False) .drop("team_id", axis=1) .reset_index(drop=True) ) metadata.home_formation = home_formation metadata.away_formation = away_formation metadata.home_players = home_players metadata.away_players = away_players return metadata def _parse_instat_position(position: str) -> str: for pos in DATABALLPY_POSITIONS: if pos in position.lower(): return pos return "unspecified" logging_wrapper(__file__) def _load_event_data(event_data_loc: str, metadata: Metadata) -> pd.DataFrame: """Function to load the event_data.json file, the events of the game. Note: this function does ignore qualifiers for now Args: event_data_loc (str): location of the event_data.json file metadata(Metadata): metadata of the game Returns: pd.DataFrame: event data of the game """ with open(event_data_loc, "rb") as f: encoding = chardet.detect(f.read())["encoding"] with open(event_data_loc, "r", encoding=encoding) as f: data = f.read() event_data_json = json.loads(data) events = event_data_json["data"]["row"] result_dict = { "event_id": [], "databallpy_event": [], "period_id": [], "minutes": [], "seconds": [], "player_id": [], "team_id": [], "is_successful": [], "start_x": [], "start_y": [], "end_x": [], "end_y": [], "datetime": [], "original_event_id": [], "original_event": [], "event_type_id": [], } start_time_period = { 1: metadata.periods_frames.loc[0, "start_datetime_ed"], 2: metadata.periods_frames.loc[0, "start_datetime_ed"] + dt.timedelta(minutes=60), 3: metadata.periods_frames.loc[0, "start_datetime_ed"] + dt.timedelta(minutes=110), 4: metadata.periods_frames.loc[0, "start_datetime_ed"] + dt.timedelta(minutes=130), 5: metadata.periods_frames.loc[0, "start_datetime_ed"] + dt.timedelta(minutes=150), } for i_event, event in enumerate(events): if not event["action_id"].startswith(("16", "15")): result_dict["event_id"].append(i_event) result_dict["original_event_id"].append(int(event["id"])) result_dict["event_type_id"].append(int(event["action_id"])) result_dict["original_event"].append(str(event["action_name"])) result_dict["period_id"].append(int(event["half"])) result_dict["minutes"].append(float(event["second"]) // 60) result_dict["seconds"].append(float(event["second"]) % 60) if "player_id" in event.keys(): result_dict["player_id"].append(int(event["player_id"])) else: result_dict["player_id"].append(MISSING_INT) if "team_id" in event.keys(): result_dict["team_id"].append(int(event["team_id"])) else: result_dict["team_id"].append(MISSING_INT) if "pos_x" in event.keys(): result_dict["start_x"].append(float(event["pos_x"])) result_dict["start_y"].append(float(event["pos_y"])) else: result_dict["start_x"].append(np.nan) result_dict["start_y"].append(np.nan) if "pos_dest_x" in event.keys(): result_dict["end_x"].append(float(event["pos_dest_x"])) result_dict["end_y"].append(float(event["pos_dest_y"])) else: result_dict["end_x"].append(np.nan) result_dict["end_y"].append(np.nan) result_dict["datetime"].append( start_time_period[int(event["half"])] + dt.timedelta(milliseconds=float(event["second"]) * 1000) ) result_dict["is_successful"] = [None] * len(result_dict["original_event_id"]) result_dict["databallpy_event"] = [None] * len(result_dict["original_event_id"]) event_data = pd.DataFrame(result_dict) event_data["databallpy_event"] = event_data["original_event"].apply( lambda x: INSTAT_DATABALLPY_MAP.get(x, [None])[0] ) event_data["is_successful"] = ( event_data["original_event"] .apply(lambda x: INSTAT_DATABALLPY_MAP.get(x, [None, None])[1]) .astype("boolean") ) potential_kick_off_events = ["pass", "shot"] x_start, y_start = ( event_data[event_data["databallpy_event"].isin(potential_kick_off_events)] .reset_index() .loc[0, ["start_x", "start_y"]] ) event_data["start_x"] -= x_start event_data["start_y"] -= y_start event_data["end_x"] -= x_start event_data["end_y"] -= y_start pitch_dimensions = [2 * x_start, 2 * y_start] id_full_name_dict = dict( zip(metadata.home_players["id"], metadata.home_players["full_name"]) ) id_full_name_dict_away = dict( zip(metadata.away_players["id"], metadata.away_players["full_name"]) ) id_full_name_dict.update(id_full_name_dict_away) event_data["player_name"] = ( event_data["player_id"].map(id_full_name_dict).replace({np.nan: None}) ) away_mask = event_data["team_id"] == metadata.away_team_id event_data.loc[away_mask, ["start_x", "start_y", "end_x", "end_y"]] *= -1 event_data["minutes"] = event_data["minutes"].round().astype(int) return event_data, pitch_dimensions