import xml.etree.ElementTree as ET
import chardet
import numpy as np
import pandas as pd
from tqdm import tqdm
from databallpy.data_parsers import Metadata
from databallpy.data_parsers.tracking_data_parsers.utils import (
_add_ball_data_to_dict,
_add_datetime,
_add_periods_to_tracking_data,
_add_player_tracking_data_to_dict,
_get_gametime,
_insert_missing_rows,
_normalize_playing_direction_tracking,
)
from databallpy.utils.constants import MISSING_INT
from databallpy.utils.logging import logging_wrapper
from databallpy.utils.tz_modification import utc_to_local_datetime
from databallpy.utils.utils import _to_float, _to_int
[docs]
@logging_wrapper(__file__)
def load_inmotio_tracking_data(
tracking_data_loc: str, metadata_loc: str, verbose: bool = True
) -> tuple[pd.DataFrame, Metadata]:
"""Function to load inmotio tracking data.
Args:
tracking_data_loc (str): location of the tracking data .txt file
metadata_loc (str): location of the metadata .xml file
verbose (bool, optional): whether to print information about the progress
in the terminall. Defaults to True.
Raises:
TypeError: if tracking_data_loc is not a string
Returns:
Tuple[pd.DataFrame, Metadata]: tracking and metadata of the game
"""
if not isinstance(tracking_data_loc, str):
raise TypeError(
f"tracking_data_loc must be a str, not a {type(tracking_data_loc)}"
)
metadata = _get_metadata(metadata_loc)
td_channels = _get_td_channels(metadata_loc, metadata)
tracking_data = _get_tracking_data(
tracking_data_loc, td_channels, metadata.pitch_dimensions, verbose
)
first_frame = metadata.periods_frames[metadata.periods_frames["start_frame"] > 0][
"start_frame"
].min()
last_frame = metadata.periods_frames["end_frame"].max()
tracking_data = tracking_data[
(tracking_data["frame"] >= first_frame) & (tracking_data["frame"] <= last_frame)
].reset_index(drop=True)
tracking_data["datetime"] = _add_datetime(
tracking_data["frame"],
metadata.frame_rate,
metadata.periods_frames["start_datetime_td"].iloc[0],
)
tracking_data, changed_periods = _normalize_playing_direction_tracking(
tracking_data, metadata.periods_frames
)
metadata.periods_changed_playing_direction = changed_periods
tracking_data["period_id"] = _add_periods_to_tracking_data(
tracking_data["frame"], metadata.periods_frames
)
tracking_data["gametime_td"] = _get_gametime(
tracking_data["frame"], tracking_data["period_id"], metadata
)
return tracking_data, metadata
@logging_wrapper(__file__)
def _get_tracking_data(
tracking_data_loc: str,
td_channels: list,
pitch_dimensions: list,
verbose: bool = True,
) -> pd.DataFrame:
"""Function to load in inmotio format tracking_data
Args:
tracking_data_loc (str): location of the tracking .txt file
td_channels (list): the order of which players are referred
to in the raw tracking data
pitch_dimensions (list): x and y dimensions of the pitch in meters
verbose (bool, optional): whether to print information about the progress in the
terminal. Defaults to True.
Returns:
pd.DataFrame: tracking data of the game in a pd dataframe
"""
if verbose:
print(f"Reading in {tracking_data_loc}", end="")
file = open(tracking_data_loc, "r")
lines = file.readlines()
if verbose:
print(" - Completed")
file.close()
size_lines = len(lines)
data = {
"frame": [np.nan] * size_lines,
"ball_x": [np.nan] * size_lines,
"ball_y": [np.nan] * size_lines,
"ball_z": [np.nan] * size_lines,
"ball_status": [None] * size_lines,
"team_possession": [None] * size_lines,
}
if verbose:
lines = tqdm(
lines, desc="Writing lines to dataframe", unit=" lines", leave=False
)
for idx, line in enumerate(lines):
frame, players_home, players_away, ball_info = line.split(":")
frame = _to_int(frame)
data["frame"][idx] = frame
players_info = players_home + ";" + players_away
players = players_info.split(";")
for i, player in enumerate(players):
y, x = player.split(",")
team = td_channels[i].split("_")[0]
shirt_num = td_channels[i].split("_")[1]
data = _add_player_tracking_data_to_dict(
team, shirt_num, _to_float(x), _to_float(y), data, idx
)
y, x, z, _, ball_status = ball_info.replace("\n", "").split(",")
data = _add_ball_data_to_dict(
_to_float(x), _to_float(y), _to_float(z), None, ball_status, data, idx
)
df = pd.DataFrame(data)
df["ball_status"] = ["alive" if x == "1" else "dead" for x in df["ball_status"]]
for col in [x for x in df.columns if "_x" in x]:
df[col] = df[col] - (pitch_dimensions[0] / 2)
df[col] *= -1
for col in [x for x in df.columns if "_y" in x]:
df[col] = df[col] - (pitch_dimensions[1] / 2)
df = _insert_missing_rows(df, "frame")
return df
def _get_td_channels(metadata_loc: str, metadata: Metadata) -> list:
"""Function to get the channels the order of which players
are referred to in the raw tracking data
Args:
metadata_loc (str): location of the metadata
metadata (Metadata): the Metadata of the game
Returns:
list: List with the order of which players are referred to
in the raw tracking data
"""
with open(metadata_loc, "rb") as file:
encoding = chardet.detect(file.read())["encoding"]
with open(metadata_loc, "r", encoding=encoding) as file:
lines = file.read()
root = ET.fromstring(lines)
res = []
for channel in root.findall(".//PlayerChannel"):
player_id = int(channel.get("id").split("_")[0][2:])
value = channel.get("id").split("_")[1]
if "y" in value:
continue
home_mask = metadata.home_players["id"] == player_id
away_mask = metadata.away_players["id"] == player_id
if home_mask.any():
team = "home"
shirt_num = metadata.home_players.loc[home_mask, "shirt_num"].values[0]
else:
team = "away"
shirt_num = metadata.away_players.loc[away_mask, "shirt_num"].values[0]
res.append(f"{team}_{shirt_num}")
return res
@logging_wrapper(__file__)
def _get_metadata(metadata_loc: str) -> Metadata:
"""Function to get the metadata of the game
Args:
metadata_loc (str): Location of the metadata .xml file
Returns:
Metadata: all information of the game
"""
with open(metadata_loc, "rb") as file:
encoding = chardet.detect(file.read())["encoding"]
with open(metadata_loc, "r", encoding=encoding) as file:
lines = file.read()
root = ET.fromstring(lines)
periods_dict = {
"period_id": [1, 2, 3, 4, 5],
"start_frame": [MISSING_INT] * 5,
"end_frame": [MISSING_INT] * 5,
"start_datetime_td": [pd.to_datetime("NaT")] * 5,
"end_datetime_td": [pd.to_datetime("NaT")] * 5,
}
periods = root.findall(".//Session")
i = 0
for period in periods:
session_type = period.find("SessionType")
if session_type is not None and session_type.text == "Period":
for param in period.findall(".//ProviderParameter"):
name_el = param.find("Name")
value_el = param.find("Value")
if name_el is None or "Frame" not in name_el.text:
continue
value = value_el.text
if "Start" in name_el.text:
dict_key = "start_frame"
dt_key = "start_datetime_td"
find_key = "Start"
else:
dict_key = "end_frame"
dt_key = "end_datetime_td"
find_key = "End"
periods_dict[dict_key][i] = _to_int(value)
find_el = period.find(find_key)
periods_dict[dt_key][i] = pd.to_datetime(find_el.text, utc=True)
i += 1
periods_frames = pd.DataFrame(periods_dict)
competition = root.find(".//Competition").text.split(",")[0]
# set to the right timezone
periods_frames["start_datetime_td"] = utc_to_local_datetime(
periods_frames["start_datetime_td"], competition
)
periods_frames["end_datetime_td"] = utc_to_local_datetime(
periods_frames["end_datetime_td"], competition
)
teams_el = root.find(".//Teams")
home_team = teams_el.findall("Team")[0]
home_team_id = home_team.get("id")
home_team_name = home_team.find("Name").text
home_team_player_data = root.findall(f'.//Player[@teamId="{home_team_id}"]')
home_players = _get_player_data(home_team_player_data)
local_score_el = root.find(".//LocalTeamScore")
home_score = int(local_score_el.text) if local_score_el is not None else MISSING_INT
away_team = teams_el.findall("Team")[1]
away_team_id = away_team.get("id")
away_team_name = away_team.find("Name").text
away_team_player_data = root.findall(f'.//Player[@teamId="{away_team_id}"]')
away_players = _get_player_data(away_team_player_data)
visiting_score_el = root.find(".//VisitingTeamScore")
away_score = (
int(visiting_score_el.text) if visiting_score_el is not None else MISSING_INT
)
metadata = Metadata(
game_id=int(root.find(".//Session").get("id")),
pitch_dimensions=[
float(root.find(".//MatchParameters/FieldSize/Length").text),
float(root.find(".//MatchParameters/FieldSize/Width").text),
],
periods_frames=periods_frames,
frame_rate=int(root.find(".//FrameRate").text),
home_team_id=home_team_id,
home_team_name=home_team_name,
home_players=home_players,
home_score=home_score,
home_formation="",
away_team_id=away_team_id,
away_team_name=away_team_name,
away_players=away_players,
away_score=away_score,
away_formation="",
country="",
)
return metadata
def _get_player_data(team: list) -> pd.DataFrame:
"""Function that creates a df containing info on all players for a team
Args:
team (list): containing ET.Element objects for all players of a team
Returns:
pd.DataFrame: contains all player information for a team
"""
team = [p if isinstance(p, ET.Element) else ET.fromstring(str(p)) for p in team]
player_dict = {
"id": [],
"full_name": [],
"shirt_num": [],
"player_type": [],
"start_frame": [],
"end_frame": [],
}
for player in team:
player_dict["id"].append(int(player.get("id")[2:]))
player_dict["full_name"].append(player.find("Name").text)
player_dict["shirt_num"].append(int(player.find("ShirtNumber").text))
values = [x.text for x in player.findall(".//Value")]
player_dict["player_type"].append(values[0])
player_dict["start_frame"].append(int(values[1]))
player_dict["end_frame"].append(int(values[2]))
df = pd.DataFrame(player_dict)
df["starter"] = df["player_type"] != "Substitute"
df["position"] = np.where(
df["player_type"] == "Goalkeeper", "goalkeeper", "unspecified"
)
df.drop(columns=["player_type"], inplace=True)
return df