import datetime as dt
import json
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import chardet
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from databallpy.data_parsers import Metadata
from databallpy.data_parsers.sportec_metadata_parser import (
_get_sportec_metadata,
_get_sportec_open_data_url,
)
from databallpy.data_parsers.tracking_data_parsers.utils import (
_add_ball_data_to_dict,
_add_datetime,
_add_periods_to_tracking_data,
_add_player_tracking_data_to_dict,
_adjust_start_end_frames,
_get_gametime,
_insert_missing_rows,
_normalize_playing_direction_tracking,
)
from databallpy.utils.constants import MISSING_INT
from databallpy.utils.logging import logging_wrapper
from databallpy.utils.tz_modification import localize_datetime
[docs]
@logging_wrapper(__file__)
def load_tracab_tracking_data(
tracab_loc: str, metadata_loc: str, verbose: bool = True
) -> tuple[pd.DataFrame, Metadata]:
"""Function to load tracking data and metadata from the tracab format
Args:
tracab_loc (str): location of the tracking_data.dat file
metadata_loc (str): location of the meta_data.xml file
verbose (bool): whether to print on progress of loading in the terminal,
defaults to True
Returns:
Tuple[pd.DataFrame, Metadata]: the tracking data and metadata class
"""
metadata = _get_metadata(metadata_loc)
if tracab_loc.endswith(".dat") or tracab_loc.endswith(".txt"):
tracking_data = _get_tracking_data_txt(tracab_loc, verbose)
tracking_data["datetime"] = _add_datetime(
tracking_data["frame"],
metadata.frame_rate,
metadata.periods_frames["start_datetime_td"].iloc[0],
)
elif tracab_loc.endswith(".xml"):
tracking_data, periods_frames, frame_rate = _get_tracking_data_xml(
tracab_loc, metadata.home_players, metadata.away_players, verbose
)
metadata.periods_frames = periods_frames
metadata.frame_rate = int(frame_rate)
tracking_data = _insert_missing_rows(
tracking_data.reset_index(drop=True), "frame"
)
else:
message = "Tracab tracking data should be either .txt, .dat, or .xml format."
raise ValueError(message)
tracking_data.insert(
len(tracking_data.columns) - 1,
"period_id",
_add_periods_to_tracking_data(tracking_data["frame"], metadata.periods_frames),
)
tracking_data, metadata = _adjust_start_end_frames(tracking_data, metadata)
tracking_data["gametime_td"] = _get_gametime(
tracking_data["frame"], tracking_data["period_id"], metadata
)
tracking_data, changed_periods = _normalize_playing_direction_tracking(
tracking_data, metadata.periods_frames
)
metadata.periods_changed_playing_direction = changed_periods
return tracking_data, metadata
[docs]
@logging_wrapper(__file__)
def load_sportec_open_tracking_data(
game_id: str, verbose: bool, cache_path: Path
) -> tuple[pd.DataFrame, Metadata]:
"""Load the tracking data from the sportec open data platform
Args:
game_id (str): The id of the game
verbose (bool): Whether to print info about the loading of the data.
cache_path (Path): path to cache files.
Returns:
tuple[pd.DataFrame, Metadata]: the tracking data and metadata class
Reference:
Bassek, M., Weber, H., Rein, R., & Memmert,D. (2024). An integrated
dataset of synchronized spatiotemporal and event data in elite soccer.
"""
metadata_url = _get_sportec_open_data_url(game_id, "metadata")
os.makedirs(cache_path, exist_ok=True)
metadata = requests.get(metadata_url)
with open(cache_path / "metadata_temp.xml", "wb") as f:
f.write(metadata.content)
if verbose:
print("Downloading open tracking data...", end="\r")
session = requests.Session()
response = session.get(
_get_sportec_open_data_url(game_id, "tracking_data"), stream=True
)
total_size = int(response.headers.get("content-length", 0))
with (
open(cache_path / "tracking_data_temp.xml", "wb") as file,
tqdm(
desc="Downloading",
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
disable=not verbose,
) as bar,
):
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
bar.update(len(chunk))
print("Done!", end="\r")
return load_tracab_tracking_data(
str(cache_path / "tracking_data_temp.xml"),
str(cache_path / "metadata_temp.xml"),
verbose=verbose,
)
@logging_wrapper(__file__)
def _get_tracking_data_xml(
tracab_loc: str,
home_players: pd.DataFrame,
away_players: pd.DataFrame,
verbose: bool,
) -> tuple[pd.DataFrame, pd.DataFrame, int]:
if verbose:
print(f"Reading in {tracab_loc}", end="")
frames_df = pd.DataFrame(
{
"period_id": [1, 2, 3, 4, 5],
"start_frame": [MISSING_INT] * 5,
"end_frame": [MISSING_INT] * 5,
"start_datetime_td": ["NaT"] * 5,
"end_datetime_td": ["NaT"] * 5,
}
)
frames_df["start_datetime_td"] = pd.to_datetime(frames_df["start_datetime_td"])
frames_df["end_datetime_td"] = pd.to_datetime(frames_df["end_datetime_td"])
context = ET.iterparse(tracab_loc, events=("start", "end"))
event, _ = next(context)
frame_values = []
n_elements = 0
n_frames_first_half = None
frame_rate = None
# first find the two frame sets of the ball to initialize the frames
for event, elem in context:
n_elements += 1
if event == "end" and elem.tag == "FrameSet" and elem.get("TeamId") == "BALL":
frames = elem.findall("Frame")
frame_values.extend([int(x.get("N")) for x in frames])
game_section = elem.get("GameSection")
frame_rate, n_frames_first_half = process_game_section(
frames, game_section, frames_df, frame_rate, n_frames_first_half
)
size_lines = len(frame_values)
data = {
"frame": frame_values,
"ball_x": [np.nan] * size_lines,
"ball_y": [np.nan] * size_lines,
"ball_z": [np.nan] * size_lines,
"ball_status": [None] * size_lines,
"team_possession": [None] * size_lines,
"datetime": ["NaT"] * size_lines,
}
context = ET.iterparse(tracab_loc, events=("start", "end"))
event, _ = next(context)
if verbose:
context = tqdm(context, total=n_elements)
for event, elem in context:
if not (event == "end" and elem.tag == "FrameSet"):
continue
frames = elem.findall("Frame")
player_id = elem.get("PersonId")
if player_id in home_players["id"].to_list():
column_id = "home_" + str(
home_players.loc[home_players["id"] == player_id, "shirt_num"].iloc[0]
)
elif player_id in away_players["id"].to_list():
column_id = "away_" + str(
away_players.loc[away_players["id"] == player_id, "shirt_num"].iloc[0]
)
else:
column_id = "ball"
if column_id + "_x" not in data.keys():
data[f"{column_id}_x"] = [np.nan] * size_lines
data[f"{column_id}_y"] = [np.nan] * size_lines
is_second_half = elem.get("GameSection") == "secondHalf"
for frame in frames:
if is_second_half:
i = n_frames_first_half + int(frame.get("N")) - 100_000
else:
i = int(frame.get("N")) - 10_000
data[f"{column_id}_x"][i] = float(frame.get("X"))
data[f"{column_id}_y"][i] = float(frame.get("Y"))
if frame.get("Z") is not None: # ball
data[f"{column_id}_z"][i] = float(frame.get("Z"))
data[f"{column_id}_status"][i] = (
"alive" if frame.get("BallStatus") == "1" else "dead"
)
data["team_possession"][i] = (
"home" if int(frame.get("BallPossession")) == 1 else "away"
)
data["datetime"][i] = frame.get("T")
df = pd.DataFrame(data)
df["datetime"] = pd.to_datetime(df["datetime"], utc=True).dt.tz_convert(
"Europe/Berlin"
)
frames_df["start_datetime_td"] = (
frames_df["start_datetime_td"]
.dt.tz_localize("UTC")
.dt.tz_convert("Europe/Berlin")
)
frames_df["end_datetime_td"] = (
frames_df["end_datetime_td"].dt.tz_localize("UTC").dt.tz_convert("Europe/Berlin")
)
return df, frames_df, frame_rate
def process_game_section(
frames, game_section, frames_df, frame_rate=None, n_frames_first_half=None
):
if game_section == "firstHalf":
frames_df.loc[0, "start_frame"] = int(frames[0].get("N"))
frames_df.loc[0, "start_datetime_td"] = pd.to_datetime(
frames[0].get("T")
).tz_convert(None)
frames_df.loc[0, "end_frame"] = int(frames[-1].get("N"))
frames_df.loc[0, "end_datetime_td"] = pd.to_datetime(
frames[-1].get("T")
).tz_convert(None)
frame_rate = (
1
/ (
pd.to_datetime(frames[1].get("T")).tz_convert(None)
- pd.to_datetime(frames[0].get("T")).tz_convert(None)
).total_seconds()
)
n_frames_first_half = len(frames)
else: # second half
frames_df.loc[1, "start_frame"] = int(frames[0].get("N"))
frames_df.loc[1, "start_datetime_td"] = pd.to_datetime(
frames[0].get("T")
).tz_convert(None)
frames_df.loc[1, "end_frame"] = int(frames[-1].get("N"))
frames_df.loc[1, "end_datetime_td"] = pd.to_datetime(
frames[-1].get("T")
).tz_convert(None)
return frame_rate, n_frames_first_half
@logging_wrapper(__file__)
def _get_tracking_data_txt(tracab_loc: str, verbose: bool) -> pd.DataFrame:
"""Function that reads tracking data from .dat file and stores it in a pd.DataFrame
Args:
tracab_loc (str): location of the tracking_data.dat file
verbose (bool): whether to print info in terminal
Returns:
pd.DataFrame: contains tracking data
"""
if verbose:
print(f"Reading in {tracab_loc}", end="")
with open(tracab_loc, "r") as file:
lines = file.readlines()
if verbose:
print(" - Completed")
size_lines = len(lines)
data = {
"frame": [np.nan] * size_lines,
"ball_x": [np.nan] * size_lines,
"ball_y": [np.nan] * size_lines,
"ball_z": [np.nan] * size_lines,
"ball_status": [None] * size_lines,
"team_possession": [None] * size_lines,
}
team_ids = {0: "away", 1: "home"}
home_away_map = {"H": "home", "A": "away"}
if verbose:
lines = tqdm(
lines, desc="Writing lines to dataframe", unit=" lines", leave=False
)
for idx, (frame, players_info, ball_info, _) in enumerate(
(line.split(":") for line in lines)
):
data["frame"][idx] = int(frame)
players = players_info.split(";")[:-1]
for team_id, _, shirt_num, x, y, _ in (player.split(",") for player in players):
team = team_ids.get(int(team_id))
if team is None: # player is unknown or referee
continue
data = _add_player_tracking_data_to_dict(team, shirt_num, x, y, data, idx)
ball_x, ball_y, ball_z, _, possession, status = ball_info.split(";")[0].split(
","
)[:6]
possession = home_away_map[possession]
data = _add_ball_data_to_dict(
ball_x, ball_y, ball_z, possession, status.lower(), data, idx
)
df = pd.DataFrame(data)
mask = df.columns.str.contains("_x|_y|_z")
df.loc[:, mask] = np.round(df.loc[:, mask] / 100, 3) # change cm to m
df = _insert_missing_rows(df, "frame")
return df
@logging_wrapper(__file__)
def _get_metadata(metadata_loc: str) -> Metadata:
"""Function that reads metadata (supports both .json and .xml) file and
stores it in Metadata class
Args:
meta_data_loc (str): location of the metadata file
Returns:
Metadata: class that contains metadata
"""
format = metadata_loc.split(".")[-1]
if format == "json":
with open(metadata_loc, "r") as file:
content = json.load(file)
return _get_tracab_metadata_json(content)
else:
with open(metadata_loc, "rb") as file:
encoding = chardet.detect(file.read())["encoding"]
with open(metadata_loc, "r", encoding=encoding) as file:
lines = file.read()
lines = lines.replace("", "")
root = ET.fromstring(lines)
if root.find(".//match") is not None:
return _get_tracab_metadata_xml(root)
elif root.find(".//General") is not None:
return _get_sportec_metadata(metadata_loc)
else:
message = "Unknown type of tracab metadata, please open an issue on GitHub."
raise ValueError(message)
@logging_wrapper(__file__)
def _get_tracab_metadata_json(metadata: dict) -> Metadata:
"""
Function that reads metadata from .json file and stores it in Metadata class.
"""
game_id = int(metadata["GameID"])
pitch_size_x = float(metadata["PitchLongSide"]) / 100
pitch_size_y = float(metadata["PitchShortSide"]) / 100
frame_rate = int(metadata["FrameRate"])
datetime_string = metadata["Kickoff"]
date = pd.to_datetime(datetime_string[:10])
frames_dict = {
"period_id": [],
"start_frame": [],
"end_frame": [],
"start_datetime_td": [],
"end_datetime_td": [],
}
for i in range(1, 6):
frames_dict["period_id"].append(i)
if metadata[f"Phase{i}StartFrame"] != 0:
start_frame = int(metadata[f"Phase{i}StartFrame"])
end_frame = int(metadata[f"Phase{i}EndFrame"])
frames_dict["start_frame"].append(start_frame)
frames_dict["end_frame"].append(end_frame)
start_frame_corrected = start_frame % (frame_rate * 60 * 60 * 24)
end_frame_corrected = end_frame % (frame_rate * 60 * 60 * 24)
frames_dict["start_datetime_td"].append(
date
+ dt.timedelta(
milliseconds=int((start_frame_corrected / frame_rate) * 1_000)
)
)
frames_dict["end_datetime_td"].append(
date
+ dt.timedelta(
milliseconds=int((end_frame_corrected / frame_rate) * 1_000)
)
)
else:
frames_dict["start_frame"].append(MISSING_INT)
frames_dict["end_frame"].append(MISSING_INT)
frames_dict["start_datetime_td"].append(pd.NaT)
frames_dict["end_datetime_td"].append(pd.NaT)
df_frames = pd.DataFrame(frames_dict)
df_frames["start_datetime_td"] = localize_datetime(
df_frames["start_datetime_td"], "Netherlands"
)
df_frames["end_datetime_td"] = localize_datetime(
df_frames["end_datetime_td"], "Netherlands"
)
home_team_id = metadata["HomeTeam"]["TeamID"]
home_team_name = metadata["HomeTeam"]["LongName"]
home_players_info = []
for player in metadata["HomeTeam"]["Players"]:
home_players_info.append(
{
"PlayerId": player["PlayerID"],
"FirstName": player["FirstName"],
"LastName": player["LastName"],
"JerseyNo": player["JerseyNo"],
"StartFrameCount": player["StartFrameCount"],
"EndFrameCount": player["EndFrameCount"],
}
)
away_team_id = metadata["AwayTeam"]["TeamID"]
away_team_name = metadata["AwayTeam"]["LongName"]
away_players_info = []
for player in metadata["AwayTeam"]["Players"]:
away_players_info.append(
{
"PlayerId": player["PlayerID"],
"FirstName": player["FirstName"],
"LastName": player["LastName"],
"JerseyNo": player["JerseyNo"],
"StartFrameCount": player["StartFrameCount"],
"EndFrameCount": player["EndFrameCount"],
}
)
df_home_players = _get_players_metadata_v1(home_players_info)
df_away_players = _get_players_metadata_v1(away_players_info)
return Metadata(
game_id=game_id,
pitch_dimensions=[pitch_size_x, pitch_size_y],
periods_frames=df_frames,
frame_rate=frame_rate,
home_team_id=home_team_id,
home_team_name=home_team_name,
home_players=df_home_players,
home_score=MISSING_INT,
home_formation="",
away_team_id=away_team_id,
away_team_name=away_team_name,
away_players=df_away_players,
away_score=MISSING_INT,
away_formation="",
country="",
)
@logging_wrapper(__file__)
def _get_tracab_metadata_xml(root: ET.Element) -> Metadata:
"""This version is used in the Netherlands"""
match_elem = root.find(".//match")
game_id = int(match_elem.get("iId"))
pitch_size_x = float(match_elem.get("fPitchXSizeMeters"))
pitch_size_y = float(match_elem.get("fPitchYSizeMeters"))
frame_rate = int(match_elem.get("iFrameRateFps"))
datetime_string = match_elem.get("dtDate")
date = pd.to_datetime(datetime_string[:10])
frames_dict = {
"period_id": [],
"start_frame": [],
"end_frame": [],
"start_datetime_td": [],
"end_datetime_td": [],
}
for _, period in enumerate(root.findall(".//period")):
frames_dict["period_id"].append(int(period.get("iId")))
start_frame = int(period.get("iStartFrame"))
end_frame = int(period.get("iEndFrame"))
if start_frame != 0:
frames_dict["start_frame"].append(start_frame)
frames_dict["end_frame"].append(end_frame)
start_frame_corrected = start_frame % (frame_rate * 60 * 60 * 24)
end_frame_corrected = end_frame % (frame_rate * 60 * 60 * 24)
frames_dict["start_datetime_td"].append(
date
+ dt.timedelta(
milliseconds=int((start_frame_corrected / frame_rate) * 1000)
)
)
frames_dict["end_datetime_td"].append(
date
+ dt.timedelta(
milliseconds=int((end_frame_corrected / frame_rate) * 1000)
)
)
else:
frames_dict["start_frame"].append(MISSING_INT)
frames_dict["end_frame"].append(MISSING_INT)
frames_dict["start_datetime_td"].append(pd.to_datetime("NaT"))
frames_dict["end_datetime_td"].append(pd.to_datetime("NaT"))
df_frames = pd.DataFrame(frames_dict)
# set to right timezone, tracab has no location/competition info
# in metadata, so we have to guess
df_frames["start_datetime_td"] = localize_datetime(
df_frames["start_datetime_td"], "Netherlands"
)
df_frames["end_datetime_td"] = localize_datetime(
df_frames["end_datetime_td"], "Netherlands"
)
home_team = root.find(".//HomeTeam")
home_team_name = home_team.find("LongName").text
home_team_id = int(home_team.find("TeamId").text)
home_players_info = []
for player in home_team.findall(".//Player"):
player_dict = {}
for element in player:
player_dict[element.tag] = element.text
home_players_info.append(player_dict)
df_home_players = _get_players_metadata_v1(home_players_info)
away_team = root.find(".//AwayTeam")
away_team_name = away_team.find("LongName").text
away_team_id = int(away_team.find("TeamId").text)
away_players_info = []
for player in away_team.findall(".//Player"):
player_dict = {}
for element in player:
player_dict[element.tag] = element.text
away_players_info.append(player_dict)
df_away_players = _get_players_metadata_v1(away_players_info)
metadata = Metadata(
game_id=game_id,
pitch_dimensions=[pitch_size_x, pitch_size_y],
periods_frames=df_frames,
frame_rate=frame_rate,
home_team_id=home_team_id,
home_team_name=home_team_name,
home_players=df_home_players,
home_score=MISSING_INT,
home_formation="",
away_team_id=away_team_id,
away_team_name=away_team_name,
away_players=df_away_players,
away_score=MISSING_INT,
away_formation="",
country="",
)
return metadata
def _get_players_metadata_v1(players_info: list[dict[str, int | float]]) -> pd.DataFrame:
"""Function that creates a df containing info on all players for a team
Args:
team (list): contains an information dictionary for each player
Returns:
pd.DataFrame: contains all player information for a team
"""
player_dict = {
"id": [],
"full_name": [],
"shirt_num": [],
"start_frame": [],
"end_frame": [],
}
for player in players_info:
player_dict["id"].append(int(player["PlayerId"]))
first_name = player["FirstName"] or ""
full_name = first_name + " " + player["LastName"]
if not first_name:
full_name = full_name.lstrip()
player_dict["full_name"].append(full_name)
player_dict["shirt_num"].append(int(player["JerseyNo"]))
player_dict["start_frame"].append(int(player["StartFrameCount"]))
player_dict["end_frame"].append(int(player["EndFrameCount"]))
df = pd.DataFrame(player_dict)
df["starter"] = df["start_frame"] == df["start_frame"].value_counts().index[0]
df["position"] = "unspecified"
return df