Describe the bug
Building a TTree inside a python3 function triggers a seg fault. Running the same code outside of the function works fine. I provided two scripts that should do the same thing, one works (inline) and the other doesn’t (with function). Am I overlooking something? Thanks for the help!!
Expected behavior
Build a TTree, convert it to RDataFrame and save.
To Reproduce
- running with the function (crashes):
def npz_to_root(data: np.lib.npyio.NpzFile, columns_names: dict, id_col: bool = True, id_start: int = 0, out_file: str = None, tree_name: str = "data") -> ROOT.RDataFrame:
"""Converts a NpzFile object into a ROOT RDataFrame
Parameters
----------
data : NpzFile
Npz file loaded with np.load()
columns_names : dict
Keys for columns and values for the corresponding keys in the npz file
id_col : bool, optional
If the id col should be computed, by default True
id_start : int, optional
Where to start counting the id, by default 0
out_file : str, optional
Name of the output file, by default None
tree_name : str, optional
Name of the tree object in the RDataFrame, by default "data"
Returns
-------
ROOT.RDataFrame
Converted RDataFrame
Examples
--------
```python
import ROOT
import numpy as np
with np.load("data.npz") as data:
rdf = npz_to_root(data, {"features_float": "data_float"})
```
"""
tree = ROOT.TTree()
data_arrs = dict()
if out_file:
os.makedirs(os.path.dirname(out_file), exist_ok=True)
if id_col:
data_arrs["id"] = np.array([0], dtype=np.int32)
tree.Branch("id", data_arrs["id"], "id/I")
evt_counter = id_start
for columns in columns_names:
for icol, col in enumerate(data[columns]):
data_type = data[columns_names[columns]][0][icol]
# get a value and check the type, converting to 32 bits if needed
if np.issubdtype(type(data_type), np.integer) and np.dtype(type(data_type)).itemsize > 4:
data_type = np.int32
elif np.issubdtype(type(data_type), np.bool_):
data_type = np.int32
elif np.issubdtype(type(data_type), np.float64):
data_type = np.float32
else:
data_type = type(data_type)
data_arrs[col] = np.array([0], dtype=data_type)
tree.Branch(col, data_arrs[col],
f"{col}/{str(data_type).upper()[14]}")
# 14 is the position of the first letter of the type
# "<class 'numpy.int32'>"[14] == "i"
total_rows = len(data[columns_names[columns]])
for row in tqdm(range(total_rows)):
for columns in columns_names:
for icol, col in enumerate(data[columns]):
data_arrs[col][0] = data[columns_names[columns]][row][icol]
if id_col:
data_arrs["id"][0] = evt_counter
evt_counter += 1
tree.Fill()
rdf = ROOT.RDataFrame(tree) # CODE CRASHES HERE
if out_file:
rdf.Snapshot(tree_name, out_file)
return rdf
columns_names = {
"features_int" : "data_int" ,
"features_float" : "data_float" ,
"features_bool" : "data_bool" ,
}
with np.load("path/to/dataset.npz") as d:
rdf = npz_to_root(d, columns_names)
- running inline code (works fine):
with np.load(path) as data:
tree = ROOT.TTree()
data_arrs = dict()
if out_file:
os.makedirs(os.path.dirname(out_file), exist_ok=True)
if id_col:
data_arrs["id"] = np.array([0], dtype=np.int32)
tree.Branch("id", data_arrs["id"], "id/I")
evt_counter = id_start
for columns in columns_names:
for icol, col in enumerate(data[columns]):
data_type = data[columns_names[columns]][0][icol]
# get a value and check the type, converting to 32 bits if needed
if np.issubdtype(type(data_type), np.integer) and np.dtype(type(data_type)).itemsize > 4:
data_type = np.int32
elif np.issubdtype(type(data_type), np.bool_):
data_type = np.int32
elif np.issubdtype(type(data_type), np.float64):
data_type = np.float32
else:
data_type = type(data_type)
data_arrs[col] = np.array([0], dtype=data_type)
tree.Branch(col, data_arrs[col],
f"{col}/{str(data_type).upper()[14]}")
# 14 is the position of the first letter of the type
# "<class 'numpy.int32'>"[14] == "i"
total_rows = len(data[columns_names[columns]])
for row in tqdm(range(total_rows)):
for columns in columns_names:
for icol, col in enumerate(data[columns]):
data_arrs[col][0] = data[columns_names[columns]][row][icol]
if id_col:
data_arrs["id"][0] = evt_counter
evt_counter += 1
tree.Fill()
rdf = ROOT.RDataFrame(tree)
if out_file:
rdf.Snapshot(tree_name, out_file)
Additional info
tree.Print() also does not seem to work from inside the function, but calling it in the inline file is fine.
The dataset.npz
file is read in the format:
{"features_int":["col_1", "col_2"],
"data_int":[[1, 2], [3, 4], [5, 6], ..., [n, n]],
"features_float":["col_f3", "col_f4"],
"data_float":[[3.14, 2.71], ..., [n, n]],
}
Setup
ROOT v6.30/02
Built for linuxx8664gcc on Dec 20 2023, 18:59:44
From heads/master@tags/v6-30-02
With g++ (GCC) 13.1.0
Binary directory: /cvmfs/sft.cern.ch/lcg/releases/ROOT/6.30.02-fb5be/x86_64-el9-gcc13-opt/bin
<!--
lsetup "views LCG_105 x86_64-el9-gcc13-opt"
-->