For teh time being this is what i ended up doing atm :
__FINGERPRINT__COLUMNS__ =[
# "MCPV_SIZE",
"ODINTCK",
"eCalTot",
"hCalTot",
"nBackTracks",
"nDownstreamTracks",
"nEcalClusters",
"nFTClusters",
"nLongTracks",
"nMuonCoordsS1",
"nMuonCoordsS2",
"nMuonCoordsS3",
"nMuonCoordsS4",
"nPVs",
"nRich1Hits",
"nRich2Hits",
"nTTracks",
"nUpstreamTracks",
"nVPClusters",
"nVeloTracks"
]
def add_pandas_columns_to_rdf(
nodeDT : RNode,
nodeMCDT : RNode,
matching_column: str = "EventID",
columns_to_port: List[str] = None,
default_unmatchval: float = -999.0,
matching_column_type: str = "uint64_t" # or "unsigned long long" if needed
) -> ROOT.RDataFrame:
# for MCDT tuple
mcdt_pd = nodeMCDT.AsNumpy( columns = ["EventID"] + ported_columns)
logger.info( "Porting columns here")
lookup_tables = {}
for ported_col in columns_to_port :
cpp_map = ROOT.std.map('unsigned long', 'double')()
print( type(cpp_map))
logger.info(f"Creating map to transport for {ported_col}")
matching_cols = nodeMCDT_pd[matching_column]
ported_col = nodeMCDT_pd[ported_col]
if len( set( matching_cols)) != len(matching_cols) :
logger.warning("Multiple candidates detected in MCDecayTree, this number must be very small")
logger.warning(f"Length columns {len(matching_cols)} , Length as set = {len( set( matching_cols))}")
logger.warning("The first match will be kept")
for a, b in zip( nodeMCDT_pd[matching_column], nodeMCDT_pd[ported_col]):
if cpp_map.count(a) == 0:
cpp_map[ROOT.uint64_t(a)] = b
else:
logger.critical(f"WARNING (ERROR?) Skipping EventID {a}, already there, duplicate found...Debug me further if this happens very often!")
# Thread-safe Define using static local map
nodeDT = ROOT.NODE_CREATOR.ColumnFromMappedID(
nodeDT,
cpp_map,
ROOT.std.string(ported_col),
ROOT.std.string("EventID"),
-999.)
return nodeDT
where for the unique tagger event i use
#pragma once
#include <ROOT/RDataFrame.hxx>
#include <xxhash.h>
#include <array>
struct EventFingerprint {
// Match EXACT branch types — no more proxy errors!
uint64_t operator()(
unsigned int ODINTCK, // ← unsigned!
int eCalTot,
int hCalTot,
int nBackTracks,
int nDownstreamTracks,
int nEcalClusters,
int nFTClusters,
int nLongTracks,
int nMuonCoordsS1,
int nMuonCoordsS2,
int nMuonCoordsS3,
int nMuonCoordsS4,
int nPVs,
int nRich1Hits,
int nRich2Hits,
int nTTracks,
int nUpstreamTracks,
int nVPClusters,
int nVeloTracks
) const {
// Pack in fixed order — use a union or two arrays to avoid alignment issues
alignas(8) const uint32_t raw_data[20] = {
ODINTCK,
static_cast<uint32_t>(eCalTot),
static_cast<uint32_t>(hCalTot),
static_cast<uint32_t>(nBackTracks),
static_cast<uint32_t>(nDownstreamTracks),
static_cast<uint32_t>(nEcalClusters),
static_cast<uint32_t>(nFTClusters),
static_cast<uint32_t>(nLongTracks),
static_cast<uint32_t>(nMuonCoordsS1),
static_cast<uint32_t>(nMuonCoordsS2),
static_cast<uint32_t>(nMuonCoordsS3),
static_cast<uint32_t>(nMuonCoordsS4),
static_cast<uint32_t>(nPVs),
static_cast<uint32_t>(nRich1Hits),
static_cast<uint32_t>(nRich2Hits),
static_cast<uint32_t>(nTTracks),
static_cast<uint32_t>(nUpstreamTracks),
static_cast<uint32_t>(nVPClusters),
static_cast<uint32_t>(nVeloTracks),
0u // padding to make size multiple of 8 bytes → better performance
};
return XXH3_64bits_withSeed(raw_data, sizeof(raw_data), 0x4c48b4d2025ULL);
}
};
and for the LookupProvider i had to got for cpp map with uint64_t to avoid collision of event tagging due to large datasets
// LookupProvider.h functor class to query std::map< colMatch, VariableFromOtherNode>
#pragma once
#include <map>
#include <ROOT/RVec.hxx>
using namespace ROOT::VecOps;
#include <unordered_map>
class LookupProvider {
// not const...
public :
LookupProvider() = default;
// Only constructor: takes a reference to a pre-built static map
LookupProvider(const std::map<uint64_t, double>& m,
double def = -999.0) noexcept
: default_value(def), map(m) {
}
private:
std::map<uint64_t, double> map;
double default_value;
public :
// The actual lookup (not const call.)
double operator()(uint64_t key) const {
auto it = std::as_const(map).find(key);
return it != map.end() ? it->second : default_value;
}
};
And the function i compile is
ROOT::RDF::RNode NODE_CREATOR::ColumnFromMappedID(
ROOT::RDF::RNode node,
std::map<uint64_t, double> & map,
std::string & col_add,
std::string & match_name,
double default_val){
LookupProvider VarProvider( map, default_val);
return node.Define( col_add.c_str(),
VarProvider,
{match_name.c_str()} );
}
In essence :
-
For nodeOrigin (the source of things) I create “EventID” column with EventFingerprint::operator() prescription
-
For nodeTarget I create “EventID” column with EventFingerprint::operator() prescription
-
I convert nodeOrigin to dict( numpy array) , use AsNumpy
- Here i generate
std::map<uint64_t, double> filled for each nodeOrigin variable i want to transport
-
In nodeTarget i do ColumnFromMappedID call relying on my LookupProvider which is nothing else than a Functor having as private member the std::map<uint64_t, double>, so that i can call nodeTarget.Define("var_ported", VarProvider( themapping), "EventID")
I posted this to show the convoluted procedure followed to do this, and the need of intermediate AsNumpy / In Memory creation of a full tree as a lookup table to do such task.