struct TData { std::string fUserId; std::string fCountry; std::string fZipCode; std::string fOrder; Long64_t fCount; double fPrice; // per year !! int fSYear; int fEYear; }; //______________________________________________________________________________________ auto MakeDataFrame( ROOT::RDF::RNode tdf_csv) { auto calcColumns = [](const std::string &x) { static TPRegexp rl("(\\d{1,2})-Jahreskarte (\\d{4})"); TData d; d.fSYear = 0; d.fEYear = -1; if (x.find("Jahreskarte") != std::string::npos) { const TObjArray *subStrL = rl.MatchS(x); if (subStrL->GetLast()+1 == 3) { const int length = ((TObjString *)subStrL->At(1))->GetString().Atoi(); d.fSYear = ((TObjString *)subStrL->At(2))->GetString().Atoi(); d.fEYear = d.fSYear+length-1; } else { Error("MakeDataFrame","Invalid data entry: %s",x.c_str()); } delete subStrL; } else { std::cout << "line not assigned: " << x << std::endl; } return d; }; // calculate the new columns auto tdf = tdf_csv.Define("data",calcColumns,{"fTicket"}) .Define("fSYear",[](TData &data){return data.fSYear;},{"data"}) .Define("fEYear",[](TData &data){return data.fEYear;},{"data"}); Long64_t count_sum = 0; tdf.Foreach([&count_sum](Long64_t count){count_sum += count;},{"fCount"} ); std::cout << "sum counts in csv: " << count_sum << std::endl; return tdf; } //______________________________________________________________________________________ auto FilterDataFrame( ROOT::RDF::RNode tdf, const std::string &select) { //auto colNames = tdf.GetColumnNames(); //for (auto &&name: colNames) {std::cout << name <("fUserId"); auto country_A = tdf.Take("fCountry"); auto zipCode_A = tdf.Take("fZipCode"); auto order_A = tdf.Take("fOrder"); auto count_A = tdf.Take("fCount"); auto price_A = tdf.Take("fPrice"); auto syear_A = tdf.Take("fSYear"); auto eyear_A = tdf.Take("fEYear"); auto N = userId_A->size(); std::vector indexSort(N); TMath::Sort(Long64_t(N),userId_A->data(),indexSort.data(),kFALSE); // Consolidate the duplicate userId's ; user could have had several different orders // in one year // But only consolidate orders with same SYear and EYear std::vector data; ULong64_t iev = 0; while (iev < N) { TData d; d.fUserId = userId_A->at(indexSort[iev]); d.fCountry = country_A->at(indexSort[iev]); d.fZipCode = zipCode_A->at(indexSort[iev]); d.fOrder = order_A->at(indexSort[iev]); d.fCount = count_A->at(indexSort[iev]); d.fPrice = count_A->at(indexSort[iev])*price_A->at(indexSort[iev]); d.fSYear = syear_A->at(indexSort[iev]); d.fEYear = eyear_A->at(indexSort[iev]); data.push_back(d); ++iev; } ROOT::RDataFrame tdf_empty(N); auto tdf_cons = tdf_empty.DefineSlotEntry("data",[&data](unsigned int /*slot*/,ULong64_t entry) {return data[entry]; } ) .Define("fUserId", [](TData &d) {return d.fUserId; },{"data"}) .Define("fCountry",[](TData &d) {return d.fCountry;},{"data"}) .Define("fZipCode",[](TData &d) {return d.fZipCode;},{"data"}) .Define("fOrder", [](TData &d) {return d.fOrder; },{"data"}) .Define("fCount", [](TData &d) {return d.fCount; },{"data"}) .Define("fPrice", [](TData &d) {return d.fPrice; },{"data"}) .Define("fSYear", [](TData &d) {return d.fSYear; },{"data"}) .Define("fEYear", [](TData &d) {return d.fEYear; },{"data"}); auto tdf_cut = tdf_cons.Filter(select); Long64_t count_sum = 0; tdf_cut.Foreach([&count_sum](Long64_t count){count_sum += count;},{"fCount"} ); std::cout << "sum counts in tdf_cut: " << count_sum << std::endl; return tdf_cut; } //______________________________________________________________________________________ void mytest() { std::string fname_csv = "ticket.txt"; auto d_csv = ROOT::RDF::MakeCsvDataFrame(fname_csv); auto d = MakeDataFrame(d_csv); std::string select = "fCountry==\"CH\"&&fSYear==2016"; auto d_select = FilterDataFrame(d,select); Long64_t count_sum = 0; d_select.Foreach([&count_sum](Long64_t count){count_sum += count;},{"fCount"} ); std::cout << "sum counts in d_select: " << count_sum << std::endl; }