#include #include #include #include #include #include #include #include #include class Reader { public: Reader(const char *InputFileName, const char *OutputFileName = 0, char Delimiter = ',', char QuoteChar = '\'', int NumGuessLines = 5); int ReadFile(); void SetQuoteChar(char QuoteChar); void SetDelimiter(char Delimiter); void SetNumGuessLines(int NumGuessLines) {fNumGuessLines = NumGuessLines;} void SetMissingValues(Float_t MissingValue = -99) {fMissingValue = MissingValue;} void SetNumLinesToRead(int NumLinesToRead = -1) {fNumLinesToRead = NumLinesToRead;} void SetReportCounter(int ReportCounter = 2) {fReportCounter = ReportCounter;} char GetQuoteChar() {return fQuoteChar;} char GetDelimiter() {return fDelimiter;} int GetNumGuessLines() {return fNumGuessLines;} Float_t GetMissingValues() {return fMissingValue;} TString GetMatchPattern() {return fRePattern;} int GetNumLinesToRead() {return fNumLinesToRead;} private: std::string fInputFileName, fOutputFileName; // Delimiter defines how the file is delimited. // The quote character is needed so we don't split at the delimiter // within the quotes. For example, "Hi, and, Hello" should be a single // token in a comma delimited file. char fDelimiter, fQuoteChar; std::vector fVarNames; std::vector fVarTypes; TString fRePattern; // Pattern to read the tokens int fNumGuessLines; // Number of lines to read to guess the var type int fMissingValue; // Value to assign for missing numeric fields int fNumLinesToRead; // How many lines of data do we need to read ? // Counter to set how frequently to report reading progress int fReportCounter; void ReadVariableNamesAndTypes(ifstream & in, TPMERegexp r); void SetRePattern(); }; ////////////////////////////////////////////////////////////////////// Reader::Reader(const char *InputFileName, const char *OutputFileName, char Delimiter, char QuoteChar, int NumGuessLines){ fDelimiter = Delimiter; fQuoteChar = QuoteChar; if (InputFileName) { fInputFileName = InputFileName; } if (OutputFileName) { fOutputFileName = OutputFileName; } else { std::string::size_type loc = fInputFileName.find( '.',0) ; fOutputFileName = fInputFileName.substr(0,loc) + ".root"; std::cout << "No output file name is provided by the user\n"; std::cout << "Using Outfile name = " << fOutputFileName << std::endl; } fNumGuessLines = NumGuessLines; // Set up the regular expression to get tokens // SetRegExp(); SetRePattern(); // The value to be assigned for missing values. SetMissingValues(); // How often to report the reading progress ? SetReportCounter(); } ///////////////////////////////////////////////////////////////////////////// void Reader::SetRePattern() { // Set the reg exp for getting the tokens fRePattern = fDelimiter; fRePattern = fRePattern + "(?=(?:[^" + fQuoteChar + "]*" + fQuoteChar + "[^" + fQuoteChar + "]*" + fQuoteChar + ")*(?![^" + fQuoteChar + "]*" + fQuoteChar + "))"; } //////////////////////////////////////////////////////////////////////////// void Reader::SetDelimiter(char Delimiter){ fDelimiter = Delimiter; // If the delimiter change, the regular expression must change as well SetRePattern(); } ////////////////////////////////////////////////////////////////////////// void Reader::SetQuoteChar(char QuoteChar){ fQuoteChar = QuoteChar; // If the quote character changed, the regular expression must change as well // SetRegExp(); SetRePattern(); } //////////////////////////////////////////////////////////////////////////// int Reader::ReadFile(){ // First, make sure that input file name is provided if (fInputFileName.empty()) { std::cout << "No input file name specified. Exiting\n"; return 0; } // Print information about the output file, delimiter etc. std::cout << "Input will be read from file: " << fInputFileName << std::endl; std::cout << "Output will be written to the file: " << fOutputFileName << std::endl; std::cout << "Delimiter character = " << fDelimiter << std::endl; // Now, open the input file ifstream infile(fInputFileName.c_str()); if (!infile) { std::cout << "Error opening input file " << fInputFileName << std::endl; std::cout << "Exiting\n"; return 0; } else { std::cout << "Opened file " << fInputFileName << " for input " << std::endl; } // Read the variable names and types from the input file TPMERegexp r(fRePattern); cout << "Splitting tokens using Pattern: " << fRePattern << endl; ReadVariableNamesAndTypes(infile, r) ; // Open a file for output TFile *fout = new TFile(fOutputFileName.c_str(),"RECREATE"); if (!fout) { std::cout << "Error opening output file " << fOutputFileName << std::endl; std::cout << "Exiting\n" ; return 0; } fout->cd(); // Book a tree TTree *tree = new TTree("tree"," "); // Vectors to keep the values of variables unsigned int nvars = fVarNames.size(); std::vector fCharValues(nvars); std::vector fFloatValues(nvars); // Assign branch addresses TString BranchDescription; for (unsigned int j = 0; j < nvars; j++) { BranchDescription = fVarNames[j]; if (toupper(fVarTypes[j]) == 'C') { BranchDescription += "/C"; tree->Branch(fVarNames[j].Data(), (void *) fCharValues[j].Data(), BranchDescription.Data()); } else { BranchDescription += "/F"; tree->Branch(fVarNames[j], &fFloatValues[j], BranchDescription.Data()); } } // end of loop over nvalues to create branches // Now loop through the file to fill the tree // Rewind the input stream to begining infile.seekg(0, std::ios::beg); if (!infile) { cout << "Error in infile" << endl; return 0; } TString s1, stemp; // Skip the first, header line. s1.ReadLine(infile); // Start reading the actual data int LineCount = 0; while (s1.ReadLine(infile)) { LineCount++; int num = r.Split(s1,0); for (int i = 0; i < num; i++) { stemp = r[i]; if (toupper(fVarTypes[i]) == 'F') { if (stemp.IsNull() || stemp.IsWhitespace() ) { fFloatValues[i] = fMissingValue; } else { fFloatValues[i] = stemp.Atof(); } } else { fCharValues[i] = stemp; } // std::cout << LineCount << " " << i << " " << stemp << " " << fFloatValues[i] << " " << fCharValues[i] << endl; } if (!infile) break; tree->Fill(); if ( (fNumLinesToRead > 0) && (LineCount == fNumLinesToRead) ) break; if ((LineCount%fReportCounter) == 0) cout << "Read " << LineCount << " lines.\n"; } tree->Write(); fout->Close(); delete fout; return LineCount; } ////////////////////////////////////////////////////////////////////////////// void Reader::ReadVariableNamesAndTypes(ifstream &in, TPMERegexp r) { // Move to the begining of the stream in.seekg(0, std::ios::beg); // Parse the header and store the names and types in a vector TString s1, stemp; int LineCount = 0; while (s1.ReadLine(in)) { LineCount++; int num = r.Split(s1,0); for (int i = 0; i < num; i++) { stemp = r[i]; if (LineCount == 1) { fVarNames.push_back(stemp); fVarTypes.push_back('F'); // To begin with set everything to floating point } else { if ( (!stemp.IsNull()) && (!stemp.IsWhitespace()) && (!stemp.IsFloat()) ) fVarTypes[i] = 'C'; } // cout << r[i] << endl; } if (!in) break; if (LineCount > fNumGuessLines) break; } // std::cout << "Read " << LineCount << " lines " << endl; std::cout << "Found " << fVarNames.size() << " variables " << std::endl; std::cout << "Variable names and types are: " << std::endl; for (unsigned int k = 0; k < fVarNames.size(); k++) { std::cout << fVarNames[k] << " " << fVarTypes[k] << endl; } } //////////////////////////////////////////////////////////////////////////// void ReadData() { string InFile = "SampleData.txt"; TStopwatch StopWatch; StopWatch.Reset(); StopWatch.Start(); Reader FileReader(InFile.c_str()); FileReader.SetDelimiter(','); FileReader.SetQuoteChar('\''); FileReader.ReadFile(); StopWatch.Stop(); cout << "Total CPU Time = " << StopWatch.CpuTime() << " s\n"; cout << "Total Real Time = " << StopWatch.RealTime() << " s\n"; }