[ROOT::Fitter] how to use Huber loss function for linear fitting

Hi,

I’m using ROOT::Fit::Fitter class to do a linear regression. Here is a minimum example (using PYROOT):

import ROOT
import numpy as np

fitter = ROOT.Fit.Fitter()
func = ROOT.TF1("f1", "pol1", 0, 130)
fitter.SetFunction(ROOT.Math.WrappedMultiTF1(func, func.GetNdim()), True)
fitter.Config().SetMinimizer("Linear")

x = np.array([67.5, 67.5, 47.5, 47.5, 57.5])
x_err = np.array([2.5, 2.5, 2.5, 2.5, 2.5])
y = np.array([107.5,  102.5, -127.5,  117.5,  112.5])
y_err = np.array([2.5, 2.5, 2.5, 2.5, 2.5])
bin_data = ROOT.Fit.BinData(len(x), x, y, x_err, y_err)

fitter.Config().ParSettings(0).SetValue(0.)
fitter.Config().ParSettings(1).SetValue(1.)
is_ok = fitter.Fit(bin_data)
print(f"is ok: {is_ok}")
if is_ok:
    res = fitter.Result()
    print(f"slope: {res.Parameter(1)}")
    print(f"offset: {res.Parameter(0)}")
    print(f"pvalue: {res.Prob()}")

The printout is:

is ok: True
slope: 5.500000000000007
offset: -253.7500000000004
pvalue: 1.4567546900600512e-36

However, the dataset usually contains some outliers (including the dataset in the example above), which makes the final fitting result quite bad. If I use scikit-learn library, this could be resolved by using Huber regression, which “truncates” the implications from the outliers. Here is an example:

import numpy as np
from sklearn.linear_model import HuberRegressor, LinearRegression

x = np.array([67.5, 67.5, 47.5, 47.5, 57.5])
x_err = np.array([2.5, 2.5, 2.5, 2.5, 2.5])
y = np.array([107.5,  102.5, -127.5,  117.5,  112.5])
y_err = np.array([2.5, 2.5, 2.5, 2.5, 2.5])

huber_res = HuberRegressor().fit(np.reshape(x, (-1, 1)), y)
print(f"coef: {huber_res.coef_}, offset: {huber_res.intercept_}")

The printout is:

coef: [-0.46186719], offset: 136.75230379098608

Here is a comparison between the two methods (red line is using huber loss function and blue line is using ROOT’s normal linear fitting).

The huber loss function yields a much better result.

Thus, I would like to know whether I could use ROOT to do the huber regression as we need to do this with C++ in our project.

Thanks for your attention

Dear Edwin,

interesting question! We don’t have this loss function in ROOT, but in C++, you can easily implement the Huber loss function efficiently, mathematically identical to what you have in scikit-learn:

// Sum of Huber loss as implemented in scikit-learn.
double huber_sum_loss(std::size_t n_samples, const double *x, const double *y, const double *params, double epsilon=1.35)
{
   double intercept = params[0]; // intercept
   double w = params[1];         // slope
   double sigma = params[2];     // scale parameter (must be > 0)
   double alpha = 0.0001;

   int num_outliers = 0;
   double outlier_loss = 0.;
   double loss_sum = alpha * (w * w); // L1-regularization
   for (std::size_t i = 0; i < n_samples; ++i) {
      double loss = y[i] - w * x[i] - intercept;

      if (std::abs(loss) > epsilon * sigma) {
         ++num_outliers;
         outlier_loss += 2. * epsilon * std::abs(loss);
      } else {
         loss_sum += loss * loss / sigma;
      }
   }
   loss_sum += n_samples * sigma + outlier_loss - sigma * num_outliers * epsilon * epsilon;
   return loss_sum;
}

You can do the numeric minimization with Minuit 2, like in this example macro:

void huber_example()
{
   // Data
   std::vector<double> g_x{47.5, 47.5, 57.5, 67.5, 67.5};
   std::vector<double> g_y{-127.5, 117.5, 112.5, 107.5, 102.5};
   double g_epsilon = 1.35; // Huber epsilon (tunable)

   cout << "Using Huber epsilon = " << g_epsilon << "\n";

   // Create ROOT minimizer (Minuit2/Migrad)
   const unsigned int npar = 3; // a, b, sigma
   ROOT::Math::Minimizer *min = ROOT::Math::Factory::CreateMinimizer("Minuit2", "Migrad");

   // Set tolerances and strategy if you like
   min->SetMaxFunctionCalls(100000); // for safety on hard problems
   min->SetMaxIterations(10000);
   min->SetTolerance(1e-6);

   // Create a Functor for the objective function (wrap the C function)
   auto loss = [&](const double *params) {
      return huber_sum_loss(g_x.size(), g_x.data(), g_y.data(), params, g_epsilon);
   };
   ROOT::Math::Functor functor(loss, npar);
   min->SetFunction(functor);

   // Set initial values and step sizes
   double step[] = {0.1, 0.1, 0.1};
   double variable[] = {0.0, 1.0, 1.0}; // initial guess for a,b,sigma

   min->SetVariable(0, "a", variable[0], step[0]);
   min->SetVariable(1, "b", variable[1], step[1]);
   min->SetVariable(2, "sigma", variable[2], step[2]);

   // Run minimization
   bool ok = min->Minimize();
   if (!ok) {
      cout << "Minimization did not converge (Minimize returned false)\n";
   }

   // Retrieve results
   const double *best = min->X();
   double a_hat = best[0];
   double b_hat = best[1];
   double sigma_hat = best[1];

   // Estimate errors from minimizer (covariance)
   const double *err = min->Errors(); // parameter errors estimate
   double a_err = err[0];
   double b_err = err[1];
   double sigma_err = err[1];

   cout << "Fit results (Huber):\n";
   cout << "  a = " << a_hat << " +/- " << a_err << "\n";
   cout << "  b = " << b_hat << " +/- " << b_err << "\n";
   cout << "  sigma = " << sigma_hat << " +/- " << sigma_err << "\n";
   cout << "Minimum objective (sum Huber loss) = " << min->MinValue() << "\n";

   // For comparison, print OLS fit using simple linear regression (closed form)
   // compute ordinary least squares slope/intercept
   double sx = 0, sy = 0, sxx = 0, sxy = 0;
   for (size_t i = 0; i < g_x.size(); ++i) {
      sx += g_x[i];
      sy += g_y[i];
      sxx += g_x[i] * g_x[i];
      sxy += g_x[i] * g_y[i];
   }
   double n = (double)g_x.size();
   double denom = n * sxx - sx * sx;
   double ols_b = (n * sxy - sx * sy) / denom;
   double ols_a = (sy - ols_b * sx) / n;
   cout << "OLS (least-squares) fit:\n";
   cout << "  a = " << ols_a << "  b = " << ols_b << "\n";

   // Draw data and fitted lines
   TCanvas *c1 = new TCanvas("c1", "Huber regression example", 800, 600);
   c1->cd();

   TGraph *gr = new TGraph((int)g_x.size());
   for (int i = 0; i < (int)g_x.size(); ++i) {
      gr->SetPoint(i, g_x[i], g_y[i]);
   }
   gr->SetMarkerStyle(20);
   gr->SetTitle("Huber regression example; x; y");
   gr->Draw("AP");

   // Draw Huber fitted line
   double xlo = g_x.front();
   double xhi = g_x.back();
   TLine *hline = new TLine(xlo, a_hat + b_hat * xlo, xhi, a_hat + b_hat * xhi);
   hline->SetLineWidth(2);
   hline->SetLineStyle(1);
   hline->Draw("same");

   // Draw OLS fitted line
   TLine *lsline = new TLine(xlo, ols_a + ols_b * xlo, xhi, ols_a + ols_b * xhi);
   lsline->SetLineWidth(2);
   lsline->SetLineStyle(2);
   lsline->Draw("same");

   // Legend
   TLegend *leg = new TLegend(0.55, 0.15, 0.85, 0.30);
   leg->AddEntry(gr, "Data (with outliers)", "p");
   leg->AddEntry(hline, "Huber fit", "l");
   leg->AddEntry(lsline, "OLS fit", "l");
   leg->Draw();

   // Save to file
   c1->SaveAs("huber_fit.png");

   // If you want the GUI event loop (interactive), call app.Run();
   // For batch runs we can exit now.
   delete gr;
   delete hline;
   delete lsline;
   delete leg;
   delete c1;
   delete min;
}

Output:

Using Huber epsilon = 1.35
Fit results (Huber):
  a = 136.752 +/- 8.04162
  b = -0.461867 +/- 0.128753
  sigma = -0.461867 +/- 0.128753
Minimum objective (sum Huber loss) = 672.318
OLS (least-squares) fit:
  a = -253.75  b = 5.5

If it would really help you and your collaborators to have this in ROOT, please feel free to open a GitHub issue with a feature request.

I hope this helps!

Cheers,
Jonas

Hi,

Thanks very much for your solution. Sorry for the late reply as well. I was trying to use my own loss functions based on the definition of the Huber loss. But it didn’t work and the loss value always explodes.

Do you, by any chance, know the reason behind the loss function used in scikit? If I use Minuit as the optimizer, any subtle deviation from the scikit loss function would result in a divergent result.

Anyway, I rewrote your code using latest C++ standard. If someone is interested, here is the code:

Model class (RootModel.hpp):

#pragma once

#include <Math/Factory.h>
#include <Math/Functor.h>
#include <Math/Minimizer.h>
#include <array>
#include <memory>
#include <print>
#include <ranges>

class RootModel
{
  public:
    explicit RootModel(double epsilon)
        : epsilon_{ epsilon }
        , minimizer_{ ROOT::Math::Factory::CreateMinimizer("Minuit2") }
    {
        minimizer_->SetTolerance(0.01);
        clear();
    }

    static constexpr auto n_pars = 3;

    auto calculate_loss(const double* raw_pars) -> double
    {
        std::ranges::copy(std::span(raw_pars, n_pars), pars_.begin());
        const auto [slope, offset, sigma] = pars_;

        const auto n_data = static_cast<int>(x_vals_.size());
        auto n_outliers = 0;

        auto loss = std::ranges::fold_left(std::views::zip_transform(
                                               [&](auto x_val, auto y_val)
                                               {
                                                   const auto residual = std::abs(y_val - (x_val * slope) - offset);
                                                   const auto thresh = std::abs(sigma * epsilon_);
                                                   if (residual > thresh)
                                                   {
                                                       ++n_outliers;
                                                       return 2. * epsilon_ * residual;
                                                   }
                                                   return residual * residual / sigma;
                                               },
                                               x_vals_,
                                               y_vals_),
                                           (0.0001 * slope * slope),
                                           std::plus{});
        loss += n_data * sigma - sigma * n_outliers * epsilon_ * epsilon_;
        // std::println("loss: {}", loss);
        return loss;
    }

    void train_from_data(const std::vector<double>& x_vals, const std::vector<double>& y_vals)
    {
        // auto loss_functor =
        auto functor = ROOT::Math::Functor{ [&](const double* raw_pars) -> double { return calculate_loss(raw_pars); },
                                            RootModel::n_pars };
        minimizer_->SetFunction(functor);
        x_vals_ = std::span{ x_vals };
        y_vals_ = std::span{ y_vals };
        auto is_ok = minimizer_->Minimize();

        if (not is_ok)
        {
            std::println("minimization failed!");
        }
    }

    void clear()
    {
        minimizer_->SetVariable(0, "slope", 1., .1);
        minimizer_->SetVariable(1, "offset", 0., 1);
        minimizer_->SetVariable(2, "sigma", 1., .1);
    }

    void print()
    {
        const auto pars = std::span(minimizer_->X(), RootModel::n_pars);
        const auto errors = std::span(minimizer_->Errors(), RootModel::n_pars);
        std::println("pars: {}", pars);
        // std::println("errors: {}", errors);
        std::println("iterations: {}", minimizer_->NIterations());
    }
    // void set_x_vals(const std::vector<double>& vals) { x_vals_ = std::span{ vals }; }
    // void set_y_vals(const std::vector<double>& vals) { y_vals_ = std::span{ vals }; }

  private:
    double epsilon_ = 0.;
    std::span<const double> x_vals_;
    std::span<const double> y_vals_;
    std::array<double, n_pars> pars_{};
    std::unique_ptr<ROOT::Math::Minimizer> minimizer_;
};

Then in the main function:

#include "RootModel.hpp"

auto main() -> int
{
    auto root_model = RootModel(1.35);
    auto x_vals = std::vector<double>{ 47.5, 47.5, 57.5, 67.5, 67.5 };
    auto y_vals = std::vector<double>{ -127.5, 117.5, 112.5, 107.5, 102.5 };

    root_model.train_from_data(x_vals, y_vals);
    root_model.print();

    return 0;
}

Cool, thanks for sharing your code!

Sorry I don’t know much about why the loss function in scikit is why it is, other that it made intuitively sense to me and I was just re-implementing it in C++ 1-by-1 (except for removing support to weighted data for simplicity).

Ok, I tried with other data and it does’t work really well.

For example:

#include "RootModel.hpp"

auto main() -> int
{
    auto root_model = RootModel(1.35);
    std::vector<double> x_vals{ 127.5, 117.5, 107.5, 97.5, 87.5, 77.5, 67.5, 7.5, 17.5, 27.5, 37.5, 47.5, 57.5 };
    std::vector<double> y_vals{-117.5, -117.5, -117.5, -117.5, -117.5, -117.5, -117.5, -122.5, -122.5, -122.5, -122.5, -117.5, -117.5};

    root_model.train_from_data(x_vals, y_vals);
    root_model.print();

    return 0;
}

This returns:

minimization failed!
iterations: 555
slope: -11470090.053220743 +/- 180.9280206018847
offset: -11961144517.582024 +/- 107134.6956399016
sigma: -9955065116.670218 +/- 0.23270648344321365

While from scikit, it returns

coef: [0.05061786]
offset: -122.58970827224836
scale: 1.103721910719122
iters: 34

The root minimizer seems very fragile to the data. When the outliers are absent, it generates worse result even compared to simple linear regression.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.