#!/usr/bin/env python
# Demonstrate use of variance reduction to exclude poor data
# Use data example from Bevington & Robinson with 1 outlier

from numpy.random import normal
import numpy
from kapteyn import kmpfit
from matplotlib.pyplot import figure, show, rc

def model(p, x):
   a, b = p
   y = a+b*x
   return y

def residuals(p, data):
   a, b = p
   x, y, err = data
   return (y-model(p,x))/err

# Artificial data
# x = numpy.array([2, 4, 6, 8, 10, 12])
# y = numpy.array([3.5, 7.2, 9.5, 17.1, 20.0, 25.5])
# err = numpy.array([0.55, 0.65, 0.74, 0.5, 0.85, 0.6])

# With outlier
x = numpy.array([25, 2, 4, 6, 8, 10, 12])
y = numpy.array([15.0, 3.5, 7.2, 9.5, 17.1, 20.0, 25.5])
#err = numpy.array([0.6, 0.55, 0.65, 0.74, 0.5, 0.85, 0.6])
err = numpy.ones(len(y))

# Do the fit and find the Variance Reduction
params0 = (1,1)
fitter = kmpfit.Fitter(residuals=residuals, data=(x,y,err))
fitter.fit(params0=params0)

print("======== Fit results all data included ==========")
print("Params:                      ", fitter.params)
print("Uncertainties:               ", fitter.xerror)
print("Errors assuming red.chi^2=1: ", fitter.stderr)
print("Iterations:                  ", fitter.niter) 
print("Function ev:                 ", fitter.nfev)
print("dof:                         ", fitter.dof)
print("chi^2, rchi2:                ", fitter.chi2_min, fitter.rchi2_min)
print("Status:                      ", fitter.status)
a, b = fitter.params

alpha = 0.01
from scipy.stats import chi2
rv = chi2(fitter.dof)
pval = 1-rv.cdf(fitter.chi2_min)
print("If H0 was correct, then") 
print("the probability to find a chi-squared higher than this:  ", pval)
if pval < alpha:
   print("pval=%g. If we set the threshold to alpha=%f, we REJECT H0."%(pval, alpha))
else:
   print("pval=%g. If we set the threshold to alpha=%f, we ACCEPT H0."%(pval, alpha))

N = len(y)
varmod = (y-model(fitter.params,x))**2.0
y_av = y.sum()/N
vardat = (y-y_av)**2.0
vr0 = 100.0*(1-(varmod.sum()/vardat.sum()))
print("Unfiltered sample: variance reduction(%):", vr0)

# Prepare loop where we exclude one point in each run
xf = numpy.zeros(N-1)
yf = numpy.zeros(N-1)
errf = numpy.zeros(N-1)
vr = []

fitter = kmpfit.Fitter(residuals=residuals, data=(xf,yf,errf))
N = len(yf)
for i in range(N):
   xf[:] = numpy.delete(x,i)      # Delete one point
   yf[:] = numpy.delete(y,i)
   errf[:] = numpy.delete(err,i)
   print("\nWe deleted from the sample: (%g,%g)"%(x[i],y[i]))
   fitter.fit(params0=params0)
   print("chi^2, rchi2: ", fitter.chi2_min, fitter.rchi2_min)
   varmod = (yf-model(fitter.params,xf))**2.0
   yf_av = yf.sum()/N
   vardat = (yf-yf_av)**2.0
   vr1 = 100.0*(1-(varmod.sum()/vardat.sum()))
   # A vr of 100% implies that the model is perfect
   # A bad model gives much lower values (sometimes negative)
   print("Variance reduction%:", vr1)
   print("Improvement: %g%%"%(vr1-vr0))
   vr.append([vr1,i])

vr.sort()
print(vr)
i = vr[-1][1]
xf[:] = numpy.delete(x,i)      # Delete one point
yf[:] = numpy.delete(y,i)
errf[:] = numpy.delete(err,i)
fitter.fit(params0=params0)
print("Filtered sample: chi^2, rchi2: ", fitter.chi2_min, fitter.rchi2_min)
pval = 1-rv.cdf(fitter.chi2_min)
print("If H0 was correct, then")
print("the probability to find a chi-squared higher than this:  ", pval)
if pval < alpha:
   print("pval=%g. If we set the threshold to alpha=%f, we REJECT H0."%(pval, alpha))
else:
   print("pval=%g. If we set the threshold to alpha=%f, we ACCEPT H0."%(pval, alpha))


# Prepare plot
fig = figure()
frame = fig.add_subplot(1,1,1)
rc('legend', fontsize=8)
frame.set_xlabel("x")
frame.set_ylabel("y")
frame.set_title("Exclude poor data with variance reduction")
frame.set_ylim(0, 1.1*y.max())
frame.set_xlim(0, 1.1*x.max())
frame.errorbar(x, y, yerr=numpy.abs(err), fmt='bo', ms=3, label="data")
frame.plot(x, a+b*x, 'g', label="Fit unfiltered data")
frame.plot(x, model(fitter.params,x), 'm', label="Fit filtered data")
# Mark the exluded data point
frame.plot((x[i],), (y[i],), marker="o", ms=20, color="r", alpha=0.3)
leg = frame.legend(loc=2)
show()