#!/usr/bin/env python
# Demonstrate criterion of Chauvenet to exclude poor data

from numpy.random import normal
from scipy.special import erf, erfc
import numpy
from kapteyn import kmpfit
from matplotlib.pyplot import figure, show, rc

def chauvenet(x, y, mean=None, stdv=None):
   #-----------------------------------------------------------
   # Input:  NumPy arrays x, y that represent measured data
   #         A single value of a mean can be entered or a 
   #         sequence of means with the same length as 
   #         the arrays x and y. In the latter case, the 
   #         mean could be a model with best-fit parameters.
   # Output: It returns a boolean array as filter.
   #         The False values correspond to the array elements
   #         that should be excluded
   # 
   # First standardize the distances to the mean value
   # d = abs(y-mean)/stdv so that this distance is in terms
   # of the standard deviation.  
   # Then the  CDF of the normal distr. is given by 
   # phi = 1/2+1/2*erf(d/sqrt(2))
   # Note that we want the CDF from -inf to -d and from d to +inf.
   # Note also erf(-d) = -erf(d).
   # Then the threshold probability = 1-erf(d/sqrt(2))
   # Note, the complementary error function erfc(d) = 1-erf(d)
   # So the threshold probability pt = erfc(d/sqrt(2))
   # If d becomes bigger, this probability becomes smaller.
   # If this probability (to obtain a deviation from the mean)
   # becomes smaller than 1/(2N) than we reject the data point
   # as valid. In this function we return an array with booleans 
   # to set the accepted values.
   # 
   # use of filter:
   # xf = x[filter]; yf = y[filter]
   # xr = x[~filter]; yr = y[~filter]
   # xf, yf are cleaned versions of x and y and with the valid entries
   # xr, yr are the rejected values from array x and y
   #-----------------------------------------------------------
   if mean is None:
      mean = y.mean()           # Mean of incoming array y
   if stdv is None:
      stdv = y.std()            # Its standard deviation
   N = len(y)                   # Lenght of incoming arrays
   criterion = 1.0/(2*N)        # Chauvenet's criterion
   d = abs(y-mean)/stdv         # Distance of a value to mean in stdv's
   d /= 2.0**0.5                # The left and right tail threshold values
   prob = erfc(d)               # Area normal dist.    
   filter = prob >= criterion   # The 'accept' filter array with booleans
   return filter                # Use boolean array outside this function

def residuals(p, data):
   a, b = p
   x, y = data
   return (y-a-b*x)


def prob( Xlo, Xhi ):
   sq2 = numpy.sqrt(2.0)
   # Left tail and right tail area
   return 1.0 - 0.5 * (erf(Xhi/sq2) - erf(Xlo/sq2))

# Artificial data
N = 100
a0 = 2; b0 = 3
x = numpy.linspace(0.0, 2.0, N)
y = a0 + b0*x + normal(0.0, 1.0, N)  # Mean 0, sigma 1

# Add outlier(s)
y[40] = 14       
y[7] = 12
y[-5] = 0.2

# Prepare plot
fig = figure()
rc('legend', fontsize=8)
frame = fig.add_subplot(1,1,1)
frame.plot(x, y, 'go', label="data")
frame.set_xlabel("x")
frame.set_ylabel("y")
frame.set_title("Exclude poor data with criterion of Chauvenet")

fitter = kmpfit.Fitter(residuals=residuals, data=(x,y))
fitter.fit(params0=(a0,b0))

print("======== Fit results all data included ==========")
print("Params:                      ", fitter.params)
print("Uncertainties:               ", fitter.xerror)
print("Errors assuming red.chi^2=1: ", fitter.stderr)
print("Iterations:                  ", fitter.niter) 
print("Function ev:                 ", fitter.nfev)
print("dof:                         ", fitter.dof)
print("chi^2, rchi2:                ", fitter.chi2_min, fitter.rchi2_min)
print("Status:                      ", fitter.status)


#mean = y.mean() # Use this for data with slope 0
a, b = fitter.params
mean = a + b*x
stdv = (y-mean).std()
criterion = 1.0/(2*N)
for xf, yf, m in zip(x, y, mean):
   d = abs(yf-m)/stdv
   P = prob(-d, d)
   if P < criterion:
      print("Data y=%f has distance %fsigma to mean (%f), prob.=%f" %(yf,d,m,P))
      print("Based on Chauvenet's criterion (prob=%g), we reject data point y=%g"\
      %(criterion,yf))
      frame.plot((xf,), (yf,), 'rx', ms=20)

# The function chauvenet() does it the NumPy way
filter = chauvenet(x, y)
xf = x[filter]; yf = y[filter]
xe = x[~filter]; ye = y[~filter]
print("Excluded by function chauvenet() are:")
print(list(zip(xe, ye)))
fitter = kmpfit.Fitter(residuals=residuals, data=(xf,yf))
fitter.fit(params0=(a0,b0))

print("\n======== Fit results with filtered data ==========")
print("Params:                      ", fitter.params)


frame.set_ylim(0, 1.1*y.max())
frame.plot(x, a+b*x, 'g', label="Fit unfiltered data")
a, b = fitter.params
frame.plot(x, a+b*x, 'b', label="Fit filtered data")
leg = frame.legend(loc=2)
show()