1
2 """module for performing statistical calculations.
3
4 (c) 2007-2009 Matt Hilton
5
6 U{http://astlib.sourceforge.net}
7
8 This module (as you may notice) provides very few statistical routines. It does, however, provide
9 biweight (robust) estimators of location and scale, as described in Beers et al. 1990 (AJ, 100,
10 32), in addition to a robust least squares fitting routine that uses the biweight transform.
11
12 Some routines may fail if they are passed lists with few items and encounter a `divide by zero'
13 error. Where this occurs, the function will return None. An error message will be printed to the
14 console when this happens if astStats.REPORT_ERRORS=True (the default). Testing if an
15 astStats function returns None can be used to handle errors in scripts.
16
17 For extensive statistics modules, the Python bindings for GNU R (U{http://rpy.sourceforge.net}), or
18 SciPy (U{http://www.scipy.org}) are suggested.
19
20 """
21
22 import math
23 import sys
24
25 REPORT_ERRORS=True
26
27
29 """Calculates the mean average of a list of numbers.
30
31 @type dataList: list
32 @param dataList: input data, must be a one dimensional list
33 @rtype: float
34 @return: mean average
35
36 """
37 sum=0
38 for item in dataList:
39 sum=sum+float(item)
40 if len(dataList)>0:
41 mean=sum/float(len(dataList))
42 else:
43 mean=0
44 return mean
45
46
48 """Calculates the weighted mean average of a two dimensional list (value, weight) of
49 numbers.
50
51 @type dataList: list
52 @param dataList: input data, must be a two dimensional list in format [value, weight]
53 @rtype: float
54 @return: weighted mean average
55
56 """
57 sum=0
58 weightSum=0
59 for item in dataList:
60 sum=sum+float(item[0]*item[1])
61 weightSum=weightSum+item[1]
62 if len(dataList)>0:
63 mean=sum/weightSum
64 else:
65 mean=0
66 return mean
67
68
70 """Calculates the (sample) standard deviation of a list of numbers.
71
72 @type dataList: list
73 @param dataList: input data, must be a one dimensional list
74 @rtype: float
75 @return: standard deviation
76
77 """
78 listMean=mean(dataList)
79 sum=0
80 for item in dataList:
81 sum=sum+(float(item-listMean)*float(item-listMean))
82 if len(dataList)>0:
83 stdev=math.sqrt(sum/(float(len(dataList))-1))
84 else:
85 stdev=0
86 return stdev
87
88
90 """Calculates the root mean square of a list of numbers.
91
92 @type dataList: list
93 @param dataList: input data, must be a one dimensional list
94 @rtype: float
95 @return: root mean square
96
97 """
98 dataListSq=[]
99 for item in dataList:
100 dataListSq.append(item*item)
101 listMeanSq=mean(dataListSq)
102 rms=math.sqrt(listMeanSq)
103
104 return rms
105
106
108 """Calculates the weighted (sample) standard deviation of a list of numbers.
109
110 @type dataList: list
111 @param dataList: input data, must be a two dimensional list in format [value, weight]
112 @rtype: float
113 @return: weighted standard deviation
114
115 @note: Returns None if an error occurs.
116
117 """
118 listMean=weightedMean(dataList)
119 sum=0
120 wSum=0
121 wNonZero=0
122 for item in dataList:
123 if item[1]>0.0:
124 sum=sum+float((item[0]-listMean)/item[1])*float((item[0]-listMean)/item[1])
125 wSum=wSum+float(1.0/item[1])*float(1.0/item[1])
126
127 if len(dataList)>1:
128 nFactor=float(len(dataList))/float(len(dataList)-1)
129 stdev=math.sqrt(nFactor*(sum/wSum))
130 else:
131 if REPORT_ERRORS==True:
132 print """ERROR: astStats.weightedStdev() : dataList contains < 2 items."""
133 stdev=None
134 return stdev
135
136
165
166
168 """Returns an estimate of the mode of a set of values by mode=(3*median)-(2*mean).
169
170 @type dataList: list
171 @param dataList: input data, must be a one dimensional list
172 @rtype: float
173 @return: estimate of mode average
174
175 """
176 mode=(3*median(dataList))-(2*mean(dataList))
177
178 return mode
179
180
182 """Calculates the Median Absolute Deviation of a list of numbers.
183
184 @type dataList: list
185 @param dataList: input data, must be a one dimensional list
186 @rtype: float
187 @return: median absolute deviation
188
189 """
190 listMedian=median(dataList)
191
192
193 diffModuli=[]
194 for item in dataList:
195 diffModuli.append(math.fabs(item-listMedian))
196 diffModuli.sort()
197
198 midValue=float(len(diffModuli)/2.0)
199 fractPart=math.modf(midValue)[0]
200
201 if fractPart==0.5:
202 midValue=math.ceil(midValue)
203
204
205 if midValue<len(diffModuli)-1:
206 MAD=diffModuli[int(midValue)]
207
208 if fractPart!=0.5:
209 prevItem=diffModuli[int(midValue)-1]
210 MAD=(MAD+prevItem)/2.0
211
212 else:
213 MAD=diffModuli[0]
214
215 return MAD
216
217
219 """Calculates the biweight location estimator (like a robust average) of a list of
220 numbers.
221
222 @type dataList: list
223 @param dataList: input data, must be a one dimensional list
224 @type tuningConstant: float
225 @param tuningConstant: 6.0 is recommended.
226 @rtype: float
227 @return: biweight location
228
229 @note: Returns None if an error occurs.
230
231 """
232 C=tuningConstant
233 listMedian=median(dataList)
234 listMAD=MAD(dataList)
235 if listMAD!=0:
236 uValues=[]
237 for item in dataList:
238 uValues.append((item-listMedian)/(C*listMAD))
239
240 top=0
241 bottom=0
242 for i in range(len(uValues)):
243 if math.fabs(uValues[i])<=1.0:
244 top=top+((dataList[i]-listMedian) \
245 *(1.0-(uValues[i]*uValues[i])) \
246 *(1.0-(uValues[i]*uValues[i])))
247
248 bottom=bottom+((1.0-(uValues[i]*uValues[i])) \
249 *(1.0-(uValues[i]*uValues[i])))
250
251 CBI=listMedian+(top/bottom)
252
253 else:
254 if REPORT_ERRORS==True:
255 print """ERROR: astStats: biweightLocation() : MAD() returned 0."""
256 return None
257
258 return CBI
259
260
262 """Calculates the biweight scale estimator (like a robust standard deviation) of a list
263 of numbers.
264
265 @type dataList: list
266 @param dataList: input data, must be a one dimensional list
267 @type tuningConstant: float
268 @param tuningConstant: 9.0 is recommended.
269 @rtype: float
270 @return: biweight scale
271
272 @note: Returns None if an error occurs.
273
274 """
275 C=tuningConstant
276
277
278 listMedian=median(dataList)
279 listMAD=MAD(dataList)
280 diffModuli=[]
281 for item in dataList:
282 diffModuli.append(math.fabs(item-listMedian))
283 uValues=[]
284 for item in dataList:
285 try:
286 uValues.append((item-listMedian)/(C*listMAD))
287 except ZeroDivisionError:
288 if REPORT_ERRORS==True:
289 print """ERROR: astStats.biweightScale() : divide by zero error."""
290 return None
291
292 top=0
293 bottom=0
294 valCount=0
295
296 for i in range(len(uValues)):
297
298 if math.fabs(uValues[i])<=1.0:
299 u2Term=1.0-(uValues[i]*uValues[i])
300 u4Term=math.pow(u2Term, 4)
301 top=top+((diffModuli[i]*diffModuli[i])*u4Term)
302 bottom=bottom+(u2Term*(1.0-(5.0*(uValues[i]*uValues[i]))))
303 valCount=valCount+1
304
305 top=math.sqrt(top)
306 bottom=math.fabs(bottom)
307
308 SBI=math.pow(float(valCount), 0.5)*(top/bottom)
309 return SBI
310
311
313 """Iteratively calculates biweight location and scale, using sigma clipping, for a list
314 of values. The calculation is performed on the first column of a multi-dimensional
315 list; other columns are ignored.
316
317 @type dataList: list
318 @param dataList: input data
319 @type tuningConstant: float
320 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
321 scale estimates
322 @type sigmaCut: float
323 @param sigmaCut: sigma clipping to apply
324 @rtype: dictionary
325 @return: estimate of biweight location, scale, and list of non-clipped data, in the format
326 {'biweightLocation', 'biweightScale', 'dataList'}
327
328 @note: Returns None if an error occurs.
329
330 """
331
332 iterations=0
333 clippedValues=[]
334 for row in dataList:
335 if type(row)==list:
336 clippedValues.append(row[0])
337 else:
338 clippedValues.append(row)
339
340 while iterations<11 and len(clippedValues)>5:
341
342 cbi=biweightLocation(clippedValues, tuningConstant)
343 sbi=biweightScale(clippedValues, tuningConstant)
344
345
346
347
348 if cbi==None or sbi==None:
349
350 if REPORT_ERRORS==True:
351 print """ERROR: astStats : biweightClipped() :
352 divide by zero error."""
353
354 return None
355
356 else:
357
358 clippedValues=[]
359 clippedData=[]
360 for row in dataList:
361 if type(row)==list:
362 if row[0]>cbi-(sigmaCut*sbi) \
363 and row[0]<cbi+(sigmaCut*sbi):
364 clippedValues.append(row[0])
365 clippedData.append(row)
366 else:
367 if row>cbi-(sigmaCut*sbi) \
368 and row<cbi+(sigmaCut*sbi):
369 clippedValues.append(row)
370 clippedData.append(row)
371
372 iterations=iterations+1
373
374 return { 'biweightLocation':cbi ,
375 'biweightScale':sbi,
376 'dataList':clippedData}
377
378
407
408
410 """Performs an ordinary least squares fit on a two dimensional list of numbers.
411 Minimum number of data points is 5.
412
413 @type dataList: list
414 @param dataList: input data, must be a two dimensional list in format [x, y]
415 @rtype: dictionary
416 @return: slope and intercept on y-axis, with associated errors, in the format
417 {'slope', 'intercept', 'slopeError', 'interceptError'}
418
419 @note: Returns None if an error occurs.
420
421 """
422 sumX=0
423 sumY=0
424 sumXY=0
425 sumXX=0
426 n=float(len(dataList))
427 if n>4:
428 for item in dataList:
429 sumX=sumX+item[0]
430 sumY=sumY+item[1]
431 sumXY=sumXY+(item[0]*item[1])
432 sumXX=sumXX+(item[0]*item[0])
433 m=((n*sumXY)-(sumX*sumY))/((n*sumXX)-(sumX*sumX))
434 c=((sumXX*sumY)-(sumX*sumXY))/((n*sumXX)-(sumX*sumX))
435
436 sumRes=0
437 for item in dataList:
438
439 sumRes=sumRes+((item[1]-(m*item[0])-c) \
440 *(item[1]-(m*item[0])-c))
441
442 sigma=math.sqrt((1.0/(n-2))*sumRes)
443
444 mSigma=(sigma*math.sqrt(n))/math.sqrt((n*sumXX)-(sumX*sumX))
445 cSigma=(sigma*math.sqrt(sumXX))/math.sqrt((n*sumXX)-(sumX*sumX))
446 else:
447 if REPORT_ERRORS==True:
448 print """ERROR: astStats.OLSFit() : dataList contains < 5 items."""
449
450 return None
451
452 return {'slope':m,
453 'intercept':c,
454 'slopeError':mSigma,
455 'interceptError':cSigma}
456
457
459 """Calculates the clipped mean and stdev of a list of numbers.
460
461 @type dataList: list
462 @param dataList: input data, one dimensional list of numbers
463 @type sigmaCut: float
464 @param sigmaCut: clipping in Gaussian sigma to apply
465 @type maxIterations: int
466 @param maxIterations: maximum number of iterations
467 @rtype: dictionary
468 @return: format {'clippedMean', 'clippedStdev', 'numPoints'}
469
470 """
471
472 listCopy=[]
473 for d in dataList:
474 listCopy.append(d)
475
476 iterations=0
477 while iterations < maxIterations and len(listCopy) > 4:
478
479 m=mean(listCopy)
480 s=stdev(listCopy)
481
482 newCopy=[]
483 for i in listCopy:
484 if abs(i) < abs(m+sigmaCut*s):
485 newCopy.append(i)
486 listCopy=newCopy
487
488 iterations=iterations+1
489
490 return {'clippedMean':m, 'clippedStdev':s, 'numPoints':len(listCopy)}
491
492
494 """Performs a weighted least squares fit on a list of numbers with sigma clipping. Minimum number of data
495 points is 5.
496
497 @type dataList: list
498 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
499 @rtype: dictionary
500 @return: slope and intercept on y-axis, with associated errors, in the format
501 {'slope', 'intercept', 'slopeError', 'interceptError'}
502
503 @note: Returns None if an error occurs.
504
505 """
506
507 iterations=0
508 clippedValues=[]
509 for row in dataList:
510 clippedValues.append(row)
511
512 while iterations<11 and len(clippedValues)>4:
513
514 fitResults=weightedLSFit(clippedValues, "errors")
515
516 if fitResults['slope'] == None:
517
518 if REPORT_ERRORS==True:
519 print """ERROR: astStats : clippedWeightedLSFit() :
520 divide by zero error."""
521
522 return None
523
524 else:
525
526 clippedValues=[]
527 for row in dataList:
528
529
530 fit=fitResults['slope']*row[0]+fitResults['intercept']
531 res=row[1]-fit
532 if abs(res)/row[2] < sigmaCut:
533 clippedValues.append(row)
534
535 iterations=iterations+1
536
537
538 fitResults['numDataPoints']=len(clippedValues)
539
540 return fitResults
541
542
544 """Performs a weighted least squares fit on a three dimensional list of numbers [x, y, y error].
545
546 @type dataList: list
547 @param dataList: input data, must be a three dimensional list in format [x, y, y error]
548 @type weightType: string
549 @param weightType: if "errors", weights are calculated assuming the input data is in the
550 format [x, y, error on y]; if "weights", the weights are assumed to be already calculated and
551 stored in a fourth column [x, y, error on y, weight] (as used by e.g. L{astStats.biweightLSFit})
552 @rtype: dictionary
553 @return: slope and intercept on y-axis, with associated errors, in the format
554 {'slope', 'intercept', 'slopeError', 'interceptError'}
555
556 @note: Returns None if an error occurs.
557
558 """
559 if weightType == "weights":
560 sumW=0
561 sumWX=0
562 sumWY=0
563 sumWXY=0
564 sumWXX=0
565 n=float(len(dataList))
566 if n > 4:
567 for item in dataList:
568 W=item[3]
569 sumWX=sumWX+(W*item[0])
570 sumWY=sumWY+(W*item[1])
571 sumWXY=sumWXY+(W*item[0]*item[1])
572 sumWXX=sumWXX+(W*item[0]*item[0])
573 sumW=sumW+W
574
575
576 try:
577 m=((sumW*sumWXY)-(sumWX*sumWY)) \
578 /((sumW*sumWXX)-(sumWX*sumWX))
579 except ZeroDivisionError:
580 if REPORT_ERRORS == True:
581 print "ERROR: astStats.weightedLSFit() : divide by zero error."
582 return None
583
584 try:
585 c=((sumWXX*sumWY)-(sumWX*sumWXY)) \
586 /((sumW*sumWXX)-(sumWX*sumWX))
587 except ZeroDivisionError:
588 if REPORT_ERRORS == True:
589 print "ERROR: astStats.weightedLSFit() : divide by zero error."
590 return None
591
592 sumRes=0
593 for item in dataList:
594
595 sumRes=sumRes+((item[1]-(m*item[0])-c) \
596 *(item[1]-(m*item[0])-c))
597
598 sigma=math.sqrt((1.0/(n-2))*sumRes)
599
600
601
602 if (n*sumWXX)-(sumWX*sumWX)>0.0:
603
604 mSigma=(sigma*math.sqrt(n)) \
605 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
606
607 cSigma=(sigma*math.sqrt(sumWXX)) \
608 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
609
610 else:
611
612 if REPORT_ERRORS==True:
613 print """ERROR: astStats.weightedLSFit()
614 : divide by zero error."""
615 return None
616
617 else:
618 if REPORT_ERRORS==True:
619 print """ERROR: astStats.weightedLSFit() :
620 dataList contains < 5 items."""
621 return None
622
623 elif weightType == "errors":
624 sumX=0
625 sumY=0
626 sumXY=0
627 sumXX=0
628 sumSigma=0
629 n=float(len(dataList))
630 for item in dataList:
631 sumX=sumX+(item[0]/(item[2]*item[2]))
632 sumY=sumY+(item[1]/(item[2]*item[2]))
633 sumXY=sumXY+((item[0]*item[1])/(item[2]*item[2]))
634 sumXX=sumXX+((item[0]*item[0])/(item[2]*item[2]))
635 sumSigma=sumSigma+(1.0/(item[2]*item[2]))
636 delta=(sumSigma*sumXX)-(sumX*sumX)
637 m=((sumSigma*sumXY)-(sumX*sumY))/delta
638 c=((sumXX*sumY)-(sumX*sumXY))/delta
639 mSigma=math.sqrt(sumSigma/delta)
640 cSigma=math.sqrt(sumXX/delta)
641
642 return {'slope':m,
643 'intercept':c,
644 'slopeError':mSigma,
645 'interceptError':cSigma}
646
647
649 """Performs a weighted least squares fit, where the weights used are the biweight
650 transforms of the residuals to the previous best fit .i.e. the procedure is iterative,
651 and converges very quickly (iterations is set to 10 by default). Minimum number of data
652 points is 10.
653
654 This seems to give slightly different results to the equivalent R routine, so use at your
655 own risk!
656
657 @type dataList: list
658 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
659 @type tuningConstant: float
660 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
661 scale estimates
662 @type sigmaCut: float
663 @param sigmaCut: sigma clipping to apply (set to None if not required)
664 @rtype: dictionary
665 @return: slope and intercept on y-axis, with associated errors, in the format
666 {'slope', 'intercept', 'slopeError', 'interceptError'}
667
668 @note: Returns None if an error occurs.
669
670 """
671
672 dataCopy=[]
673 for row in dataList:
674 dataCopy.append(row)
675
676
677 results=OLSFit(dataCopy)
678 origLen=len(dataCopy)
679 for k in range(10):
680 m=results[0]
681 c=results[1]
682 res=[]
683 for item in dataCopy:
684 res.append((m*item[0]+c)-item[1])
685
686 if len(res)>5:
687
688
689 if sigmaClipping!=None:
690 absRes=[]
691 for item in res:
692 absRes.append(abs(item))
693 sigma=stdev(absRes)
694 count=0
695 for item in absRes:
696 if item>(sigmaClipping*sigma) \
697 and len(dataCopy)>2:
698 del dataCopy[count]
699 del res[count]
700
701
702
703
704 count=count-1
705
706 count=count+1
707
708
709 weights=biweightTransform(res, tuningConstant)
710
711
712
713 wData=[]
714 for i in range(len(dataCopy)):
715 wData.append([ dataCopy[i][0],
716 dataCopy[i][1],
717 weights[i]])
718 results=weightedLSFit(wData, "weights")
719
720 return {'slope':m,
721 'intercept':c,
722 'slopeError':mSigma,
723 'interceptError':cSigma}
724
725
727 """Bins the input data cumulatively.
728
729 @param data: input data, must be a one dimensional list
730 @type binMin: float
731 @param binMin: minimum value from which to bin data
732 @type binMax: float
733 @param binMax: maximum value from which to bin data
734 @type binTotal: int
735 @param binTotal: number of bins
736 @rtype: list
737 @return: binned data, in format [bin centre, frequency]
738
739 """
740
741 binStep=float(binMax-binMin)/binTotal
742 bins=[]
743 totalItems=len(data)
744 for i in range(binTotal):
745 bins.append(0)
746 for item in data:
747 if item>(binMin+(i*binStep)):
748 bins[i]=bins[i]+1.0/totalItems
749
750
751 coords=[]
752 for i in range(binTotal):
753 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
754
755 return coords
756
757
758 -def binner(data, binMin, binMax, binTotal):
759 """Bins the input data..
760
761 @param data: input data, must be a one dimensional list
762 @type binMin: float
763 @param binMin: minimum value from which to bin data
764 @type binMax: float
765 @param binMax: maximum value from which to bin data
766 @type binTotal: int
767 @param binTotal: number of bins
768 @rtype: list
769 @return: binned data, in format [bin centre, frequency]
770
771 """
772
773 binStep=float(binMax-binMin)/binTotal
774 bins=[]
775 for i in range(binTotal):
776 bins.append(0)
777 for item in data:
778 if item>(binMin+(i*binStep)) \
779 and item<=(binMin+((i+1)*binStep)):
780 bins[i]=bins[i]+1
781
782
783 coords=[]
784 for i in range(binTotal):
785 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
786
787 return coords
788
789
791 """Bins the input data, recorded frequency is sum of weights in bin.
792
793 @param data: input data, must be a one dimensional list
794 @type binMin: float
795 @param binMin: minimum value from which to bin data
796 @type binMax: float
797 @param binMax: maximum value from which to bin data
798 @type binTotal: int
799 @param binTotal: number of bins
800 @rtype: list
801 @return: binned data, in format [bin centre, frequency]
802
803 """
804
805 binStep=float(binMax-binMin)/binTotal
806 bins=[]
807 for i in range(binTotal):
808 bins.append(0.0)
809 for item, weight in zip(data, weights):
810 if item>(binMin+(i*binStep)) \
811 and item<=(binMin+((i+1)*binStep)):
812 bins[i]=bins[i]+weight
813
814
815 coords=[]
816 for i in range(binTotal):
817 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
818
819 return coords
820
821
822