jackknife resampling method. This module also contains a hack for weighted average (warning: the theory is not established yet, at least I have not seen it).master
parent
f0ba6f4068
commit
4e2b2c66c2
1 changed files with 162 additions and 0 deletions
@ -0,0 +1,162 @@ |
|||||||
|
""" |
||||||
|
REFERENCES: |
||||||
|
|
||||||
|
Jackknife and Bootstrap Resampling Methods in Statistical Analysis to Correct for Bias. |
||||||
|
P. Young |
||||||
|
http://young.physics.ucsc.edu/jackboot.pdf |
||||||
|
|
||||||
|
|
||||||
|
Notes on Bootstrapping |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
""" |
||||||
|
|
||||||
|
import numpy |
||||||
|
|
||||||
|
from numpy import pi, cos |
||||||
|
from numpy.random import normal |
||||||
|
|
||||||
|
def test1_generate_data(ndata=1000): |
||||||
|
""" |
||||||
|
|
||||||
|
""" |
||||||
|
return pi / 3 + normal(size=ndata) |
||||||
|
|
||||||
|
|
||||||
|
def test1(): |
||||||
|
global test1_dset |
||||||
|
test1_dset = test1_generate_data() |
||||||
|
dset = test1_dset |
||||||
|
print "first jackknife routine: jk_generate_datasets -> jk_wstats" |
||||||
|
dset_jk = jk_generate_datasets(dset) |
||||||
|
cos_avg1 = jk_wstats(dset_jk, func=numpy.cos) |
||||||
|
print cos_avg1 |
||||||
|
|
||||||
|
print "second jackknife routine: jk_generate_averages -> jk_stats_aa" |
||||||
|
aa_jk = jk_generate_averages(dset) |
||||||
|
cos_avg2 = jk_stats_aa(aa_jk, func=numpy.cos) |
||||||
|
print cos_avg2 |
||||||
|
|
||||||
|
# the two results above must be identical |
||||||
|
|
||||||
|
|
||||||
|
def test2_generate_data(): |
||||||
|
rootdir = "/home/wirawan/Work/PWQMC-77/expt/qmc/MnO/AFM2/rh.1x1x1/Opium-GFRG/vol10.41/k-0772+3780+2187.run" |
||||||
|
srcfile = rootdir + "/measurements.h5" |
||||||
|
from pyqmc.results.pwqmc_meas import meas_hdf5 |
||||||
|
|
||||||
|
global test2_db |
||||||
|
test2_db = meas_hdf5(srcfile) |
||||||
|
|
||||||
|
|
||||||
|
def jk_select_dataset(a, i): |
||||||
|
"""Selects the i-th dataset for jackknife operation from a |
||||||
|
given dataset 'a'. |
||||||
|
The argument i must be: 0 <= 0 < len(a). |
||||||
|
This is essentially deleting the i-th data point from the |
||||||
|
original dataset. |
||||||
|
""" |
||||||
|
a = numpy.asarray(a) |
||||||
|
N = a.shape[0] |
||||||
|
assert len(a.shape) == 1 |
||||||
|
assert 0 <= i < N |
||||||
|
rslt = numpy.empty(shape=(N-1,), dtype=a.dtype) |
||||||
|
rslt[:i] = a[:i] |
||||||
|
rslt[i:] = a[i+1:] |
||||||
|
return rslt |
||||||
|
|
||||||
|
def jk_generate_datasets(a): |
||||||
|
"""Generates ALL the datasets for jackknife operation from |
||||||
|
the original dataset 'a'. |
||||||
|
For the i-th dataset, this is essentially deleting the |
||||||
|
i-th data point from 'a'. |
||||||
|
""" |
||||||
|
a = numpy.asarray(a) |
||||||
|
N = a.shape[0] |
||||||
|
assert len(a.shape) == 1 |
||||||
|
rslt = numpy.empty(shape=(N,N-1,), dtype=a.dtype) |
||||||
|
for i in xrange(N): |
||||||
|
rslt[i, :i] = a[:i] |
||||||
|
rslt[i, i:] = a[i+1:] |
||||||
|
return rslt |
||||||
|
|
||||||
|
def jk_generate_averages(a, weights=None): |
||||||
|
"""Generates ALL the average samples for jackknife operation |
||||||
|
from the original dataset 'a'. |
||||||
|
For the i-th dataset, this is essentially deleting the |
||||||
|
i-th data point from 'a', then taking the average. |
||||||
|
|
||||||
|
This version does not store N*(N-1) data points; only (N). |
||||||
|
""" |
||||||
|
a = numpy.asarray(a) |
||||||
|
N = a.shape[0] |
||||||
|
assert len(a.shape) == 1 |
||||||
|
aa_jk = numpy.empty(shape=(N,), dtype=a.dtype) |
||||||
|
dset_i = numpy.empty(shape=(N-1,), dtype=a.dtype) |
||||||
|
if weights != None: |
||||||
|
weights_i = numpy.empty(shape=(N-1,), dtype=weights.dtype) |
||||||
|
for i in xrange(N): |
||||||
|
dset_i[:i] = a[:i] |
||||||
|
dset_i[i:] = a[i+1:] |
||||||
|
if weights != None: |
||||||
|
weights_i[:i] = weights[:i] |
||||||
|
weights_i[i:] = weights[i+1:] |
||||||
|
aa_jk[i] = numpy.average(dset_i, weights=weights_i) |
||||||
|
else: |
||||||
|
aa_jk[i] = numpy.mean(dset_i) |
||||||
|
|
||||||
|
return aa_jk |
||||||
|
|
||||||
|
''' |
||||||
|
def jk_stats_old(a_jk, func=None): |
||||||
|
"""a_jk must be in the same format as that produced by |
||||||
|
|
||||||
|
""" |
||||||
|
# get all the jackknived stats. |
||||||
|
if func == None: |
||||||
|
jk_mean = numpy.mean(a_jk, axis=1) |
||||||
|
else: |
||||||
|
jk_mean = numpy.mean(func(a_jk), axis=1) |
||||||
|
''' |
||||||
|
|
||||||
|
def jk_wstats_dsets(a_jk, w_jk=None, func=None): |
||||||
|
"""a_jk and w_jk must be in the same format as that produced by |
||||||
|
jk_generate_datasets. |
||||||
|
|
||||||
|
""" |
||||||
|
# get all the jackknived stats. |
||||||
|
N = len(a_jk) |
||||||
|
# reconstruct full "a" array: |
||||||
|
a = numpy.empty(shape=(N,), dtype=a_jk.dtype) |
||||||
|
a[1:] = a_jk[0] |
||||||
|
a[0] = a_jk[1][0] |
||||||
|
if func == None: |
||||||
|
func = lambda x : x |
||||||
|
aa_jk = numpy.average(a_jk, axis=1, weights=w_jk) |
||||||
|
#print aa_jk |
||||||
|
f_jk = func(aa_jk) |
||||||
|
mean = numpy.mean(f_jk) |
||||||
|
var = numpy.std(f_jk) * numpy.sqrt(N-1) |
||||||
|
mean_unbiased = N * func(a.mean()) - (N-1) * mean |
||||||
|
return (mean, var, mean_unbiased) |
||||||
|
|
||||||
|
|
||||||
|
def jk_stats_aa(aa_jk, func=None, a=None): |
||||||
|
"""Computes the jackknife statistics from the preprocessed |
||||||
|
jackknife averages (aa_jk). |
||||||
|
The input array aa_jk is computed by jk_generate_averages(). |
||||||
|
""" |
||||||
|
# get all the jackknived stats. |
||||||
|
N = len(aa_jk) |
||||||
|
# reconstruct full "a" array: |
||||||
|
if func == None: |
||||||
|
func = lambda x : x |
||||||
|
f_jk = func(aa_jk) |
||||||
|
mean = numpy.mean(f_jk) |
||||||
|
var = numpy.std(f_jk) * numpy.sqrt(N-1) |
||||||
|
if a != None: |
||||||
|
mean_unbiased = N * func(a.mean()) - (N-1) * mean |
||||||
|
else: |
||||||
|
mean_unbiased = None |
||||||
|
return (mean, var, mean_unbiased) |
Loading…
Reference in new issue