问题
when i use a<-rt(10,3)
and b <-rnorm(10,3
)+5 trying to get shifted to the right numbers in order to calculate power of the two sample t-test. I get wrong results. There is a lot of literature on the internet talking about the use of the noncentrality parameter to get shifted numbers in order to be able to calculate power. My question how to use noncentrality parameter to get an amount of shifting equal to 5. If I am wrong and that the only method to get shifted numbers from the t distribution is the method introduced at the beginning then please tell me.
desired_length<-1000
empty_list <- vector(mode = "list", length = desired_length)
empty_list1 <- vector(mode = "list", length = desired_length)
empty_list2<-vector(mode="list",length=desired_length)
empty_list3<-vector(mode="list",length=desired_length)
empty_list4<-vector(mode="list",length=desired_length)
for (i in 1:1000) {
h<-rt(10,1)
g<-rt(10,1)
g1<- rt(10,1)+0.5
g2<-rt(10,1)+1
g3<- rt(10,1)+1.5
g4<- rt(10,1)+2
a<-cbind(h,g)
b<-cbind(h,g1)
c<-cbind(h,g2)
d<-cbind(h,g3)
e<-cbind(h,g4)
empty_list[[i]]<-a
empty_list1[[i]]<-b
empty_list2[[i]]<-c
empty_list3[[i]]<-d
empty_list4[[i]]<-e
}
pvalue<-numeric(1000)
pvalue1<-numeric(1000)
pvalue2<-numeric(1000)
pvalue3<-numeric(1000)
pvalue4<-numeric(1000)
x<-numeric(5)
for (i in 1:1000){
pvalue[i]<-t.test(empty_list[[i]][,1],empty_list[[i]][,2])$p.value
pvalue1[i]<-t.test(empty_list1[[i]][,1],empty_list1[[i]][,2])$p.value
pvalue2[i]<-t.test(empty_list2[[i]][,1],empty_list2[[i]][,2])$p.value
pvalue3[i]<-t.test(empty_list3[[i]][,1],empty_list3[[i]][,2])$p.value
pvalue4[i]<-t.test(empty_list4[[i]][,1],empty_list4[[i]][,2])$p.value
}
x[1]<-sum(pvalue<0.05)/1000
x[2]<-sum(pvalue1<0.05)/1000
x[3]<-sum(pvalue2<0.05)/1000
x[4]<-sum(pvalue3<0.05)/1000
x[5]<-sum(pvalue4<0.05)/1000
location<-seq(0,2,by =0.5)
plot(location,x,ylab="Power for t1 distributions",xlab="location difference",type = "l",ylim=c(0,1))
combined_data<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data[,i]<-c(empty_list[[i]][,1],empty_list[[i]][,2])
}
combined_data1<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data1[,i]<-c(empty_list1[[i]][,1],empty_list1[[i]][,2])
}
combined_data2<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data2[,i]<-c(empty_list2[[i]][,1],empty_list2[[i]][,2])
}
combined_data3<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data3[,i]<-c(empty_list3[[i]][,1],empty_list3[[i]][,2])
}
combined_data4<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data4[,i]<-c(empty_list4[[i]][,1],empty_list4[[i]][,2])
}
Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
Tstatistic<- mean(g2)-mean(g1)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
G3[i]<-mean(G2)-mean(G1)
}
m<-(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1)
}
p<-numeric(5)
pval<-apply(combined_data,2,FUN=Pvalue_approximator)
p[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=Pvalue_approximator)
p[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=Pvalue_approximator)
p[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=Pvalue_approximator)
p[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=Pvalue_approximator)
p[5]<-sum( pval4 < 0.05)/1000
lines(location, p, col="red",lty=2)
Diff.med.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
a<-abs(c(g1-median(c(g1))))
b<-abs(c(g2-median(c(g2))))
ab<-2*median(c(a,b))
ac<-abs(median(c(g2))-median(c(g1)))
Tstatistic =ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
o<-abs(c(G1-median(c(G1))))
v<-abs(c(G2-median(c(G2))))
ov<-2*median(c(o,v))
oc<-abs(median(c(G2))-median(c(G1)))
G3[i]<- oc/ov
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
po<-numeric(5)
pval<-apply(combined_data,2,FUN=Diff.med.Pvalue_approximator)
po[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=Diff.med.Pvalue_approximator)
po[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=Diff.med.Pvalue_approximator)
po[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=Diff.med.Pvalue_approximator)
po[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=Diff.med.Pvalue_approximator)
po[5]<-sum(pval4 < 0.05)/1000
lines(location, po, col="green",lty=1)
wilcoxon.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
l = length(g1)
rx = rank(c(g1,g2))
rf<-rx[11:20]
Tstatistic<-sum(rf)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
rt<-rank(c(G1,G2))
ra<-rt[11:20]
G3[i]<-sum(ra)
}
m<-2*(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1)
}
pw<-numeric(5)
pval<-apply(combined_data,2,FUN=wilcoxon.Pvalue_approximator)
pw[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=wilcoxon.Pvalue_approximator)
pw[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=wilcoxon.Pvalue_approximator)
pw[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=wilcoxon.Pvalue_approximator)
pw[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=wilcoxon.Pvalue_approximator)
pw[5]<-sum( pval4 < 0.05)/1000
lines(location, pw, col="blue",lty=1)
HLE2.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
u<-median(c(g1))
v<-median(c(g2))
x<-c(g1-u)
y<-c(g2-v)
xy<-c(x,y)
a<-outer(xy,xy,"-")
t<-a[lower.tri(a)]
ab<- median(c(abs(t)))
ac<-abs(median(c(outer(g2,g1,"-"))))
Tstatistic = ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
f<-median(c(G1))
h<-median(c(G2))
p<-c(G1-f)
r<-c(G2-h)
pr<-c(p,r)
pu<-outer(pr,pr,"-")
xc<-pu[lower.tri(pu)]
b<- median(c(abs(xc)))
acn<-abs(median(c(outer(G2,G1,"-"))))
G3[i]<- acn/b
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
phl<-numeric(5)
pval<-apply(combined_data,2,FUN=HLE2.Pvalue_approximator)
phl[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=HLE2.Pvalue_approximator)
phl[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=HLE2.Pvalue_approximator)
phl[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=HLE2.Pvalue_approximator)
phl[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=HLE2.Pvalue_approximator)
phl[5]<-sum( pval4 < 0.05)/1000
lines(location, phl, col="orange",lty=1)
HLE1.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
u<-median(c(g1))
v<-median(c(g2))
x<-c(g1-u)
y<-c(g2-v)
xy<-c(x,y)
a<-outer(xy,xy,"-")
t<-a[lower.tri(a)]
ab<- median(c(abs(t)))
ma<-outer(g2,g2,"+")
deno1<-median(c(ma[lower.tri(ma)]/2))
mn<-outer(g1,g1,"+")
deno2<-median(c(mn[lower.tri(mn)]/2))
ac<-abs(deno1-deno2)
Tstatistic =ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
f<-median(c(G1))
h<-median(c(G2))
p<-c(G1-f)
r<-c(G2-h)
pr<-c(p,r)
pu<-outer(pr,pr,"-")
xc<-pu[lower.tri(pu)]
b<- median(c(abs(xc)))
mas<-outer(G2,G2,"+")
dn1<-median(c(mas[lower.tri(mas)]/2))
mns<-outer(G1,G1,"+")
dn2<-median(c(mns[lower.tri(mns)]/2))
an<-abs(dn2-dn1)
G3[i]<- an/b
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
pl<-numeric(5)
pval<-apply(combined_data,2,FUN=HLE1.Pvalue_approximator)
pl[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=HLE1.Pvalue_approximator)
pl[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=HLE1.Pvalue_approximator)
pl[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=wilcoxon.Pvalue_approximator)
pl[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=wilcoxon.Pvalue_approximator)
pl[5]<-sum( pval4 < 0.05)/1000
lines(location, pl, col="brown",lty=1)
median_Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
rt<-rank(c(g1,g2))
rt<-rt[11:20]
Tstatistic<-sum(rt > 10.5)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
ra<-rank(c(G1,G2))
ra<-ra[11:20]
G3[i]<-sum(ra > 10.5)
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
pm<-numeric(5)
pval<-apply(combined_data,2,FUN=median_Pvalue_approximator)
pm[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,2,FUN=median_Pvalue_approximator)
pm[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,2,FUN=median_Pvalue_approximator)
pm[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,2,FUN=median_Pvalue_approximator)
pm[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,2,FUN=median_Pvalue_approximator)
pm[5]<-sum( pval4 < 0.05)/1000
lines(location, pm, col="yellow",lty=1)
legend("topleft", legend=c("t.test","HLE2", "HLE","Diff.med","median","wilcoxon","mean diff"),col=c( "black","orange","brown","green","yellow","blue","red"), lty=c(1,1,1,1,1,1,2), cex=0.8, text.font=4, bg='white')
回答1:
Ok, we have t-Distribution which could be written as
T(n) = N(0,1)*√[n/χ2(n)]
where N(0,1) is standard normal, and χ2(n) is Chi-squared distribtion. This is pretty standard stuff.
If we want shifted distribution, we add shift μ
, so
T(n)+μ = N(0,1)*√[n/χ2(n)] + μ (1)
If we want non-central parameter (NCP) equal to μ, and Non-central t-distribution we shift GAUSSIAN in the expression above
T(n, NCP=μ) = N(μ,1)*√[n/χ2(n)]=(N(0,1)+μ)*√[n/χ2(n)]=
=N(0,1)*√[n/χ2(n)] + μ*√[n/χ2(n)] (2)
Do you see the difference? In the eq(1) we add constant. In the eq(2 ) we add constant multiplied by some ugly looking random variable. Those distributions are different and will produce different results. Use with care.
Standard T(n)
would be symmetric wrt 0, and T(n)+μ
would be symmetric wrt μ
, but non-central T would have asymmetry, you're mixing symmetric T(n) with asymmetric term μ*√[n/χ2(n)]. You could at graphs in Wikipedia for non-central T(n)
UPDATE
running your code (yes, took quite some time, probably more than 12 hours), I've got
UPDATE II
I'm a bit more familiar with Python nowadays, so I recoded part of the test in Python and ran it, it is pretty much instant, and for t-distribution with df=3 I got a lot more close to the paper graph, values up to 0.8. You could also quickly make graph for df=1, and again should get close to the paper result. Or you could replace rng.standard_t
with rng.normal(size=N)
and you will get graph with close to 1 power at large shifts.
Code
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
rng = np.random.default_rng(312345)
N = 10 # Sample Size
α = 0.05
shift = [0.0, 0.5, 1.0, 1.5, 2.0]
power = np.zeros(len(shift))
for k in range(0, len(shift)):
s = shift[k] # current shift
c = 0 # counter how many times we reject
for _ in range(0, 1000):
a = rng.standard_t(df=3, size=N) # baseline sample
b = rng.standard_t(df=3, size=N) + s # sample with shift
t, p = stats.ttest_ind(a, b, equal_var=True) # t-Test from two independent samples, assuming equal variance
if p <= α:
c += 1
power[k] = float(c)/1000.0
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
ax.plot(shift, power, 'r-')
plt.show()
and graph
UPDATE III
And here is R code which is pretty much like Python one and makes about the same graph
N <- 10
shift <- c(0., 0.5, 1.0, 1.5, 2.0)
power <- c(0., 0., 0., 0., 0.)
av <- 0.05
samples <- function(n) {
rchisq(n, df=3) #rnorm(n) #rt(n, df=3) #rt(n, df=1)
}
pvalue <- function(a, b) {
t.test(a, b, var.equal = TRUE)$p.value
}
for (k in 1:5) {
s <- shift[k]
p <- replicate(1000, pvalue(samples(N), samples(N) + s))
cc <- sum(p <= av)
power[k] <- cc/1000.0
}
plot(shift, power, type="l")
UPDATE IV
No, I was unable to get their (in paper) t-test graph in Fig.1, bottom right for χ2(3), in both R and Python. What I'm getting is something like graph below.
回答2:
You are looking for the ncp
(Non Centrality Parameter) argument of rt()
rt(10, 3, ncp = 4)
Have a look at the helpfile to see how you need to set the ncp argument.
来源:https://stackoverflow.com/questions/64135392/generating-shifted-t-distributed-numbers-and-the-non-centrality-parameter