generating shifted t distributed numbers and the non centrality parameter?

问题

when i use a<-rt(10,3)and b <-rnorm(10,3)+5 trying to get shifted to the right numbers in order to calculate power of the two sample t-test. I get wrong results. There is a lot of literature on the internet talking about the use of the noncentrality parameter to get shifted numbers in order to be able to calculate power. My question how to use noncentrality parameter to get an amount of shifting equal to 5. If I am wrong and that the only method to get shifted numbers from the t distribution is the method introduced at the beginning then please tell me.

desired_length<-1000
empty_list <- vector(mode = "list", length = desired_length)
empty_list1 <- vector(mode = "list", length = desired_length)
empty_list2<-vector(mode="list",length=desired_length)
empty_list3<-vector(mode="list",length=desired_length)
empty_list4<-vector(mode="list",length=desired_length)
for (i in 1:1000) {
  

  h<-rt(10,1)

  g<-rt(10,1)

  g1<- rt(10,1)+0.5

  g2<-rt(10,1)+1

  g3<- rt(10,1)+1.5

  g4<- rt(10,1)+2
  a<-cbind(h,g)
  b<-cbind(h,g1)
  c<-cbind(h,g2)
  d<-cbind(h,g3)
  e<-cbind(h,g4)
  empty_list[[i]]<-a
  empty_list1[[i]]<-b
  empty_list2[[i]]<-c
  empty_list3[[i]]<-d
  empty_list4[[i]]<-e
}

pvalue<-numeric(1000)
pvalue1<-numeric(1000)
pvalue2<-numeric(1000)
pvalue3<-numeric(1000)
pvalue4<-numeric(1000)
x<-numeric(5)

for (i in 1:1000){
  pvalue[i]<-t.test(empty_list[[i]][,1],empty_list[[i]][,2])$p.value
  
  pvalue1[i]<-t.test(empty_list1[[i]][,1],empty_list1[[i]][,2])$p.value
  
  pvalue2[i]<-t.test(empty_list2[[i]][,1],empty_list2[[i]][,2])$p.value
  
  pvalue3[i]<-t.test(empty_list3[[i]][,1],empty_list3[[i]][,2])$p.value
  
  pvalue4[i]<-t.test(empty_list4[[i]][,1],empty_list4[[i]][,2])$p.value
  
}
x[1]<-sum(pvalue<0.05)/1000
x[2]<-sum(pvalue1<0.05)/1000
x[3]<-sum(pvalue2<0.05)/1000
x[4]<-sum(pvalue3<0.05)/1000
x[5]<-sum(pvalue4<0.05)/1000
location<-seq(0,2,by =0.5)
plot(location,x,ylab="Power for t1 distributions",xlab="location difference",type = "l",ylim=c(0,1))





combined_data<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
  
  combined_data[,i]<-c(empty_list[[i]][,1],empty_list[[i]][,2])
}

combined_data1<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
  
  combined_data1[,i]<-c(empty_list1[[i]][,1],empty_list1[[i]][,2])
}

combined_data2<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
  
  combined_data2[,i]<-c(empty_list2[[i]][,1],empty_list2[[i]][,2])
}

combined_data3<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
  
  combined_data3[,i]<-c(empty_list3[[i]][,1],empty_list3[[i]][,2])
}

combined_data4<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
  
  combined_data4[,i]<-c(empty_list4[[i]][,1],empty_list4[[i]][,2])
}

Pvalue_approximator<-function(m){
  
  g1<-m[1:10]
  g2<-m[11:20]
  Tstatistic<- mean(g2)-mean(g1)
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    G3[i]<-mean(G2)-mean(G1)
  }
  
  m<-(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1) 
}
p<-numeric(5)
pval<-apply(combined_data,2,FUN=Pvalue_approximator)
p[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=Pvalue_approximator)
p[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=Pvalue_approximator)
p[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=Pvalue_approximator)
p[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=Pvalue_approximator)
p[5]<-sum( pval4 < 0.05)/1000 


lines(location, p, col="red",lty=2)

Diff.med.Pvalue_approximator<-function(m){
  
  g1<-m[1:10]
  g2<-m[11:20]
  a<-abs(c(g1-median(c(g1))))
  b<-abs(c(g2-median(c(g2))))
  ab<-2*median(c(a,b))
  ac<-abs(median(c(g2))-median(c(g1)))
  Tstatistic =ac/ab
  
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    o<-abs(c(G1-median(c(G1))))
    v<-abs(c(G2-median(c(G2))))
    ov<-2*median(c(o,v))
    oc<-abs(median(c(G2))-median(c(G1)))
    G3[i]<- oc/ov
  }
  m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
  
}
po<-numeric(5)
pval<-apply(combined_data,2,FUN=Diff.med.Pvalue_approximator)
po[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=Diff.med.Pvalue_approximator)
po[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=Diff.med.Pvalue_approximator)
po[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=Diff.med.Pvalue_approximator)
po[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=Diff.med.Pvalue_approximator)
po[5]<-sum(pval4 < 0.05)/1000 

lines(location, po, col="green",lty=1)






wilcoxon.Pvalue_approximator<-function(m){
  
  g1<-m[1:10]
  g2<-m[11:20]
  l = length(g1)
  rx = rank(c(g1,g2))
  rf<-rx[11:20]
  Tstatistic<-sum(rf)
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    rt<-rank(c(G1,G2))
    ra<-rt[11:20]
    G3[i]<-sum(ra)
  }
  
  m<-2*(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1)
}


pw<-numeric(5)
pval<-apply(combined_data,2,FUN=wilcoxon.Pvalue_approximator)
pw[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=wilcoxon.Pvalue_approximator)
pw[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=wilcoxon.Pvalue_approximator)
pw[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=wilcoxon.Pvalue_approximator)
pw[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=wilcoxon.Pvalue_approximator)
pw[5]<-sum( pval4 < 0.05)/1000 


lines(location, pw, col="blue",lty=1)

HLE2.Pvalue_approximator<-function(m){
  
  g1<-m[1:10]
  g2<-m[11:20]
  u<-median(c(g1))
  v<-median(c(g2))
  x<-c(g1-u)
  y<-c(g2-v)
  xy<-c(x,y)
  a<-outer(xy,xy,"-")
  t<-a[lower.tri(a)]
  ab<- median(c(abs(t)))
  ac<-abs(median(c(outer(g2,g1,"-"))))
  Tstatistic = ac/ab
  
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    f<-median(c(G1))
    h<-median(c(G2))
    p<-c(G1-f)
    r<-c(G2-h)
    pr<-c(p,r)
    pu<-outer(pr,pr,"-")
    xc<-pu[lower.tri(pu)]
    b<- median(c(abs(xc)))
    acn<-abs(median(c(outer(G2,G1,"-"))))
    G3[i]<- acn/b
  }
  m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
  
}

phl<-numeric(5)
pval<-apply(combined_data,2,FUN=HLE2.Pvalue_approximator)
phl[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=HLE2.Pvalue_approximator)
phl[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=HLE2.Pvalue_approximator)
phl[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=HLE2.Pvalue_approximator)
phl[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=HLE2.Pvalue_approximator)
phl[5]<-sum( pval4 < 0.05)/1000 


lines(location, phl, col="orange",lty=1)


HLE1.Pvalue_approximator<-function(m){
  
  g1<-m[1:10]
  g2<-m[11:20]
  u<-median(c(g1))
  v<-median(c(g2))
  x<-c(g1-u)
  y<-c(g2-v)
  xy<-c(x,y)
  a<-outer(xy,xy,"-")
  t<-a[lower.tri(a)]
  ab<- median(c(abs(t)))
  ma<-outer(g2,g2,"+")
  deno1<-median(c(ma[lower.tri(ma)]/2))
  mn<-outer(g1,g1,"+")
  deno2<-median(c(mn[lower.tri(mn)]/2))
  ac<-abs(deno1-deno2)
  Tstatistic =ac/ab
  
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    f<-median(c(G1))
    h<-median(c(G2))
    p<-c(G1-f)
    r<-c(G2-h)
    pr<-c(p,r)
    pu<-outer(pr,pr,"-")
    xc<-pu[lower.tri(pu)]
    b<- median(c(abs(xc)))
    mas<-outer(G2,G2,"+")
    dn1<-median(c(mas[lower.tri(mas)]/2))
    mns<-outer(G1,G1,"+")
    dn2<-median(c(mns[lower.tri(mns)]/2))
    an<-abs(dn2-dn1)
    G3[i]<- an/b
  }
  m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
  
}
pl<-numeric(5)
pval<-apply(combined_data,2,FUN=HLE1.Pvalue_approximator)
pl[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=HLE1.Pvalue_approximator)
pl[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=HLE1.Pvalue_approximator)
pl[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=wilcoxon.Pvalue_approximator)
pl[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=wilcoxon.Pvalue_approximator)
pl[5]<-sum( pval4 < 0.05)/1000 

lines(location, pl, col="brown",lty=1)



median_Pvalue_approximator<-function(m){
  g1<-m[1:10]
  g2<-m[11:20]
  rt<-rank(c(g1,g2))
  rt<-rt[11:20]
  Tstatistic<-sum(rt > 10.5)
  nreps=10000
  G3 <- numeric(nreps)
  for (i in 1:nreps) {
    shuffled_data<-sample(c(m))
    G1 <- (shuffled_data)[1:10] 
    G2 <- (shuffled_data)[11:20]
    ra<-rank(c(G1,G2))
    ra<-ra[11:20]
    G3[i]<-sum(ra > 10.5)
    
  }
  m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}

pm<-numeric(5)
pval<-apply(combined_data,2,FUN=median_Pvalue_approximator)
pm[1]<-sum( pval < 0.05)/1000 
pval1<-apply(combined_data1,2,FUN=median_Pvalue_approximator)
pm[2]<-sum( pval1 < 0.05)/1000 
pval2<-apply(combined_data2,2,FUN=median_Pvalue_approximator)
pm[3]<-sum( pval2 < 0.05)/1000 
pval3<-apply(combined_data3,2,FUN=median_Pvalue_approximator)
pm[4]<-sum( pval3 < 0.05)/1000 
pval4<-apply(combined_data4,2,FUN=median_Pvalue_approximator)
pm[5]<-sum( pval4 < 0.05)/1000 


lines(location, pm, col="yellow",lty=1)
legend("topleft", legend=c("t.test","HLE2", "HLE","Diff.med","median","wilcoxon","mean diff"),col=c( "black","orange","brown","green","yellow","blue","red"), lty=c(1,1,1,1,1,1,2), cex=0.8, text.font=4, bg='white')

回答1:

Ok, we have t-Distribution which could be written as

T(n) = N(0,1)*√[n/χ²(n)]

where N(0,1) is standard normal, and χ²(n) is Chi-squared distribtion. This is pretty standard stuff.

If we want shifted distribution, we add shift μ, so

T(n)+μ = N(0,1)*√[n/χ²(n)] + μ (1)

If we want non-central parameter (NCP) equal to μ, and Non-central t-distribution we shift GAUSSIAN in the expression above

T(n, NCP=μ) = N(μ,1)*√[n/χ²(n)]=(N(0,1)+μ)*√[n/χ²(n)]=

=N(0,1)*√[n/χ²(n)] + μ*√[n/χ²(n)] (2)

Do you see the difference? In the eq(1) we add constant. In the eq(2 ) we add constant multiplied by some ugly looking random variable. Those distributions are different and will produce different results. Use with care.

Standard T(n) would be symmetric wrt 0, and T(n)+μ would be symmetric wrt μ, but non-central T would have asymmetry, you're mixing symmetric T(n) with asymmetric term μ*√[n/χ²(n)]. You could at graphs in Wikipedia for non-central T(n)

UPDATE

running your code (yes, took quite some time, probably more than 12 hours), I've got

UPDATE II

I'm a bit more familiar with Python nowadays, so I recoded part of the test in Python and ran it, it is pretty much instant, and for t-distribution with df=3 I got a lot more close to the paper graph, values up to 0.8. You could also quickly make graph for df=1, and again should get close to the paper result. Or you could replace rng.standard_t with rng.normal(size=N) and you will get graph with close to 1 power at large shifts.

Code

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

rng = np.random.default_rng(312345)

N = 10 # Sample Size

α = 0.05

shift = [0.0, 0.5, 1.0, 1.5, 2.0]
power = np.zeros(len(shift))

for k in range(0, len(shift)):
    s = shift[k] # current shift
    c = 0        # counter how many times we reject
    for _ in range(0, 1000):

        a = rng.standard_t(df=3, size=N) # baseline sample
        b = rng.standard_t(df=3, size=N) + s # sample with shift

        t, p = stats.ttest_ind(a, b, equal_var=True) # t-Test from two independent samples, assuming equal variance
        if p <= α:
            c += 1

    power[k] = float(c)/1000.0

fig = plt.figure()
ax  = fig.add_subplot(2, 1, 1)

ax.plot(shift, power, 'r-')

plt.show()

and graph

UPDATE III

And here is R code which is pretty much like Python one and makes about the same graph

N <- 10

shift <- c(0., 0.5, 1.0, 1.5, 2.0)
power <- c(0., 0., 0., 0., 0.)

av <- 0.05

samples <- function(n) {
    rchisq(n, df=3) #rnorm(n) #rt(n, df=3) #rt(n, df=1)
}

pvalue <- function(a, b) {
    t.test(a, b, var.equal = TRUE)$p.value
}

for (k in 1:5) {
    s <- shift[k]

    p <- replicate(1000, pvalue(samples(N), samples(N) + s))
    cc <- sum(p <= av)

    power[k] <- cc/1000.0
}

plot(shift, power, type="l")

UPDATE IV

No, I was unable to get their (in paper) t-test graph in Fig.1, bottom right for χ²(3), in both R and Python. What I'm getting is something like graph below.

回答2:

You are looking for the ncp (Non Centrality Parameter) argument of rt()

rt(10, 3, ncp = 4)

Have a look at the helpfile to see how you need to set the ncp argument.

来源：https://stackoverflow.com/questions/64135392/generating-shifted-t-distributed-numbers-and-the-non-centrality-parameter

标签

simulation