Fst estimators for bialleic SNPs

Weir and Cockerham 1984

The same as the faster "fstat" from the geneland package but this script also gives the total variance, variance within individuals, variance within population and variance between populations both on a SNP level and on as a joint estimate.


WC84<-function(x,pop){
  #number ind each population
  n<-table(pop)
  #number of populations
  npop<-nrow(n)
  #average sample size of each population
  n_avg<-mean(n)
  #total number of samples
  N<-length(pop)
  #frequency in samples
  p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop)
  #average frequency in all samples (apply(x,2,mean)/2)
  p_avg<-as.vector(n%*%p/N )
  #the sample variance of allele 1 over populations
  s2<-1/(npop-1)*(apply(p,1,function(x){((x-p_avg)^2)})%*%n)/n_avg
  #average heterozygouts
  #  h<-apply(x==1,2,function(x,pop)tapply(x,pop,mean),pop=pop)
  #average heterozygote frequency for allele 1
  #  h_avg<-as.vector(n%*%h/N)
  #faster version than above:
   h_avg<-apply(x==1,2,sum)/N
  #nc (see page 1360 in wier and cockerhamm, 1984)
  n_c<-1/(npop-1)*(N-sum(n^2)/N)
  #variance betwen populations
  a <- n_avg/n_c*(s2-(p_avg*(1-p_avg)-(npop-1)*s2/npop-h_avg/4)/(n_avg-1))
  #variance between individuals within populations
  b <- n_avg/(n_avg-1)*(p_avg*(1-p_avg)-(npop-1)*s2/npop-(2*n_avg-1)*h_avg/(4*n_avg))
  #variance within individuals
  c <- h_avg/2

  #inbreedning (F_it)
  F <- 1-c/(a+b+c)
  #(F_st)
  theta <- a/(a+b+c)
  #(F_is)
  f <- 1-c(b+c)
  #weigted average of theta
  theta_w<-sum(a)/sum(a+b+c)
  list(F=F,theta=theta,f=f,theta_w=theta_w,a=a,b=b,c=c,total=c+b+a)
}


#example of use
nsnp=10000
x<-matrix(rbinom(160*nsnp,2,0.02),160)
pop<-rep(1:4,40)
res<-WC84(x,pop)
res$theta

to get the Fst for single sites use

nsnp=10000
x<-matrix(rbinom(160*nsnp,2,0.02),160)
pop<-rep(1:4,40)
res<-WC84(x,pop)
with(res,a/(a+b+c))

WEIR, B. S., and C. C. COCKERHAM, 1984 Estimating F-statistics for the analysis of population structure. Evolution 38: 1358–1370

Weir and Hill 2002

WH02<-function(x,pop){
  #moment estimation of fst using weir and hall 2002
  n<-table(pop)#number ind each population
  npop<-nrow(n)
  N<-length(pop) #total number of samples
  p_1<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop) #frequency in samples
  p_1_avg<-n%*%p_1/N #average frequency in all samples (apply(x,2,mean)/2)
  msp_1<-1/(npop-1)*(apply(p_1,1,function(x){((x-p_1_avg)^2)})%*%n)
  msg_1<-1/(N-npop)*(apply(p_1,1,function(x){x*(1-x)})%*%n)

  p_2<-1-p_1
  p_2_avg<-n%*%p_2/N #average frequency in all samples (apply(x,2,mean)/2)
  msp_2<-1/(npop-1)*(apply(p_2,1,function(x){((x-p_2_avg)^2)})%*%n)
  msg_2<-1/(N-npop)*(apply(p_2,1,function(x){x*(1-x)})%*%n)


  n_c<-1/(npop-1)*(N-sum(n^2)/N)
  theta<-sum(msp_1-msg_1+msp_2-msg_2)/sum(msp_1+(n_c-1)*msg_1+msp_2+(n_c-1)*msg_2)

return(theta)
}

x<-matrix(rbinom(1600,2,0.02),160)
pop<-rep(1:4,40)
WH02(x,pop)

WEIR, B. S., and W. G. HILL, 2002 Estimating F-statistics. Annu. Rev. Genet. 36: 721–750

[ Front | MyRfun | Index ]

Updated januar 6, 2011

Fst estimators for bialleic SNPs

Home Projects Software Collaboration About

Weir and Cockerham 1984

Weir and Hill 2002