Rscripts: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
= PCA/Eigensoft/Eigenstrat = | = PCA/Eigensoft/Eigenstrat = | ||
<pre> | <pre> | ||
eigenstrat<-function(geno){ #snp x ind matrix of genotypes \in 0,1,2 | |||
nMis<-rowSums(is.na(geno)) | |||
geno<-geno[nMis==0,] #remove snps with missing data | |||
avg<-rowSums(geno)/ncol(geno) # get allele frequency times 2 | |||
keep<-avg!=0&avg!=2 # remove sites with non-polymorphic data | |||
avg<-avg[keep] | |||
geno<-geno[keep,] | |||
snp<-nrow(geno) #number of snps used in analysis | |||
ind<-ncol(geno) #number of individuals used in analuysis | |||
freq<-avg/2 #frequency | |||
M <- (geno-avg)/sqrt(freq*(1-freq)) #normalize the genotype matrix | |||
X<-t(M)%*%M #get the (almost) covariance matrix | |||
X<-X/(sum(diag(X))/(snp-1)) | |||
E<-eigen(X) | |||
mu<-(sqrt(snp-1)+sqrt(ind))^2/snp #for testing significance (assuming no LD!) | |||
sigma<-(sqrt(snp-1)+sqrt(ind))/snp*(1/sqrt(snp-1)+1/sqrt(ind))^(1/3) | |||
E$TW<-(E$values[1]*ind/sum(E$values)-mu)/sigma | |||
E$mu<-mu | |||
E$sigma<-sigma | |||
class(E)<-"eigenstrat" | |||
E | |||
} | |||
plot.eigenstrat<-function(x,col=1,...) | |||
plot(x$vectors[,1:2],col=col,...) | |||
print.eigenstrat<-function(x) | |||
cat("statistic",x$TW,"\n") | |||
</pre> | </pre> | ||
Example | |||
<pre> | <pre> | ||
ind<-c(20,20) | |||
snp<-10000 | |||
freq=c(0.2,0.25) | |||
geno<-c() | |||
for(pop in 1:length(ind)) | |||
geno<-rbind(geno,matrix(rbinom(snp*ind[pop],2,freq[pop]),ind[pop])) | |||
geno<-t(geno) | |||
e<-eigenstrat(geno) | |||
plot(e,col=rep(1:length(ind),ind),xlab="PC1",ylab="PC2") | |||
</pre> | </pre> | ||
= Fst = | = Fst = | ||
The same as the faster "fstat" from the geneland package but this script also gives the total variance, variance within individuals, variance within population and variance between populations both on a SNP level and on as a joint estimate. | |||
<pre> | <pre> | ||
WC84<-function(x,pop){ | |||
#number ind each population | |||
n<-table(pop) | |||
#number of populations | |||
npop<-nrow(n) | |||
#average sample size of each population | |||
n_avg<-mean(n) | |||
#total number of samples | |||
N<-length(pop) | |||
#frequency in samples | |||
p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop) | |||
#average frequency in all samples (apply(x,2,mean)/2) | |||
p_avg<-as.vector(n%*%p/N ) | |||
#the sample variance of allele 1 over populations | |||
s2<-1/(npop-1)*(apply(p,1,function(x){((x-p_avg)^2)})%*%n)/n_avg | |||
#average heterozygouts | |||
# h<-apply(x==1,2,function(x,pop)tapply(x,pop,mean),pop=pop) | |||
#average heterozygote frequency for allele 1 | |||
# h_avg<-as.vector(n%*%h/N) | |||
#faster version than above: | |||
h_avg<-apply(x==1,2,sum)/N | |||
#nc (see page 1360 in wier and cockerhamm, 1984) | |||
n_c<-1/(npop-1)*(N-sum(n^2)/N) | |||
#variance betwen populations | |||
a <- n_avg/n_c*(s2-(p_avg*(1-p_avg)-(npop-1)*s2/npop-h_avg/4)/(n_avg-1)) | |||
#variance between individuals within populations | |||
b <- n_avg/(n_avg-1)*(p_avg*(1-p_avg)-(npop-1)*s2/npop-(2*n_avg-1)*h_avg/(4*n_avg)) | |||
#variance within individuals | |||
c <- h_avg/2 | |||
#inbreedning (F_it) | |||
F <- 1-c/(a+b+c) | |||
#(F_st) | |||
theta <- a/(a+b+c) | |||
#(F_is) | |||
f <- 1-c(b+c) | |||
#weigted average of theta | |||
theta_w<-sum(a)/sum(a+b+c) | |||
list(F=F,theta=theta,f=f,theta_w=theta_w,a=a,b=b,c=c,total=c+b+a) | |||
} | |||
</pre> | </pre> | ||
Line 20: | Line 99: | ||
<pre> | <pre> | ||
#example of use | |||
nsnp=10000 | |||
x<-matrix(rbinom(160*nsnp,2,0.02),160) | |||
pop<-rep(1:4,40) | |||
res<-WC84(x,pop) | |||
res$theta | |||
</pre> | </pre> | ||
= LD pruning in R = | = LD pruning in R = | ||
Install R package | |||
<pre> | <pre> | ||
wget http://www.popgen.dk/albrecht/misc_Rpackages/Rpakker/pruning_0.51.tar.gz | |||
R CMD INSTALL pruning_0.51.tar.gz | |||
</pre> | </pre> | ||
Revision as of 11:33, 15 August 2013
PCA/Eigensoft/Eigenstrat
eigenstrat<-function(geno){ #snp x ind matrix of genotypes \in 0,1,2 nMis<-rowSums(is.na(geno)) geno<-geno[nMis==0,] #remove snps with missing data avg<-rowSums(geno)/ncol(geno) # get allele frequency times 2 keep<-avg!=0&avg!=2 # remove sites with non-polymorphic data avg<-avg[keep] geno<-geno[keep,] snp<-nrow(geno) #number of snps used in analysis ind<-ncol(geno) #number of individuals used in analuysis freq<-avg/2 #frequency M <- (geno-avg)/sqrt(freq*(1-freq)) #normalize the genotype matrix X<-t(M)%*%M #get the (almost) covariance matrix X<-X/(sum(diag(X))/(snp-1)) E<-eigen(X) mu<-(sqrt(snp-1)+sqrt(ind))^2/snp #for testing significance (assuming no LD!) sigma<-(sqrt(snp-1)+sqrt(ind))/snp*(1/sqrt(snp-1)+1/sqrt(ind))^(1/3) E$TW<-(E$values[1]*ind/sum(E$values)-mu)/sigma E$mu<-mu E$sigma<-sigma class(E)<-"eigenstrat" E } plot.eigenstrat<-function(x,col=1,...) plot(x$vectors[,1:2],col=col,...) print.eigenstrat<-function(x) cat("statistic",x$TW,"\n")
Example
ind<-c(20,20) snp<-10000 freq=c(0.2,0.25) geno<-c() for(pop in 1:length(ind)) geno<-rbind(geno,matrix(rbinom(snp*ind[pop],2,freq[pop]),ind[pop])) geno<-t(geno) e<-eigenstrat(geno) plot(e,col=rep(1:length(ind),ind),xlab="PC1",ylab="PC2")
Fst
The same as the faster "fstat" from the geneland package but this script also gives the total variance, variance within individuals, variance within population and variance between populations both on a SNP level and on as a joint estimate.
WC84<-function(x,pop){ #number ind each population n<-table(pop) #number of populations npop<-nrow(n) #average sample size of each population n_avg<-mean(n) #total number of samples N<-length(pop) #frequency in samples p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop) #average frequency in all samples (apply(x,2,mean)/2) p_avg<-as.vector(n%*%p/N ) #the sample variance of allele 1 over populations s2<-1/(npop-1)*(apply(p,1,function(x){((x-p_avg)^2)})%*%n)/n_avg #average heterozygouts # h<-apply(x==1,2,function(x,pop)tapply(x,pop,mean),pop=pop) #average heterozygote frequency for allele 1 # h_avg<-as.vector(n%*%h/N) #faster version than above: h_avg<-apply(x==1,2,sum)/N #nc (see page 1360 in wier and cockerhamm, 1984) n_c<-1/(npop-1)*(N-sum(n^2)/N) #variance betwen populations a <- n_avg/n_c*(s2-(p_avg*(1-p_avg)-(npop-1)*s2/npop-h_avg/4)/(n_avg-1)) #variance between individuals within populations b <- n_avg/(n_avg-1)*(p_avg*(1-p_avg)-(npop-1)*s2/npop-(2*n_avg-1)*h_avg/(4*n_avg)) #variance within individuals c <- h_avg/2 #inbreedning (F_it) F <- 1-c/(a+b+c) #(F_st) theta <- a/(a+b+c) #(F_is) f <- 1-c(b+c) #weigted average of theta theta_w<-sum(a)/sum(a+b+c) list(F=F,theta=theta,f=f,theta_w=theta_w,a=a,b=b,c=c,total=c+b+a) }
#example of use nsnp=10000 x<-matrix(rbinom(160*nsnp,2,0.02),160) pop<-rep(1:4,40) res<-WC84(x,pop) res$theta
LD pruning in R
Install R package
wget http://www.popgen.dk/albrecht/misc_Rpackages/Rpakker/pruning_0.51.tar.gz R CMD INSTALL pruning_0.51.tar.gz