|
|
(19 intermediate revisions by the same user not shown) |
Line 1: |
Line 1: |
| | ;THIS PAGE IS OBSOLETE, PLEASE SEE FST AND PCA in the sidebar for the latest versions |
| Matteo Fumagalli has been working on methods to estimate Fst and doing PCA/Covariance based on ANGSD output files. | | Matteo Fumagalli has been working on methods to estimate Fst and doing PCA/Covariance based on ANGSD output files. |
|
| |
|
Line 4: |
Line 5: |
| https://github.com/mfumagalli/ngsTools | | https://github.com/mfumagalli/ngsTools |
|
| |
|
| We are also working on a new implementation that generalizes the above approach to multiple populations.
| | =Fst= |
| | |
| =New Fancy Version= | |
| | |
| ==Two Populations real data==
| |
| <pre>
| |
| #this is with 2pops
| |
| #first calculate per pop saf for each populatoin
| |
| ../angsd -b list1 -anc hg19ancNoChr.fa -out pop1 -dosaf 1 -gl 1
| |
| ../angsd -b list2 -anc hg19ancNoChr.fa -out pop1 -dosaf 1 -gl 1
| |
| #calculate the 2dsfs prior
| |
| ../misc/realSFS pop1.saf.idx pop2.saf.idx >pop1.pop2.ml
| |
| #prepare the fst for easy window analysis etc
| |
| ../misc/realSFS fst index pop1.saf.idx pop2.saf.idx -sfs pop1.pop2.ml -fstout here
| |
| #get the global estimate
| |
| ../misc/realSFS fst stats here.fst.idx
| |
| -> FST.Unweight:0.069395 Fst.Weight:0.042349
| |
| #below is not tested that much, but seems to work
| |
| ../misc/realSFS fst stats here.fst.idx -win 50000 -step 10000 >slidingwindow
| |
| </pre>
| |
| ==3 Populations real data==
| |
| <pre>
| |
| #this is with 2pops
| |
| #first calculate per pop saf for each populatoin
| |
| ../angsd -b list10 -anc hg19ancNoChr.fa -out pop1 -dosaf 1 -gl 1
| |
| ../angsd -b list11 -anc hg19ancNoChr.fa -out pop2 -dosaf 1 -gl 1
| |
| ../angsd -b list12 -anc hg19ancNoChr.fa -out pop3 -dosaf 1 -gl 1
| |
| #calculate the 2dsfs prior
| |
| ../misc/realSFS pop1.saf.idx pop2.saf.idx >pop1.pop2.ml
| |
| #prepare the fst for easy window analysis etc
| |
| ../misc/realSFS fst index pop1.saf.idx pop2.saf.idx -sfs pop1.pop2.ml -fstout here
| |
| #get the global estimate
| |
| ../misc/realSFS fst stats here.fst.idx
| |
| -> FST.Unweight:0.069395 Fst.Weight:0.042349
| |
| #below is not tested that much, but seems to work
| |
| ../misc/realSFS fst stats here.fst.idx -win 50000 -step 10000 >slidingwindow
| |
| </pre>
| |
| | |
| == 3 Populations simulated data==
| |
| <pre>
| |
| msms -ms 44 10 -t 930 -r 400 -I 3 12 14 18 -n 1 1.682020 -n 2 3.736830 -n 3 7.292050 -eg 0 2 116.010723 -eg 0 3 160.246047 -ma x 0.881098 0.561966 0.881098 x 2.797460 0.561966 2.797460 x -ej 0.028985 3 2 -en 0.028985 2 0.287184 -ema 0.028985 3 x 7.293140 x 7.293140 x x x x x -ej 0.197963 2 1 -en 0.303501 1 1 >msoutput.txt
| |
| ../misc/msToGlf -in msoutput.txt -out raw -singleOut 1 -regLen 0 -depth 8 -err 0.005
| |
| ../misc/splitgl raw.glf.gz 22 1 6 >pop1.glf.gz
| |
| ../misc/splitgl raw.glf.gz 22 7 13 >pop2.glf.gz
| |
| ../misc/splitgl raw.glf.gz 22 14 22 >pop3.glf.gz
| |
| echo \"1 250000000\" >fai.fai
| |
| ../angsd -glf pop1.glf.gz -nind 6 -doSaf 1 -out pop1 -fai fai.fai -issim 1
| |
| ../angsd -glf pop2.glf.gz -nind 7 -doSaf 1 -out pop2 -fai fai.fai -issim 1
| |
| ../angsd -glf pop3.glf.gz -nind 9 -doSaf 1 -out pop3 -fai fai.fai -issim 1
| |
| ../misc/realSFS pop1.saf.idx >pop1.saf.idx.ml
| |
| ../misc/realSFS pop2.saf.idx >pop2.saf.idx.ml
| |
| ../misc/realSFS pop3.saf.idx >pop3.saf.idx.ml
| |
| ../misc/realSFS pop1.saf.idx pop2.saf.idx -p 20 >pop1.pop2.saf.idx.ml
| |
| ../misc/realSFS pop1.saf.idx pop3.saf.idx -p 20 >pop1.pop3.saf.idx.ml
| |
| ../misc/realSFS pop2.saf.idx pop3.saf.idx -p 20 >pop2.pop3.saf.idx.ml
| |
| ../misc/realSFS fst index pop1.saf.idx pop2.saf.idx -fstout pop1.pop2 -sfs pop1.pop2.saf.idx.ml
| |
| ../misc/realSFS fst index pop1.saf.idx pop3.saf.idx -fstout pop1.pop3 -sfs pop1.pop3.saf.idx.ml
| |
| ../misc/realSFS fst index pop2.saf.idx pop3.saf.idx -fstout pop2.pop3 -sfs pop2.pop3.saf.idx.ml
| |
| ../misc/realSFS fst index pop1.saf.idx pop2.saf.idx pop3.saf.idx -fstout pop1.pop2.pop3 -sfs pop1.pop2.saf.idx.ml -sfs pop1.pop3.saf.idx.ml -sfs pop2.pop3.saf.idx.ml
| |
| ../misc/realSFS fst stats pop1.pop2.fst.idx
| |
| ../misc/realSFS fst stats pop1.pop3.fst.idx
| |
| ../misc/realSFS fst stats pop2.pop3.fst.idx
| |
| ../misc/realSFS fst stats pop1.pop2.pop3.fst.idx
| |
| </pre>
| |
| Which gives the following output
| |
| <pre>
| |
| $ ../misc/realSFS fst stats pop1.pop2.fst.idx
| |
| -> You are printing the optimized SFS to the terminal consider dumping into a file
| |
| -> E.g.: './realSFS fst stats pop1.pop2.fst.idx >sfs.ml.txt'
| |
| -> Assuming idxname:pop1.pop2.fst.idx
| |
| -> Assuming .fst.gz file: pop1.pop2.fst.gz
| |
| -> FST.Unweight[nObs:51085]:0.114638 Fst.Weight:0.186980
| |
| $ ../misc/realSFS fst stats pop1.pop3.fst.idx
| |
| -> You are printing the optimized SFS to the terminal consider dumping into a file
| |
| -> E.g.: './realSFS fst stats pop1.pop3.fst.idx >sfs.ml.txt'
| |
| -> Assuming idxname:pop1.pop3.fst.idx
| |
| -> Assuming .fst.gz file: pop1.pop3.fst.gz
| |
| -> FST.Unweight[nObs:51085]:0.121007 Fst.Weight:0.192111
| |
| $ ../misc/realSFS fst stats pop2.pop3.fst.idx
| |
| -> You are printing the optimized SFS to the terminal consider dumping into a file
| |
| -> E.g.: './realSFS fst stats pop2.pop3.fst.idx >sfs.ml.txt'
| |
| -> Assuming idxname:pop2.pop3.fst.idx
| |
| -> Assuming .fst.gz file: pop2.pop3.fst.gz
| |
| -> FST.Unweight[nObs:51085]:0.069462 Fst.Weight:0.125002
| |
| $ ../misc/realSFS fst stats pop1.pop2.pop3.fst.idx
| |
| -> You are printing the optimized SFS to the terminal consider dumping into a file
| |
| -> E.g.: './realSFS fst stats pop1.pop2.pop3.fst.idx >sfs.ml.txt'
| |
| -> Assuming idxname:pop1.pop2.pop3.fst.idx
| |
| -> Assuming .fst.gz file: pop1.pop2.pop3.fst.gz
| |
| -> FST.Unweight[nObs:51085]:0.114638 Fst.Weight:0.186980
| |
| -> FST.Unweight[nObs:51085]:0.121007 Fst.Weight:0.192111
| |
| -> FST.Unweight[nObs:51085]:0.069462 Fst.Weight:0.125002
| |
| </pre>
| |
| | |
| ==Two populations (sim data with R implementation of functionality)==
| |
| <pre>
| |
| nRep <- 100
| |
| nPop1 <- 24
| |
| nPop2 <- 16
| |
| cmd <- paste("msms -ms",nPop1+nPop2,nRep,"-t 930 -r 400 -I 2",nPop1,nPop2,"0 -g 1 9.70406 -n 1 2 -n 2 1 -ma x 0.0 0.0 x -ej 0.07142857 2 1 >msoutput.txt ",sep=" ")
| |
| system(cmd)
| |
| ##system("msms -ms 40 1 -t 930 -r 400 -I 2 20 20 0 -g 1 9.70406 -n 1 2 -n 2 1 -ma x 0.0 0.0 x -ej 0.07142857 2 1 >msoutput.txt ")
| |
| source("../R/readms.output.R")
| |
| to2dSFS <- function(p1.d,p2.d,nPop1,nPop2)
| |
| sapply(0:nPop1,function(x) table(factor(p2.d[p1.d==x],levels=0:nPop2)))
| |
| | |
| source("../R/readms.output.R")
| |
| a<- read.ms.output(file="msoutput.txt")
| |
|
| |
| p1.d <- unlist((sapply(a$gam,function(x) colSums(x[1:nPop1,]))))
| |
| p2.d <- unlist((sapply(a$gam,function(x) colSums(x[-c(1:nPop1),]))))
| |
| par(mfrow=c(1,2))
| |
| barplot(table(p1.d))
| |
| barplot(table(p2.d))
| |
| sfs.2d <- t(sapply(0:nPop1,function(x) table(factor(p2.d[p1.d==x],levels=0:nPop2))))
| |
| system("../misc/msToGlf -in msoutput.txt -out raw -singleOut 1 -regLen 0 -depth 8 -err 0.005")
| |
| system("../misc/splitgl raw.glf.gz 20 1 12 >pop1.glf.gz")
| |
| system("../misc/splitgl raw.glf.gz 20 13 20 >pop2.glf.gz")
| |
| system("echo \"1 250000000\" >fai.fai")
| |
| system("../angsd -glf pop1.glf.gz -nind 12 -doSaf 1 -out pop1 -fai fai.fai -issim 1")
| |
| system("../angsd -glf pop2.glf.gz -nind 8 -doSaf 1 -out pop2 -fai fai.fai -issim 1")
| |
| system("../misc/realSFS pop1.saf.idx >pop1.saf.idx.ml")
| |
| system("../misc/realSFS pop2.saf.idx >pop2.saf.idx.ml")
| |
| system("../misc/realSFS pop1.saf.idx pop2.saf.idx -maxIter 500 -p 20 >pop1.pop2.saf.idx.ml")
| |
| getFst<-function(est){
| |
| N1<-nrow(est)-1
| |
| N2<-ncol(est)-1
| |
| cat("N1: ",N1 ," N2: ",N2,"\n")
| |
| est0<-est
| |
| est0[1,1]<-0
| |
| est0[N1+1,N2+1]<-0
| |
| est0<-est0/sum(est0)
| |
|
| |
| aMat<<-matrix(NA,nrow=N1+1,ncol=N2+1)
| |
| baMat<<-matrix(NA,nrow=N1+1,ncol=N2+1)
| |
| for(a1 in 0:(N1))
| |
| for(a2 in 0:(N2)){
| |
| p1 <- a1/N1
| |
| p2 <- a2/N2
| |
| q1 <- 1 - p1
| |
| q2 <- 1 - p2
| |
| alpha1 <- 1 - (p1^2 + q1^2)
| |
| alpha2 <- 1 - (p2^2 + q2^2)
| |
|
| |
| al <- 1/2 * ( (p1-p2)^2 + (q1-q2)^2) - (N1+N2) * (N1*alpha1 + N2*alpha2) / (4*N1*N2*(N1+N2-1))
| |
| bal <- 1/2 * ( (p1-p2)^2 + (q1-q2)^2) + (4*N1*N2-N1-N2)*(N1*alpha1 + N2*alpha2) / (4*N1*N2*(N1+N2-1))
| |
| aMat[a1+1,a2+1]<<-al
| |
| baMat[a1+1,a2+1]<<-bal
| |
| ## print(signif(c(a1=a1,a2=a2,p1=p1,p2=p2,al1=alpha1,al2=alpha2,al),2))
| |
| }
| |
| ## unweighted average of single-locus ratio estimators
| |
| fstU <- sum(est0*(aMat/baMat),na.rm=T)
| |
| ## weighted average of single-locus ratio estimators
| |
| fstW <- sum(est0*aMat,na.rm=T)/sum(est0*baMat,na.rm=T)
| |
| c(fstW=fstW,fstU=fstU)
| |
| }
| |
| | |
| > getFst(sfs.2d)
| |
| N1: 24 N2: 16
| |
| fstW fstU
| |
| 0.11945801 0.08249571
| |
| | |
| est <- matrix(as.integer(scan("pop1.pop2.saf.idx.ml")),byrow=T,ncol=nPop2+1)
| |
| > getFst(est)
| |
| N1: 24 N2: 16
| |
| fstW fstU
| |
| 0.11925903 0.08241461
| |
| | |
| cmd<"fst index pop1.saf.idx pop2.saf.idx -sfs pop1.pop2.saf.idx.ml -fstout testing"
| |
| system(cmd)
| |
| | |
| ##view the per site 'alpha' 'beta' if you want
| |
| cmd<-"../misc/realSFS fst print testing.fst.idx |head"
| |
| | |
| ##use fancy new emperical bayes
| |
| cmd<- "../misc/realSFS fst stats testing.fst.idx "
| |
| system(cmd)
| |
| -> FST.Unweight:0.083316 Fst.Weight:0.119372
| |
| </pre>
| |
| | |
| =Old nice version=
| |
| ==Fst==
| |
| # Generate .saf files from each population using ANGSD [[SFS Estimation]] | | # Generate .saf files from each population using ANGSD [[SFS Estimation]] |
| ## using a 2D-SFS as a prior, estimated using ngs2dSFS | | ## using a 2D-SFS as a prior, estimated using ngs2dSFS |
| ## using marginal spectra as priors, estimated using '''realSFS''' | | ## using marginal spectra as priors, estimated using '''realSFS''' |
|
| |
|
| ==PCA==
| | =PCA= |
| More information here: | | More information here: |
| https://github.com/mfumagalli/ngsTools#ngscovar | | https://github.com/mfumagalli/ngsTools#ngscovar |