ANGSD: Analysis of next generation Sequencing Data

Latest tar.gz version is (0.938/0.939 on github), see Change_log for changes, and download it here.

Fst PCA: Difference between revisions

From angsd
Jump to navigation Jump to search
No edit summary
 
(39 intermediate revisions by the same user not shown)
Line 1: Line 1:
;THIS PAGE IS OBSOLETE, PLEASE SEE FST AND PCA in the sidebar for the latest versions
Matteo Fumagalli has been working on methods to estimate Fst and doing PCA/Covariance based on ANGSD output files.
Matteo Fumagalli has been working on methods to estimate Fst and doing PCA/Covariance based on ANGSD output files.


Line 15: Line 16:
=cite=
=cite=
If you use these methods, you should cite the m. fumagalli paper http://www.ncbi.nlm.nih.gov/pubmed/23979584
If you use these methods, you should cite the m. fumagalli paper http://www.ncbi.nlm.nih.gov/pubmed/23979584
=New Fancy Method (sim data)=
<pre>
  nRep <- 100
      nPop1 <- 24
      nPop2 <- 16
      cmd <- paste("msms -ms",nPop1+nPop2,nRep,"-t 930 -r 400 -I 2",nPop1,nPop2,"0 -g 1 9.70406 -n 1 2 -n 2 1 -ma x 0.0 0.0 x -ej 0.07142857 2 1  >msoutput.txt ",sep=" ")
      system(cmd)
      ##system("msms -ms 40 1 -t 930 -r 400 -I 2 20 20 0 -g 1 9.70406 -n 1 2 -n 2 1 -ma x 0.0 0.0 x -ej 0.07142857 2 1  >msoutput.txt  ")
source("../R/readms.output.R")
  to2dSFS <- function(p1.d,p2.d,nPop1,nPop2)
        sapply(0:nPop1,function(x) table(factor(p2.d[p1.d==x],levels=0:nPop2)))
  source("../R/readms.output.R")
        a<- read.ms.output(file="msoutput.txt")
       
        p1.d <- unlist((sapply(a$gam,function(x) colSums(x[1:nPop1,]))))
        p2.d <- unlist((sapply(a$gam,function(x) colSums(x[-c(1:nPop1),]))))
        par(mfrow=c(1,2))
        barplot(table(p1.d))
        barplot(table(p2.d))
        sfs.2d <- t(sapply(0:nPop1,function(x) table(factor(p2.d[p1.d==x],levels=0:nPop2))))
  system("../misc/msToGlf -in msoutput.txt -out raw -singleOut 1 -regLen 0 -depth 8 -err 0.005")
      system("../misc/splitgl raw.glf.gz 20 1 12 >pop1.glf.gz")
      system("../misc/splitgl raw.glf.gz 20 13 20 >pop2.glf.gz")
      system("echo \"1 250000000\" >fai.fai")
      system("../angsd -glf pop1.glf.gz -nind 12 -doSaf 1 -out pop1 -fai fai.fai -issim 1")
      system("../angsd -glf pop2.glf.gz -nind 8 -doSaf 1 -out pop2 -fai fai.fai -issim 1")
      system("../misc/realSFS pop1.saf.idx >pop1.saf.idx.ml")
      system("../misc/realSFS pop2.saf.idx >pop2.saf.idx.ml")
      system("../misc/realSFS pop1.saf.idx pop2.saf.idx -maxIter 500 -p 20  >pop1.pop2.saf.idx.ml")
getFst<-function(est){
    N1<-nrow(est)-1
    N2<-ncol(est)-1
    cat("N1: ",N1 ," N2: ",N2,"\n")
    est0<-est
    est0[1,1]<-0
    est0[N1+1,N2+1]<-0
    est0<-est0/sum(est0)
   
    aMat<<-matrix(NA,nrow=N1+1,ncol=N2+1)
    baMat<<-matrix(NA,nrow=N1+1,ncol=N2+1)
    for(a1 in 0:(N1))
        for(a2 in 0:(N2)){
            p1 <- a1/N1
            p2 <- a2/N2
            q1 <- 1 - p1
            q2 <- 1 - p2
            alpha1 <- 1 - (p1^2 + q1^2)
            alpha2 <- 1 - (p2^2 + q2^2)
           
            al <-  1/2 * ( (p1-p2)^2 + (q1-q2)^2) - (N1+N2) *  (N1*alpha1 + N2*alpha2) / (4*N1*N2*(N1+N2-1))
            bal <- 1/2 * ( (p1-p2)^2 + (q1-q2)^2) + (4*N1*N2-N1-N2)*(N1*alpha1 + N2*alpha2) / (4*N1*N2*(N1+N2-1))
            aMat[a1+1,a2+1]<<-al
            baMat[a1+1,a2+1]<<-bal
            ##  print(signif(c(a1=a1,a2=a2,p1=p1,p2=p2,al1=alpha1,al2=alpha2,al),2))
        }
    ## unweighted average of single-locus ratio estimators
    fstU <-  sum(est0*(aMat/baMat),na.rm=T)
    ## weighted average of single-locus ratio estimators
    fstW <-  sum(est0*aMat,na.rm=T)/sum(est0*baMat,na.rm=T)
    c(fstW=fstW,fstU=fstU)
}
> getFst(sfs.2d)
N1:  24  N2:  16
      fstW      fstU
0.11945801 0.08249571
est <- matrix(as.integer(scan("pop1.pop2.saf.idx.ml")),byrow=T,ncol=nPop2+1)
> getFst(est)
N1:  24  N2:  16
      fstW      fstU
0.11925903 0.08241461
cmd<"fst index pop1.saf.idx pop2.saf.idx -sfs pop1.pop2.saf.idx.ml  -fstout testing"
system(cmd)
##view the per site 'alpha' 'beta' if you want
cmd<-"../misc/realSFS fst print testing.fst.idx |head"
##use fancy new emperical bayes
cmd<- "../misc/realSFS fst stats testing.fst.idx "
system(cmd)
-> FST.Unweight:0.083316 Fst.Weight:0.119372
</pre>
=new fancy version=
<pre>
#this is with 2pops
../angsd -b list1  -anc hg19ancNoChr.fa -out pop1 -dosaf 1 -gl 1
../angsd -b list2  -anc hg19ancNoChr.fa -out pop1 -dosaf 1 -gl 1
../misc/realSFS pop1.saf.idx pop2.saf.idx >pop1.pop2.ml
../misc/realSFS fst index pop1.saf.idx pop2.saf.idx -sfs pop1.pop2.ml -fstout here
../misc/realSFS fst stats here.fst.idx
-> FST.Unweight:0.069395 Fst.Weight:0.042349

Latest revision as of 15:21, 30 July 2015

THIS PAGE IS OBSOLETE, PLEASE SEE FST AND PCA in the sidebar for the latest versions

Matteo Fumagalli has been working on methods to estimate Fst and doing PCA/Covariance based on ANGSD output files.

The main documentation for this is found here: https://github.com/mfumagalli/ngsTools

Fst

  1. Generate .saf files from each population using ANGSD SFS Estimation
    1. using a 2D-SFS as a prior, estimated using ngs2dSFS
    2. using marginal spectra as priors, estimated using realSFS

PCA

More information here: https://github.com/mfumagalli/ngsTools#ngscovar

cite

If you use these methods, you should cite the m. fumagalli paper http://www.ncbi.nlm.nih.gov/pubmed/23979584