library(genio)
library(data.table)

# ── 1. Read BED ───────────────────────────────────────────────────────────────
bed   <- read_plink("example/plink")
geno  <- bed$X        # variants × samples, 0/1/2/NA

# ── 2. Hardcalls → dosages with Gaussian noise ────────────────────────────────
hardcall_to_dosage <- function(geno_matrix, noise_sd = 0.15, seed = 42) {
  set.seed(seed)
  out              <- geno_matrix + 0.0   # coerce to numeric
  not_na           <- !is.na(geno_matrix)
  out[not_na]      <- out[not_na] + rnorm(sum(not_na), 0, noise_sd)
  out[not_na]      <- pmin(pmax(out[not_na], 0), 2)
  out
}

dosage <- hardcall_to_dosage(geno, noise_sd = 0.15)

# ── 3. Write dosage VCF (DS field) ───────────────────────────────────────────
write_dosage_vcf <- function(dosage_matrix, bim, fam, out_vcf) {

  n_var  <- nrow(dosage_matrix)
  n_samp <- ncol(dosage_matrix)

  # VCF header
  header <- c(
    "##fileformat=VCFv4.2",
    '##FORMAT=<ID=DS,Number=1,Type=Float,Description="Dosage">',
    paste(c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",
            fam$id), collapse = "\t")
  )

  # Build one row per variant
  # DS values: round to 4 dp to keep file size sane
  rows <- vapply(seq_len(n_var), function(i) {
    ds <- ifelse(is.na(dosage_matrix[i, ]),
                 ".",
                 formatC(dosage_matrix[i, ], digits = 4, format = "f"))
    paste(c(
      bim$chr[i],          # CHROM
      bim$pos[i],          # POS
      bim$id[i],           # ID
      bim$ref[i],          # REF  (allele 2 in BIM)
      bim$alt[i],          # ALT  (allele 1 in BIM)
      ".", ".", ".",        # QUAL FILTER INFO
      "DS",                # FORMAT
      ds                   # one DS value per sample
    ), collapse = "\t")
  }, character(1))

  writeLines(c(header, rows), con = out_vcf)
  message("VCF written: ", out_vcf)
}

write_dosage_vcf(dosage, bim = bed$bim, fam = bed$fam,
                 out_vcf = "data/my_cohort_dosage.vcf")

# ── 4. Convert VCF → PGEN via PLINK2 ─────────────────────────────────────────
vcf_to_pgen <- function(vcf_file, out_prefix, plink2 = "plink2") {
  args <- c(
    "--vcf",        vcf_file,
    "dosage=DS",             # tell PLINK2 to ingest the DS field
    "--make-pgen",
    "--out",        out_prefix
  )
  result <- system2(plink2, args, stdout = TRUE, stderr = TRUE)
  cat(result, sep = "\n")

  if (!file.exists(paste0(out_prefix, ".pgen")))
    stop("PLINK2 conversion failed — check output above")

  message("PGEN written: ", out_prefix, ".pgen/.pvar/.psam")
}

vcf_to_pgen("data/my_cohort_dosage.vcf", "data/my_cohort_dosage")

# ── 5. Verify with pgenlibr ───────────────────────────────────────────────────
# library(pgenlibr)

# pvar <- NewPvar("data/my_cohort_dosage.pvar")
# pgen <- NewPgen("data/my_cohort_dosage.pgen", pvar = pvar)
# buf  <- Buf(pgen)

# ReadDosages(pgen, buf, 1L)
# cat("Variant 1 dosages (first 10 samples):\n")
# print(round(head(buf, 10), 4))

# ClosePgen(pgen)
