Previous results

.read.v3 <- function(hdr) {

    .files.1 <- list.files(hdr %&% "/result/",
                           pattern = "eval", full.names=TRUE)

    .files.2 <- list.files(hdr %&% "/summary/",
                           pattern = "eval.gz", full.names=TRUE)

    .files <- c(.files.1, .files.2)

    ## method p1 p0 pa rseed power10 auprc auroc r r0 s0

    .ct <- c("character", "character","character","character",
             "integer",
             "double","double","double","double","double","double")

    .dat <- lapply(.files, .fread, header=TRUE, fill=TRUE,
                   colClasses = .ct)

    .dat <- .dat %>% 
        do.call(what=rbind) %>%
        mutate(p1 = as.numeric("0." %&% p1)) %>% 
        mutate(pa = as.numeric("0." %&% pa)) %>% 
        mutate(p0 = as.numeric("0." %&% p0))
}

.read.named.v3 <- function(x) {
    xx <-
        str_remove(x, "sim_v3_") %>%
        str_remove_all("[NMS]") %>% 
        str_split("[_B]") %>%
        unlist %>%
        as.integer

    names(xx) <- c("N","M","S","B")
    ret <- .read.v3(x) %>% as.data.table
    ret[, N := xx[1]]
    ret[, M := xx[2]]
    ret[, S := xx[3]]
    ret[, B := xx[4]]
    return(ret)
}

.file <- ".simulation.v3.rdata"
if(!file.exists(.file)) {

    result.dt <-
        list.files(".", "sim_v3") %>%
        lapply(.read.named.v3) %>% 
        do.call(what = rbind)

    save(list="result.dt", file=.file)
} else {
    load(.file)
}

result.dt <- result.dt[N %in% c(20, 40)]

col.lty.df <- 
    tibble(
        method = c("cocoa", "cocoa.pc.safe", "cocoa.pc.full",
                   "tot", "tot.pc.safe", "tot.pc.full", "avg", "mu", "cf"),
        label = c("CoCoA", "CoCoA + safe PC", "CoCoA + 10 PC",
                  "Total", "Tot + safe PC", "Tot + 10 PC",
                  "Average", "Bayesian", "confounder"),
        lty = c(1, 2, 3, 1, 2, 3, 1, 1, 1),
        pch = c(19, 21, 32, 17, 24, 32, 4, 5, 3),
        col = c("magenta", "magenta", "magenta", "#009900", "#009900", "#009900", "#00FF00", "gray40", "#000099")
    )

plot.v3 <- function(.sum) {

    .max.dt <- .sum[, .(y.max = max(y)), by = .(method)]
    .method <- .max.dt[order(.max.dt$y.max, decreasing=TRUE), .(method)] %>% unlist

    .temp <-
        tibble(method = .method) %>% 
        left_join(col.lty.df, by = "method") %>%
        na.omit

    .sum <- .sum %>%
        mutate(method = factor(method, .temp$method, .temp$label))

    .lty <- .temp$lty
    .cols <- .temp$col
    .pch <- .temp$pch

    .aes <- aes(x = p1, y = y,
                ymin = y - y.se,
                ymax = y + y.se,
                colour = method,
                lty = method)

    .df <- .sum %>%
        as_tibble %>% 
        mutate(.p0 = "expr. V[Y|X]=" %&% p0) %>%
        mutate(.pa = "label V[W|X]=" %&% pa)

    .ggplot(.df, .aes) +
        geom_linerange(.aes, lty=1, size=.2) +
        geom_line() +
        geom_point(aes(shape=method), stroke=.3, size=1.5, fill="white") +
        scale_shape_manual(values = .pch) +
        scale_colour_manual(values = .cols) +
        scale_linetype_manual(values = .lty)
}

1. Our counterfactual inference algorithm effectively adjusts confounding factors

.take.cor <- function(.dt, .method = c("cocoa","tot","cf")) {

    .dt[method %in% .method,
        .(y = mean(r), y.se = 2 * sd(r)),
        by = .(p1, p0, pa, method)] %>%
        as.data.table
}

.take.cor.0 <- function(.dt, .method = c("cocoa","tot","cf")) {

    .dt[method %in% .method,
        .(y = mean(r0), y.se = 2 * sd(r0)),
        by = .(p1, p0, pa, method)] %>%
        as.data.table
}

.take.cor.null <- function(.dt, .method = c("cocoa","tot","cf")) {

    .dt[method %in% .method,
        .(y = mean(s0), y.se = 2 * sd(s0)),
        by = .(p1, p0, pa, method)] %>%
        as.data.table
}

for(ss in c(2, 5)) {

    .dt <- result.dt[S==ss & B == (5- ss) & N == 40 & M == 50 & p1 <= .3]

    p1 <- 
        plot.v3(.take.cor(.dt)) +
        xlab("V[Y|W]") +
        ylab("correlation with the unconfounded\n(causal genes)") +
        ggtitle("#confounder: " %&% ss %&%
                ", #non-confounding covariate: " %&% (5 - ss))

    p2 <- 
        plot.v3(.take.cor.0(.dt)) +
        xlab("V[Y|W]") +
        ylab("correlation with the unconfounded\n(non-causal genes)")

    p3 <- 
        plot.v3(.take.cor.null(.dt)) +
        xlab("V[Y|W]") +
        ylab("correlation with the confounders\n(non-causal genes)")

    plt <- p1/p2/p3
    print(plt)

    .file <- fig.dir %&% "/Fig_Q1_S" %&% ss %&% ".pdf"
    .gg.save(.file, plot = plt, width=4, height=6)
}

PDF

2. CoCoA would not over-correct the variance even if there was no confounding effect

.take.roc <- function(.dt, .method = c("cocoa", "tot", "avg", "mu", "cf")) {

    .ret1 <- .dt[method %in% .method,
                 .(y = mean(auprc), y.se = 2 * sd(auprc)/sqrt(.N)),
                 by = .(p1, p0, pa, method)] %>%
        mutate(metric = "auprc")

    .ret2 <- .dt[method %in% .method,
                 .(y = mean(power10), y.se = 2 * sd(power10)/sqrt(.N)),
                 by = .(p1, p0, pa, method)] %>% 
        mutate(metric = "power10")

    rbind(.ret1, .ret2) %>%
        as.data.table
}

.dt <- result.dt[B==5 & N == 40 & p1 <= .3] %>% .take.roc

p1 <-
    plot.v3(.dt[metric=="power10"]) +
    ylab("Power (FDR < 10%)")

p2 <-
    plot.v3(.dt[metric=="auprc"]) +
    ylab("AUPRC")

plt <- p1/p2
print(plt)

.file <- fig.dir %&% "/Fig_Q2_no_confounding.pdf"
.gg.save(.file, plot = plt, width=4, height=4)

PDF

3. CoCoA improves statistical power

for(ss in c(0, 2, 5)){

    .dt <- result.dt[S==ss & N == 40 & M == 50] %>% .take.roc

    p1 <-
        plot.v3(.dt[metric=="power10"]) +
        ylab("Power (FDR < 10%)") +
        ggtitle("#confounder: " %&% ss)

    p2 <-
        plot.v3(.dt[metric=="auprc"]) +
        ylab("AUPRC")

    plt <- p1/p2
    print(plt)
    .file <- fig.dir %&% "/Fig_Q3_S" %&% ss %&% ".pdf"
    .gg.save(.file, plot = plt, width=4, height=4)
}

PDF

4. Additional principal component analysis can help, but over-correction by matrix factorization can hurt

.method <- c("cocoa", "tot", "cocoa.pc.safe", "tot.pc.safe")

.dt <- result.dt[M == 50 & pa == .5 & method %in% .method] %>%
    mutate(N = factor(N, c(40, 20), c("N = 40", "N = 20")))

.aes <- aes(x = as.factor(p1), y = auprc, fill = method)

plt <-
    .ggplot(.dt, .aes) +
    facet_grid(N ~ S) +
    ylab("AUPRC") +
    xlab("Variance Caused by Disease Effect") +
    geom_boxplot(outlier.size = 0, outlier.stroke = 0, size = .2) +
    scale_fill_brewer(palette = "Paired")

print(plt)

.file <- fig.dir %&% "/Fig_Q4_AUPRC.pdf"
.gg.save(.file, plot = plt, width=7, height=5)

PDF

5. What if we increase the sample size?

.methods <- c("cocoa", "tot")
.lab <- c("CoCoA", "Total")

.dt <- result.dt[S == 5 & M == 50 & pa == .5 & p0 == .5 & method %in% .methods] %>%
    mutate(method = factor(method, .methods, .lab))

.aes <- aes(x = as.factor(N),
            y = power10,
            fill = method)

p1 <-
    .gg.plot(.dt) +
    ylab("Power (FDR < 10%)") + xlab("# individuals") +
    facet_wrap(~ p1, nrow = 1) +
    geom_violin(.aes, size = .2, scale="width", draw_quantiles = c(.5)) +
    scale_y_continuous(limits = c(0, 1)) +
    scale_fill_manual(values = c("#f768a1", "gray")) +
    theme(legend.position = "bottom")

.aes <- aes(x = as.factor(N),
            y = auprc,
            fill = method)

p2 <-
    .gg.plot(.dt) +
    ylab("AUPRC") + xlab("# individuals") +
    facet_wrap(~ p1, nrow = 1) +
    geom_violin(.aes, size = .2, scale="width", draw_quantiles = c(.5)) +
    scale_y_continuous(limits = c(0, 1)) +
    scale_fill_manual(values = c("#f768a1", "gray")) +
    theme(legend.position = "bottom")

plt <- p1/p2
print(plt)

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

.file <- fig.dir %&% "/Fig_Q5_sample_size.pdf"
.gg.save(.file, plot = plt, width=6, height=4)

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

PDF

Simulation results for the causal discovery of DEGs

Yongjin Park

Previous results

1. Our counterfactual inference algorithm effectively adjusts confounding factors

2. CoCoA would not over-correct the variance even if there was no confounding effect

3. CoCoA improves statistical power

4. Additional principal component analysis can help, but over-correction by matrix factorization can hurt

5. What if we increase the sample size?