.read.named.v5 <- function(x) {
    xx <-
      str_remove(x, "sim_v5_") %>%
      str_remove_all("[NMS]") %>% 
      str_split("[_B]") %>%
      unlist %>%
      as.integer

    names(xx) <- c("N","M","S","B")
    ret <- .read.v5(x) %>% as.data.table
    ret[, N := xx[1]]
    ret[, M := xx[2]]
    ret[, S := xx[3]]
    ret[, B := xx[4]]
    return(ret)
}

.file <- ".simulation.v5.rdata"
if(!file.exists(.file)) {
  result.dt <- .read.named.v5("sim_v5_N40_M50_S5B0")
  save(list="result.dt", file=.file)
} else {
  load(.file)
}
result.dt[, knn := factor(knn, c(1,10,50,100,200))]
result.dt[, p1.lab := paste("Var. by Causal ", p1*100, "%")]
result.dt[, p0.lab := paste("Var. by Confounder", p0*100, "%")]

1. Performance of causal gene discovery

.aes <- aes(x=as.factor(p1*100),
            y=auprc,
            fill=knn)

plt <-
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_fill_brewer("# neighbours") +
  ylab("AUPRC") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_auprc.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

2. Empirical Discovery Rate

We sought to control false discovery rates without looking at actual causal gene labels using John Storey’s empirical FDR calibration method implemented in qvalue package.

When FDR controlled at 1% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=edr01,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .01, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("Discovery Rate (out of 10k genes)") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_edr01.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

When FDR controlled at 5% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=edr05,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .05, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("Discovery Rate (out of 10k genes)") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_edr05.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

When FDR controlled at 10% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=edr10,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .10, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("Discovery Rate (out of 10k genes)") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_edr10.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

3. Measuring actual (empirical) False Discovery Rate

When FDR controlled at 1% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=efdr01,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .01, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("empirical False Discovery Rate") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_efdr01.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

When FDR controlled at 5% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=efdr05,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .05, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("empirical False Discovery Rate") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_efdr05.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF

When FDR controlled at 10% by qvalue package

.aes <- aes(x=as.factor(p1*100),
            y=efdr10,
            fill=knn)

plt <- 
  .ggplot(result.dt, .aes) +
  facet_grid(~p0.lab) +
  geom_boxplot(size=.3, outlier.size=0, outlier.stroke=0) +
  scale_y_continuous(minor_breaks = c(), breaks = c(1e-2, 5e-2, (1:9)/10)) +
  geom_hline(yintercept = .1, lty = 2, colour = 2, size = .5) +
  scale_fill_brewer("# neighbours") +
  ylab("empirical False Discovery Rate") +
  xlab("Variance Caused by Disease Effect (%)")

print(plt)

.file <- fig.dir %&% "/Fig_knn_efdr10.pdf"
.gg.save(.file, plt, width=6, height=3)

PDF