.hdr <- "result/step1/final_matrix"
.data <- fileset.list(.hdr)

.hash.hdr <- "result/step1/hash"
.hash.data <- fileset.list(.hash.hdr)
.hash.info <- read.hash(.hash.data) %>%
    dplyr::select(-hash)

0. Surface Marker Proteins

1. Adjust batch-specific displacement using Batch-balancing k-Nearest Neighbour graph

.file <- "result/step2/bbknn.rds"
.mkdir(dirname(.file))

if.needed(.file, {

    .cells <-
        fread(.data$col, header=F, col.names = "tag") %>%
        parse.tag()

    .svd <- rcpp_mmutil_svd(.data$mtx, RANK=50, TAKE_LN=T, EM_ITER = 20, NUM_THREADS=16)

    .bbknn <- rcpp_mmutil_bbknn(r_svd_v = .svd$V,
                                r_svd_u = .svd$U,
                                r_svd_d = .svd$D,
                                r_batches = .cells$batch,
                                knn = 50,
                                NUM_THREADS = 16,
                                USE_SINGULAR_VALUES = F)

    saveRDS(.bbknn, .file)
})

.bbknn <- readRDS(.file)
.file <- "result/step2/annotation_bbknn.txt.gz"
if.needed(.file, {

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            row_file = .data$row,
            col_file = .data$col,
            EM_TOL = 1e-8,
            EM_ITER = 500,
            TAKE_LN = F)

    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(.annot.out$annotation) <- .col
    annot.dt <- setDT(.annot.out$annotation) %>%
        parse.tag()

    fwrite(annot.dt, .file)
})

2. Make major cell type annotation based on marker proteins + Leiden clustering

.file <- "Tab/step2_cell_type.txt.gz"
if.needed(.file, {
    .cells <-
        fread(.data$col, header=F, col.names = "tag") %>%
        parse.tag()

    .leiden <- run.leiden(.bbknn$knn.adj, .cells$tag, res=1, nrepeat = 100)

    .tab <-
        .leiden %>% 
        left_join(annot.dt) %>%
        na.omit()

    ## identify problematic clusters
    .fraction <- .tab[, .(.N), by = .(celltype, membership, component)]
    .fraction[, Ntot := sum(`N`), by = .(membership, component)]
    .mem.qc <- .fraction[order(`N`, decreasing = T), head(.SD, 1), by = .(membership, component)]

    .remove <- .mem.qc[`N` / `Ntot` < .5, .(membership, component)]

    ## Map cell group membership to cell type
    .ct.map <- .mem.qc[, .(membership, component, celltype)]

    final.cell.type <- .leiden %>%
        anti_join(.remove) %>%
        na.omit() %>% 
        left_join(.ct.map) %>%
        as.data.table() %>%
        parse.tag()
    fwrite(final.cell.type, .file)
})
final.cell.type <- fread(.file)
.file <- "Tab/step2_umap_coord.txt.gz"
if.needed(.file, {
    .umap <- uwot::tumap(.bbknn$factors.adjusted,
                         learning_rate = .1,
                         n_epochs = 2000,
                         n_sgd_threads = 16,
                         init = "laplacian",
                         verbose = T,
                         init_sdev = .01,
                         scale = F)

    .tags <- readLines(.data$col)
    colnames(.umap) <- "UMAP" %&% 1:ncol(.umap)
    umap.dt <- data.table(.umap, tag = .tags)
    fwrite(umap.dt, .file)
})
umap.dt <- fread(.file) %>% 
    left_join(final.cell.type) %>% 
    left_join(.hash.info) %>% 
    na.omit()
.file <- "Tab/step2_tsne_coord.txt.gz"
if.needed(.file, {
    .tsne <- Rtsne::Rtsne(.bbknn$factors.adjusted,
                          num_threads = 16,
                          verbose = T,
                          check_duplicates = F)

    .tags <- readLines(.data$col)
    colnames(.tsne$Y) <- "tSNE" %&% 1:ncol(.tsne$Y)
    tsne.dt <- data.table(.tsne$Y, tag = .tags)
    fwrite(tsne.dt, .file)
})
tsne.dt <- fread(.file) %>%
    left_join(final.cell.type) %>%
    left_join(.hash.info) %>% 
    na.omit()

t-UMAP

p1 <- .gg.plot(umap.dt[sample(.N)], aes(UMAP1, UMAP2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("cell types") +
    scale_color_brewer("", palette = "Paired")

p2 <- .gg.plot(umap.dt[sample(.N)], aes(UMAP1, UMAP2, color=as.factor(batch))) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("batch membership") +
    scale_color_brewer("", palette = "Set3")

p3 <- .gg.plot(umap.dt[sample(.N)], aes(UMAP1, UMAP2, color=as.factor(membership))) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("clustering") +
    scale_color_brewer("", palette = "Set1")

p4 <- .gg.plot(umap.dt[sample(.N)], aes(UMAP1, UMAP2, color=as.factor(disease))) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("clustering") +
    scale_color_brewer("", palette = "Dark2")

plt <- p1 | p2 | p3 | p4
print(plt)

PDF

tSNE

p1 <-
    .gg.plot(tsne.dt[sample(.N)], aes(tSNE1, tSNE2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("cell types") +
    scale_color_brewer("", palette = "Paired")

p2 <-
    .gg.plot(tsne.dt[sample(.N)], aes(tSNE1, tSNE2, color=as.factor(batch))) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("batch membership") +
    scale_color_brewer("", palette = "Set3")

p3 <-
    .gg.plot(tsne.dt[sample(.N)], aes(tSNE1, tSNE2, color=as.factor(membership))) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("clustering") +
    scale_color_brewer("", palette = "Set1")

p4 <-
    .gg.plot(tsne.dt[sample(.N)], aes(tSNE1, tSNE2, color=disease)) +
    ggrastr::rasterise(geom_point(stroke = 0, size=1), dpi=300) +
    theme(legend.position = c(0,0), legend.justification = c(0,0)) +
    ggtitle("clustering") +
    scale_color_brewer("", palette = "Dark2")

plt <- p1 | p2 | p3 | p4
print(plt)

PDF

3. Confirm cell type assignment with raw surface marker proteins

Two-dimensional density plot on the raw CD marker concentrations.

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, final.cell.type, marker.raw.mtx)
print(plt)

PDF

4. Confirm by other marker genes/proteins (normalized expression)

.markers <-
    c("FOXP3", "ID3", "BACH2", "CXCR3", "PRDM1", "SGK1", "TCF7", "LEF1",
      "SELL", "IL2RA", "IL7R", "IKZF2", "CCR6", "CCR4", "CCR7", "CTLA4",
      "HLA-DRA", "CD25", "CD127", "CD183", "CD196",
      "CD197", "CD194", "CD45RA", "CD45RO",
      "HLA") %>%
    unique

UMAP for the marker genes and proteins

.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
    plt <- plot.marker.scatter(g, show.tsne = F)
    print(plt)
    .file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_umap.pdf"
    .gg.save(filename = .file, plot = plt, width=3, height=5)
}

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

tSNE for the marker genes and proteins

.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
    plt <- plot.marker.scatter(g, show.tsne = T)
    print(plt)
    .file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_tsne.pdf"
    .gg.save(filename = .file, plot = plt, width=3, height=3)
}

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

5. Basic statistics for the first round annotation (22,403 cells)

PDF

PDF

6. PRDM1 short vs. long

prdm.dt <- fread("data/PRDM1/PRDM1_SL.csv.gz")

.dt <-
    copy(final.cell.type) %>%
    left_join(prdm.dt) %>%
    left_join(umap.dt) %>%
    left_join(tsne.dt) %>%
    na.omit() %>%
    as.data.table()

.dt[, membership := as.factor(`membership`)]
.aes <- aes(UMAP1, UMAP2, colour=pmin(PRDM1_short, 3))

p1 <-
    ggplot(.dt[order(PRDM1_short)], .aes) +
    xlab("UMAP 1") + ylab("UMAP 2") + prdm.thm +
    ggrastr::rasterise(geom_point(stroke=0, size=1), dpi = 300) +
    scale_colour_distiller("PRDM1\nshort",
                           palette="RdPu",
                           direction=1)

.aes <- aes(UMAP1, UMAP2, colour = pmin(PRDM1_long, 3))

p2 <-
    ggplot(.dt[order(PRDM1_long)], .aes) +
    xlab("UMAP 1") + ylab("UMAP 2") + prdm.thm +
    ggrastr::rasterise(geom_point(stroke=0, size=1), dpi = 300) +
    scale_colour_distiller("PRDM1\nlong",
                           palette="RdPu",
                           direction=1)

plt <- p1 | p2
print(plt)

PDF

.aes <- aes(tSNE1, tSNE2, colour=pmin(PRDM1_short, 3))

p1 <-
    ggplot(.dt[order(PRDM1_short)], .aes) +
    xlab("tSNE 1") + ylab("tSNE 2") + prdm.thm +
    ggrastr::rasterise(geom_point(stroke=0, size=1), dpi = 300) +
    scale_colour_distiller("PRDM1\nshort",
                           palette="RdPu",
                           direction=1)

.aes <- aes(tSNE1, tSNE2, colour = pmin(PRDM1_long, 3))

p2 <-
    ggplot(.dt[order(PRDM1_long)], .aes) +
    xlab("tSNE 1") + ylab("tSNE 2") + prdm.thm +
    ggrastr::rasterise(geom_point(stroke=0, size=1), dpi = 300) +
    scale_colour_distiller("PRDM1\nlong",
                           palette="RdPu",
                           direction=1)

plt <- p1 | p2
print(plt)

PDF

5. Tables