## ----setup, include = FALSE---------------------------------------------------
library(fluentGenomics)
dir <- system.file("extdata", package = "macrophage")

library(tximeta)
makeLinkedTxome(
  indexDir = file.path(dir, "gencode.v29_salmon_0.12.0"),
  source = "Gencode",
  organism = "Homo sapiens",
  release = "29",
  genome = "GRCh38",
  fasta = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
  gtf = file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
  write = FALSE
)

knitr::opts_chunk$set(
  fig.align = "center"
)

## ----workflow, fig.cap = "(ref:workflow)", fig.align='center', out.width="\\textwidth", echo = FALSE----
knitr::include_graphics("workflow.png")

## ----read-cache, eval = FALSE-------------------------------------------------
# library(fluentGenomics)
# atac <- readRDS(cache_atac_se())

## ----dir, eval=FALSE----------------------------------------------------------
# dir <- "/path/to/quant/files"

## ----setdir-------------------------------------------------------------------
dir <- system.file("extdata", package = "macrophage")

## ----coldata-rna--------------------------------------------------------------
library(readr)
colfile <- file.path(dir, "coldata.csv")
coldata <- read_csv(colfile) |>
  dplyr::select(
    names,
    id = sample_id,
    line = line_id,
    condition = condition_name
  ) |>
  dplyr::mutate(
    files = file.path(dir, "quants", names, "quant.sf.gz"),
    line = factor(line),
    condition = relevel(factor(condition), "naive")
  )
coldata

## ----tximeta-run--------------------------------------------------------------
suppressPackageStartupMessages(library(SummarizedExperiment))
library(tximeta)
se <- tximeta(coldata, dropInfReps = TRUE)
se

## ----linkedtxome-ex, eval = FALSE---------------------------------------------
# makeLinkedTxome(
#   indexDir = file.path(dir, "gencode.v29_salmon_0.12.0"),
#   source = "Gencode",
#   organism = "Homo sapiens",
#   release = "29",
#   genome = "GRCh38",
#   fasta = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
#   gtf = file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
#   write = FALSE
# )

## ----gse----------------------------------------------------------------------
gse <- summarizeToGene(se)

## ----coldata-atac, eval=FALSE-------------------------------------------------
# atac_coldata <- read_tsv("ATAC_sample_metadata.txt.gz") |>
#   select(
#     sample_id,
#     donor,
#     condition = condition_name
#   ) |>
#   mutate(condition = relevel(factor(condition), "naive"))

## ----mat-atac, eval = FALSE---------------------------------------------------
# atac_mat <- read_tsv("ATAC_cqn_matrix.txt.gz",
#                      skip = 1,
#                      col_names = c("rownames", atac_coldata[["sample_id"]]))
# rownames <- atac_mat[["rownames"]]
# atac_mat <- as.matrix(atac_mat[, -1])
# rownames(atac_mat) <- rownames

## ----peaks-atac, eval=FALSE---------------------------------------------------
# library(plyranges)
# peaks_df <- read_tsv("ATAC_peak_metadata.txt.gz", col_types = c("cidciicdc"))
# 
# peaks_gr <- peaks_df |>
#   as_granges(seqnames = chr) |>
#   select(peak_id = gene_id) |>
#   set_genome_info(genome = "GRCh38")

## ----atac-se, eval = FALSE----------------------------------------------------
# atac <- SummarizedExperiment(assays = list(cqndata = atac_mat),
#                              rowRanges = peaks_gr,
#                              colData = atac_coldata)

## ----setup-deseq--------------------------------------------------------------
library(DESeq2)
dds <- DESeqDataSet(gse, ~ line + condition)
# filter out lowly expressed genes
# at least 10 counts in at least 6 samples
keep <- rowSums(counts(dds) >= 10) >= 6
dds <- dds[keep, ]

## ----fit-model----------------------------------------------------------------
dds <- DESeq(dds)

## ----results-DFrame-----------------------------------------------------------
res <- results(dds,
               contrast = c("condition", "IFNg", "naive"),
               lfcThreshold = 1, alpha = 0.01)

## ----ma-plot, fig.cap = "(ref:maplot)"----------------------------------------
summary(res)
DESeq2::plotMA(res, ylim = c(-10, 10))

## ----results-GRanges----------------------------------------------------------
suppressPackageStartupMessages(library(plyranges))
de_genes <- results(dds,
                    contrast = c("condition", "IFNg", "naive"),
                    lfcThreshold = 1,
                    format = "GRanges") |>
  names_to_column("gene_id")
de_genes

## ----de-genes-----------------------------------------------------------------
de_genes <- de_genes |>
  filter(padj < 0.01) |>
  select(gene_id, de_log2FC = log2FoldChange, de_padj = padj)

## ----not-de-genes-------------------------------------------------------------
other_genes <- results(dds,
                       contrast = c("condition", "IFNg", "naive"),
                       lfcThreshold = 1,
                       altHypothesis = "lessAbs",
                       format = "GRanges") |>
  filter(padj < 0.01) |>
  names_to_column("gene_id") |>
  dplyr::select(gene_id,
                de_log2FC = log2FoldChange,
                de_padj = padj)

## ----limma, eval = FALSE------------------------------------------------------
# library(limma)
# design <- model.matrix(~ donor + condition, colData(atac))
# fit <- lmFit(assay(atac), design)
# fit <- eBayes(fit)
# idx <- which(colnames(fit$coefficients) == "conditionIFNg")
# tt <- topTable(fit, coef = idx, sort.by = "none", n = nrow(atac))

## ----peaks-tidy, eval = FALSE-------------------------------------------------
# atac_peaks <- rowRanges(atac) |>
#   remove_names() |>
#   mutate(
#     da_log2FC = tt$logFC,
#     da_padj = tt$adj.P.Val
#   ) |>
#   set_genome_info(genome = "hg38")
# 
# seqlevelsStyle(atac_peaks) <- "UCSC"

## ----load-peaks---------------------------------------------------------------
library(fluentGenomics)
peaks

## ----filter-peaks-------------------------------------------------------------
da_peaks <- peaks |>
  filter(da_padj < 0.01)

## ----slice-example------------------------------------------------------------
size <- length(de_genes)
slice(other_genes, sample.int(n(), size))

## ----boot-set-01--------------------------------------------------------------
# set a seed for the results
set.seed(2019 - 08 - 02)
boot_genes <- replicate(10,
                        slice(other_genes, sample.int(n(), size)),
                        simplify = FALSE)

## ----boot-set-02--------------------------------------------------------------
boot_genes <- bind_ranges(boot_genes, .id = "resample")

## ----combine-results----------------------------------------------------------
all_genes <- bind_ranges(
  de = de_genes,
  not_de = boot_genes,
  .id = "origin"
) |>
  mutate(
    origin = factor(origin, c("not_de", "de")),
    resample = ifelse(is.na(resample), 0L, as.integer(resample))
  )
all_genes

## ----resize-01----------------------------------------------------------------
all_genes <- all_genes |>
  anchor_5p() |>
  mutate(width = 1)

## ----resize-02----------------------------------------------------------------
all_genes <- all_genes |>
  anchor_center() |>
  mutate(width = 2 * 1e4)

## ----olap-join----------------------------------------------------------------
genes_olap_peaks <- all_genes |>
  join_overlap_left(da_peaks)
genes_olap_peaks

## ----reduce-ex01--------------------------------------------------------------
gene_peak_max_lfc <- genes_olap_peaks |>
  group_by(gene_id, origin) |>
  reduce_ranges_directed(
    peak_count = sum(!is.na(da_padj)) / n_distinct(resample),
    peak_max_lfc = max(abs(da_log2FC))
  )

## ----boxplot, fig.cap = "(ref:boxplot)"---------------------------------------
library(ggplot2)
gene_peak_max_lfc |>
  filter(peak_count > 0) |>
  as.data.frame() |>
  ggplot(aes(origin, peak_max_lfc)) +
  geom_boxplot()

## ----summarize-ex01-----------------------------------------------------------
origin_peak_lfc <- genes_olap_peaks |>
  group_by(origin) |>
  summarize(
    peak_count = sum(!is.na(da_padj)) / n_distinct(resample),
    lfc1_peak_count = sum(
      abs(da_log2FC) > 1, na.rm = TRUE
    ) / n_distinct(resample),
    lfc2_peak_count = sum(
      abs(da_log2FC) > 2, na.rm = TRUE
    ) / n_distinct(resample)
  )
origin_peak_lfc

## ----pivot-enrich-------------------------------------------------------------
origin_peak_lfc |>
  as.data.frame() |>
  tidyr::pivot_longer(cols = -origin) |>
  tidyr::pivot_wider(names_from = origin, values_from = value) |>
  mutate(enrichment = de / not_de)

## ----reduce-summarize---------------------------------------------------------
genes_olap_peaks |>
  group_by(gene_id, origin, resample) |>
  reduce_ranges_directed(
    lfc1 = sum(abs(da_log2FC) > 1, na.rm = TRUE),
    lfc2 = sum(abs(da_log2FC) > 2, na.rm = TRUE)
  ) |>
  group_by(origin) |>
  summarize(
    lfc1_gene_count = sum(lfc1 > 0) / n_distinct(resample),
    lfc1_peak_count = sum(lfc1) / n_distinct(resample),
    lfc2_gene_count = sum(lfc2 > 0) / n_distinct(resample),
    lfc2_peak_count = sum(lfc2) / n_distinct(resample)
  )

## ----count-fn-----------------------------------------------------------------
count_if_above_threshold <- function(var, thresholds) {
  lapply(thresholds, function(.) sum(abs(var) > ., na.rm = TRUE))
}

## ----thresholds---------------------------------------------------------------
thresholds <- da_peaks |>
  mutate(abs_lfc = abs(da_log2FC)) |>
  with(
    seq(min(abs_lfc), max(abs_lfc), length.out = 100)
  )

## ----reduce-ex02--------------------------------------------------------------
genes_peak_all_thresholds <- genes_olap_peaks |>
  group_by(gene_id, origin, resample) |>
  reduce_ranges_directed(
    value = count_if_above_threshold(da_log2FC, thresholds),
    threshold = list(thresholds)
  )
genes_peak_all_thresholds

## ----expand-summarize---------------------------------------------------------
origin_peak_all_thresholds <- genes_peak_all_thresholds |>
  expand_ranges() |>
  group_by(origin, threshold) |>
  summarize(
    gene_count = sum(value > 0) / n_distinct(resample),
    peak_count = sum(value) / n_distinct(resample)
  )
origin_peak_all_thresholds

## ----line-chart, fig.cap = "(ref:linechart)"----------------------------------
origin_threshold_counts <- origin_peak_all_thresholds |>
  as.data.frame() |>
  tidyr::pivot_longer(cols = -c(origin, threshold),
                      names_to = c("type", "var"),
                      names_sep = "_",
                      values_to = "count") |>
  select(-var)

origin_threshold_counts |>
  filter(type == "peak") |>
  tidyr::pivot_wider(names_from = origin, values_from = count) |>
  mutate(enrichment = de / not_de) |>
  ggplot(aes(x = threshold, y = enrichment)) +
  geom_line() +
  labs(x = "logFC threshold", y = "Relative Enrichment")

## ----line-chart2, fig.cap = "(ref:linechart2)"--------------------------------
origin_threshold_counts |>
  ggplot(aes(x = threshold,
             y = count + 1,
             color = origin,
             linetype = type)) +
  geom_line() +
  scale_y_log10()

## ----eval = FALSE-------------------------------------------------------------
# # development version from Github
# BiocManager::install("sa-lee/fluentGenomics")
# # version available from Bioconductor
# BiocManager::install("fluentGenomics")

## -----------------------------------------------------------------------------
sessionInfo()

