DESeq2

博客

2024-03-23 15:03

library(readr)

library(dplyr)

library(ggplot2)

library(tidyverse)

library(DESeq2)

mycounts <- read.csv("C:/Users/~/Desktop/datacounts.csv")

metadata <- read.csv("C:/Users/~/Desktop/datametadata.csv")

countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]

rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)

mycounts <- as.data.frame(mycounts)

metadata <- as.data.frame(metadata)

head(mycounts)

head(metadata)

class(mycounts)

class(metadata)

dds <- DESeqDataSetFromMatrix(countData=mycounts,

colData=metadata,

design=~1,

tidy=TRUE)

dds

sizeFactors(dds)

dispersions(dds)

dds <- DESeq(dds)

res = results(dds, contrast=c("Group", "CNL", "Target"), tidy=TRUE)

res <- as_tibble(res)

res

res %>%

filter(padj<0.05) %>%

write_csv("sigresults.csv")

#smallestGroupSize <- 3

#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize

#dds <- dds[keep,]

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" ")

df <- as.data.frame(colData(vsdata)[,c("Group","R","NR")])

H <- c("A","B","C")])

mat <- assay(vsdata)[ H, ]

mat <- as.data.frame(mat)

mat1 <- mat %>% arrange(desc(TD))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "")

plotCounts(dds, gene="MYC", intgroup="Group", returnData=TRUE) %>%

ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("MYC")

vsdata <- varianceStabilizingTransformation(dds) (less row)

Volcanoplot

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()

# Add more simple "theme"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()

# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.

# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)

# add a column of NAs

res$diffexpressed <- "NO"

# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"

res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"

# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"

res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"

# Re-plot but this time color the points with "diffexpressed"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()

# Add lines as before...

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

## Change point color

# 1. by resfault, it is assigned to the categories in an alphabetical orresr):

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:

mycolors <- c("blue", "red", "black")

names(mycolors) <- c("DOWN", "UP", "NO")

p3 <- p2 + scale_colour_manual(values = mycolors)

# Now write down the name of genes besires the points...

# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)

res$delabel <- NA

res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

geom_point() +

theme_minimal() +

geom_text()

library(ggrepel)

# plot adding up all layers we have seen so far

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

geom_point() +

theme_minimal() +

geom_text_repel() +

scale_color_manual(values=c("blue", "black", "red")) +

geom_vline(xintercept=c(-0.6, 0.6), col="red") +

geom_hline(yintercept=-log10(0.05), col="red")

plotCounts(dds, gene="FSTL5", intgroup="Treatment", returnData=TRUE) %>%

ggplot(aes(Treatment, count)) + geom_boxplot(aes(fill=Treatment)) + scale_y_log10() + ggtitle("FSTL5")

P + scale_x_discrete(name ="Group", limits=c("CNL","Target"))

library("genefilter")

topVarGenes <- head(order(-rowVars(assay(rld))),30)

The heatmap becomes more interesting if we do not look at absolute expression strength but rather at the amount by which each gene deviates in a specific sample from the gene’s average across all samples. Hence, we center each genes’ values across samples, and plot a heatmap. We provide the column side colors to help identify the treated samples (in blue) from the untreated samples (in grey).

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group")

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))

topVarGenes <- head(order(-rowVars(assay(vsdata))),50)

mat <- assay(vsdata)[ topVarGenes, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","R","D","M","Treatment")])

pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)

res1 <- res %>% filter(padj<0.05) %>% arrange(padj)

res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))

Sig <- res2$row

mat <- assay(vsdata)[ Sig, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Treatment","HPV","PIK3CAmutation","Response.Primary","Responder")])

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)

mat <- assay(vsdata)[ G, ]

mat1 <- as.data.frame(mat)

mat1 <- mat1 %>% arrange(desc(TD3))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F)

zmat <- scale(tmat1, center = TRUE, scale = TRUE) upon column

sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)

PLS-DA

GSEA

library(org.Hs.eg.db)

ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,

key=res$row,

columns="SYMBOL",

keytype="SYMBOL")

ens2symbol <- as_tibble(ens2symbol)

ens2symbol

res2 <- res %>%

dplyr::select(SYMBOL, stat) %>%

na.omit() %>%

distinct() %>%

group_by(SYMBOL) %>%

summarize(stat=mean(stat))

res2

library(fgsea)

ranks <- deframe(res2)

head(ranks, 20)

pathways.hallmark <- gmtPathways("C:/Users/~/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps = 0.0,

minSize = 15,

maxSize = 500)

fgseaResTidy <- fgseaRes %>%

as_tibble() %>%

arrange(desc(NES))

# Show in a nice table:

fgseaResTidy %>%

dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%

arrange(padj)

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],

ranks) + labs(title=" HALLMARK_APOPTOSIS")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

	pathway	pval	padj	NES	size
1	HALLMARK_E2F_TARGETS	0.003125	0.016	1.50255883270908	196
2	HALLMARK_P53_PATHWAY	0.00307692307692308	0.016	1.46976609146818	193

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

geom_col(aes(fill=padj<0.05)) +

coord_flip() +

labs(x="Pathway", y="Normalized Enrichment Score",

title="Hallmark pathways NES from GSEA") +

theme_minimal()

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/~/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%

as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark Bulk PRE HPV+ R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],

ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

戳这里 Claim your page

来源: 文学城-longteng24