longteng

随笔记录, 工作日记.
正文

DESeq2

(2024-03-23 15:45:30) 下一个

library(readr)

library(dplyr)

library(ggplot2)

library(tidyverse)

library(DESeq2)

 

mycounts <- read.csv("C:/Users/~/Desktop/datacounts.csv")

metadata <-  read.csv("C:/Users/~/Desktop/datametadata.csv")

countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]

rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)

mycounts <- as.data.frame(mycounts)

metadata <- as.data.frame(metadata)

head(mycounts)

head(metadata)

class(mycounts)

class(metadata)

dds <- DESeqDataSetFromMatrix(countData=mycounts, 

                              colData=metadata, 

                              design=~1, 

                              tidy=TRUE)

dds

sizeFactors(dds)

dispersions(dds)

dds <- DESeq(dds)

res = results(dds, contrast=c("Group", "CNL", "Target"), tidy=TRUE)

res <- as_tibble(res)

res

res %>% 

    filter(padj<0.05) %>% 

    write_csv("sigresults.csv")

#smallestGroupSize <- 3

#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize

#dds <- dds[keep,]

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" ")

df <- as.data.frame(colData(vsdata)[,c("Group","R","NR")])

H <- c("A","B","C")])

mat <- assay(vsdata)[ H, ]

mat <- as.data.frame(mat)

mat1 <- mat %>% arrange(desc(TD))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "")

plotCounts(dds, gene="MYC", intgroup="Group", returnData=TRUE) %>% 

  ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("MYC")

vsdata <- varianceStabilizingTransformation(dds) (less row)

Volcanoplot

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()

# Add more simple "theme"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()

# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold 

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

    geom_hline(yintercept=-log10(0.05), col="red")

# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.

# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)

# add a column of NAs

res$diffexpressed <- "NO"

# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP" 

res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"

# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"

res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"

# Re-plot but this time color the points with "diffexpressed"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()

# Add lines as before...

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

        geom_hline(yintercept=-log10(0.05), col="red")

## Change point color 

# 1. by resfault, it is assigned to the categories in an alphabetical orresr):

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:

mycolors <- c("blue", "red", "black")

names(mycolors) <- c("DOWN", "UP", "NO")

p3 <- p2 + scale_colour_manual(values = mycolors)

# Now write down the name of genes besires the points...

# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)

res$delabel <- NA

res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) + 

    geom_point() + 

    theme_minimal() +

    geom_text()

library(ggrepel)

# plot adding up all layers we have seen so far

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

        geom_point() + 

        theme_minimal() +

        geom_text_repel() +

        scale_color_manual(values=c("blue", "black", "red")) +

        geom_vline(xintercept=c(-0.6, 0.6), col="red") +

        geom_hline(yintercept=-log10(0.05), col="red")

 

plotCounts(dds, gene="FSTL5", intgroup="Treatment", returnData=TRUE) %>% 

   ggplot(aes(Treatment, count)) + geom_boxplot(aes(fill=Treatment)) + scale_y_log10() + ggtitle("FSTL5")

P + scale_x_discrete(name ="Group", limits=c("CNL","Target"))

 

 

 

library("genefilter")

topVarGenes <- head(order(-rowVars(assay(rld))),30)

The heatmap becomes more interesting if we do not look at absolute expression strength but rather at the amount by which each gene deviates in a specific sample from the gene’s average across all samples. Hence, we center each genes’ values across samples, and plot a heatmap. We provide the column side colors to help identify the treated samples (in blue) from the untreated samples (in grey).

 

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group")

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))

topVarGenes <- head(order(-rowVars(assay(vsdata))),50)

mat <- assay(vsdata)[ topVarGenes, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","R","D","M","Treatment")])

pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)

 

res1 <-  res %>% filter(padj<0.05) %>% arrange(padj)

res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))

Sig <- res2$row

mat <- assay(vsdata)[ Sig, ]

 mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Treatment","HPV","PIK3CAmutation","Response.Primary","Responder")])

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)

 

mat <- assay(vsdata)[ G, ]

mat1 <- as.data.frame(mat)

mat1 <- mat1 %>% arrange(desc(TD3))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F)

zmat <- scale(tmat1, center = TRUE, scale = TRUE)  upon column

 

sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)

PLS-DA

GSEA

library(org.Hs.eg.db)

ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,

                                    key=res$row, 

                                    columns="SYMBOL",

                                    keytype="SYMBOL")

ens2symbol <- as_tibble(ens2symbol)

ens2symbol

res2 <- res %>% 

  dplyr::select(SYMBOL, stat) %>% 

  na.omit() %>% 

  distinct() %>% 

  group_by(SYMBOL) %>% 

  summarize(stat=mean(stat))

res2

 

library(fgsea)

ranks <- deframe(res2)

head(ranks, 20)

 

pathways.hallmark <- gmtPathways("C:/Users/~/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps      = 0.0,

                  minSize  = 15,

                  maxSize  = 500)

fgseaResTidy <- fgseaRes %>%

  as_tibble() %>%

  arrange(desc(NES))

 

# Show in a nice table:

fgseaResTidy %>% 

  dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>% 

  arrange(padj) 

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.05)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA") + 
  theme_minimal()

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],

                            ranks) + labs(title=" HALLMARK_APOPTOSIS")

 

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>%  as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

 

pathway

pval

padj

NES

size

1

HALLMARK_E2F_TARGETS

0.003125

0.016

1.50255883270908

196

2

HALLMARK_P53_PATHWAY

0.00307692307692308

0.016

1.46976609146818

193

 

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

  geom_col(aes(fill=padj<0.05)) +

  coord_flip() +

  labs(x="Pathway", y="Normalized Enrichment Score",

       title="Hallmark pathways NES from GSEA") + 

  theme_minimal()

 

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>%  as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/~/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>% 

   as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>%   as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark Bulk PRE HPV+ R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],

                            ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

[ 打印 ]
阅读 ()评论 (0)
评论
目前还没有任何评论
登录后才可评论.