跳转至

CellMiner数据库

作者:DoubleHelix

微信公众号:生物信息云

CellMiner数据库,主要是通过国家癌症研究所癌症研究中心(NCI)所列出的60种癌细胞为基础而建立的。该数据库最初发表于2009年,后于2012年在Cancer Research杂志上进行了更新,题目为“CellMiner: a web-based suite of genomic and pharmacologic tools to explore transcript and drug patterns in the NCI-60 cell line set”。大家后期在使用该数据库记得应用相关文献。

数据库地址:CellMiner - Analysis Tools | Genomics and Pharmacology Facility (nih.gov)

视频讲解:微信 B站

案例:BioInfoCloud/CellMiner: CellMiner (github.com)

1.数据下载

Download Data Sets 》》》》Processed Data Set:

勾选RNA: RNA-seq和Compound activity: DTP NCI-60,点击下载即可

2.读入药物数据

library(readxl)
dat1 <- read_excel(path = "data/DTP_NCI60_ZSCORE.xlsx", skip = 7)


colnames(dat1) <- dat1[1,]
dat1 <- dat1[-1,-c(67,68)]

# 筛选药物标准

table(dat1$`FDA status`)

# 选取经过临床试验(Clinical trial)和FDA批准(FDA approved)的药物结果
dat1 <- dat1[dat1$`FDA status` %in% c("FDA approved", "Clinical trial"),]
dat1 <- dat1[,-c(1, 3:6)]

ifelse(dir.exists("output"),FALSE,dir.create("output"))
write.table(dat1, file = "output/drug.txt",sep = "\t",row.names = F,quote = F)

3.读入表达数据

###==============读入表达数据
dat2 <- read_excel(path = "data/RNA__RNA_seq_composite_expression.xls", skip = 9)
colnames(dat2) <- dat2[1,]
dat2 <- dat2[-1,-c(2:6)]

write.table(dat2, file = "output/geneExp.txt",sep = "\t",row.names = F,quote = F)

4.整理数据

library(impute)
library(limma)

#读取药物输入文件
drugDat <- read.table("output/drug.txt",sep="\t",header=T,check.names=F)
drugDat <- as.matrix(drugDat)
rownames(drugDat) <- drugDat[,1]
drug <- drugDat[,2:ncol(drugDat)]
dimnames <- list(rownames(drug),colnames(drug))
data <- matrix(as.numeric(as.matrix(drug)),nrow=nrow(drug),dimnames=dimnames)

# 考虑到药物敏感性数据中存在部分NA缺失值,通过impute.knn()函数来评估并补齐药物数据。其中,impute.knn()函数是一个使用最近邻平均来估算缺少的表达式数据的函数。
mat <- impute.knn(data)
drug <- mat$data
drug <- avereps(drug) %>% t() %>% as.data.frame()

colnames(drug)[1:12]

# 读取表达输入文件
exp <- read.table("output/geneExp.txt", sep="\t", header=T, row.names = 1, check.names=F)
dim(exp)
exp[1:4, 1:4]
# 提取特定基因表达
library(WGCNA)
library(tidyr)
inputgene <- c("TP53","PTEN","BCAT2","EGFR","TMEM178A")
gl <- intersect(inputgene,row.names(exp))
exp <- exp[gl,] %>% t() %>% as.data.frame()

identical(rownames(exp),rownames(drug))

5.药敏相关性分析

##======药物敏感性计算
corTab <-cor(drug,exp,method="pearson")
corPval <- corPvalueStudent(corTab,nSamples = nrow(drug))

##=========筛选有显著性的
##以EGFR为例

fitercor <- lapply(gl, function(g){
  index <- abs(corTab[,g])> 0.3 & corPval[,g] < 0.05
  df <- cbind(corTab[index,g],corPval[index,g])
  colnames(df) <- c("pearson","Pvalue")
  write.csv(df,file = paste0("output/",g,"-cor.csv"))
  df
})
length(fitercor) == length(gl)
names(fitercor) <- gl

save(fitercor,drug,exp,file = "output/fitercor.Rdata")

6.相关性拟合曲线

###======可视化========
rm(list = ls())
library(ggplot2)
library(ggpubr)

load("output/fitercor.Rdata")
names(fitercor)


ifelse(dir.exists("opFig"),FALSE,dir.create("opFig"))
g <- names(fitercor)[1]
lapply(names(fitercor), function(g){
  data <- fitercor[[g]]
  data <- na.omit(data)
  if(!is.null(data)){
    #dr <- rownames(data)[1]
    for(dr in rownames(data)){
      df <- data.frame(exp = exp[,g],dr = drug[,dr])
      tit <- paste0("R:",round(data[dr,1],2),",p value = ",round(data[dr,2],3))

      p <- ggplot(data = df, aes(x = exp, y = dr)) + #数据映射
        geom_point(alpha = 0.6,shape = 19,size=3,color="#DC143C") +#散点图,alpha就是点的透明度
        #geom_abline()+
        labs(title = tit)+
        geom_smooth(method = lm, formula = y ~ x,aes(colour = "lm"), size = 1.2,se = T)+
        scale_color_manual(values = c("#808080")) + #手动调颜色c("#DC143C","#00008B", "#808080")
        theme_bw() +#设定主题
        theme(axis.title=element_text(size=15,face="plain",color="black"),
              axis.text = element_text(size=12,face="plain",color="black"),
              legend.position =  "none",
              panel.background = element_rect(fill = "transparent",colour = "black"),
              plot.background = element_blank(),
              plot.title = element_text(size=15, lineheight=.8,hjust=0.5, face="plain"),
              legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"))+
        ylab(paste0("Activity z scores of ",dr)) + #expression的作用就是让log10的10下标
        xlab(paste0("The expression of ",g))
      ggsave(filename = paste0("opFig/",g,"-",dr,"-cor.pdf"),plot = p,width = 5,height = 5)
    }
  }

})

7.基因高低表达分组小提琴图

###======可视化========
rm(list = ls())
library(ggplot2)
library(ggpubr)
library(ggsignif)
library(RColorBrewer)
load("output/fitercor.Rdata")
names(fitercor)

ifelse(dir.exists("opFig"),FALSE,dir.create("opFig"))
g <- names(fitercor)[1]
lapply(names(fitercor), function(g){
  data <- fitercor[[g]]
  data <- na.omit(data)
  if(!is.null(data)){
    #dr <- rownames(data)[1]
    for(dr in rownames(data)){
      df <- data.frame(exp = exp[,g],dr = drug[,dr])
      med <- median(df$exp)
      df$group <- ifelse(df$exp > med,"High","Low")
      head(df)

      p = ggplot(df, aes(group,dr,fill= group))+ 
        geom_violin(aes(fill = group),trim = FALSE)+
        geom_signif(comparisons = list(c("High","Low")),
                    step_increase = 0.1,
                    map_signif_level = T,
                    margin_top=0.2,
                    tip_length =0.02,
                    test = "t.test")+
        geom_boxplot(width = 0.1,fill = "white")+
        scale_fill_manual(values=c(brewer.pal(2,"Dark2")))+
        theme_classic()+
        labs(y= paste0("Activity z scores of ",dr),title= g)+
        theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
              plot.title = element_text(hjust = 0.5),
              axis.line=element_line(colour="black",size=0.25),
              axis.title.x = element_blank(),
              axis.text.x = element_text(face = "plain",colour = "black"),
              axis.text = element_text(size=12,face="plain",color="black"),
              legend.position="none"
        )
      p
      ggsave(filename = paste0("opFig/",g,"-",dr,"-violin.pdf"),plot = p,
             width = 3,height = 4)
    }
  }
})