#BiocManager::install('Biobase')
#install.packages('NMF') ## install.extras('NMF')
#install.packages("ggsci")
#BiocManager::install('limma')
单细胞研究避免不了要回答两个问题:组织中有哪些细胞类型,每个细胞类型又有哪些表达模式?NMF解决这类问题具有天然的优势,因为它分解的因子很容易与细胞类型或表达模式对应起来。Github上有很多基于NMF和其变种算法的单细胞分析工具,我比较喜欢的有单细胞整合分析工具liger和空间转录组去卷积工具SPOTlight。应用NMF分析方法发表的高分文章也有很多,我给大家介绍一篇,更多的文章请自己搜索。
对比PCA分析的结果,NMF虽然毫不逊色,但是它的运行时间更长。(1min vs 1h)
我们为什么要用NMF呢?一个很重要的原因是NMF的因子可解释性更强,每个因子贡献度最大的基因基本代表了某种或某个状态细胞的表达模式,相比差异分析得到marker基因更有代表性。
NMF包通过nmf()函数实现矩阵分解,它的用法及重要参数如下:
> nmf(x, rank, method, seed, nrun, ...)
x:待分解非负矩阵,数据格式可以是matrix,data.frame, ExpressionSet
rank:分解的基数量,对于单细胞数据,可以设置为期望的细胞类型数量或表达模式数量
method:因式分解的常用方法,这里介绍三种常用的
1、基于KL 散度进行度量目标函数的多重迭代梯度下降算法——brunet(默认算法)
2、基于欧几里得距离度量目标函数的多重迭代梯度下降算法——lee
3、交替最小二乘法(Alternating Least Squares(ALS))——snmf/r
seed:因式分解的初始化种子
nrun:运行次数
rank值一般是要通过测试评估后确定的,但是分析单细胞数据这是一个很难完成的工作,5000个细胞的测试时间可能超过10个小时。 替代办法是使用经验或先验知识指定,可以尝试略多于细胞类型或细胞状态(细胞亚群再聚类时)的一个数值,例如我在本帖的PBMC数据分解中就指定为rank=10。
因为NMF一般是从随机数开始,通过迭代算法收敛误差的方法求出最优W和H矩阵,所以seed不同最后的结果也不同。为了减少seed的影响求得最优解,常规的办法是通过nrun参数设置运行100-200次矩阵分解选取最优值,也可以使用特殊的算法选择一个最佳的seed(设置seed=‘nndsvd’或seed=’ica’),这样运行一次也能得到最优解。
library(Biobase)
## 载入需要的程辑包:BiocGenerics
##
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(NMF)
## 载入需要的程辑包:registry
## 载入需要的程辑包:rngtools
## 载入需要的程辑包:cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 2/2
## To enable shared memory capabilities, try: install.extras('
## NMF
## ')
library(ggplot2)
library(dplyr) #tidyverse
##
## 载入程辑包:'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(patchwork)
## 参数测试
data("esGolub")
#str(esGolub)
if(0){
#esGolub <- esGolub[1:500,]
t1 <- nmf(esGolub, 3, method = "brunet", seed = 219)
runtime(t1) # elapsed: 1.358
t2 <- nmf(esGolub, 3, method = "lee", seed = 219)
runtime(t2) # elapsed: 1.884
t3 <- nmf(esGolub, 3, method = "snmf/r", seed = 219)
runtime(t3) # elapsed: 1.774
t4 <- nmf(esGolub, 3, method = "brunet", seed = 'nndsvd')
runtime(t4) # elapsed: 3.058
t5 <- nmf(esGolub, 3, method = "brunet", nrun = 100)
runtime(t5) # elapsed: 2.089
t6 <- nmf(esGolub, 3, method = "nsNMF", nrun = 100)
runtime(t6) # elapsed: 2.207
}
library(Seurat)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## Attaching SeuratObject
scObj = readRDS("../result/data/pbmc_tutorial.rds")
DimPlot(scObj, label = T)
if(0){
library(NMF)
##(1) 创建seurat对象----
pbmc <- Read10X_h5("pbmc.h5")
pbmc <- CreateSeuratObject(pbmc, project = "pbmc", min.cells = 3, min.features = 500)
pbmc$percent.mt <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc <- subset(pbmc, percent.mt<20)
pbmc <- NormalizeData(pbmc) %>% FindVariableFeatures() %>% ScaleData(do.center = F)
##(2) 使用pca的分解结果降维聚类----
pbmc <- RunPCA(pbmc)
set.seed(219)
pbmc.pca <- RunUMAP(pbmc, dims = 1:15) %>% FindNeighbors(dims = 1:15) %>% FindClusters()
## 结果可视化
DimPlot(pbmc.pca, label = T) + ggsci::scale_color_igv()
#ggsave("pbmc_pca.png", p, width = 9, height = 6)
p <- FeaturePlot(pbmc.pca, features = c('CD3D', 'CD3E', 'MS4A1', 'CD79A', 'GNLY', 'NKG7', 'CD14',
'FCGR3A', 'PPBP', 'FCER1A', 'CD4', 'CD8A'), ncol = 4)
p
#ggsave("pbmc_pca_markers.png", p, width = 12, height = 8)
}
pbmc=scObj
## 高变基因表达矩阵的分解
# pbmc大体可分成T,B,NK,CD14 Mono,CD16 Mono,DC,Platelet等类型,考虑冗余后设置rank=10
vm <- pbmc@assays$RNA@scale.data
# 直接做 NMF 会报错,因为 起始矩阵有负数。
# 错误: NMF::nmf - Input matrix x contains some negative entries.
# 过滤
vm[vm < 0] <- 0 #负数设置为 0
dim(vm)
## [1] 13714 2689
vm <- vm[apply(vm, 1, var) > 0, ] #全是0的行去掉
dim(vm)
## [1] 13713 2689
if(0){
res <- nmf(vm, rank=11,
nrun=1, #默认1
method = "snmf/r", seed = 'nndsvd')
# 默认交替最小二乘法(Alternating Least Squares(ALS))——snmf/r
# 参数rank=,是期望的细胞亚群数量
# 保存结果
saveRDS(res, file = "../result/data/NMF_res.list.rds")
}else{
res=readRDS("../result/data/NMF_res.list.rds")
}
if(0){
# 聚类效果: 不好,有很长的飘尾
res2 <- nmf(vm, rank=11,
nrun=1, #seed = 'ica',
method = 'nsNMF')
# 12:07 --> 5871.753 = 97 min
# 保存结果
saveRDS(res2, file = "data/NMF_res2.list.rds")
#res=res2
#res=readRDS("data/NMF_res.list.rds")
}
if(0){
# 聚类效果: 不好,很零散,15类
#install.packages("fastICA")
res3 <- nmf(vm, rank=11,
nrun=1, seed = 'ica',
method = 'nsNMF')
# 15:31-->5851.158 = 97min
runtime(res3)
}
pbmc@reductions$nmf <- pbmc@reductions$pca
pbmc@reductions$nmf@cell.embeddings <- t(coef(res))
pbmc@reductions$nmf@feature.loadings <- basis(res)
# 查看结果://todo?
plot(t(coef(res) ),
col= scObj$seurat_clusters)
head(basis(res))
## [,1] [,2] [,3] [,4] [,5]
## AL627309.1 0.002468800 0.015954603 0.000000000 0.000000e+00 0.000000e+00
## AP006222.2 0.000000000 0.001779544 0.000000000 0.000000e+00 0.000000e+00
## RP11-206L10.2 0.000000000 0.000000000 0.004452893 0.000000e+00 0.000000e+00
## RP11-206L10.9 0.000000000 0.005160232 0.000000000 3.394568e-05 1.553167e-05
## LINC00115 0.006045406 0.009026734 0.007213186 9.008296e-03 0.000000e+00
## NOC2L 0.018250833 0.011114251 0.026230897 2.305267e-02 0.000000e+00
## [,6] [,7] [,8] [,9] [,10]
## AL627309.1 0.000000000 0.001609292 0.0008995114 0.000000000 0.0000000000
## AP006222.2 0.000000000 0.000000000 0.0000000000 0.000000000 0.0082183674
## RP11-206L10.2 0.000000000 0.028332082 0.0005637506 0.000000000 0.0000000000
## RP11-206L10.9 0.002832633 0.001856657 0.0000000000 0.000236175 0.0007506968
## LINC00115 0.006097497 0.000000000 0.0055467242 0.004283768 0.0000000000
## NOC2L 0.021350290 0.018019947 0.0315238261 0.005395589 0.0354739973
## [,11]
## AL627309.1 0.002883569
## AP006222.2 0.003663774
## RP11-206L10.2 0.000000000
## RP11-206L10.9 0.000000000
## LINC00115 0.004088279
## NOC2L 0.020387763
set.seed(219)
pbmc.nmf <- RunUMAP(pbmc, reduction = 'nmf', dims = 1:11) %>%
FindNeighbors(reduction = 'nmf', dims = 1:11) %>%
FindClusters(resolution = 0.6)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:55:50 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:55:50 Read 2689 rows and found 11 numeric columns
## 19:55:50 Using Annoy for neighbor search, n_neighbors = 30
## 19:55:50 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:55:50 Writing NN index file to temp file /tmp/Rtmpv0NlAF/file35402fa4db83
## 19:55:50 Searching Annoy index using 1 thread, search_k = 3000
## 19:55:51 Annoy recall = 100%
## 19:55:52 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 19:55:52 10 smooth knn distance failures
## 19:55:52 Initializing from normalized Laplacian + noise (using irlba)
## 19:55:52 Commencing optimization for 500 epochs, with 105106 positive edges
## 19:55:56 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2689
## Number of edges: 81155
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8849
## Number of communities: 11
## Elapsed time: 0 seconds
## 结果可视化
p <- DimPlot(pbmc.nmf, label = T) +
ggsci::scale_color_igv()+ggtitle("NMF")
p
#ggsave("pbmc_nmf.png", p, width = 9, height = 6)
p2 <- FeaturePlot(pbmc.nmf, features = c('CD3D', 'CD3E', 'MS4A1', 'CD79A', 'GNLY', 'NKG7', 'CD14',
'FCGR3A', 'PPBP', 'FCER1A', 'CD4', 'CD8A'), ncol = 4)
p2
#ggsave("pbmc_nmf_markers.png", p, width = 12, height = 8)
pbmc.markers <- FindAllMarkers(pbmc.nmf, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
## # A tibble: 22 × 7
## # Groups: cluster [11]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 5.33 0.998 0.234 0 0 S100A9
## 2 0 5.29 0.986 0.14 0 0 S100A8
## 3 5.31e- 51 0.865 0.907 0.631 7.29e- 47 1 LDHB
## 4 1.83e- 17 1.18 0.258 0.103 2.51e- 13 1 LDLRAP1
## 5 0 4.27 0.936 0.042 0 2 CD79A
## 6 2.70e-278 3.60 0.627 0.022 3.71e-274 2 TCL1A
## 7 2.77e-215 3.14 0.961 0.228 3.80e-211 3 CCL5
## 8 6.92e-186 3.01 0.584 0.051 9.48e-182 3 GZMK
## 9 3.00e- 96 1.58 0.616 0.137 4.11e- 92 4 CCR7
## 10 1.07e- 39 1.29 0.352 0.095 1.47e- 35 4 CD8B
## # ℹ 12 more rows
# auto top N
DotPlot(pbmc.nmf, features = c(
pbmc.markers %>% group_by(cluster) %>% top_n(n = 7, wt = avg_log2FC) %>% pull(gene) %>% unique()
), cluster.idents = T)+RotatedAxis()+ggtitle("pbmc3k NMF")
# manual
DotPlot(pbmc.nmf, features = unique(c(
"CD3D", "CD3E", "CD3G", #T
"CD4", "CD8A", "CD8B", #CD4 or CD8
"CD27", "CCR7", #naive CD4+T
"IL7R", "S100A4", #memory CD4+
"ISG15",
"GZMK", #CD8+ effective?
"GZMH", "GZMB", "GZMA",
"NCR3", "NKG7", "GNLY","CCL3", #NK
#"CCR10", "IL2RA", "CD52", "CMTM7", "FOXP3", #Treg
"CD79A", "CD79B", "MS4A1", #B
"CD1C","FCER1A", "CST3", #mDC
"CD68","S100A12",
"LYZ", "CD14", #CD14+Mono
"FCGR3A","MS4A7", #CD16+monocyte
"PPBP",#Platelet
"GAPDH"
)))+RotatedAxis()+ggtitle("pbmc3k NMF")
## plot correlation ----
DimPlot(pbmc, label = T)+ggtitle("based on PCA") +
(DimPlot(pbmc.nmf, label = T) + ggtitle("based on NMF"))
pbmc$cid=rownames(pbmc@meta.data)
dat1=FetchData(pbmc, vars = c("cid", "seurat_clusters"))
colnames(dat1) =c("cid", "cluster1")
head(dat1)
## cid cluster1
## AAACATACAACCAC-1 AAACATACAACCAC-1 0
## AAACATTGAGCTAC-1 AAACATTGAGCTAC-1 3
## AAACATTGATCAGC-1 AAACATTGATCAGC-1 2
## AAACCGTGCTTCCG-1 AAACCGTGCTTCCG-1 1
## AAACCGTGTATGCG-1 AAACCGTGTATGCG-1 6
## AAACGCACTGGTAC-1 AAACGCACTGGTAC-1 2
#
pbmc.nmf$cid=rownames(pbmc.nmf@meta.data)
dat2=FetchData(pbmc.nmf, vars = c("cid", "seurat_clusters"))
colnames(dat2) =c("cid", "cluster2")
dat2=dat2[rownames(dat1),] #order
head(dat2)
## cid cluster2
## AAACATACAACCAC-1 AAACATACAACCAC-1 3
## AAACATTGAGCTAC-1 AAACATTGAGCTAC-1 2
## AAACATTGATCAGC-1 AAACATTGATCAGC-1 7
## AAACCGTGCTTCCG-1 AAACCGTGCTTCCG-1 6
## AAACCGTGTATGCG-1 AAACCGTGTATGCG-1 8
## AAACGCACTGGTAC-1 AAACGCACTGGTAC-1 7
table(dat1$cluster1, dat2$cluster2)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 0 0 295 0 46 308 57 0 43 1 4 0
## 1 430 0 0 0 0 0 54 0 0 4 0
## 2 0 91 0 15 9 204 0 124 2 0 0
## 3 0 0 345 1 0 0 0 3 0 0 0
## 4 0 1 1 268 1 6 0 4 17 0 0
## 5 0 0 0 0 0 0 162 0 0 0 0
## 6 0 0 0 2 0 0 0 0 142 0 0
## 7 0 0 0 0 0 0 0 0 0 34 0
## 8 0 0 0 0 0 0 0 0 0 0 15
table(dat2$cluster2)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 430 387 346 332 318 267 216 174 162 42 15
table(pbmc.nmf$seurat_clusters)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 430 387 346 332 318 267 216 174 162 42 15
pbmc.nmf$celltype <- pbmc.nmf$seurat_clusters
pbmc.nmf$celltype <- recode(pbmc.nmf$celltype,
"0" = "CD14 Mono",
"6"= "CD16 Mono",
"1" = "Naive CD8",
"4" = "Naive CD8",
'2' = "B cells",
'3' = "Eff CD8+ T",
'5' = "CD4+ T",
'7' = "CD4+ T",
'8' = "NKs",
'9' = "DCs",
'10' = "Platelet"
)
p3 <- DimPlot(pbmc.nmf, group.by = 'celltype', label = T, label.size = 3) +
ggsci::scale_color_npg(alpha = 0.6); p3
tmp <- data.frame(t(coef(res)), check.names = F)
colnames(tmp) <- paste0("factor", 1:ncol(tmp))
pbmc.nmf <- AddMetaData(pbmc.nmf, metadata = tmp) #给 str(pbmc.nmf@meta.data) 添加若干列
p4 <- FeaturePlot(pbmc.nmf, features = paste0("factor", 1:11), ncol = 4); p4
#ggsave("pbmc_nmf_factors.png", p4, width = 12, height = 8)
## 查看细胞主成分上的荷载
p5 <- FeaturePlot(pbmc.nmf, features = paste0("PC_", 1:12), ncol = 4); #p5 #略
#ggsave("pbmc_nmf_PCs.png", p, width = 12, height = 8)
# 对比上下两张图,很容易发现NMF的因子比PCA的PC轴解释性更强。
## 提取每个因子贡献度最大的20个基因
f <- extractFeatures(res, 20L)
f <- lapply(f, function(x) rownames(res)[x])
f <- do.call("rbind", f)
head(t(f) )
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "IL7R" "S100A8" "CD79A" "GZMB" "PF4" "MS4A7"
## [2,] "AQP3" "S100A9" "MS4A1" "FGFBP2" "SDPR" "FCGR3A"
## [3,] "CRIP2" "LGALS2" "TCL1A" "SPON2" "PPBP" "HES4"
## [4,] "TNFRSF4" "CD14" "CD79B" "GNLY" "GNG11" "RP11-290F20.3"
## [5,] "TRADD" "FCN1" "LINC00926" "PRF1" "SPARC" "CDKN1C"
## [6,] "CD40LG" "LYZ" "HLA-DQA1" "AKR1C3" "HIST1H2AC" "CKB"
## [,7] [,8] [,9] [,10] [,11]
## [1,] "FCER1A" "CCR7" "TYMS" "IFIT1" "GZMK"
## [2,] "CLEC10A" "PRKCQ-AS1" "MKI67" "RTP4" "CCL5"
## [3,] "SERPINF1" "LINC00176" "RRM2" "HERC5" "CD8A"
## [4,] "CD1C" "LEF1" "KIAA0101" "IFIT3" "KLRG1"
## [5,] "ENHO" "FHIT" "TK1" "SPATS2L" "GZMA"
## [6,] "CLIC2" "PIK3IP1" "MYBL2" "STAT1" "LYAR"
#DT::datatable(t(f))
# for DotPlot
{
factors <- extractFeatures(res, 10L)
factors <- lapply(factors, function(x) rownames(res)[x])
factors <- do.call("rbind", factors)
print(dim(factors))
#DotPlot(pbmc.nmf, features = unique(unlist(lapply(factors, c))), cluster.idents = T) + RotatedAxis()
}
## [1] 11 10
gene_list = unique(unlist(lapply(factors, c))); length(gene_list)
## [1] 108
#jsonlite::toJSON(gene_list)
dat=pbmc.nmf@assays$RNA@data
#features=rownames(pbmc)
features=gene_list
#
rs=NULL;
for(i in levels(pbmc.nmf)){
cells.1=WhichCells(pbmc.nmf, idents = i)
rs1=rowSums( dat[features, cells.1, drop=F] >0 ) / length(cells.1)
rs = cbind(rs, rs1)
print(dim(rs))
}
## [1] 108 1
## [1] 108 2
## [1] 108 3
## [1] 108 4
## [1] 108 5
## [1] 108 6
## [1] 108 7
## [1] 108 8
## [1] 108 9
## [1] 108 10
## [1] 108 11
colnames(rs)=paste0("Cluster",levels(pbmc.nmf))
dim(rs)
## [1] 108 11
head(rs)
## Cluster0 Cluster1 Cluster2 Cluster3 Cluster4 Cluster5
## IL7R 0.11395349 0.682170543 0.11560694 0.49397590 0.572327044 0.81647940
## S100A8 0.98604651 0.090439276 0.07514451 0.04518072 0.100628931 0.09737828
## CD79A 0.03255814 0.041343669 0.93641618 0.01506024 0.044025157 0.05992509
## GZMB 0.03720930 0.025839793 0.02023121 0.24397590 0.031446541 0.04119850
## PF4 0.01395349 0.005167959 0.01156069 0.01204819 0.003144654 0.00000000
## MS4A7 0.23488372 0.015503876 0.05780347 0.02108434 0.022012579 0.02621723
## Cluster6 Cluster7 Cluster8 Cluster9 Cluster10
## IL7R 0.13888889 0.563218391 0.12962963 0.33333333 0.13333333
## S100A8 0.59722222 0.097701149 0.09259259 0.40476190 0.33333333
## CD79A 0.05555556 0.068965517 0.01851852 0.14285714 0.06666667
## GZMB 0.07870370 0.034482759 0.96296296 0.11904762 0.20000000
## PF4 0.04166667 0.005747126 0.00000000 0.04761905 0.93333333
## MS4A7 0.71759259 0.005747126 0.00617284 0.19047619 0.06666667
# 在每个cluster中都低于 0.3 的基因个数
rs2=rs[rowSums(rs<0.3) >0, ]
dim(rs) #108
## [1] 108 11
dim(rs2) #105
## [1] 105 11
# 按每个基因的最大百分比值排序
rs3=apply(rs, 1, max)
rs3=rs3[order(-rs3)]
head(rs3)
## CCL5 PPBP LYZ NKG7 NRGN TYROBP
## 1 1 1 1 1 1
tail(rs3)
## RRM2 TK1 MKI67 KIAA0101 ZWINT DEPDC1B
## 0.03086420 0.03086420 0.02469136 0.02469136 0.02469136 0.01851852
#plot(rs3, n=100)
library(pheatmap)
#pheatmap(rs, border_color =NA)
#dim(rs1)
length(rs1)
## [1] 108
#
#DotPlot(pbmc.nmf, features = c(
# head(names(rs1), n=60)
#), cluster.idents = T) + RotatedAxis()
#
mean.fxn = function(x, pseudocount.use=1, base=2) {
return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base))
}
rsM=NULL;
for(i in levels(pbmc.nmf)){
cells.1 = WhichCells(pbmc.nmf, idents = i)
rs0 = mean.fxn(dat[features, cells.1, drop=F])
rsM = cbind(rsM, rs0)
print(dim(rsM))
}
## [1] 108 1
## [1] 108 2
## [1] 108 3
## [1] 108 4
## [1] 108 5
## [1] 108 6
## [1] 108 7
## [1] 108 8
## [1] 108 9
## [1] 108 10
## [1] 108 11
colnames(rsM)=paste0("Cluster",levels(pbmc.nmf))
head(rsM)
## Cluster0 Cluster1 Cluster2 Cluster3 Cluster4 Cluster5 Cluster6
## IL7R 0.5783467 2.97218212 0.79030927 2.7901951 2.65087165 3.3609040 0.5133748
## S100A8 6.4833172 0.54379009 0.48324592 0.2794916 0.50299391 0.5752853 3.2575690
## CD79A 0.2300565 0.30918779 4.55617698 0.1203183 0.27274330 0.2712897 0.2403736
## GZMB 0.2565375 0.14356850 0.19463312 2.1670096 0.19217687 0.2363497 0.3053534
## PF4 0.1351051 0.03487235 0.07499719 0.1609550 0.01904058 0.0000000 0.3824003
## MS4A7 1.2618183 0.09782055 0.43514025 0.1232816 0.12966611 0.1445432 2.9259239
## Cluster7 Cluster8 Cluster9 Cluster10
## IL7R 2.69812613 0.84073909 0.9562834 0.4354341
## S100A8 0.45948327 0.55654875 2.6684399 1.9170040
## CD79A 0.79957323 0.11260199 0.6478446 0.1044942
## GZMB 0.20584260 5.65870171 2.7392979 1.1244498
## PF4 0.01589696 0.00000000 0.2130774 7.2228179
## MS4A7 0.03116309 0.04864571 0.5776897 0.2931964
p =pheatmap(rsM, border_color = NA, clustering_method = "ward.D2")
# 获取聚类后的基因顺序
row_cluster = cutree(p$tree_row,k=6)
# 对聚类后的数据进行重新排序
newOrder = rsM[p$tree_row$order,]
#newOrder[,ncol(newOrder)+1]= row_cluster[match(rownames(newOrder),names(row_cluster))]
#colnames(newOrder)[ncol(newOrder)]="Cluster"
# 查看重新排序后的数据
head(newOrder)
## Cluster0 Cluster1 Cluster2 Cluster3 Cluster4 Cluster5 Cluster6
## LGALS2 4.296763 0.2194642 0.2065756 0.1187146 0.2786202 0.06569327 2.249148
## MS4A6A 3.232930 0.1824591 0.1188825 0.1281771 0.1592001 0.08310864 1.390507
## GRN 3.199818 0.2318531 0.6841047 0.4564896 0.3730964 0.48195845 2.001191
## FCN1 4.797550 0.5578412 0.5910947 0.4149588 0.5384594 0.65455950 4.007145
## IFITM3 2.922208 0.2003130 0.2609213 0.4901171 0.4235872 0.39331258 4.166580
## LYZ 7.597019 1.9862370 2.0864374 1.9135665 1.9784074 2.15867502 5.570402
## Cluster7 Cluster8 Cluster9 Cluster10
## LGALS2 0.1706343 0.14313213 3.487993 1.501095
## MS4A6A 0.1148936 0.08192313 2.268643 0.812377
## GRN 0.3579583 0.45463957 2.835850 1.458060
## FCN1 0.5144093 0.71581304 2.305610 1.018420
## IFITM3 0.1971241 1.08732299 2.540592 1.229016
## LYZ 2.0239575 1.95027818 6.967274 4.232046
# 基因在每个cluster的最大值排序
rsM1=rev(sort(apply(rsM, 1, max)))
head(rsM1)
## PPBP LYZ S100A9 PF4 NKG7 RPS27
## 8.699382 7.597019 7.223132 7.222818 6.956882 6.770116
# plot(rsM1)
# 求交集
rs_inter = intersect( head(names(rsM1),n=50), head(names(rs3),n=50) )
length(rs_inter)
## [1] 47
head(rs_inter)
## [1] "PPBP" "LYZ" "S100A9" "PF4" "NKG7" "RPS27"
#DotPlot(pbmc.nmf, features = rs_inter, cluster.idents = T) + RotatedAxis()
# 聚类的结果
#DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rs3),n=90 ) ),
# #rownames(newOrder),
# cluster.idents = T) + RotatedAxis()
# 高表达比例 + 聚类后结果
#DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rs3),n=75 ) ),
# cluster.idents = T) + RotatedAxis()
DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rsM1),n=50 ) ),
#rownames(newOrder),
cluster.idents = T) + RotatedAxis()
== End ==