#BiocManager::install('Biobase')
#install.packages('NMF') ## install.extras('NMF')
#install.packages("ggsci")
#BiocManager::install('limma')

0. test time consuming

(1)intro

单细胞研究避免不了要回答两个问题:组织中有哪些细胞类型,每个细胞类型又有哪些表达模式?NMF解决这类问题具有天然的优势,因为它分解的因子很容易与细胞类型或表达模式对应起来。Github上有很多基于NMF和其变种算法的单细胞分析工具,我比较喜欢的有单细胞整合分析工具liger和空间转录组去卷积工具SPOTlight。应用NMF分析方法发表的高分文章也有很多,我给大家介绍一篇,更多的文章请自己搜索。

对比PCA分析的结果,NMF虽然毫不逊色,但是它的运行时间更长。(1min vs 1h)

我们为什么要用NMF呢?一个很重要的原因是NMF的因子可解释性更强,每个因子贡献度最大的基因基本代表了某种或某个状态细胞的表达模式,相比差异分析得到marker基因更有代表性。

NMF包通过nmf()函数实现矩阵分解,它的用法及重要参数如下: > nmf(x, rank, method, seed, nrun, ...)

x:待分解非负矩阵,数据格式可以是matrix,data.frame, ExpressionSet
rank:分解的基数量,对于单细胞数据,可以设置为期望的细胞类型数量或表达模式数量
method:因式分解的常用方法,这里介绍三种常用的
1、基于KL 散度进行度量目标函数的多重迭代梯度下降算法——brunet(默认算法)
2、基于欧几里得距离度量目标函数的多重迭代梯度下降算法——lee
3、交替最小二乘法(Alternating Least Squares(ALS))——snmf/r
seed:因式分解的初始化种子
nrun:运行次数

rank值一般是要通过测试评估后确定的,但是分析单细胞数据这是一个很难完成的工作,5000个细胞的测试时间可能超过10个小时。 替代办法是使用经验或先验知识指定,可以尝试略多于细胞类型或细胞状态(细胞亚群再聚类时)的一个数值,例如我在本帖的PBMC数据分解中就指定为rank=10。

因为NMF一般是从随机数开始,通过迭代算法收敛误差的方法求出最优W和H矩阵,所以seed不同最后的结果也不同。为了减少seed的影响求得最优解,常规的办法是通过nrun参数设置运行100-200次矩阵分解选取最优值,也可以使用特殊的算法选择一个最佳的seed(设置seed=‘nndsvd’或seed=’ica’),这样运行一次也能得到最优解。

(2)耗时

library(Biobase)
## 载入需要的程辑包:BiocGenerics
## 
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(NMF)
## 载入需要的程辑包:registry
## 载入需要的程辑包:rngtools
## 载入需要的程辑包:cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 2/2
##   To enable shared memory capabilities, try: install.extras('
## NMF
## ')
library(ggplot2)
library(dplyr) #tidyverse
## 
## 载入程辑包:'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(patchwork)

## 参数测试
data("esGolub")
#str(esGolub)

if(0){
  #esGolub <- esGolub[1:500,]
  t1 <- nmf(esGolub, 3, method = "brunet", seed = 219)
  runtime(t1)   # elapsed: 1.358
  
  t2 <- nmf(esGolub, 3, method = "lee", seed = 219)
  runtime(t2)   # elapsed: 1.884 
  
  t3 <- nmf(esGolub, 3, method = "snmf/r", seed = 219)
  runtime(t3)   # elapsed: 1.774
  
  t4 <- nmf(esGolub, 3, method = "brunet", seed = 'nndsvd')
  runtime(t4)   # elapsed: 3.058 
  
  t5 <- nmf(esGolub, 3, method = "brunet", nrun = 100)
  runtime(t5)   # elapsed: 2.089
  
  t6 <- nmf(esGolub, 3, method = "nsNMF", nrun = 100)
  runtime(t6)   # elapsed: 2.207 
}

1. pbmc3k

library(Seurat)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## Attaching SeuratObject
scObj = readRDS("../result/data/pbmc_tutorial.rds")
DimPlot(scObj, label = T)

if(0){
  library(NMF)
  
  ##(1) 创建seurat对象----
  pbmc <- Read10X_h5("pbmc.h5")
  pbmc <- CreateSeuratObject(pbmc, project = "pbmc", min.cells = 3, min.features = 500)
  pbmc$percent.mt <- PercentageFeatureSet(pbmc, pattern = "^MT-")
  pbmc <- subset(pbmc, percent.mt<20)
  pbmc <- NormalizeData(pbmc) %>% FindVariableFeatures() %>% ScaleData(do.center = F)
  
  ##(2) 使用pca的分解结果降维聚类----
  pbmc <- RunPCA(pbmc)
  set.seed(219)
  pbmc.pca <- RunUMAP(pbmc, dims = 1:15) %>% FindNeighbors(dims = 1:15) %>% FindClusters()
  
  ## 结果可视化
  DimPlot(pbmc.pca, label = T) + ggsci::scale_color_igv()
  #ggsave("pbmc_pca.png", p, width = 9, height = 6)
  
  p <- FeaturePlot(pbmc.pca, features = c('CD3D', 'CD3E', 'MS4A1', 'CD79A', 'GNLY', 'NKG7', 'CD14', 
                                          'FCGR3A', 'PPBP', 'FCER1A', 'CD4', 'CD8A'), ncol = 4)
  p
  #ggsave("pbmc_pca_markers.png", p, width = 12, height = 8)
}

(3)基于NMF分解的降维聚类

pbmc=scObj

## 高变基因表达矩阵的分解
# pbmc大体可分成T,B,NK,CD14 Mono,CD16 Mono,DC,Platelet等类型,考虑冗余后设置rank=10
vm <- pbmc@assays$RNA@scale.data
# 直接做 NMF 会报错,因为 起始矩阵有负数。
# 错误: NMF::nmf - Input matrix x contains some negative entries.
# 过滤
vm[vm < 0] <- 0 #负数设置为 0
dim(vm)
## [1] 13714  2689
vm <- vm[apply(vm, 1, var) > 0, ] #全是0的行去掉
dim(vm)
## [1] 13713  2689

1) 做非负矩阵分解

if(0){
  res <- nmf(vm, rank=11, 
             nrun=1, #默认1
             method = "snmf/r", seed = 'nndsvd')
  # 默认交替最小二乘法(Alternating Least Squares(ALS))——snmf/r  
  # 参数rank=,是期望的细胞亚群数量
  
  # 保存结果
  saveRDS(res, file = "../result/data/NMF_res.list.rds")
}else{
  res=readRDS("../result/data/NMF_res.list.rds")
}


if(0){
  # 聚类效果: 不好,有很长的飘尾
  res2 <- nmf(vm, rank=11, 
             nrun=1, #seed = 'ica', 
             method = 'nsNMF')
  # 12:07 --> 5871.753  = 97 min
  # 保存结果
  saveRDS(res2, file = "data/NMF_res2.list.rds")
  #res=res2
  #res=readRDS("data/NMF_res.list.rds")
}
if(0){
  # 聚类效果: 不好,很零散,15类
  #install.packages("fastICA")
  res3 <- nmf(vm, rank=11, 
              nrun=1, seed = 'ica', 
              method = 'nsNMF')
  # 15:31-->5851.158 = 97min
  runtime(res3)
}

2)分解结果返回suerat对象

pbmc@reductions$nmf <- pbmc@reductions$pca
pbmc@reductions$nmf@cell.embeddings <- t(coef(res))    
pbmc@reductions$nmf@feature.loadings <- basis(res)  


# 查看结果://todo?
plot(t(coef(res) ), 
     col= scObj$seurat_clusters)

head(basis(res))
##                      [,1]        [,2]        [,3]         [,4]         [,5]
## AL627309.1    0.002468800 0.015954603 0.000000000 0.000000e+00 0.000000e+00
## AP006222.2    0.000000000 0.001779544 0.000000000 0.000000e+00 0.000000e+00
## RP11-206L10.2 0.000000000 0.000000000 0.004452893 0.000000e+00 0.000000e+00
## RP11-206L10.9 0.000000000 0.005160232 0.000000000 3.394568e-05 1.553167e-05
## LINC00115     0.006045406 0.009026734 0.007213186 9.008296e-03 0.000000e+00
## NOC2L         0.018250833 0.011114251 0.026230897 2.305267e-02 0.000000e+00
##                      [,6]        [,7]         [,8]        [,9]        [,10]
## AL627309.1    0.000000000 0.001609292 0.0008995114 0.000000000 0.0000000000
## AP006222.2    0.000000000 0.000000000 0.0000000000 0.000000000 0.0082183674
## RP11-206L10.2 0.000000000 0.028332082 0.0005637506 0.000000000 0.0000000000
## RP11-206L10.9 0.002832633 0.001856657 0.0000000000 0.000236175 0.0007506968
## LINC00115     0.006097497 0.000000000 0.0055467242 0.004283768 0.0000000000
## NOC2L         0.021350290 0.018019947 0.0315238261 0.005395589 0.0354739973
##                     [,11]
## AL627309.1    0.002883569
## AP006222.2    0.003663774
## RP11-206L10.2 0.000000000
## RP11-206L10.9 0.000000000
## LINC00115     0.004088279
## NOC2L         0.020387763

3) 基于nmf分解做降维聚类

  • 使用 nmf 代替 PCA,后续处理不变。
set.seed(219)
pbmc.nmf <- RunUMAP(pbmc, reduction = 'nmf', dims = 1:11) %>% 
  FindNeighbors(reduction = 'nmf', dims = 1:11) %>% 
  FindClusters(resolution = 0.6)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:55:50 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:55:50 Read 2689 rows and found 11 numeric columns
## 19:55:50 Using Annoy for neighbor search, n_neighbors = 30
## 19:55:50 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:55:50 Writing NN index file to temp file /tmp/Rtmpv0NlAF/file35402fa4db83
## 19:55:50 Searching Annoy index using 1 thread, search_k = 3000
## 19:55:51 Annoy recall = 100%
## 19:55:52 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 19:55:52 10 smooth knn distance failures
## 19:55:52 Initializing from normalized Laplacian + noise (using irlba)
## 19:55:52 Commencing optimization for 500 epochs, with 105106 positive edges
## 19:55:56 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2689
## Number of edges: 81155
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8849
## Number of communities: 11
## Elapsed time: 0 seconds
## 结果可视化  
p <- DimPlot(pbmc.nmf, label = T) +
  ggsci::scale_color_igv()+ggtitle("NMF")
p

#ggsave("pbmc_nmf.png", p, width = 9, height = 6)
p2 <- FeaturePlot(pbmc.nmf, features = c('CD3D', 'CD3E', 'MS4A1', 'CD79A', 'GNLY', 'NKG7', 'CD14', 
                                        'FCGR3A', 'PPBP', 'FCER1A', 'CD4', 'CD8A'), ncol = 4)
p2

#ggsave("pbmc_nmf_markers.png", p, width = 12, height = 8)

(4). biomarker

pbmc.markers <- FindAllMarkers(pbmc.nmf, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
## # A tibble: 22 × 7
## # Groups:   cluster [11]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
##  1 0              5.33  0.998 0.234 0         0       S100A9 
##  2 0              5.29  0.986 0.14  0         0       S100A8 
##  3 5.31e- 51      0.865 0.907 0.631 7.29e- 47 1       LDHB   
##  4 1.83e- 17      1.18  0.258 0.103 2.51e- 13 1       LDLRAP1
##  5 0              4.27  0.936 0.042 0         2       CD79A  
##  6 2.70e-278      3.60  0.627 0.022 3.71e-274 2       TCL1A  
##  7 2.77e-215      3.14  0.961 0.228 3.80e-211 3       CCL5   
##  8 6.92e-186      3.01  0.584 0.051 9.48e-182 3       GZMK   
##  9 3.00e- 96      1.58  0.616 0.137 4.11e- 92 4       CCR7   
## 10 1.07e- 39      1.29  0.352 0.095 1.47e- 35 4       CD8B   
## # ℹ 12 more rows
# auto top N
DotPlot(pbmc.nmf, features = c(
  pbmc.markers %>% group_by(cluster) %>% top_n(n = 7, wt = avg_log2FC) %>% pull(gene) %>% unique()
), cluster.idents = T)+RotatedAxis()+ggtitle("pbmc3k NMF")

# manual
DotPlot(pbmc.nmf, features = unique(c(
  "CD3D", "CD3E", "CD3G", #T
  "CD4", "CD8A", "CD8B",  #CD4 or CD8
  "CD27", "CCR7",           #naive CD4+T
  "IL7R", "S100A4", #memory CD4+
  "ISG15", 
  "GZMK", #CD8+ effective?
  "GZMH", "GZMB", "GZMA",
  "NCR3", "NKG7", "GNLY","CCL3", #NK
  
  #"CCR10", "IL2RA", "CD52", "CMTM7", "FOXP3", #Treg
  
  "CD79A", "CD79B", "MS4A1", #B
  "CD1C","FCER1A", "CST3", #mDC
  
  "CD68","S100A12",
  "LYZ", "CD14",   #CD14+Mono
  "FCGR3A","MS4A7", #CD16+monocyte
  
  "PPBP",#Platelet
  "GAPDH"
)))+RotatedAxis()+ggtitle("pbmc3k NMF")

## plot correlation ----
DimPlot(pbmc, label = T)+ggtitle("based on PCA") +
 (DimPlot(pbmc.nmf, label = T) + ggtitle("based on NMF"))

分类的一致性:PCA vs NMF

pbmc$cid=rownames(pbmc@meta.data)
dat1=FetchData(pbmc, vars = c("cid", "seurat_clusters"))
colnames(dat1) =c("cid", "cluster1")
head(dat1)
##                               cid cluster1
## AAACATACAACCAC-1 AAACATACAACCAC-1        0
## AAACATTGAGCTAC-1 AAACATTGAGCTAC-1        3
## AAACATTGATCAGC-1 AAACATTGATCAGC-1        2
## AAACCGTGCTTCCG-1 AAACCGTGCTTCCG-1        1
## AAACCGTGTATGCG-1 AAACCGTGTATGCG-1        6
## AAACGCACTGGTAC-1 AAACGCACTGGTAC-1        2
#
pbmc.nmf$cid=rownames(pbmc.nmf@meta.data)
dat2=FetchData(pbmc.nmf, vars = c("cid", "seurat_clusters"))
colnames(dat2) =c("cid", "cluster2")
dat2=dat2[rownames(dat1),] #order
head(dat2)
##                               cid cluster2
## AAACATACAACCAC-1 AAACATACAACCAC-1        3
## AAACATTGAGCTAC-1 AAACATTGAGCTAC-1        2
## AAACATTGATCAGC-1 AAACATTGATCAGC-1        7
## AAACCGTGCTTCCG-1 AAACCGTGCTTCCG-1        6
## AAACCGTGTATGCG-1 AAACCGTGTATGCG-1        8
## AAACGCACTGGTAC-1 AAACGCACTGGTAC-1        7
table(dat1$cluster1, dat2$cluster2)
##    
##       0   1   2   3   4   5   6   7   8   9  10
##   0   0 295   0  46 308  57   0  43   1   4   0
##   1 430   0   0   0   0   0  54   0   0   4   0
##   2   0  91   0  15   9 204   0 124   2   0   0
##   3   0   0 345   1   0   0   0   3   0   0   0
##   4   0   1   1 268   1   6   0   4  17   0   0
##   5   0   0   0   0   0   0 162   0   0   0   0
##   6   0   0   0   2   0   0   0   0 142   0   0
##   7   0   0   0   0   0   0   0   0   0  34   0
##   8   0   0   0   0   0   0   0   0   0   0  15
table(dat2$cluster2)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 430 387 346 332 318 267 216 174 162  42  15
table(pbmc.nmf$seurat_clusters)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 430 387 346 332 318 267 216 174 162  42  15

(5)人工定义细胞类型

pbmc.nmf$celltype <- pbmc.nmf$seurat_clusters
pbmc.nmf$celltype <- recode(pbmc.nmf$celltype,
                            "0" = "CD14 Mono",
                            "6"= "CD16 Mono",
                            "1" = "Naive CD8",
                            "4" = "Naive CD8",
                            '2' = "B cells", 
                            '3' = "Eff CD8+ T", 
                            '5' = "CD4+ T", 
                            '7' = "CD4+ T", 
                            '8' = "NKs", 
                            '9' = "DCs", 
                            '10' = "Platelet"
                            )
p3 <- DimPlot(pbmc.nmf, group.by = 'celltype', label = T, label.size = 3) +  
  ggsci::scale_color_npg(alpha = 0.6); p3

1) 查看细胞的因子上的荷载

tmp <- data.frame(t(coef(res)), check.names = F)
colnames(tmp) <- paste0("factor", 1:ncol(tmp))
pbmc.nmf <- AddMetaData(pbmc.nmf, metadata = tmp) #给 str(pbmc.nmf@meta.data) 添加若干列

p4 <- FeaturePlot(pbmc.nmf, features = paste0("factor", 1:11), ncol = 4); p4

#ggsave("pbmc_nmf_factors.png", p4, width = 12, height = 8)

## 查看细胞主成分上的荷载
p5 <- FeaturePlot(pbmc.nmf, features = paste0("PC_", 1:12), ncol = 4); #p5 #略
#ggsave("pbmc_nmf_PCs.png", p, width = 12, height = 8)

# 对比上下两张图,很容易发现NMF的因子比PCA的PC轴解释性更强。

(6) 提取celltype的signatures

## 提取每个因子贡献度最大的20个基因
f <- extractFeatures(res, 20L)
f <- lapply(f, function(x) rownames(res)[x])
f <- do.call("rbind", f)
head(t(f) )
##      [,1]      [,2]     [,3]        [,4]     [,5]        [,6]           
## [1,] "IL7R"    "S100A8" "CD79A"     "GZMB"   "PF4"       "MS4A7"        
## [2,] "AQP3"    "S100A9" "MS4A1"     "FGFBP2" "SDPR"      "FCGR3A"       
## [3,] "CRIP2"   "LGALS2" "TCL1A"     "SPON2"  "PPBP"      "HES4"         
## [4,] "TNFRSF4" "CD14"   "CD79B"     "GNLY"   "GNG11"     "RP11-290F20.3"
## [5,] "TRADD"   "FCN1"   "LINC00926" "PRF1"   "SPARC"     "CDKN1C"       
## [6,] "CD40LG"  "LYZ"    "HLA-DQA1"  "AKR1C3" "HIST1H2AC" "CKB"          
##      [,7]       [,8]        [,9]       [,10]     [,11]  
## [1,] "FCER1A"   "CCR7"      "TYMS"     "IFIT1"   "GZMK" 
## [2,] "CLEC10A"  "PRKCQ-AS1" "MKI67"    "RTP4"    "CCL5" 
## [3,] "SERPINF1" "LINC00176" "RRM2"     "HERC5"   "CD8A" 
## [4,] "CD1C"     "LEF1"      "KIAA0101" "IFIT3"   "KLRG1"
## [5,] "ENHO"     "FHIT"      "TK1"      "SPATS2L" "GZMA" 
## [6,] "CLIC2"    "PIK3IP1"   "MYBL2"    "STAT1"   "LYAR"
#DT::datatable(t(f))

(7) DotPlot

# for DotPlot
{
  factors <- extractFeatures(res, 10L)
  factors <- lapply(factors, function(x) rownames(res)[x])
  factors <- do.call("rbind", factors)
  print(dim(factors))
  #DotPlot(pbmc.nmf, features = unique(unlist(lapply(factors, c))), cluster.idents = T) + RotatedAxis()
}
## [1] 11 10
gene_list = unique(unlist(lapply(factors, c))); length(gene_list)
## [1] 108
#jsonlite::toJSON(gene_list)

1) 每个基因在各个cluster的 pct

dat=pbmc.nmf@assays$RNA@data
#features=rownames(pbmc)
features=gene_list

#
rs=NULL;
for(i in levels(pbmc.nmf)){
  cells.1=WhichCells(pbmc.nmf, idents = i)
  rs1=rowSums( dat[features, cells.1, drop=F] >0 ) / length(cells.1)
  rs = cbind(rs, rs1)
  print(dim(rs))
}
## [1] 108   1
## [1] 108   2
## [1] 108   3
## [1] 108   4
## [1] 108   5
## [1] 108   6
## [1] 108   7
## [1] 108   8
## [1] 108   9
## [1] 108  10
## [1] 108  11
colnames(rs)=paste0("Cluster",levels(pbmc.nmf))
dim(rs)
## [1] 108  11
head(rs)
##          Cluster0    Cluster1   Cluster2   Cluster3    Cluster4   Cluster5
## IL7R   0.11395349 0.682170543 0.11560694 0.49397590 0.572327044 0.81647940
## S100A8 0.98604651 0.090439276 0.07514451 0.04518072 0.100628931 0.09737828
## CD79A  0.03255814 0.041343669 0.93641618 0.01506024 0.044025157 0.05992509
## GZMB   0.03720930 0.025839793 0.02023121 0.24397590 0.031446541 0.04119850
## PF4    0.01395349 0.005167959 0.01156069 0.01204819 0.003144654 0.00000000
## MS4A7  0.23488372 0.015503876 0.05780347 0.02108434 0.022012579 0.02621723
##          Cluster6    Cluster7   Cluster8   Cluster9  Cluster10
## IL7R   0.13888889 0.563218391 0.12962963 0.33333333 0.13333333
## S100A8 0.59722222 0.097701149 0.09259259 0.40476190 0.33333333
## CD79A  0.05555556 0.068965517 0.01851852 0.14285714 0.06666667
## GZMB   0.07870370 0.034482759 0.96296296 0.11904762 0.20000000
## PF4    0.04166667 0.005747126 0.00000000 0.04761905 0.93333333
## MS4A7  0.71759259 0.005747126 0.00617284 0.19047619 0.06666667
# 在每个cluster中都低于 0.3 的基因个数
rs2=rs[rowSums(rs<0.3) >0, ]
dim(rs) #108
## [1] 108  11
dim(rs2) #105
## [1] 105  11
# 按每个基因的最大百分比值排序
rs3=apply(rs, 1, max)
rs3=rs3[order(-rs3)]
head(rs3)
##   CCL5   PPBP    LYZ   NKG7   NRGN TYROBP 
##      1      1      1      1      1      1
tail(rs3)
##       RRM2        TK1      MKI67   KIAA0101      ZWINT    DEPDC1B 
## 0.03086420 0.03086420 0.02469136 0.02469136 0.02469136 0.01851852
#plot(rs3, n=100)


library(pheatmap)
#pheatmap(rs, border_color =NA)
#dim(rs1)
length(rs1)
## [1] 108
#
#DotPlot(pbmc.nmf, features = c(
#  head(names(rs1), n=60)
#), cluster.idents = T) + RotatedAxis()
#

2) 每个群中的 mean exp

mean.fxn = function(x, pseudocount.use=1, base=2) {
  return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base))
}
rsM=NULL;
for(i in levels(pbmc.nmf)){
  cells.1 = WhichCells(pbmc.nmf, idents = i)
  rs0 = mean.fxn(dat[features, cells.1, drop=F])
  rsM = cbind(rsM, rs0)
  print(dim(rsM))
}
## [1] 108   1
## [1] 108   2
## [1] 108   3
## [1] 108   4
## [1] 108   5
## [1] 108   6
## [1] 108   7
## [1] 108   8
## [1] 108   9
## [1] 108  10
## [1] 108  11
colnames(rsM)=paste0("Cluster",levels(pbmc.nmf))
head(rsM)
##         Cluster0   Cluster1   Cluster2  Cluster3   Cluster4  Cluster5  Cluster6
## IL7R   0.5783467 2.97218212 0.79030927 2.7901951 2.65087165 3.3609040 0.5133748
## S100A8 6.4833172 0.54379009 0.48324592 0.2794916 0.50299391 0.5752853 3.2575690
## CD79A  0.2300565 0.30918779 4.55617698 0.1203183 0.27274330 0.2712897 0.2403736
## GZMB   0.2565375 0.14356850 0.19463312 2.1670096 0.19217687 0.2363497 0.3053534
## PF4    0.1351051 0.03487235 0.07499719 0.1609550 0.01904058 0.0000000 0.3824003
## MS4A7  1.2618183 0.09782055 0.43514025 0.1232816 0.12966611 0.1445432 2.9259239
##          Cluster7   Cluster8  Cluster9 Cluster10
## IL7R   2.69812613 0.84073909 0.9562834 0.4354341
## S100A8 0.45948327 0.55654875 2.6684399 1.9170040
## CD79A  0.79957323 0.11260199 0.6478446 0.1044942
## GZMB   0.20584260 5.65870171 2.7392979 1.1244498
## PF4    0.01589696 0.00000000 0.2130774 7.2228179
## MS4A7  0.03116309 0.04864571 0.5776897 0.2931964
p =pheatmap(rsM, border_color = NA, clustering_method = "ward.D2")

# 获取聚类后的基因顺序
row_cluster = cutree(p$tree_row,k=6)
# 对聚类后的数据进行重新排序
newOrder = rsM[p$tree_row$order,]
#newOrder[,ncol(newOrder)+1]= row_cluster[match(rownames(newOrder),names(row_cluster))]
#colnames(newOrder)[ncol(newOrder)]="Cluster"
# 查看重新排序后的数据
head(newOrder)
##        Cluster0  Cluster1  Cluster2  Cluster3  Cluster4   Cluster5 Cluster6
## LGALS2 4.296763 0.2194642 0.2065756 0.1187146 0.2786202 0.06569327 2.249148
## MS4A6A 3.232930 0.1824591 0.1188825 0.1281771 0.1592001 0.08310864 1.390507
## GRN    3.199818 0.2318531 0.6841047 0.4564896 0.3730964 0.48195845 2.001191
## FCN1   4.797550 0.5578412 0.5910947 0.4149588 0.5384594 0.65455950 4.007145
## IFITM3 2.922208 0.2003130 0.2609213 0.4901171 0.4235872 0.39331258 4.166580
## LYZ    7.597019 1.9862370 2.0864374 1.9135665 1.9784074 2.15867502 5.570402
##         Cluster7   Cluster8 Cluster9 Cluster10
## LGALS2 0.1706343 0.14313213 3.487993  1.501095
## MS4A6A 0.1148936 0.08192313 2.268643  0.812377
## GRN    0.3579583 0.45463957 2.835850  1.458060
## FCN1   0.5144093 0.71581304 2.305610  1.018420
## IFITM3 0.1971241 1.08732299 2.540592  1.229016
## LYZ    2.0239575 1.95027818 6.967274  4.232046
# 基因在每个cluster的最大值排序
rsM1=rev(sort(apply(rsM, 1, max)))
head(rsM1)
##     PPBP      LYZ   S100A9      PF4     NKG7    RPS27 
## 8.699382 7.597019 7.223132 7.222818 6.956882 6.770116
# plot(rsM1)


# 求交集
rs_inter = intersect( head(names(rsM1),n=50), head(names(rs3),n=50) )
length(rs_inter)
## [1] 47
head(rs_inter)
## [1] "PPBP"   "LYZ"    "S100A9" "PF4"    "NKG7"   "RPS27"
#DotPlot(pbmc.nmf, features = rs_inter, cluster.idents = T) + RotatedAxis()

# 聚类的结果
#DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rs3),n=90 ) ),
#          #rownames(newOrder), 
#        cluster.idents = T) + RotatedAxis()

# 高表达比例 + 聚类后结果
#DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rs3),n=75 ) ), 
#        cluster.idents = T) + RotatedAxis()

高表达mean + 聚类后结果

DotPlot(pbmc.nmf, features = intersect( rownames(newOrder), head( names(rsM1),n=50 ) ),
        #rownames(newOrder), 
        cluster.idents = T) + RotatedAxis()