Liver Development scRNA-seq Analysis

Set up

Install packages as necessary.

dir.create('liver_scrnaseq_analysis/')

## Warning in dir.create("liver_scrnaseq_analysis/"):
## 'liver_scrnaseq_analysis' already exists

list.of.bioconductor.packages <- c("edgeR","limma","pathifier","KEGGREST","biomaRt","GSVA","GSEABase")
new.bioc.packages <- list.of.bioconductor.packages[!(list.of.bioconductor.packages %in% installed.packages()[,"Package"])]
if(length(new.bioc.packages)) BiocManager::install(new.bioc.packages)

list.of.packages <- c("aricode","ggplot2","mclust","Rmisc","ggpubr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

Read in and preprocess scRNA-seq data

Count data downloaded from GEO GSE90047.

# Read in data downloaded from GEO
gene_exp <- read.table("./input_data/GSE90047_Single-cell_RNA-seq_Read_Count.txt",header=TRUE,quote="",sep="\t",row.names=1)
gene_symbols <- gene_exp$Symbol
gene_lengths <- gene_exp$GeneLength
gene_exp$Symbol <- NULL
gene_exp$GeneLength <- NULL
gene_exp <- as.data.frame(t(gene_exp))
dim(gene_exp)

## [1]   529 40916

# Get rid of spike-in controls
i <- grep("ERCC", names(gene_exp))
gene_exp[,i] <- NULL
dim(gene_exp)

## [1]   529 40824

# Keep only cells from main time course experiment
gene_exp <- as.matrix(gene_exp)
gene_exp <- gene_exp[1:447,]

Filter out lowly expressed genes and use voom to convert counts to log2(cpm) with associated weights.

# Filter out lowly expressed genes
keep <- edgeR::filterByExpr(t(gene_exp))
table(keep)

## keep
## FALSE  TRUE 
## 35753  5071

gene_exp <- gene_exp[,keep]

# Apply voom
exp_voom <- limma::voom(t(gene_exp),plot=T)

exp_voom <- as.data.frame(t(exp_voom$E))

write.table(exp_voom,file="./liver_scrnaseq_analysis/liver_counts.exp_filt.voom.txt",quote=F,sep="\t")

Keep only 3,000 most highly variant genes.

# Keep top 3,000 most highly variant genes
exp_voom <- ssnpa::filter_by_variance(exp_voom, 3000)
dim(exp_voom)

## [1]  447 3000

write.table(exp_voom,file='./liver_scrnaseq_analysis/liver_counts.exp_filt.voom.var_filt.txt',quote=F, sep='\t')

Parse cell names for timepoint and read in cell type information.

m <- row.names(exp_voom)
n <- read.table(text=m, sep="_", colClasses="character")
time <- n$V1
time <- substring(time,1,5)
class <- as.data.frame(time, row.names=m)

cell <- read.table('./input_data/liver_cell_types.txt',header=T,sep="\t",row.names=1,quote="")
class <- merge(class,cell, by="row.names")
row.names(class) <- class$Row.names
class$Row.names <- NULL
class$timexcell <- paste(class$time, class$cell_type,sep = '_')

table(class$time)

## 
## E10.5 E11.5 E12.5 E13.5 E14.5 E15.5 E17.5 
##    54    70    41    65    70    77    70

table(class$cell_type)

## 
##          cholangiocyte hepatoblast/hepatocyte 
##                    102                    345

table(class$timexcell)

## 
## E10.5_hepatoblast/hepatocyte          E11.5_cholangiocyte 
##                           54                            2 
## E11.5_hepatoblast/hepatocyte          E12.5_cholangiocyte 
##                           68                            3 
## E12.5_hepatoblast/hepatocyte          E13.5_cholangiocyte 
##                           38                           10 
## E13.5_hepatoblast/hepatocyte          E14.5_cholangiocyte 
##                           55                           17 
## E14.5_hepatoblast/hepatocyte          E15.5_cholangiocyte 
##                           53                           36 
## E15.5_hepatoblast/hepatocyte          E17.5_cholangiocyte 
##                           41                           34 
## E17.5_hepatoblast/hepatocyte 
##                           36

Test ssNPA with different reference groups and penalty discounts

Scan a range of penalty discount values and reference groups and evaluate performance of ssNPA by NMI and ARI

dir.create('./liver_scrnaseq_analysis/test_refxpd/')
for (ref in c("E10.5","E11.5","E12.5","E13.5","E14.5","E15.5","E17.5")){
  dir.create(paste0('./liver_scrnaseq_analysis/test_refxpd/',ref))
  for (pd in 5:12){
    ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = exp_voom,
                 reference_samples = (class$time==ref), 
                 out_dir = paste0('./liver_scrnaseq_analysis/test_refxpd/',ref),
                 seed_value = 1,
                 fgs_pd = pd,
                 n_pc = 10,
                 cluster_resolution = 0.6,
                 cluster_ref_samples = T)
  write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_refxpd/',ref,'/ssNPA_cluster_output_',ref,'_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_refxpd/',ref,'/ssNPA_loading_output_',ref,'_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  }
}

for (ref in c("E10.5","E11.5","E13.5","E14.5","E15.5","E17.5")){
  for (pd in 4){
    ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = exp_voom,
                 reference_samples = (class$time==ref), 
                 out_dir = paste0('./liver_scrnaseq_analysis/test_refxpd/',ref),
                 seed_value = 1,
                 fgs_pd = pd,
                 n_pc = 10,
                 cluster_resolution = 0.6,
                 cluster_ref_samples = T)
  write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_refxpd/',ref,'/ssNPA_cluster_output_',ref,'_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_refxpd/',ref,'/ssNPA_loading_output_',ref,'_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  }
}

# add in E17.5 Hepatocytes
dir.create('./liver_scrnaseq_analysis/test_refxpd/E17.5hepatocytes/')
for (pd in 5:12){
  ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = exp_voom,
                                    reference_samples = (class$timexcell=='E17.5_hepatoblast/hepatocyte'), 
                                    out_dir = paste0('./liver_scrnaseq_analysis/test_refxpd/','E17.5hepatocytes'),
                                    seed_value = 1,
                                    fgs_pd = pd,
                                    n_pc = 10,
                                    cluster_resolution = 0.6,
                                    cluster_ref_samples = T)
  write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_refxpd/','E17.5hepatocytes','/ssNPA_cluster_output_','E17.5hepatocytes','_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_refxpd/','E17.5hepatocytes','/ssNPA_loading_output_','E17.5hepatocytes','_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
}

Generate random reference groups.

dir.create('./liver_scrnaseq_analysis/random_reference_cells/')
set.seed(1)
for (i in 1:5){
  s <- sample(row.names(exp_voom), median(table(class$time)), replace = FALSE)
  write.table(s,paste0('./liver_scrnaseq_analysis/random_reference_cells/reference_cells_',toString(i),'_.txt'),quote=F,sep='\t',row.names = F, col.names=F)  
}

Test ssNPA with random reference groups.

ref="Random"
for (i in 1:5){
  dir.create(paste0('./liver_scrnaseq_analysis/test_refxpd/random',toString(i)))
  ref_list <- read.table(paste0('./liver_scrnaseq_analysis/random_reference_cells/reference_cells_',toString(i),'_.txt'), quote="", sep='\t', header =F)
  ref_list <- ref_list$V1
  ref_sam <- (row.names(exp_voom) %in% ref_list)
  for (pd in 4:12){
    ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = exp_voom,
                                    reference_samples = ref_sam, 
                                    out_dir = paste0('./liver_scrnaseq_analysis/test_refxpd/','random',toString(i)),
                                    seed_value = 1,
                                    fgs_pd = pd,
                                    n_pc = 10,
                                    cluster_resolution = 0.6,
                                    cluster_ref_samples = T)
  write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_refxpd/','random',toString(i),'/ssNPA_cluster_output_','random','_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_refxpd/','random',toString(i),'/ssNPA_loading_output_','random','_pd_',toString(pd),'_.txt'),quote = F, sep='\t')
  }
}

Read in all ssNPA results files and calculate NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_refxpd", pattern = 'ssNPA_cluster_output',full.names = T, recursive = T,ignore.case = FALSE, include.dirs = T)

method <- character()
reference_set <- character()
pds <- integer()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  a <- strsplit(file,"_",fixed=T)
  method <- c(method, 'ssNPA')
  reference_set <- c(reference_set, a[[1]][7])
  pds <- c(pds,a[[1]][9])
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=T)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
ssnpa_results <- data.frame(method,reference_set,pds,num_clusters,nmi,ari)
ssnpa_results$pds <- factor(ssnpa_results$pds, levels = c(4,5,6,7,8,9,10,11,12))
levels(ssnpa_results$reference_set)[levels(ssnpa_results$reference_set)=="E17.5hepatocytes"] <- "E17.5 hepatocytes"
levels(ssnpa_results$reference_set)[levels(ssnpa_results$reference_set)=="random"] <- "Random"

ssnpa_results

##     method     reference_set pds num_clusters       nmi       ari
## 1    ssNPA             E10.5  10            6 0.4989061 0.3629863
## 2    ssNPA             E10.5  11            6 0.4996008 0.3632776
## 3    ssNPA             E10.5  12            6 0.4985045 0.3628323
## 4    ssNPA             E10.5   4            6 0.5190910 0.3941733
## 5    ssNPA             E10.5   5            6 0.5115482 0.3732445
## 6    ssNPA             E10.5   6            6 0.5126917 0.3783058
## 7    ssNPA             E10.5   7            6 0.5076537 0.3721858
## 8    ssNPA             E10.5   8            6 0.4984591 0.3607016
## 9    ssNPA             E10.5   9            6 0.4949195 0.3575609
## 10   ssNPA             E11.5  10            7 0.5061272 0.3711821
## 11   ssNPA             E11.5  11            7 0.5055431 0.3688952
## 12   ssNPA             E11.5  12            7 0.5021115 0.3647993
## 13   ssNPA             E11.5   4            7 0.5806941 0.5046954
## 14   ssNPA             E11.5   5            7 0.5536136 0.4661542
## 15   ssNPA             E11.5   6            7 0.5235163 0.4111588
## 16   ssNPA             E11.5   7            7 0.5139211 0.3944585
## 17   ssNPA             E11.5   8            7 0.5257517 0.3955257
## 18   ssNPA             E11.5   9            7 0.5188797 0.3866566
## 19   ssNPA             E12.5  10            6 0.4627493 0.3496546
## 20   ssNPA             E12.5  11            6 0.4881712 0.3675188
## 21   ssNPA             E12.5  12            6 0.4805727 0.3593541
## 22   ssNPA             E12.5   5            7 0.5388598 0.4157324
## 23   ssNPA             E12.5   6            6 0.5170250 0.3849483
## 24   ssNPA             E12.5   7            6 0.4708107 0.3427727
## 25   ssNPA             E12.5   8            6 0.4923602 0.3669023
## 26   ssNPA             E12.5   9            6 0.4823793 0.3323744
## 27   ssNPA             E13.5  10            6 0.4674087 0.3593620
## 28   ssNPA             E13.5  11            6 0.4649576 0.3608429
## 29   ssNPA             E13.5  12            6 0.4664099 0.3562749
## 30   ssNPA             E13.5   4            7 0.5512526 0.4297797
## 31   ssNPA             E13.5   5            6 0.4785982 0.3424778
## 32   ssNPA             E13.5   6            7 0.5343838 0.4101959
## 33   ssNPA             E13.5   7            6 0.4720886 0.3514341
## 34   ssNPA             E13.5   8            6 0.4687013 0.3434875
## 35   ssNPA             E13.5   9            6 0.4696692 0.3576294
## 36   ssNPA             E14.5  10            7 0.5110207 0.3762143
## 37   ssNPA             E14.5  11            7 0.5218555 0.3817916
## 38   ssNPA             E14.5  12            7 0.5058115 0.3705352
## 39   ssNPA             E14.5   4            6 0.5740522 0.4723229
## 40   ssNPA             E14.5   5            7 0.5714402 0.4315123
## 41   ssNPA             E14.5   6            7 0.5401851 0.3833062
## 42   ssNPA             E14.5   7            7 0.4937949 0.3686072
## 43   ssNPA             E14.5   8            7 0.5066050 0.3846853
## 44   ssNPA             E14.5   9            7 0.4931975 0.3606796
## 45   ssNPA             E15.5  10            6 0.5014041 0.3901273
## 46   ssNPA             E15.5  11            6 0.5002681 0.3930109
## 47   ssNPA             E15.5  12            6 0.5032696 0.3938568
## 48   ssNPA             E15.5   4            8 0.6150530 0.4873983
## 49   ssNPA             E15.5   5            8 0.5976367 0.4478579
## 50   ssNPA             E15.5   6            7 0.5148128 0.3515344
## 51   ssNPA             E15.5   7            6 0.4934968 0.3591223
## 52   ssNPA             E15.5   8            6 0.5037886 0.3964979
## 53   ssNPA             E15.5   9            7 0.5137178 0.3802511
## 54   ssNPA             E17.5  10            6 0.4595449 0.3449952
## 55   ssNPA             E17.5  11            6 0.4502055 0.3345621
## 56   ssNPA             E17.5  12            6 0.4325164 0.3125855
## 57   ssNPA             E17.5   4            7 0.5188686 0.3782065
## 58   ssNPA             E17.5   5            7 0.5380958 0.4139808
## 59   ssNPA             E17.5   6            6 0.4937677 0.3811921
## 60   ssNPA             E17.5   7            6 0.4927237 0.3814886
## 61   ssNPA             E17.5   8            6 0.4863517 0.3675997
## 62   ssNPA             E17.5   9            6 0.4596659 0.3496078
## 63   ssNPA E17.5 hepatocytes  10            6 0.4423047 0.3399355
## 64   ssNPA E17.5 hepatocytes  11            6 0.4483608 0.3443706
## 65   ssNPA E17.5 hepatocytes  12            6 0.4417953 0.3414855
## 66   ssNPA E17.5 hepatocytes   5            7 0.5244172 0.4103018
## 67   ssNPA E17.5 hepatocytes   6            6 0.4588314 0.3620383
## 68   ssNPA E17.5 hepatocytes   7            6 0.4579592 0.3556687
## 69   ssNPA E17.5 hepatocytes   8            6 0.4390137 0.3349598
## 70   ssNPA E17.5 hepatocytes   9            6 0.4486123 0.3457244
## 71   ssNPA            Random  10            5 0.3994849 0.2940187
## 72   ssNPA            Random  11            6 0.4615890 0.3325610
## 73   ssNPA            Random  12            5 0.3952019 0.2968861
## 74   ssNPA            Random   4            5 0.2971578 0.1786121
## 75   ssNPA            Random   5            5 0.2951630 0.2001413
## 76   ssNPA            Random   6            5 0.3449468 0.2433838
## 77   ssNPA            Random   7            5 0.3774365 0.2693926
## 78   ssNPA            Random   8            5 0.4055045 0.2950569
## 79   ssNPA            Random   9            5 0.3927834 0.2829422
## 80   ssNPA            Random  10            5 0.4036376 0.3017254
## 81   ssNPA            Random  11            5 0.4107778 0.3040372
## 82   ssNPA            Random  12            5 0.4107587 0.3029480
## 83   ssNPA            Random   4            5 0.2900579 0.1844669
## 84   ssNPA            Random   5            6 0.3414785 0.2011797
## 85   ssNPA            Random   6            6 0.4170360 0.2886322
## 86   ssNPA            Random   7            5 0.3752135 0.2631230
## 87   ssNPA            Random   8            6 0.4419078 0.3080828
## 88   ssNPA            Random   9            6 0.4319798 0.3082374
## 89   ssNPA            Random  10            5 0.3908403 0.2875533
## 90   ssNPA            Random  11            5 0.3899106 0.2877471
## 91   ssNPA            Random  12            5 0.3981384 0.2979477
## 92   ssNPA            Random   4            5 0.2940171 0.1872200
## 93   ssNPA            Random   5            6 0.3385853 0.2241929
## 94   ssNPA            Random   6            5 0.3349233 0.2278658
## 95   ssNPA            Random   7            5 0.3513398 0.2317307
## 96   ssNPA            Random   8            5 0.4063303 0.2823776
## 97   ssNPA            Random   9            5 0.3975408 0.2912891
## 98   ssNPA            Random  10            7 0.4605585 0.3216140
## 99   ssNPA            Random  11            6 0.4532103 0.3282535
## 100  ssNPA            Random  12            6 0.4543818 0.3314441
## 101  ssNPA            Random   4            5 0.3012069 0.1922797
## 102  ssNPA            Random   5            6 0.3374999 0.2291097
## 103  ssNPA            Random   6            5 0.3404424 0.2119682
## 104  ssNPA            Random   7            6 0.4252663 0.2786030
## 105  ssNPA            Random   8            6 0.4219663 0.2748916
## 106  ssNPA            Random   9            6 0.4284912 0.2825565
## 107  ssNPA            Random  10            5 0.4294952 0.3208720
## 108  ssNPA            Random  11            5 0.4201252 0.3109494
## 109  ssNPA            Random  12            5 0.4192327 0.3142967
## 110  ssNPA            Random   4            5 0.2863549 0.1888866
## 111  ssNPA            Random   5            5 0.3301963 0.2209320
## 112  ssNPA            Random   6            6 0.3465440 0.2381293
## 113  ssNPA            Random   7            5 0.3663433 0.2449496
## 114  ssNPA            Random   8            5 0.3817106 0.2816567
## 115  ssNPA            Random   9            5 0.3910132 0.2999313

mean(ssnpa_results$nmi[ssnpa_results$reference_set=="Random"])

## [1] 0.3819507

s <- Rmisc::summarySE(ssnpa_results, measurevar="nmi", groupvars=c("reference_set","pds"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=nmi, fill=as.factor(pds),width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9))
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=nmi-se, ymax=nmi+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('NMI')
g <- g + ggplot2::guides(fill=ggplot2::guide_legend(title="Penalty\nDiscount"))
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('ssnpa_liver_referencecells_by_pd.nmi.pdf', width=12, height=6)
g

## Warning: Removed 70 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 70 rows containing missing values (geom_errorbar).

s <- Rmisc::summarySE(ssnpa_results, measurevar="ari", groupvars=c("reference_set","pds"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=ari, fill=as.factor(pds),width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9))
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=ari-se, ymax=ari+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('ARI')
g <- g + ggplot2::guides(fill=ggplot2::guide_legend(title="Penalty\nDiscount"))
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('ssnpa_liver_referencecells_by_pd.ari.pdf', width=12, height=6)
g

## Warning: Removed 70 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 70 rows containing missing values (geom_errorbar).

ssnpa_features_selected <- read.table('./liver_scrnaseq_analysis/test_refxpd/E15.5/selectedfeatures.fges_pd4.txt', header=T, sep='\t',quote = "")
mean(ssnpa_features_selected$num_var)

## [1] 3.584

Test ssNPA-LR with different reference groups

dir.create('./liver_scrnaseq_analysis/test_lr_ref/')
for (ref in c("E10.5","E11.5","E12.5","E13.5","E14.5","E15.5","E17.5")){
    dir.create(paste0('./liver_scrnaseq_analysis/test_lr_ref/',ref))
    ssnpa_cluster <- ssnpa::run_ssnpa_lr(gene_exp = exp_voom,
                                      reference_samples = (class$time==ref), 
                                      out_dir = paste0('./liver_scrnaseq_analysis/test_lr_ref/',ref),
                                      seed_value = 1,
                                      n_pc = 10,
                                      cluster_resolution = 0.6,
                                      cluster_ref_samples = T)
    write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/',ref,'/ssNPA-LR_cluster_output_',ref,'_.txt'),quote = F, sep='\t')
    write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/',ref,'/ssNPA-LR_loading_output_',ref,'_.txt'),quote = F, sep='\t')
}

# add in E17.5 Hepatocytes
dir.create(paste0('./liver_scrnaseq_analysis/test_lr_ref/E17.5hepatocytes/'))
ssnpa_cluster <- ssnpa::run_ssnpa_lr(gene_exp = exp_voom,
                                    reference_samples = (class$timexcell=='E17.5_hepatoblast/hepatocyte'), 
                                    out_dir = paste0('./liver_scrnaseq_analysis/test_lr_ref/','E17.5hepatocytes'),
                                    seed_value = 1,
                                    n_pc = 10,
                                    cluster_resolution = 0.6,
                                    cluster_ref_samples = T)
write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/','E17.5hepatocytes','/ssNPA-LR_cluster_output_','E17.5hepatocytes','_.txt'),quote = F, sep='\t')
write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/','E17.5hepatocytes','/ssNPA-LR_loading_output_','E17.5hepatocytes','_.txt'),quote = F, sep='\t')


for (i in 1:5){
  dir.create(paste0("./liver_scrnaseq_analysis/test_lr_ref/random",toString(i)))
  ref_list <- read.table(paste0('./liver_scrnaseq_analysis/random_reference_cells/reference_cells_',toString(i),'_.txt'), quote="", sep='\t', header =F)
  ref_list <- ref_list$V1
  ref_sam <- (row.names(exp_voom) %in% ref_list)
  ssnpa_cluster <- ssnpa::run_ssnpa_lr(gene_exp = exp_voom,
                                    reference_samples = ref_sam, 
                                    out_dir = paste0('./liver_scrnaseq_analysis/test_lr_ref/','random',toString(i)),
                                    seed_value = 1,
                                    n_pc = 10,
                                    cluster_resolution = 0.6,
                                    cluster_ref_samples = T)
  write.table(ssnpa_cluster$clustering,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/','random',toString(i),'/ssNPA-LR_cluster_output_','random','_.txt'),quote = F, sep='\t')
  write.table(ssnpa_cluster$loadings,file=paste0('./liver_scrnaseq_analysis/test_lr_ref/','random',toString(i),'/ssNPA-LR_loading_output_','random','_.txt'),quote = F, sep='\t')
}

Read in all ssNPA-LR results files and calculate NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_lr_ref", pattern = 'ssNPA-LR_cluster_output',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

method <- character()
reference_set <- character()
pds <- integer()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  a <- strsplit(file,"_",fixed=T)
  method <- c(method, 'ssNPA-LR')
  reference_set <- c(reference_set, a[[1]][8])
  pds <- c(pds,NA)
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=F)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
ssnpa_lr_results <- data.frame(method,reference_set,pds,num_clusters,nmi,ari)
levels(ssnpa_lr_results$reference_set)[levels(ssnpa_lr_results$reference_set)=="E17.5hepatocytes"] <- "E17.5 hepatocytes"
levels(ssnpa_lr_results$reference_set)[levels(ssnpa_lr_results$reference_set)=="random"] <- "Random"

ssnpa_lr_results

##      method     reference_set pds num_clusters       nmi       ari
## 1  ssNPA-LR             E10.5  NA            6 0.5343157 0.4103438
## 2  ssNPA-LR             E11.5  NA            7 0.5619260 0.4851154
## 3  ssNPA-LR             E12.5  NA            7 0.5401574 0.4101360
## 4  ssNPA-LR             E13.5  NA            7 0.5956836 0.4771295
## 5  ssNPA-LR             E14.5  NA            6 0.5944397 0.4973185
## 6  ssNPA-LR             E15.5  NA            7 0.5919097 0.4755229
## 7  ssNPA-LR             E17.5  NA            6 0.5100085 0.3861713
## 8  ssNPA-LR E17.5 hepatocytes  NA            7 0.5199538 0.4069017
## 9  ssNPA-LR            Random  NA            6 0.3327361 0.2236332
## 10 ssNPA-LR            Random  NA            6 0.3348679 0.2181527
## 11 ssNPA-LR            Random  NA            6 0.3153343 0.2041835
## 12 ssNPA-LR            Random  NA            6 0.3155976 0.1994630
## 13 ssNPA-LR            Random  NA            6 0.3513737 0.2508438

mean(ssnpa_lr_results$nmi[ssnpa_lr_results$reference_set=="Random"])

## [1] 0.3299819

s <- Rmisc::summarySE(ssnpa_lr_results, measurevar="nmi", groupvars=c("reference_set"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=nmi, width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9),fill="dodgerblue3")
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=nmi-se, ymax=nmi+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('NMI')
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('liver_ssnpa-lr_by_referencecells.nmi.pdf', width=8, height=6)
g

## Warning: Removed 8 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 8 rows containing missing values (geom_errorbar).

s <- Rmisc::summarySE(ssnpa_lr_results, measurevar="ari", groupvars=c("reference_set"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=ari, width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9),fill="dodgerblue3")
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=ari-se, ymax=ari+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('ARI')
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('liver_ssnpa-lr_by_referencecells.ari.pdf', width=8, height=6)
g

## Warning: Removed 8 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 8 rows containing missing values (geom_errorbar).

ssnpalr_features_selected <- read.table('./liver_scrnaseq_analysis/test_lr_ref/E13.5/ssnpa_lr_selectedfeatures.txt', header=T, sep='\t',quote = "")
mean(ssnpalr_features_selected$num_var)

## [1] 13.51167

Cluster cells just based on gene expression

dir.create(paste0('./liver_scrnaseq_analysis/test_gex'))
gex_cluster <- ssnpa::cluster_samples(features = exp_voom,
                                      meta_data_df = class,
                                      n_pc = 10,
                                      cluster_resolution =0.6,
                                      cluster_ref_samples = T,
                                      ref_samples = rep(T,dim(exp_voom)[1]),
                                      seed_value = 1)$clustering
method <- 'Gene Expression'
reference_set <- NA
pds <- NA
num_clusters <- length(unique(gex_cluster$Cluster))
nmi <- aricode::NMI(class$timexcell,gex_cluster$Cluster)
ari <- mclust::adjustedRandIndex(class$timexcell, gex_cluster$Cluster)
write.table(gex_cluster,file=paste0('./liver_scrnaseq_analysis/test_gex/gex_cluster_output.txt'),quote = F, sep='\t')

Read in gene expression result file and calcule NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_gex", pattern = 'gex_cluster_output',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

method <- character()
reference_set <- character()
pds <- integer()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  method <- c(method, 'Gene Expression')
  reference_set <- c(reference_set, NA)
  pds <- c(pds,NA)
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=F)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
gex_results <- data.frame(method,reference_set,pds,num_clusters,nmi,ari)
gex_results

##            method reference_set pds num_clusters       nmi       ari
## 1 Gene Expression          <NA>  NA            5 0.4967235 0.3748872

Compare performance to Pathifier

Use Pathifier with KEGG pathways to cluster these data. First, read in KEGG pathways.

dir.create(paste0('./output_data'))
kegg_full <- list()
pathways <- KEGGREST::keggList("pathway", "mmu") #mmu for mouse
count2 <- 1
for (i in 1:length(pathways)){
    x <- attr(pathways,"names")[i]
    x <- gsub("path:","",x, fixed=T)
    path_full <- KEGGREST::keggGet(x)
    p <- path_full[[1]]$GENE
    if (length(p)>0){
    even_indexes<-seq(2,length(p),2)
    q <- p[even_indexes]
    r <- strsplit(q,";",fixed=T)
    gene_names <- NULL
    count <- 1
    for (j in 1:length(r)){
        if (length(r[[j]])>1){
        gene_names[count] <- r[[j]][1]
        count <- count+1
        }
    }
    # only keep pathways with at least 10 genes
        if (length(gene_names)>10){
    kegg_full$gs[count2] <- list(gene_names)
    kegg_full$pathwaynames[count2] <- path_full[[1]]$NAME
    count2 <- count2 + 1
    }
    }
}
save(kegg_full,file="./output_data/kegg_full.mmu.RData")
summary(kegg_full)

Convert ENSEMBL IDs to gene names to match KEGG.

ensembl <- biomaRt::useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl")
lookup <- biomaRt::getBM(attributes=c('ensembl_gene_id','external_gene_name'), filters ='ensembl_gene_id', values =names(exp_voom), mart = ensembl)

t <- match(names(exp_voom), lookup$ensembl_gene_id)
gene_names <- lookup$external_gene_name[t]
exp_voom_genename <- exp_voom
names(exp_voom_genename)[!is.na(gene_names)] <- gene_names[!is.na(gene_names)]

Run Pathifier with all groups of cells as the reference set.

data <- t(exp_voom_genename)
dir.create(paste0('./liver_scrnaseq_analysis/test_pathifier_ref/'))
for (ref in c("E10.5","E11.5","E12.5","E13.5","E14.5","E15.5","E17.5")){
    dir.create(paste0('./liver_scrnaseq_analysis/test_pathifier_ref/',ref))
    pathway_dereg <- pathifier::quantify_pathways_deregulation(data,allgenes = row.names(data),syms = kegg_full$gs,pathwaynames = kegg_full$pathwaynames,normals = (class$time==ref), attempts=100)
    scores <- do.call(rbind,pathway_dereg$scores)
    scores <- as.data.frame(t(scores))
    names(scores) <- names(pathway_dereg$scores)
    names(scores) <- make.names(names(scores))
    row.names(scores) <- row.names(exp_voom_genename)
    pathifier_cluster <- ssnpa::cluster_samples(features = scores,
                                      meta_data_df = class,
                                      n_pc = 10,
                                      cluster_resolution =0.6,
                                      cluster_ref_samples = T,
                                      ref_samples = (class$time==ref),
                                      seed_value = 1)$clustering
    write.table(pathifier_cluster,file=paste0('./liver_scrnaseq_analysis/test_pathifier_ref/',ref,'/pathifier_cluster_output_',ref,'_.txt'),quote = F, sep='\t')
}


for (i in 1:5){
  dir.create(paste0("./liver_scrnaseq_analysis/test_pathifier_ref/random",toString(i)))
  ref_list <- read.table(paste0('./liver_scrnaseq_analysis/random_reference_cells/reference_cells_',toString(i),'_.txt'), quote="", sep='\t', header =F)
  ref_list <- ref_list$V1
  ref_sam <- (row.names(exp_voom) %in% ref_list)
  
  pathway_dereg <- pathifier::quantify_pathways_deregulation(data,allgenes = row.names(data),syms = kegg_full$gs,pathwaynames = kegg_full$pathwaynames,normals = ref_sam, attempts=100)
  scores <- do.call(rbind,pathway_dereg$scores)
  scores <- as.data.frame(t(scores))
  names(scores) <- names(pathway_dereg$scores)
  names(scores) <- make.names(names(scores))
  row.names(scores) <- row.names(exp_voom_genename)
  write.table(scores, file=paste0('./liver_scrnaseq_analysis/test_pathifier_ref/random', toString(i),'/pathifier_scores_random_.txt'), quote=F, sep='\t')
  pathifier_cluster <- ssnpa::cluster_samples(features = scores,
                                      meta_data_df = class,
                                      n_pc = 10,
                                      cluster_resolution =0.6,
                                      cluster_ref_samples = T,
                                      ref_samples = ref_sam,
                                      seed_value = 1)$clustering
  write.table(pathifier_cluster,file=paste0('./liver_scrnaseq_analysis/test_pathifier_ref/','random',toString(i),'/pathifier_cluster_output_','random','_.txt'),quote = F, sep='\t')
}

Read in Pathifier result files and calcule NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_pathifier_ref", pattern = 'pathifier_cluster_output',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

method <- character()
reference_set <- character()
pds <- integer()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  a <- strsplit(file,"_",fixed=T)
  method <- c(method, 'Pathifier')
  reference_set <- c(reference_set, a[[1]][8])
  pds <- c(pds,NA)
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=F)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
pathifier_results <- data.frame(method,reference_set,pds,num_clusters,nmi,ari)
levels(pathifier_results$reference_set)[levels(pathifier_results$reference_set)=="random"] <- "Random"


pathifier_results

##       method reference_set pds num_clusters       nmi       ari
## 1  Pathifier         E10.5  NA            6 0.5039087 0.3571078
## 2  Pathifier         E11.5  NA            6 0.4930281 0.3824889
## 3  Pathifier         E12.5  NA            6 0.5103271 0.3741414
## 4  Pathifier         E13.5  NA            6 0.5289707 0.3993093
## 5  Pathifier         E14.5  NA            6 0.5072126 0.3641602
## 6  Pathifier         E15.5  NA            5 0.4475906 0.3258259
## 7  Pathifier         E17.5  NA            5 0.4361635 0.3074956
## 8  Pathifier        Random  NA            5 0.4424782 0.3142321
## 9  Pathifier        Random  NA            6 0.4601782 0.3006597
## 10 Pathifier        Random  NA            6 0.5026206 0.3678690
## 11 Pathifier        Random  NA            5 0.4747649 0.3528858
## 12 Pathifier        Random  NA            5 0.4448886 0.3192647

mean(pathifier_results$nmi[pathifier_results$reference_set=="Random"])

## [1] 0.4649861

s <- Rmisc::summarySE(pathifier_results, measurevar="nmi", groupvars=c("reference_set"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=nmi, width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9),fill="dodgerblue3")
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=nmi-se, ymax=nmi+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('NMI')
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('liver_pathifier_by_referencecells.nmi.pdf', width=8, height=6)
g

## Warning: Removed 7 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 7 rows containing missing values (geom_errorbar).

s <- Rmisc::summarySE(pathifier_results, measurevar="ari", groupvars=c("reference_set"))

## Warning in qt(conf.interval/2 + 0.5, datac$N - 1): NaNs produced

g <- ggplot2::ggplot(s,ggplot2::aes(x=reference_set,y=ari, width=0.75))
g <- g + ggplot2::geom_bar(stat="identity", position=ggplot2::position_dodge(width=0.9),fill="dodgerblue3")
g <- g + ggplot2::geom_errorbar(ggplot2::aes(ymin=ari-se, ymax=ari+se), width=0.5,position=ggplot2::position_dodge(.9))
g <- g + ggplot2::theme_bw()
g <- g + ggplot2::xlab('Reference Cells')
g <- g + ggplot2::ylab('ARI')
g <- g + ggplot2::theme(axis.title.y = ggplot2::element_text(margin = ggplot2::margin(t = 0, r = 15, b = 0, l = 0)),
                        axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                        text = ggplot2::element_text(size=20),
                        legend.text=ggplot2::element_text(size=18),
                        axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t = 15, r = 0, b = 0, l = 0)))
pdf('liver_pathifier_by_referencecells.ari.pdf', width=8, height=6)
g

## Warning: Removed 7 rows containing missing values (geom_errorbar).

dev.off()

## quartz_off_screen 
##                 2

## Warning: Removed 7 rows containing missing values (geom_errorbar).

Compare performance to ssGSEA

Use ssGSEA with MSigDB pathways to cluster these data. First, convert ENSEMBL IDs to Entrez gene IDs.

ensembl <- biomaRt::useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl")
lookup <- biomaRt::getBM(attributes=c('ensembl_gene_id','entrezgene_id'), filters ='ensembl_gene_id', values =names(exp_voom), mart = ensembl)

t <- match(names(exp_voom), lookup$ensembl_gene_id)
gene_names <- lookup$entrezgene[t]
exp_voom_entrez <- exp_voom
names(exp_voom_entrez)[!is.na(gene_names)] <- gene_names[!is.na(gene_names)]

Load in the C2 collection from MSigDB (version 5.2 with mouse identifiers) downloaded from http://bioinf.wehi.edu.au/software/MSigDB/index.html

load('./input_data/mouse_c2_v5p2.rdata')

Run ssGSEA and cluster cells.

dir.create(paste0('./liver_scrnaseq_analysis/test_ssgsea'))
ssgsea_features <- GSVA::gsva(as.matrix(t(exp_voom_entrez)), Mm.c2, method="ssgsea")
ssgsea_cluster_input <- as.data.frame(t(ssgsea_features))
names(ssgsea_cluster_input) <- make.names(names(ssgsea_cluster_input))
ssgsea_cluster <- ssnpa::cluster_samples(features = as.data.frame(t(ssgsea_features)),
                                      meta_data_df = class,
                                      n_pc = 10,
                                      cluster_resolution =0.6,
                                      cluster_ref_samples = T,
                                      ref_samples = rep(T,dim(class)[1]),
                                      seed_value = 1)$clustering
write.table(ssgsea_cluster,file=paste0('./liver_scrnaseq_analysis/test_ssgsea/ssgsea_cluster_output.txt'),quote = F, sep='\t')

Read in ssGSEA result file and calculate NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_ssgsea", pattern = 'ssgsea_cluster_output',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

method <- character()
reference_set <- character()
pds <- integer()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  method <- c(method, 'ssGSEA')
  reference_set <- c(reference_set, NA)
  pds <- c(pds,NA)
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=F)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
ssgsea_results <- data.frame(method,reference_set,pds,num_clusters,nmi,ari)
ssgsea_results

##   method reference_set pds num_clusters       nmi       ari
## 1 ssGSEA          <NA>  NA            5 0.4776922 0.3531489

Check performance with fewer genes

Generate random datasets.

dir.create(paste0('./liver_scrnaseq_analysis/test_fewgenes/datasets'))
set.seed(1)
for (p in c(0.95, 0.9, 0.85, 0.8, 0.75, 0.5, 0.25)){
for (i in 1:5){
  s <- sample(names(exp_voom), floor(dim(exp_voom)[2]*p), replace = FALSE)
  exp_s <- exp_voom[,s]
  write.table(exp_s,paste0('./liver_scrnaseq_analysis/test_fewgenes/datasets/expvoom_p_',toString(p),'_',toString(i),'_.txt'),quote=F,sep='\t',row.names = T, col.names=T)
}
}

Run ssNPA on each of these random datasets.

ref <- 'E15.5'
pd <- 4

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_fewgenes/datasets", pattern = 'expvoom',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

for (file in result_files){
  df <- read.table(file,header=T,sep='\t',quote="",row.names=1)
  a <- strsplit(file,'_',fixed = T)
  p <- a[[1]][6]
  i <- a[[1]][7]
  dir.create(paste0('./liver_scrnaseq_analysis/test_fewgenes/p',toString(p),'/',toString(i)))
  ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = df,
                 reference_samples = (class$time==ref), 
                 out_dir = paste0('./liver_scrnaseq_analysis/test_fewgenes/p',toString(p),'/',toString(i)),
                 seed_value = 1,
                 fgs_pd = pd,
                 n_pc = 10,
                 cluster_resolution = 0.6,
                 cluster_ref_samples = T)$clustering
  write.table(ssnpa_cluster,file=paste0('./liver_scrnaseq_analysis/test_fewgenes/ssNPA_cluster_output_',ref,'_pd_',toString(pd),'_',toString(p),'_',toString(i),'_.txt'),quote = F, sep='\t')
}
  #Apply ssNPA to full dataset (all 3,000 genes)
  p <- 1
  i <- 1
  dir.create(paste0('./liver_scrnaseq_analysis/test_fewgenes/p',toString(p),'/',toString(i)))
  ssnpa_cluster <- ssnpa::run_ssnpa(gene_exp = exp_voom,
                 reference_samples = (class$time==ref), 
                 out_dir = paste0('./liver_scrnaseq_analysis/test_fewgenes/p',toString(p),'/',toString(i)),
                 seed_value = 1,
                 fgs_pd = pd,
                 n_pc = 10,
                 cluster_resolution = 0.6,
                 cluster_ref_samples = T)$clustering
  write.table(ssnpa_cluster,file=paste0('./liver_scrnaseq_analysis/test_fewgenes/ssNPA_cluster_output_',ref,'_pd_',toString(pd),'_',toString(p),'_',toString(i),'_.txt'),quote = F, sep='\t')

Read in reduced gene result files and calculate NMI and ARI

result_files <- list.files(path = "./liver_scrnaseq_analysis/test_fewgenes", pattern = 'ssNPA_cluster_output',full.names = T, recursive = T,ignore.case = F, include.dirs = T)

f <- double()
num_clusters <- integer()
nmi <- double()
ari <- double()
for (file in result_files){
  a <- strsplit(file,'_',fixed = T)
  f <- c(f, as.numeric(a[[1]][10])*100)
  df <- read.table(file,header=T,sep='\t', quote="",row.names=1,stringsAsFactors=F)
  num_clusters <- c(num_clusters,length(unique(df$Cluster)))
  nmi <- c(nmi,aricode::NMI(class$timexcell,df$Cluster))
  ari <- c(ari,mclust::adjustedRandIndex(class$timexcell, df$Cluster))
}
ssnpa_fewgene_results <- data.frame(f,num_clusters,nmi,ari)

s <- Rmisc::summarySE(ssnpa_fewgene_results, measurevar="nmi", groupvars=c("f"))

s

##     f N       nmi         sd         se         ci
## 1  25 5 0.3855473 0.03936100 0.01760277 0.04887314
## 2  50 5 0.5216856 0.03135548 0.01402260 0.03893297
## 3  75 5 0.5421200 0.04445185 0.01987947 0.05519426
## 4  80 5 0.5592278 0.04314140 0.01929342 0.05356712
## 5  85 5 0.5833929 0.02407098 0.01076487 0.02988807
## 6  90 5 0.5823763 0.02749929 0.01229806 0.03414488
## 7  95 5 0.5948758 0.03501905 0.01566099 0.04348189
## 8 100 1 0.6150530         NA         NA        NaN

g_genes <- ggplot2::ggplot(s, ggplot2::aes(x=as.numeric(f), y=nmi)) + 
    ggplot2::geom_errorbar(ggplot2::aes(ymin=nmi-se, ymax=nmi+se), width=1) + ggplot2::geom_line() + ggplot2::geom_point() + ggplot2::theme_bw() + ggplot2::xlab("Percentage of 3,000 Genes Selected") + ggplot2::ylab("NMI")
g_genes

pdf('ssnpa_liver_percentgenes.pdf',width=7, heigh=5)
g_genes
dev.off()

## quartz_off_screen 
##                 2

Check performance with fewer cells