Strategies for Large-scale Single-cell Data Subsetting and Computation

If your computer storage is enough, ignore this item.With the rapid development of single-cell data analysis in recent years, the number of single cells has also increased significantly. Analyzing the entire single-cell expression matrix can sometimes lead to memory overflow, making it difficult for many individuals to run scPagwas with the required amount of memory. To address this issue, we provide two solutions corresponding to different memory situations:

  1. Solution 1: If there is enough memory to load all the single-cell data and run until the “Pathway_pcascore_run” function without encountering memory overflow, the solution involves splitting the data after this step.

  2. Solution 2: If there is not enough memory to run the “Pathway_pcascore_run” function, the solution involves splitting the data based on the initial input of single-cell data.

    Please note that both of these approaches are specific to single-cell data processing. For cell type calculations, large memory requirements are typically not necessary. If there is still insufficient memory, an alternative approach is to randomly select a subset of single cells, ensuring that each cell type is adequately represented in the subset. Cell type calculations can then be performed based on this subset, as it represents a pseudo-bulk analysis. However, this vignette does not cover cell type calculations.

1.Solution 1

1.1 Input and compute the single-cell data.

Solution 1 involves following the scPagwas workflow and proceeding with the calculations until the second step:

library(scPagwas)
Pagwas <- list()
#Input and preprocess the single-cell data
Single_data <- readRDS(system.file("extdata", "scRNAexample.rds", package = "scPagwas"))
Pagwas <- scPagwas::Single_data_input(
      Pagwas = Pagwas,
      assay = "RNA",
      Single_data = Single_data,
      Pathway_list = Genes_by_pathway_kegg
    )
Single_data <- Single_data[, colnames(Pagwas$data_mat)]

#Run svd function
Pagwas <- scPagwas::Pathway_pcascore_run(
        Pagwas = Pagwas,
        Pathway_list = Genes_by_pathway_kegg
      )

Pagwas <- scPagwas::pa_meanexpr(Pagwas)
#save this result
save(Pagwas,file="./single.pagwas.RData")

1.2 The single-cell processing results will be divided into different groups and saved as output

In this step, the number of divisions to which the data will be split depends on the size of the computer’s memory.

library(scPagwas)
library(readr)
library(dplyr)
library(Seurat)
library(tidyverse)

seudata_path="./single.pagwas.RData"
load(seudata_path)

pca_scCell_mat<-Pagwas$pca_scCell_mat
data_mat<-Pagwas$data_mat

Pagwas$pca_scCell_mat=pca_scCell_mat[,1:350] 
Pagwas$data_mat=data_mat[,1:350]
save(Pagwas,file="1_Pagwas_singledata.RData")

Pagwas$pca_scCell_mat=pca_scCell_mat[,351:700] 
Pagwas$data_mat=data_mat[,351:700]
save(Pagwas,file="2_Pagwas_singledata.RData")

Pagwas$pca_scCell_mat=pca_scCell_mat[,701:1000] 
Pagwas$data_mat=data_mat[,701:1000]
save(Pagwas,file="3_Pagwas_singledata.RData")

1.3 Input the gwas data and save output

gwasfile= system.file("extdata", "GWAS_summ_example.txt", package = "scPagwas")
out_file= "./gwas_pagwas.RData"
Pagwas<-list()
gwas_data <- bigreadr::fread2(gwasfile)
Pagwas <- scPagwas::GWAS_summary_input(
    Pagwas = Pagwas,
    gwas_data = gwas_data
  )
Pagwas$snp_gene_df <- scPagwas::SnpToGene(gwas_data = Pagwas$gwas_data,
        block_annotation = block_annotation,
        marg = 10000)
save(Pagwas,file=out_file)

1.4 The core computational pathway for the regression process

Given that we have divided the single-cell results into two partitions, the following code illustrates the regression process using one of the partitions as an example.

gwas_path <- "gwas_pagwas.RData"
scmat_path <- "1_Pagwas_singledata.RData"
#scmat_path <- "2_Pagwas_singledata.RData"

#Import and integrate the two data results into a single list.
load(gwas_path)
Pagwas1<-Pagwas
rm(Pagwas)
load(scmat_path)
Pagwas<- c(Pagwas1,Pagwas)

rm(Pagwas1)
gc()

output.prefix='1'
#output.prefix='2'
output.dirs='Test'
if (!dir.exists(output.dirs)) {
    dir.create(output.dirs)
}
if (!dir.exists(output.dirs)) {
  dir.create(paste0("./", output.dirs,"/temp_",output.prefix))
}

Pagwas <- scPagwas::Pathway_annotation_input(
      Pagwas = Pagwas,
      block_annotation = block_annotation
    )
Pagwas <- scPagwas::Link_pathway_blocks_gwas(
      Pagwas = Pagwas,
      chrom_ld = chrom_ld,
      singlecell = T,
      celltype = F,
      backingpath=paste0("./", output.dirs,"/temp_",output.prefix),
      n.cores=1
      )
pmat<-Pagwas$Pathway_sclm_results

#save result
save(pmat,file=paste0("./", output.dirs, "/",output.prefix,"_Pathway_sclm_results.RData"))
#save(pmat,file=paste0("./", output.dirs, "/",output.prefix,"_Pathway_sclm_results.RData"))

Here, it is necessary to run all the split single-cell data separately, and obtain the “_Pathway_sclm_results.RData” result for each one.

1.5 Aggregate all the single-cell regression results after splitting.

load(paste0("./", output.dirs, "/1_Pathway_sclm_results.RData"))
n=2
pmat_merge<-pmat
for (i in 2:n) {
	print(i)
	load(paste0("./", output.dirs, "/",i,"_Pathway_sclm_results.RData"))
	pmat_merge<-rbind(pmat_merge,pmat)
}
save(pmat_merge,file="pmat_merge.RData")

1.6 Calculating the scPagwas.gPAS.score for eahc cell and PCC for genes

After calculating the scPagwas.gPAS.score, genetic association gene analysis is performed based on this score.

In this process, 200 cells are randomly sampled from the large-scale single-cell data for computation. This process is repeated five times, and the results are integrated to obtain the final PCC for genes.

The number of cells randomly selected and the number of random iterations depend on the specific quantity of single cells available.

#Note: Here, we need to import the "single.pagwas.RData" data again.
load("single.pagwas.RData")
load("pmat_merge.RData")

#compute the gPas score.
scPagwas.gPAS.score <- scPagwas::Merge_gPas(Pagwas,pmat_merge)

#compute the heritability correlation(PCC) for each gene. 
PCC<-scPagwas::Corr_Random(Pagwas$data_mat,
                          scPagwas.gPAS.score,
                          seed=1234,
                          random=T,
                          Nrandom=5,# you need change this parameter
                          Nselect=200 # you need change this parameter based on your cell numbers.
    )

mean_gpas<-mean(scPagwas.gPAS.score)
a1<-which(scPagwas.gPAS.score >= mean_gpas)
a2<-which(scPagwas.gPAS.score < mean_gpas)

PCC_up <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a1],data_mat=Pagwas$data_mat[,a1])
PCC_down <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a2],data_mat=Pagwas$data_mat[,a2])

scPagwas_topgenes <- names(PCC[order(PCC, decreasing = T)])[1:500]
scPagwas_upgenes <- names(PCC_up)[order(PCC_up, decreasing = T)[1:500]]
scPagwas_downgenes <- names(PCC_down)[order(PCC_down, decreasing = F)[1:500]]

1.7 Compute the TRS and background correction p-values.

In this step, we need to utilize the initially imported single-cell data once again.

Single_data <- Seurat::AddModuleScore(Single_data, assay = 'RNA', list(scPagwas_topgenes,scPagwas_upgenes,scPagwas_downgenes), name = c("scPagwas.TRS.Score","scPagwas.upTRS.Score","scPagwas.downTRS.Score"))
correct_pdf <- scPagwas::Get_CorrectBg_p(Single_data=Single_data,
                                     scPagwas.TRS.Score=Single_data$scPagwas.TRS.Score1,
                                     iters_singlecell=100,
                                     n_topgenes=500,
                                     scPagwas_topgenes=scPagwas_topgenes)

Pagwas$Random_Correct_BG_pdf <- correct_pdf
Pagwas$Merged_celltype_pvalue<-scPagwas::Merge_celltype_p(single_p=correct_pdf$pooled_p,
                                                          celltype=Pagwas$Celltype_anno$annotation)

All the results can be found in the Pagwas result list.

2.Solution 2

2.1. Splice scRNA-seq data

I do not recommend using method 2 unless your server memory cannot handle the single-cell data preprocessing. The results of method 2 may differ from the conventional results.

In the second approach, the single-cell data was initially divided into several partitions, and then each partition was individually processed using scPagwas. When you split the scRNA-seq data in random, you should set the min_clustercells=1.

library(Seurat)
Single_data <-readRDS(system.file("extdata", "scRNAexample.rds", package = "scPagwas"))
#set the number of split time, it depend on the size of your single cell data.
#There have two form of spliting the data:

#Create the random index number.
n_split=2
Split_index <- rep(1:n_split, time = ceiling(ncol(Single_data)/n_split), length = ncol(Single_data))

for (i in 1:n_split) {
  Example_splice <- Single_data[,Split_index==i]
  saveRDS(Example_splice,file = paste0("Example_splice",i,".rds"))
}

2.2. Run gwas data input

Second, we run the GWAS_summary_input and Snp2Gene firstly, because the following function share the same gwas data. Sometimes, we need not run these functions for small gwas data, because a little time for these functions.

Pagwas<-list()
gwas_data <- bigreadr::fread2(system.file("extdata", "GWAS_summ_example.txt", package = "scPagwas"))
Pagwas <- GWAS_summary_input(
    Pagwas = Pagwas,
    gwas_data = gwas_data
  )
Pagwas$snp_gene_df <- SnpToGene(gwas_data = Pagwas$gwas_data, 
                                block_annotation = block_annotation, marg = 10000)
names(Pagwas)

2.3. Run scPagwas for sub-data

for (i in 1:n_split) {
 scPagwas_main(Pagwas =Pagwas,
                     gwas_data =NULL,
                     Single_data = paste0("Example_splice",i,".rds"), #the 
                     output.prefix=i, #the prefix can be the circulation coefficient i.
                     output.dirs="splice_scPagwas", #the output.dirs shoukd be the same for each circulation
                     Pathway_list=Genes_by_pathway_kegg,
                     run_split=TRUE, #You must set the key parameter.This parameter is set to run single result for each split result.
                     min_clustercells=10, #the minimum cluster cell number should be 1.
                     assay="RNA",
                     block_annotation = block_annotation,
                     chrom_ld = chrom_ld)
  gc()
}

2.4. Integrate the gPAS.score result for splice data.

Read all the “_singlecell_scPagwas.gPAS.score.Result.csv” result files and integrate them together.

output.dirs="splice_scPagwas"
oriDir <- paste0("./",output.dirs)
files <- list.files(oriDir, pattern="*_singlecell_scPagwas.gPAS.score.Result.csv")
#integrate the scPagwas.gPAS.score
scPagwas.gPAS.score<-unlist(lapply( files,function(file){
    gs<-read.csv(file=paste0(oriDir,"/",file))
    ga <- gs$scPagwas.gPAS.score
    names(ga) <- gs$cellnames
    return(ga)
  }))
Single_data <- Single_data[,names(scPagwas.gPAS.score)]
Single_data$scPagwas.gPAS.score<-scPagwas.gPAS.score

2.5 Compute the TRS and background correction p-values

This step is identical to the later steps in Solution 1.

data_mat <- GetAssayData(Single_data, slot = "data", assay = "RNA")
#compute the heritability correlation(PCC) for each gene. 
PCC<-scPagwas::Corr_Random(data_mat=data_mat,
                          scPagwas.gPAS.score,
                          seed=1234,
                          random=T,
                          Nrandom=5,# you need change this parameter
                          Nselect=200 # you need change this parameter based on your cell numbers.
    )

mean_gpas<-mean(scPagwas.gPAS.score)
a1<-which(scPagwas.gPAS.score >= mean_gpas)
a2<-which(scPagwas.gPAS.score < mean_gpas)

PCC_up <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a1],data_mat=data_mat[,a1])
PCC_down <- scPagwas::Corr_Random(scPagwas.gPAS.score=scPagwas.gPAS.score[a2],data_mat=data_mat[,a2])

scPagwas_topgenes <- names(PCC[order(PCC, decreasing = T)])[1:500]
scPagwas_upgenes <- names(PCC_up)[order(PCC_up, decreasing = T)[1:500]]
scPagwas_downgenes <- names(PCC_down)[order(PCC_down, decreasing = F)[1:500]]
Single_data <- Seurat::AddModuleScore(Single_data, assay = 'RNA', list(scPagwas_topgenes,scPagwas_upgenes,scPagwas_downgenes), name = c("scPagwas.TRS.Score","scPagwas.upTRS.Score","scPagwas.downTRS.Score"))
correct_pdf <- scPagwas::Get_CorrectBg_p(Single_data=Single_data,
                                     scPagwas.TRS.Score=Single_data$scPagwas.TRS.Score1,
                                     iters_singlecell=100,
                                     n_topgenes=500,
                                     scPagwas_topgenes=scPagwas_topgenes)
Pagwas$scPagwas.TRS.Score = Single_data$scPagwas.TRS.Score1
Pagwas$Random_Correct_BG_pdf <- correct_pdf
Pagwas$Merged_celltype_pvalue<-scPagwas::Merge_celltype_p(single_p=correct_pdf$pooled_p,
                                                         celltype=Idents(Single_data))