4  SCTransform & integration

Published

August 18, 2025

suppressPackageStartupMessages({
    library(tidyverse)
    library(Seurat)
    # BiocManager::install('glmGamPoi') 
    library(glmGamPoi)
    library(future)
    library(compareGroups)
    })


### helper functions
source("20250725-helper_functions.R")

4.1 Subset cells with nFeat_RNA > 2000

load ("data/20200620-list_of_Seurat_objects.Rdata")

seurat_objects <- map(seurat_objects, \(x) subset(x, subset = nFeature_RNA > 2000))
features <- imap (seurat_objects, \(x, name) tibble(seurat_object = name,
                                             n_features = x$nFeature_RNA,
                                             ribo = PercentageFeatureSet(x, pattern = "^Rp[ls]"))) |> 
    list_rbind()

ggplot(features,aes(n_features,col=seurat_object))+
    labs(y="", title = "Number of features /cell after subsetting")+
    scale_x_continuous( " ", breaks = seq(0,12000,1000), 
                       labels = scales::label_number(scale = 1/1000, suffix = "K"))+
    density_decor()

ggplot(features,aes(ribo,col=seurat_object))+
    labs(y="", title = "ribo content /cell after subsetting")+
    scale_x_continuous( " ", limits = c(0,30),
                        labels = scales::label_number(scale = 1/1, suffix = "%"))+
    density_decor()


Number of cells (N) and features per cell (median [IQR]) in the subsetted dataset

c1 <- compareGroups(seurat_object ~ n_features, features, 
                    method = 2, max.ylev = 6)
t1 <- createTable(c1, show.p.overall = F)
export2md(t1, caption = "")
KK4_464 KK4_465 KK4_492 KK4_496 KK4_502 KK4_504
N=11291 N=7728 N=9799 N=6665 N=9201 N=8098
n_features 3534 [3078;4080] 4094 [3562;4649] 3833 [3348;4338] 4235 [3604;4828] 3797 [3298;4318] 3752 [3234;4303]

4.2 Apply SCTransform

Apply SCTransform normalization to each subsetted Seurat object in the list.

seurat_objects <- map(seurat_objects, \(x) SCTransform(x))

4.3 Integration

4.3.1 Select integration features

4.4 SCTransform-based integration of multiple Seurat objects

  1. Identifies the top 2,000 highly variable genes across all Seurat objects to use for integration. These features should be common and informative for aligning datasets.

  2. Prepares the SCT assays for integration. It calculates the necessary Pearson residuals and scales data based on the selected features. This must be done after SCTransform and before FindIntegrationAnchors.

##################
features <- SelectIntegrationFeatures(object.list = seurat_objects, nfeatures = 2000)

# Prepare for integration
seurat_objects <- PrepSCTIntegration(seurat_objects, anchor.features = features)

4.4.1 Find anchors

  1. Identifies anchors—pairs of cells from different datasets that are biologically similar. These are used to align the datasets during integration.
# Find integration anchors
anchors <- FindIntegrationAnchors(
  object.list = seurat_objects,
  normalization.method = "SCT",
  reduction = "cca",
  anchor.features = features
)

4.4.2 Integrate the datasets

  1. Performs the actual integration, combining all datasets into a single Seurat object where batch effects are minimized. Output is stored in a new "integrated" assay.
seurat_integrated <- IntegrateData(
  anchorset = anchors,
  normalization.method = "SCT"
)

save(seurat_integrated,file ="data/20250619-seurat_integrated_nFeat>2000.Rdata")