4 SCTransform & integration

Published

August 18, 2025

suppressPackageStartupMessages({
    library(tidyverse)
    library(Seurat)
    # BiocManager::install('glmGamPoi') 
    library(glmGamPoi)
    library(future)
    library(compareGroups)
    })


### helper functions
source("20250725-helper_functions.R")

4.1 Subset cells with nFeat_RNA > 2000

load ("data/20200620-list_of_Seurat_objects.Rdata")

seurat_objects <- map(seurat_objects, \(x) subset(x, subset = nFeature_RNA > 2000))

features <- imap (seurat_objects, \(x, name) tibble(seurat_object = name,
                                             n_features = x$nFeature_RNA,
                                             ribo = PercentageFeatureSet(x, pattern = "^Rp[ls]"))) |> 
    list_rbind()

ggplot(features,aes(n_features,col=seurat_object))+
    labs(y="", title = "Number of features /cell after subsetting")+
    scale_x_continuous( " ", breaks = seq(0,12000,1000), 
                       labels = scales::label_number(scale = 1/1000, suffix = "K"))+
    density_decor()

ggplot(features,aes(ribo,col=seurat_object))+
    labs(y="", title = "ribo content /cell after subsetting")+
    scale_x_continuous( " ", limits = c(0,30),
                        labels = scales::label_number(scale = 1/1, suffix = "%"))+
    density_decor()

Number of cells (N) and features per cell (median [IQR]) in the subsetted dataset

c1 <- compareGroups(seurat_object ~ n_features, features, 
                    method = 2, max.ylev = 6)
t1 <- createTable(c1, show.p.overall = F)
export2md(t1, caption = "")

	KK4_464	KK4_465	KK4_492	KK4_496	KK4_502	KK4_504
	N=11291	N=7728	N=9799	N=6665	N=9201	N=8098
n_features	3534 [3078;4080]	4094 [3562;4649]	3833 [3348;4338]	4235 [3604;4828]	3797 [3298;4318]	3752 [3234;4303]

4.2 Apply SCTransform

Apply SCTransform normalization to each subsetted Seurat object in the list.

seurat_objects <- map(seurat_objects, \(x) SCTransform(x))

4.3 Integration

4.3.1 Select integration features

4.4 SCTransform-based integration of multiple Seurat objects

Identifies the top 2,000 highly variable genes across all Seurat objects to use for integration. These features should be common and informative for aligning datasets.
Prepares the SCT assays for integration. It calculates the necessary Pearson residuals and scales data based on the selected features. This must be done after SCTransform and before FindIntegrationAnchors.

##################
features <- SelectIntegrationFeatures(object.list = seurat_objects, nfeatures = 2000)

# Prepare for integration
seurat_objects <- PrepSCTIntegration(seurat_objects, anchor.features = features)

4.4.1 Find anchors

Identifies anchors—pairs of cells from different datasets that are biologically similar. These are used to align the datasets during integration.

# Find integration anchors
anchors <- FindIntegrationAnchors(
  object.list = seurat_objects,
  normalization.method = "SCT",
  reduction = "cca",
  anchor.features = features
)

4.4.2 Integrate the datasets

Performs the actual integration, combining all datasets into a single Seurat object where batch effects are minimized. Output is stored in a new "integrated" assay.

seurat_integrated <- IntegrateData(
  anchorset = anchors,
  normalization.method = "SCT"
)

save(seurat_integrated,file ="data/20250619-seurat_integrated_nFeat>2000.Rdata")