Normalization, scaling, batch correction

1. Introduction

This document takes the QCd data, performs normalization, variable genes identification and scaling.

#Install and load required R packages. Install the packages if you do not already have them installed
library(Seurat)
library(kableExtra)
library(ggplot2)
library(plotly)
library(tidyverse)
library(rmarkdown)

2. Load QC’d data

# Load QCd data
load('/data/../TotalSeq/Results/qualitycontrol/SeuratObjectAfterQc.RData')
# Input path
datadirectory <- "/data/../TotalSeq/Results/qualitycontrol/"
# Output path
outputpath="/data/../TotalSeq/Results/normalization/"
# Set working directory
setwd(outputpath)

3. Normalization

###################### Normalize Data
# Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. This is then natural-log transformed
# Cell level normalization - accounts for sequencing depth
allsamples <- NormalizeData(
  object = allsamples,
  normalization.method = "LogNormalize",
  scale.factor = 10000)

allsamples <- NormalizeData(
  object = allsamples,
  assay = "ADT",
  normalization.method = "CLR")

# ADT names
rownames(allsamples[["ADT"]])

##  [1] "CD107a-LAMP-1"                "CD115-CSF-1R"
##  [3] "CD119-IFN-g-R-a-chain"        "CD11b"
##  [5] "CD11c"                        "CD123"
##  [7] "CD127-IL-7Ra"                 "CD135-Flt-3-Flk-2"
##  [9] "CD137-4-1BB"                  "CD137L-4-1BB-Ligand"
## [11] "CD14.1"                       "CD141-Thrombomodulin"
## [13] "CD152-CTLA-4"                 "CD154"
## [15] "CD158e1-KIR3DL1-NKB1"         "CD16"
## [17] "CD163.1"                      "CD172a-SIRPa"
## [19] "CD178-Fas-L"                  "CD183-CXCR3"
## [21] "CD185-CXCR5"                  "CD19.1"
## [23] "CD196-CCR6"                   "CD197-CCR7"
## [25] "CD1c"                         "CD20"
## [27] "CD218a-IL-18Ra"               "CD223-LAG-3"
## [29] "CD226.1"                      "CD24.1"
## [31] "CD244-2B4"                    "CD25"
## [33] "CD27.1"                       "CD273-B7-DC-PD-L2"
## [35] "CD274-B7-H1-PD-L1"            "CD275"
## [37] "CD278-ICOS"                   "CD279-PD-1"
## [39] "CD28.1"                       "CD3"
## [41] "CD303-BDCA-2"                 "CD307c-FcRL3"
## [43] "CD307d-FcRL4"                 "CD307e-FcRL5"
## [45] "CD314-NKG2D"                  "CD335-NKp46"
## [47] "CD336"                        "CD337-NKp30"
## [49] "CD34.1"                       "CD357-GITR"
## [51] "CD366-Tim-3"                  "CD38.1"
## [53] "CD39"                         "CD4.1"
## [55] "CD40.1"                       "CD44.1"
## [57] "CD45"                         "CD45RA"
## [59] "CD45RO"                       "CD47.1"
## [61] "CD48.1"                       "CD56-NCAM"
## [63] "CD57-Recombinant"             "CD62L"
## [65] "CD64"                         "CD69.1"
## [67] "CD70.1"                       "CD8"
## [69] "CD80.1"                       "CD86.1"
## [71] "CD8a"                         "CD94"
## [73] "CD95-Fas"                     "HLA-A-B-C"
## [75] "HLA-A2"                       "HLA-DR"
## [77] "HLA-E"                        "IgD"
## [79] "IgM"                          "KLRG1-MAFA"
## [81] "Lymphotoxin-b-Receptor-LT-bR" "NKp80"
## [83] "TCR-g-d"                      "TIGIT-VSTM3"
## [85] "b2-microglobulin"

##################### Find variable genes
# Returns top 2000 variable genes
# VST is uses LOESS method
# FindVariableFeatures needs normalization
allsamples <- FindVariableFeatures(
  object = allsamples,
  selection.method = "vst")
top10 <- head(VariableFeatures(allsamples), 10)
top10

##  [1] "IGLC1"  "PPBP"   "IGKC"   "IGHA1"  "IGLC2"  "IGLC3"  "PF4"    "CCL4"
##  [9] "NRGN"   "JCHAIN"

# Plot variable features and label the top 10
vfp1 <- VariableFeaturePlot(allsamples)
vfp1 <- LabelPoints(plot = vfp1, points = top10, repel = TRUE)
vfp1

ggsave(paste(outputpath,"variablefeatureplot.pdf",sep=""), plot = vfp1, width = 20, height = 15, units = "cm")

4. Scaling & Batch Correction

##################### SCALE DATA
# PCA needs scaled data
# Zero centers and scales (mean/sd) gene/feature data in each cell (for across sample comparison), so extreme ranges in expression do not affect clustering (done for making cells with similar expression cluster together)
dim(allsamples)

## [1] 36601 30189

allsamples <- ScaleData(
  object = allsamples,
  # Remove unwanted sources of variation (technical, batch etc.)
  vars.to.regress = c("percent.mito", "nFeature_RNA", "percent.ribo", "nCount_RNA", "batch"))

# Save Normalized, Scaled session data in the working directory
save.image(file=paste(outputpath,"SeuratObjectAfterNormalization.RData",sep=""))

# Reference
# Doublet prediction
# https://ucdavis-bioinformatics-training.github.io/2021-March-Single-Cell-RNA-Seq-Analysis/data_analysis/scRNA_Workshop-PART2_fixed
sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
##  [1] rmarkdown_2.18      forcats_0.5.2       stringr_1.4.1
##  [4] dplyr_1.0.10        purrr_0.3.5         readr_2.1.3
##  [7] tidyr_1.2.1         tibble_3.1.8        tidyverse_1.3.2
## [10] kableExtra_1.3.4    KernSmooth_2.23-20  fields_14.1
## [13] viridis_0.6.2       viridisLite_0.4.1   spam_2.9-1
## [16] plotly_4.10.1       ggplot2_3.4.0       SeuratObject_4.1.3
## [19] Seurat_4.2.1        DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             spatstat.explore_3.0-5 reticulate_1.26
##   [4] tidyselect_1.2.0       htmlwidgets_1.5.4      grid_4.2.2
##   [7] Rtsne_0.16             munsell_0.5.0          codetools_0.2-18
##  [10] ragg_1.2.4             ica_1.0-3              future_1.29.0
##  [13] miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.0-1
##  [16] colorspace_2.0-3       progressr_0.11.0       highr_0.9
##  [19] knitr_1.40             rstudioapi_0.14        ROCR_1.0-11
##  [22] tensor_1.5             listenv_0.8.0          labeling_0.4.2
##  [25] polyclip_1.10-4        bit64_4.0.5            farver_2.1.1
##  [28] parallelly_1.32.1      vctrs_0.5.0            generics_0.1.3
##  [31] xfun_0.34              timechange_0.1.1       R6_2.5.1
##  [34] ggbeeswarm_0.6.0       hdf5r_1.3.7            spatstat.utils_3.0-1
##  [37] cachem_1.0.6           assertthat_0.2.1       promises_1.2.0.1
##  [40] scales_1.2.1           googlesheets4_1.0.1    beeswarm_0.4.0
##  [43] gtable_0.3.1           globals_0.16.1         goftest_1.2-3
##  [46] rlang_1.0.6            systemfonts_1.0.4      splines_4.2.2
##  [49] lazyeval_0.2.2         gargle_1.2.1           spatstat.geom_3.0-3
##  [52] broom_1.0.1            yaml_2.3.6             reshape2_1.4.4
##  [55] abind_1.4-5            modelr_0.1.9           backports_1.4.1
##  [58] httpuv_1.6.6           tools_4.2.2            ellipsis_0.3.2
##  [61] jquerylib_0.1.4        RColorBrewer_1.1-3     ggridges_0.5.4
##  [64] Rcpp_1.0.9             plyr_1.8.7             deldir_1.0-6
##  [67] pbapply_1.5-0          cowplot_1.1.1          zoo_1.8-11
##  [70] haven_2.5.1            ggrepel_0.9.2          cluster_2.1.4
##  [73] fs_1.5.2               magrittr_2.0.3         data.table_1.14.4
##  [76] scattermore_0.8        reprex_2.0.2           lmtest_0.9-40
##  [79] RANN_2.6.1             googledrive_2.0.0      fitdistrplus_1.1-8
##  [82] matrixStats_0.62.0     hms_1.1.2              patchwork_1.1.2
##  [85] mime_0.12              evaluate_0.18          xtable_1.8-4
##  [88] readxl_1.4.1           gridExtra_2.3          compiler_4.2.2
##  [91] maps_3.4.1             crayon_1.5.2           htmltools_0.5.3
##  [94] later_1.3.0            tzdb_0.3.0             lubridate_1.9.0
##  [97] DBI_1.1.3              dbplyr_2.2.1           MASS_7.3-58.1
## [100] Matrix_1.5-3           cli_3.4.1              parallel_4.2.2
## [103] dotCall64_1.0-2        igraph_1.3.5           pkgconfig_2.0.3
## [106] sp_1.5-1               spatstat.sparse_3.0-0  xml2_1.3.3
## [109] svglite_2.1.0          vipor_0.4.5            bslib_0.4.1
## [112] webshot_0.5.4          rvest_1.0.3            digest_0.6.30
## [115] sctransform_0.3.5      RcppAnnoy_0.0.20       spatstat.data_3.0-0
## [118] cellranger_1.1.0       leiden_0.4.3           uwot_0.1.14
## [121] curl_4.3.3             shiny_1.7.3            lifecycle_1.0.3
## [124] nlme_3.1-160           jsonlite_1.8.3         fansi_1.0.3
## [127] pillar_1.8.1           lattice_0.20-45        ggrastr_1.0.1
## [130] fastmap_1.1.0          httr_1.4.4             survival_3.4-0
## [133] glue_1.6.2             remotes_2.4.2          png_0.1-7
## [136] bit_4.0.4              stringi_1.7.8          sass_0.4.2
## [139] textshaping_0.3.6      irlba_2.3.5.1          future.apply_1.10.0

5. Generate R script

file.copy(from = "/data/../TotalSeq/Tools/normalization.html",
          to   = "/data/../TotalSeq/Reports/site/normalization.html")

## [1] FALSE

file.remove("/data/../TotalSeq/Tools/normalization.html")

## [1] FALSE