This document takes the QCd data, performs normalization, variable genes identification and scaling.
#Install and load required R packages. Install the packages if you do not already have them installed
library(Seurat)
library(kableExtra)
library(ggplot2)
library(plotly)
library(tidyverse)
library(rmarkdown)
# Load QCd data
load('/data/../TotalSeq/Results/qualitycontrol/SeuratObjectAfterQc.RData')
# Input path
datadirectory <- "/data/../TotalSeq/Results/qualitycontrol/"
# Output path
outputpath="/data/../TotalSeq/Results/normalization/"
# Set working directory
setwd(outputpath)
###################### Normalize Data
# Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. This is then natural-log transformed
# Cell level normalization - accounts for sequencing depth
allsamples <- NormalizeData(
object = allsamples,
normalization.method = "LogNormalize",
scale.factor = 10000)
allsamples <- NormalizeData(
object = allsamples,
assay = "ADT",
normalization.method = "CLR")
# ADT names
rownames(allsamples[["ADT"]])
## [1] "CD107a-LAMP-1" "CD115-CSF-1R"
## [3] "CD119-IFN-g-R-a-chain" "CD11b"
## [5] "CD11c" "CD123"
## [7] "CD127-IL-7Ra" "CD135-Flt-3-Flk-2"
## [9] "CD137-4-1BB" "CD137L-4-1BB-Ligand"
## [11] "CD14.1" "CD141-Thrombomodulin"
## [13] "CD152-CTLA-4" "CD154"
## [15] "CD158e1-KIR3DL1-NKB1" "CD16"
## [17] "CD163.1" "CD172a-SIRPa"
## [19] "CD178-Fas-L" "CD183-CXCR3"
## [21] "CD185-CXCR5" "CD19.1"
## [23] "CD196-CCR6" "CD197-CCR7"
## [25] "CD1c" "CD20"
## [27] "CD218a-IL-18Ra" "CD223-LAG-3"
## [29] "CD226.1" "CD24.1"
## [31] "CD244-2B4" "CD25"
## [33] "CD27.1" "CD273-B7-DC-PD-L2"
## [35] "CD274-B7-H1-PD-L1" "CD275"
## [37] "CD278-ICOS" "CD279-PD-1"
## [39] "CD28.1" "CD3"
## [41] "CD303-BDCA-2" "CD307c-FcRL3"
## [43] "CD307d-FcRL4" "CD307e-FcRL5"
## [45] "CD314-NKG2D" "CD335-NKp46"
## [47] "CD336" "CD337-NKp30"
## [49] "CD34.1" "CD357-GITR"
## [51] "CD366-Tim-3" "CD38.1"
## [53] "CD39" "CD4.1"
## [55] "CD40.1" "CD44.1"
## [57] "CD45" "CD45RA"
## [59] "CD45RO" "CD47.1"
## [61] "CD48.1" "CD56-NCAM"
## [63] "CD57-Recombinant" "CD62L"
## [65] "CD64" "CD69.1"
## [67] "CD70.1" "CD8"
## [69] "CD80.1" "CD86.1"
## [71] "CD8a" "CD94"
## [73] "CD95-Fas" "HLA-A-B-C"
## [75] "HLA-A2" "HLA-DR"
## [77] "HLA-E" "IgD"
## [79] "IgM" "KLRG1-MAFA"
## [81] "Lymphotoxin-b-Receptor-LT-bR" "NKp80"
## [83] "TCR-g-d" "TIGIT-VSTM3"
## [85] "b2-microglobulin"
##################### Find variable genes
# Returns top 2000 variable genes
# VST is uses LOESS method
# FindVariableFeatures needs normalization
allsamples <- FindVariableFeatures(
object = allsamples,
selection.method = "vst")
top10 <- head(VariableFeatures(allsamples), 10)
top10
## [1] "IGLC1" "PPBP" "IGKC" "IGHA1" "IGLC2" "IGLC3" "PF4" "CCL4"
## [9] "NRGN" "JCHAIN"
# Plot variable features and label the top 10
vfp1 <- VariableFeaturePlot(allsamples)
vfp1 <- LabelPoints(plot = vfp1, points = top10, repel = TRUE)
vfp1
ggsave(paste(outputpath,"variablefeatureplot.pdf",sep=""), plot = vfp1, width = 20, height = 15, units = "cm")
##################### SCALE DATA
# PCA needs scaled data
# Zero centers and scales (mean/sd) gene/feature data in each cell (for across sample comparison), so extreme ranges in expression do not affect clustering (done for making cells with similar expression cluster together)
dim(allsamples)
## [1] 36601 30189
allsamples <- ScaleData(
object = allsamples,
# Remove unwanted sources of variation (technical, batch etc.)
vars.to.regress = c("percent.mito", "nFeature_RNA", "percent.ribo", "nCount_RNA", "batch"))
# Save Normalized, Scaled session data in the working directory
save.image(file=paste(outputpath,"SeuratObjectAfterNormalization.RData",sep=""))
# Reference
# Doublet prediction
# https://ucdavis-bioinformatics-training.github.io/2021-March-Single-Cell-RNA-Seq-Analysis/data_analysis/scRNA_Workshop-PART2_fixed
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rmarkdown_2.18 forcats_0.5.2 stringr_1.4.1
## [4] dplyr_1.0.10 purrr_0.3.5 readr_2.1.3
## [7] tidyr_1.2.1 tibble_3.1.8 tidyverse_1.3.2
## [10] kableExtra_1.3.4 KernSmooth_2.23-20 fields_14.1
## [13] viridis_0.6.2 viridisLite_0.4.1 spam_2.9-1
## [16] plotly_4.10.1 ggplot2_3.4.0 SeuratObject_4.1.3
## [19] Seurat_4.2.1 DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 spatstat.explore_3.0-5 reticulate_1.26
## [4] tidyselect_1.2.0 htmlwidgets_1.5.4 grid_4.2.2
## [7] Rtsne_0.16 munsell_0.5.0 codetools_0.2-18
## [10] ragg_1.2.4 ica_1.0-3 future_1.29.0
## [13] miniUI_0.1.1.1 withr_2.5.0 spatstat.random_3.0-1
## [16] colorspace_2.0-3 progressr_0.11.0 highr_0.9
## [19] knitr_1.40 rstudioapi_0.14 ROCR_1.0-11
## [22] tensor_1.5 listenv_0.8.0 labeling_0.4.2
## [25] polyclip_1.10-4 bit64_4.0.5 farver_2.1.1
## [28] parallelly_1.32.1 vctrs_0.5.0 generics_0.1.3
## [31] xfun_0.34 timechange_0.1.1 R6_2.5.1
## [34] ggbeeswarm_0.6.0 hdf5r_1.3.7 spatstat.utils_3.0-1
## [37] cachem_1.0.6 assertthat_0.2.1 promises_1.2.0.1
## [40] scales_1.2.1 googlesheets4_1.0.1 beeswarm_0.4.0
## [43] gtable_0.3.1 globals_0.16.1 goftest_1.2-3
## [46] rlang_1.0.6 systemfonts_1.0.4 splines_4.2.2
## [49] lazyeval_0.2.2 gargle_1.2.1 spatstat.geom_3.0-3
## [52] broom_1.0.1 yaml_2.3.6 reshape2_1.4.4
## [55] abind_1.4-5 modelr_0.1.9 backports_1.4.1
## [58] httpuv_1.6.6 tools_4.2.2 ellipsis_0.3.2
## [61] jquerylib_0.1.4 RColorBrewer_1.1-3 ggridges_0.5.4
## [64] Rcpp_1.0.9 plyr_1.8.7 deldir_1.0-6
## [67] pbapply_1.5-0 cowplot_1.1.1 zoo_1.8-11
## [70] haven_2.5.1 ggrepel_0.9.2 cluster_2.1.4
## [73] fs_1.5.2 magrittr_2.0.3 data.table_1.14.4
## [76] scattermore_0.8 reprex_2.0.2 lmtest_0.9-40
## [79] RANN_2.6.1 googledrive_2.0.0 fitdistrplus_1.1-8
## [82] matrixStats_0.62.0 hms_1.1.2 patchwork_1.1.2
## [85] mime_0.12 evaluate_0.18 xtable_1.8-4
## [88] readxl_1.4.1 gridExtra_2.3 compiler_4.2.2
## [91] maps_3.4.1 crayon_1.5.2 htmltools_0.5.3
## [94] later_1.3.0 tzdb_0.3.0 lubridate_1.9.0
## [97] DBI_1.1.3 dbplyr_2.2.1 MASS_7.3-58.1
## [100] Matrix_1.5-3 cli_3.4.1 parallel_4.2.2
## [103] dotCall64_1.0-2 igraph_1.3.5 pkgconfig_2.0.3
## [106] sp_1.5-1 spatstat.sparse_3.0-0 xml2_1.3.3
## [109] svglite_2.1.0 vipor_0.4.5 bslib_0.4.1
## [112] webshot_0.5.4 rvest_1.0.3 digest_0.6.30
## [115] sctransform_0.3.5 RcppAnnoy_0.0.20 spatstat.data_3.0-0
## [118] cellranger_1.1.0 leiden_0.4.3 uwot_0.1.14
## [121] curl_4.3.3 shiny_1.7.3 lifecycle_1.0.3
## [124] nlme_3.1-160 jsonlite_1.8.3 fansi_1.0.3
## [127] pillar_1.8.1 lattice_0.20-45 ggrastr_1.0.1
## [130] fastmap_1.1.0 httr_1.4.4 survival_3.4-0
## [133] glue_1.6.2 remotes_2.4.2 png_0.1-7
## [136] bit_4.0.4 stringi_1.7.8 sass_0.4.2
## [139] textshaping_0.3.6 irlba_2.3.5.1 future.apply_1.10.0
file.copy(from = "/data/../TotalSeq/Tools/normalization.html",
to = "/data/../TotalSeq/Reports/site/normalization.html")
## [1] FALSE
file.remove("/data/../TotalSeq/Tools/normalization.html")
## [1] FALSE