This document takes the QCd, normalized, scaled, batch corrected data and performs clustering.
#Install and load required R packages. Install the packages if you do not already have them installed
library(Seurat)
library(kableExtra)
library(ggplot2)
library(plotly)
library(tidyverse)
library(rmarkdown)
# Load QCd, normalized data
load('/data/../TotalSeq/Results/normalization/SeuratObjectAfterNormalization.RData')
# Input path
datadirectory <- "/data/../TotalSeq/Results/normalization/"
# Output path
outputpath="/data/../TotalSeq/Results/clustering/"
# Set working directory
setwd(outputpath)
##################### PCA
# Run PCA with 50 pcs
allsamples <- RunPCA(object = allsamples, npcs=50)
# Print genes in the top 5 pcs
print(allsamples[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: MALAT1, CD247, BACH2, RPS27, CST7
## Negative: S100A9, LYZ, CST3, S100A8, KYNU
## PC_ 2
## Positive: CD79A, MS4A1, BANK1, PLEKHG1, IGHD
## Negative: ANXA1, S100A4, VIM, PITPNC1, CD247
## PC_ 3
## Positive: GNLY, NKG7, GZMB, KLRD1, FGFBP2
## Negative: PDE3B, PLCL1, LEF1, LDLRAD4, TSHZ2
## PC_ 4
## Positive: GP1BB, PF4, PPBP, GNG11, CAVIN2
## Negative: LMNA, EREG, IL1B, CXCL8, S100A10
## PC_ 5
## Positive: TXK, LEF1, MAML2, PLCL1, FHIT
## Negative: LMNA, S100A10, NIBAN1, S100A4, S100A11
# Identify number of PCs that explains majority of variations
ElbowPlot(allsamples, ndims=50, reduction = "pca")
# Visualize the genes in the first PC
VizDimLoadings(allsamples, dims = 1, ncol = 1) + theme_minimal(base_size = 8)
# Visualize the genes in the second PC
VizDimLoadings(allsamples, dims = 2, ncol = 1) + theme_minimal(base_size = 8)
# Set the identity column to WT vs TG
Idents(allsamples) <- "orig.ident"
# Visualize the cells after pca
DimPlot(object = allsamples, reduction = "pca")
# Plot heatmap with cells=500 plotting cells with extreme cells on both ends of spectrum
#DimHeatmap(object = allsamples, dims = 1:6, cells = 500, balanced = TRUE)
DimHeatmap(object = allsamples, dims = 1:6, cells = 500, balanced = TRUE)
##################### Clustering
# use the first 20 pc's
use.pcs = 1:20
# FindNeighbors with 20 pcs
allsamples <- FindNeighbors(allsamples, reduction="pca", dims = use.pcs)
# FindClusters with resolution starting at 0.25 and ending at 4, with 0.5 increment
allsamples <- FindClusters(
object = allsamples,
resolution = seq(0.25,0.75,0.5),
verbose = FALSE
)
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS 1758 948 1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS 1478 930 186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS 2045 1003 883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS 5150 2003 2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS 2287 1306 645
## nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS 81 0.8221226 42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS 84 4.3795620 18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS 81 4.6177139 30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS 79 5.3014109 21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS 78 2.2900763 23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS 84 5.1194539 26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS 48 2.6387009 24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS 78 3.4229829 30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS 82 0.6019417 35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS 83 2.9296021 19.45780 0.00000000
## batch type RNA_snn_res.0.25 RNA_snn_res.0.75
## AAACCCACACTGGAAG-NS3R189BTS 1 HC 2 2
## AAACCCACATGACGTT-NS3R189BTS 1 HC 1 9
## AAACCCAGTCCAAATC-NS3R189BTS 1 HC 0 15
## AAACCCAGTGAGTTTC-NS3R189BTS 1 HC 1 3
## AAACCCAGTTCTTAGG-NS3R189BTS 1 HC 0 4
## AAACCCATCAGCTGAT-NS3R189BTS 1 HC 1 3
## AAACCCATCATTCACT-NS3R189BTS 1 HC 0 0
## AAACCCATCCGGACGT-NS3R189BTS 1 HC 2 2
## AAACCCATCCGTGGTG-NS3R189BTS 1 HC 0 4
## AAACCCATCCTCTCTT-NS3R189BTS 1 HC 0 0
## seurat_clusters
## AAACCCACACTGGAAG-NS3R189BTS 2
## AAACCCACATGACGTT-NS3R189BTS 9
## AAACCCAGTCCAAATC-NS3R189BTS 15
## AAACCCAGTGAGTTTC-NS3R189BTS 3
## AAACCCAGTTCTTAGG-NS3R189BTS 4
## AAACCCATCAGCTGAT-NS3R189BTS 3
## AAACCCATCATTCACT-NS3R189BTS 0
## AAACCCATCCGGACGT-NS3R189BTS 2
## AAACCCATCCGTGGTG-NS3R189BTS 4
## AAACCCATCCTCTCTT-NS3R189BTS 0
# Count number of clusters at each resolution
sapply(grep("res",colnames(allsamples@meta.data),value = TRUE),
function(x) length(unique(allsamples@meta.data[,x])))
## RNA_snn_res.0.25 RNA_snn_res.0.75
## 13 22
####################### TSNE
# Run tsne with 20 pcs
allsamples <- RunTSNE(
object = allsamples,
reduction.use = "pca",
dims = use.pcs,
do.fast = TRUE)
# Plot all clusters in all resolutions
DimPlot(object = allsamples, group.by=grep("res",colnames(allsamples@meta.data),value = TRUE)[1:4], ncol=2 , pt.size=3.0, reduction = "tsne", label = T)
# change default identity
Idents(allsamples) <- "RNA_snn_res.0.25"
# list cell number in each cluster for WT vs TG
table(Idents(allsamples),allsamples$type)
##
## BCR-UV HC
## 0 3460 7882
## 1 15 4068
## 2 356 3386
## 3 1300 760
## 4 1970 69
## 5 891 982
## 6 535 1065
## 7 755 695
## 8 628 197
## 9 404 250
## 10 191 155
## 11 44 104
## 12 10 17
# Visualize cluster at 0.75 resolution
DimPlot(object = allsamples, pt.size=0.5, reduction = "tsne", label = T)
# Color by HC vs BCR-UV
DimPlot(object = allsamples, pt.size=0.5, reduction = "tsne", group.by = "type" )
########################## UMAP
# Run umap with 20 pcs
allsamples <- RunUMAP(
object = allsamples,
reduction.use = "pca",
dims = use.pcs)
# Plot umap
DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", label = T)
# Color by HC vs BCR-UV
DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", group.by = "type")
# Plot umap
dimplot1<-DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", label = T)
# Color by HC vs BCR-UV
dimplot2<-DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", group.by = "type")
# Save umaps
ggsave(paste(outputpath,"umap-clusters.pdf",sep=""), plot = dimplot1, width = 20, height = 15, units = "cm")
ggsave(paste(outputpath,"umap-clusters-type.pdf",sep=""), plot = dimplot2, width = 20, height = 15, units = "cm")
# Visualizations
# expression of variable genes across clusters
# Custom list of genes to visualize
#top10
mygenes = c("NKG7","CCL5")
RidgePlot(allsamples, features = mygenes)
#VlnPlot(allsamples, features = top10)
VlnPlot(allsamples, features = mygenes, pt.size=0)
#VlnPlot(WT_TG_Filt_Scaled, features = mygenes, split.by = "orig.ident")
FeaturePlot(allsamples, features = mygenes)
#FeaturePlot(WT_TG_Filt_Scaled, features = mygenes, split.by = "orig.ident")
DotPlot(allsamples, features = mygenes)
DoHeatmap(subset(allsamples), features = mygenes, size = 3)
# Visualize ADT with GEX data
# Specify key for rna features and ADT features
# Add rna_ or adt_ to any gene to visualize
rownames(allsamples[["ADT"]])
## [1] "CD107a-LAMP-1" "CD115-CSF-1R"
## [3] "CD119-IFN-g-R-a-chain" "CD11b"
## [5] "CD11c" "CD123"
## [7] "CD127-IL-7Ra" "CD135-Flt-3-Flk-2"
## [9] "CD137-4-1BB" "CD137L-4-1BB-Ligand"
## [11] "CD14.1" "CD141-Thrombomodulin"
## [13] "CD152-CTLA-4" "CD154"
## [15] "CD158e1-KIR3DL1-NKB1" "CD16"
## [17] "CD163.1" "CD172a-SIRPa"
## [19] "CD178-Fas-L" "CD183-CXCR3"
## [21] "CD185-CXCR5" "CD19.1"
## [23] "CD196-CCR6" "CD197-CCR7"
## [25] "CD1c" "CD20"
## [27] "CD218a-IL-18Ra" "CD223-LAG-3"
## [29] "CD226.1" "CD24.1"
## [31] "CD244-2B4" "CD25"
## [33] "CD27.1" "CD273-B7-DC-PD-L2"
## [35] "CD274-B7-H1-PD-L1" "CD275"
## [37] "CD278-ICOS" "CD279-PD-1"
## [39] "CD28.1" "CD3"
## [41] "CD303-BDCA-2" "CD307c-FcRL3"
## [43] "CD307d-FcRL4" "CD307e-FcRL5"
## [45] "CD314-NKG2D" "CD335-NKp46"
## [47] "CD336" "CD337-NKp30"
## [49] "CD34.1" "CD357-GITR"
## [51] "CD366-Tim-3" "CD38.1"
## [53] "CD39" "CD4.1"
## [55] "CD40.1" "CD44.1"
## [57] "CD45" "CD45RA"
## [59] "CD45RO" "CD47.1"
## [61] "CD48.1" "CD56-NCAM"
## [63] "CD57-Recombinant" "CD62L"
## [65] "CD64" "CD69.1"
## [67] "CD70.1" "CD8"
## [69] "CD80.1" "CD86.1"
## [71] "CD8a" "CD94"
## [73] "CD95-Fas" "HLA-A-B-C"
## [75] "HLA-A2" "HLA-DR"
## [77] "HLA-E" "IgD"
## [79] "IgM" "KLRG1-MAFA"
## [81] "Lymphotoxin-b-Receptor-LT-bR" "NKp80"
## [83] "TCR-g-d" "TIGIT-VSTM3"
## [85] "b2-microglobulin"
Key(allsamples[["RNA"]])
##
## "rna_"
Key(allsamples[["ADT"]])
##
## "adt_"
p1 <- FeaturePlot(allsamples, "adt_CD19.1") + ggtitle("CD19.1 protein")
p2 <- FeaturePlot(allsamples, "rna_CD19") + ggtitle("CD19 RNA")
p1 | p2
save.image(file=paste(outputpath,"SeuratObjectAfterClustering.RData",sep=""))
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rmarkdown_2.18 forcats_0.5.2 stringr_1.4.1
## [4] dplyr_1.0.10 purrr_0.3.5 readr_2.1.3
## [7] tidyr_1.2.1 tibble_3.1.8 tidyverse_1.3.2
## [10] kableExtra_1.3.4 KernSmooth_2.23-20 fields_14.1
## [13] viridis_0.6.2 viridisLite_0.4.1 spam_2.9-1
## [16] plotly_4.10.1 ggplot2_3.4.0 SeuratObject_4.1.3
## [19] Seurat_4.2.1 DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 spatstat.explore_3.0-5 reticulate_1.26
## [4] tidyselect_1.2.0 htmlwidgets_1.5.4 grid_4.2.2
## [7] Rtsne_0.16 munsell_0.5.0 codetools_0.2-18
## [10] ragg_1.2.4 ica_1.0-3 future_1.29.0
## [13] miniUI_0.1.1.1 withr_2.5.0 spatstat.random_3.0-1
## [16] colorspace_2.0-3 progressr_0.11.0 highr_0.9
## [19] knitr_1.40 rstudioapi_0.14 ROCR_1.0-11
## [22] tensor_1.5 listenv_0.8.0 labeling_0.4.2
## [25] polyclip_1.10-4 bit64_4.0.5 farver_2.1.1
## [28] parallelly_1.32.1 vctrs_0.5.0 generics_0.1.3
## [31] xfun_0.34 timechange_0.1.1 R6_2.5.1
## [34] ggbeeswarm_0.6.0 hdf5r_1.3.7 spatstat.utils_3.0-1
## [37] cachem_1.0.6 assertthat_0.2.1 promises_1.2.0.1
## [40] scales_1.2.1 googlesheets4_1.0.1 beeswarm_0.4.0
## [43] gtable_0.3.1 globals_0.16.1 goftest_1.2-3
## [46] rlang_1.0.6 systemfonts_1.0.4 splines_4.2.2
## [49] lazyeval_0.2.2 gargle_1.2.1 spatstat.geom_3.0-3
## [52] broom_1.0.1 yaml_2.3.6 reshape2_1.4.4
## [55] abind_1.4-5 modelr_0.1.9 backports_1.4.1
## [58] httpuv_1.6.6 tools_4.2.2 ellipsis_0.3.2
## [61] jquerylib_0.1.4 RColorBrewer_1.1-3 ggridges_0.5.4
## [64] Rcpp_1.0.9 plyr_1.8.7 deldir_1.0-6
## [67] pbapply_1.5-0 cowplot_1.1.1 zoo_1.8-11
## [70] haven_2.5.1 ggrepel_0.9.2 cluster_2.1.4
## [73] fs_1.5.2 magrittr_2.0.3 data.table_1.14.4
## [76] scattermore_0.8 reprex_2.0.2 lmtest_0.9-40
## [79] RANN_2.6.1 googledrive_2.0.0 fitdistrplus_1.1-8
## [82] matrixStats_0.62.0 hms_1.1.2 patchwork_1.1.2
## [85] mime_0.12 evaluate_0.18 xtable_1.8-4
## [88] readxl_1.4.1 gridExtra_2.3 compiler_4.2.2
## [91] maps_3.4.1 crayon_1.5.2 htmltools_0.5.3
## [94] later_1.3.0 tzdb_0.3.0 lubridate_1.9.0
## [97] DBI_1.1.3 dbplyr_2.2.1 MASS_7.3-58.1
## [100] Matrix_1.5-3 cli_3.4.1 parallel_4.2.2
## [103] dotCall64_1.0-2 igraph_1.3.5 pkgconfig_2.0.3
## [106] sp_1.5-1 spatstat.sparse_3.0-0 xml2_1.3.3
## [109] svglite_2.1.0 vipor_0.4.5 bslib_0.4.1
## [112] webshot_0.5.4 rvest_1.0.3 digest_0.6.30
## [115] sctransform_0.3.5 RcppAnnoy_0.0.20 spatstat.data_3.0-0
## [118] cellranger_1.1.0 leiden_0.4.3 uwot_0.1.14
## [121] curl_4.3.3 shiny_1.7.3 lifecycle_1.0.3
## [124] nlme_3.1-160 jsonlite_1.8.3 fansi_1.0.3
## [127] pillar_1.8.1 lattice_0.20-45 ggrastr_1.0.1
## [130] fastmap_1.1.0 httr_1.4.4 survival_3.4-0
## [133] glue_1.6.2 remotes_2.4.2 png_0.1-7
## [136] bit_4.0.4 stringi_1.7.8 sass_0.4.2
## [139] textshaping_0.3.6 irlba_2.3.5.1 future.apply_1.10.0
#https://ucdavis-bioinformatics-training.github.io/2021-March-Single-Cell-RNA-Seq-Analysis/data_analysis/scRNA_Workshop-PART4_fixed
file.copy(from = "/data/../TotalSeq/Tools/clustering.html",
to = "/data/../TotalSeq/Reports/site/clustering.html")
## [1] FALSE
file.remove("/data/../TotalSeq/Tools/clustering.html")
## [1] FALSE