Clustering

1. Introduction

This document takes the QCd, normalized, scaled, batch corrected data and performs clustering.

#Install and load required R packages. Install the packages if you do not already have them installed
library(Seurat)
library(kableExtra)
library(ggplot2)
library(plotly)
library(tidyverse)
library(rmarkdown)

2. Load QC’d, normalized, scaled, batch corrected data

# Load QCd, normalized data
load('/data/../TotalSeq/Results/normalization/SeuratObjectAfterNormalization.RData')
# Input path
datadirectory <- "/data/../TotalSeq/Results/normalization/"
# Output path
outputpath="/data/../TotalSeq/Results/clustering/"
# Set working directory
setwd(outputpath)

3. PCA

##################### PCA
# Run PCA with 50 pcs
allsamples <- RunPCA(object = allsamples, npcs=50)
# Print genes in the top 5 pcs
print(allsamples[["pca"]], dims = 1:5, nfeatures = 5)

## PC_ 1
## Positive:  MALAT1, CD247, BACH2, RPS27, CST7
## Negative:  S100A9, LYZ, CST3, S100A8, KYNU
## PC_ 2
## Positive:  CD79A, MS4A1, BANK1, PLEKHG1, IGHD
## Negative:  ANXA1, S100A4, VIM, PITPNC1, CD247
## PC_ 3
## Positive:  GNLY, NKG7, GZMB, KLRD1, FGFBP2
## Negative:  PDE3B, PLCL1, LEF1, LDLRAD4, TSHZ2
## PC_ 4
## Positive:  GP1BB, PF4, PPBP, GNG11, CAVIN2
## Negative:  LMNA, EREG, IL1B, CXCL8, S100A10
## PC_ 5
## Positive:  TXK, LEF1, MAML2, PLCL1, FHIT
## Negative:  LMNA, S100A10, NIBAN1, S100A4, S100A11

# Identify number of PCs that explains majority of variations
ElbowPlot(allsamples, ndims=50, reduction = "pca")

# Visualize the genes in the first PC
VizDimLoadings(allsamples, dims = 1, ncol = 1) + theme_minimal(base_size = 8)

# Visualize the genes in the second PC
VizDimLoadings(allsamples, dims = 2, ncol = 1) + theme_minimal(base_size = 8)

# Set the identity column to WT vs TG
Idents(allsamples) <- "orig.ident"
# Visualize the cells after pca
DimPlot(object = allsamples, reduction = "pca")

# Plot heatmap with cells=500 plotting cells with extreme cells on both ends of spectrum
#DimHeatmap(object = allsamples, dims = 1:6, cells = 500, balanced = TRUE)
DimHeatmap(object = allsamples, dims = 1:6, cells = 500, balanced = TRUE)

3. Clustering

##################### Clustering
# use the first 20 pc's
use.pcs = 1:20
# FindNeighbors with 20 pcs
allsamples <- FindNeighbors(allsamples, reduction="pca", dims = use.pcs)
# FindClusters with resolution starting at 0.25 and ending at 4, with 0.5 increment
allsamples <- FindClusters(
  object = allsamples,
  resolution = seq(0.25,0.75,0.5),
  verbose = FALSE
)
head(allsamples)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS       1758          948       1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS       1478          930        186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS       2045         1003        883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS       5150         2003       2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS       2287         1306        645
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226     42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620     18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139     30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109     21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763     23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS           84    5.1194539     26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS           48    2.6387009     24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS           78    3.4229829     30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS           82    0.6019417     35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS           83    2.9296021     19.45780 0.00000000
##                             batch type RNA_snn_res.0.25 RNA_snn_res.0.75
## AAACCCACACTGGAAG-NS3R189BTS     1   HC                2                2
## AAACCCACATGACGTT-NS3R189BTS     1   HC                1                9
## AAACCCAGTCCAAATC-NS3R189BTS     1   HC                0               15
## AAACCCAGTGAGTTTC-NS3R189BTS     1   HC                1                3
## AAACCCAGTTCTTAGG-NS3R189BTS     1   HC                0                4
## AAACCCATCAGCTGAT-NS3R189BTS     1   HC                1                3
## AAACCCATCATTCACT-NS3R189BTS     1   HC                0                0
## AAACCCATCCGGACGT-NS3R189BTS     1   HC                2                2
## AAACCCATCCGTGGTG-NS3R189BTS     1   HC                0                4
## AAACCCATCCTCTCTT-NS3R189BTS     1   HC                0                0
##                             seurat_clusters
## AAACCCACACTGGAAG-NS3R189BTS               2
## AAACCCACATGACGTT-NS3R189BTS               9
## AAACCCAGTCCAAATC-NS3R189BTS              15
## AAACCCAGTGAGTTTC-NS3R189BTS               3
## AAACCCAGTTCTTAGG-NS3R189BTS               4
## AAACCCATCAGCTGAT-NS3R189BTS               3
## AAACCCATCATTCACT-NS3R189BTS               0
## AAACCCATCCGGACGT-NS3R189BTS               2
## AAACCCATCCGTGGTG-NS3R189BTS               4
## AAACCCATCCTCTCTT-NS3R189BTS               0

# Count number of clusters at each resolution
sapply(grep("res",colnames(allsamples@meta.data),value = TRUE),
       function(x) length(unique(allsamples@meta.data[,x])))

## RNA_snn_res.0.25 RNA_snn_res.0.75
##               13               22

4. tSNE

####################### TSNE
# Run tsne with 20 pcs
allsamples <- RunTSNE(
  object = allsamples,
  reduction.use = "pca",
  dims = use.pcs,
  do.fast = TRUE)

# Plot all clusters in all resolutions
DimPlot(object = allsamples, group.by=grep("res",colnames(allsamples@meta.data),value = TRUE)[1:4], ncol=2 , pt.size=3.0, reduction = "tsne", label = T)

# change default identity
Idents(allsamples) <- "RNA_snn_res.0.25"
# list cell number in each cluster for WT vs TG
table(Idents(allsamples),allsamples$type)

##
##      BCR-UV   HC
##   0    3460 7882
##   1      15 4068
##   2     356 3386
##   3    1300  760
##   4    1970   69
##   5     891  982
##   6     535 1065
##   7     755  695
##   8     628  197
##   9     404  250
##   10    191  155
##   11     44  104
##   12     10   17

# Visualize cluster at 0.75 resolution
DimPlot(object = allsamples, pt.size=0.5, reduction = "tsne", label = T)

# Color by HC vs BCR-UV
DimPlot(object = allsamples, pt.size=0.5, reduction = "tsne", group.by = "type" )

5. UMAP

########################## UMAP
# Run umap with 20 pcs
allsamples <- RunUMAP(
  object = allsamples,
  reduction.use = "pca",
  dims = use.pcs)
# Plot umap
DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", label = T)

# Color by HC vs BCR-UV
DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", group.by = "type")

# Plot umap
dimplot1<-DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", label = T)
# Color by HC vs BCR-UV
dimplot2<-DimPlot(object = allsamples, pt.size=0.5, reduction = "umap", group.by = "type")
# Save umaps
ggsave(paste(outputpath,"umap-clusters.pdf",sep=""), plot = dimplot1, width = 20, height = 15, units = "cm")
ggsave(paste(outputpath,"umap-clusters-type.pdf",sep=""), plot = dimplot2, width = 20, height = 15, units = "cm")

# Visualizations
# expression of variable genes across clusters
# Custom list of genes to visualize
#top10
mygenes = c("NKG7","CCL5")
RidgePlot(allsamples, features = mygenes)

#VlnPlot(allsamples, features = top10)
VlnPlot(allsamples, features = mygenes, pt.size=0)

#VlnPlot(WT_TG_Filt_Scaled, features = mygenes, split.by = "orig.ident")
FeaturePlot(allsamples, features = mygenes)

#FeaturePlot(WT_TG_Filt_Scaled, features = mygenes, split.by = "orig.ident")
DotPlot(allsamples, features = mygenes)

DoHeatmap(subset(allsamples), features = mygenes, size = 3)

# Visualize ADT with GEX data
# Specify key for rna features and ADT features
# Add rna_ or adt_ to any gene to visualize
rownames(allsamples[["ADT"]])

##  [1] "CD107a-LAMP-1"                "CD115-CSF-1R"
##  [3] "CD119-IFN-g-R-a-chain"        "CD11b"
##  [5] "CD11c"                        "CD123"
##  [7] "CD127-IL-7Ra"                 "CD135-Flt-3-Flk-2"
##  [9] "CD137-4-1BB"                  "CD137L-4-1BB-Ligand"
## [11] "CD14.1"                       "CD141-Thrombomodulin"
## [13] "CD152-CTLA-4"                 "CD154"
## [15] "CD158e1-KIR3DL1-NKB1"         "CD16"
## [17] "CD163.1"                      "CD172a-SIRPa"
## [19] "CD178-Fas-L"                  "CD183-CXCR3"
## [21] "CD185-CXCR5"                  "CD19.1"
## [23] "CD196-CCR6"                   "CD197-CCR7"
## [25] "CD1c"                         "CD20"
## [27] "CD218a-IL-18Ra"               "CD223-LAG-3"
## [29] "CD226.1"                      "CD24.1"
## [31] "CD244-2B4"                    "CD25"
## [33] "CD27.1"                       "CD273-B7-DC-PD-L2"
## [35] "CD274-B7-H1-PD-L1"            "CD275"
## [37] "CD278-ICOS"                   "CD279-PD-1"
## [39] "CD28.1"                       "CD3"
## [41] "CD303-BDCA-2"                 "CD307c-FcRL3"
## [43] "CD307d-FcRL4"                 "CD307e-FcRL5"
## [45] "CD314-NKG2D"                  "CD335-NKp46"
## [47] "CD336"                        "CD337-NKp30"
## [49] "CD34.1"                       "CD357-GITR"
## [51] "CD366-Tim-3"                  "CD38.1"
## [53] "CD39"                         "CD4.1"
## [55] "CD40.1"                       "CD44.1"
## [57] "CD45"                         "CD45RA"
## [59] "CD45RO"                       "CD47.1"
## [61] "CD48.1"                       "CD56-NCAM"
## [63] "CD57-Recombinant"             "CD62L"
## [65] "CD64"                         "CD69.1"
## [67] "CD70.1"                       "CD8"
## [69] "CD80.1"                       "CD86.1"
## [71] "CD8a"                         "CD94"
## [73] "CD95-Fas"                     "HLA-A-B-C"
## [75] "HLA-A2"                       "HLA-DR"
## [77] "HLA-E"                        "IgD"
## [79] "IgM"                          "KLRG1-MAFA"
## [81] "Lymphotoxin-b-Receptor-LT-bR" "NKp80"
## [83] "TCR-g-d"                      "TIGIT-VSTM3"
## [85] "b2-microglobulin"

Key(allsamples[["RNA"]])

##
## "rna_"

Key(allsamples[["ADT"]])

##
## "adt_"

p1 <- FeaturePlot(allsamples, "adt_CD19.1") + ggtitle("CD19.1 protein")
p2 <- FeaturePlot(allsamples, "rna_CD19") + ggtitle("CD19 RNA")
p1 | p2

save.image(file=paste(outputpath,"SeuratObjectAfterClustering.RData",sep=""))
sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
##  [1] rmarkdown_2.18      forcats_0.5.2       stringr_1.4.1
##  [4] dplyr_1.0.10        purrr_0.3.5         readr_2.1.3
##  [7] tidyr_1.2.1         tibble_3.1.8        tidyverse_1.3.2
## [10] kableExtra_1.3.4    KernSmooth_2.23-20  fields_14.1
## [13] viridis_0.6.2       viridisLite_0.4.1   spam_2.9-1
## [16] plotly_4.10.1       ggplot2_3.4.0       SeuratObject_4.1.3
## [19] Seurat_4.2.1        DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             spatstat.explore_3.0-5 reticulate_1.26
##   [4] tidyselect_1.2.0       htmlwidgets_1.5.4      grid_4.2.2
##   [7] Rtsne_0.16             munsell_0.5.0          codetools_0.2-18
##  [10] ragg_1.2.4             ica_1.0-3              future_1.29.0
##  [13] miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.0-1
##  [16] colorspace_2.0-3       progressr_0.11.0       highr_0.9
##  [19] knitr_1.40             rstudioapi_0.14        ROCR_1.0-11
##  [22] tensor_1.5             listenv_0.8.0          labeling_0.4.2
##  [25] polyclip_1.10-4        bit64_4.0.5            farver_2.1.1
##  [28] parallelly_1.32.1      vctrs_0.5.0            generics_0.1.3
##  [31] xfun_0.34              timechange_0.1.1       R6_2.5.1
##  [34] ggbeeswarm_0.6.0       hdf5r_1.3.7            spatstat.utils_3.0-1
##  [37] cachem_1.0.6           assertthat_0.2.1       promises_1.2.0.1
##  [40] scales_1.2.1           googlesheets4_1.0.1    beeswarm_0.4.0
##  [43] gtable_0.3.1           globals_0.16.1         goftest_1.2-3
##  [46] rlang_1.0.6            systemfonts_1.0.4      splines_4.2.2
##  [49] lazyeval_0.2.2         gargle_1.2.1           spatstat.geom_3.0-3
##  [52] broom_1.0.1            yaml_2.3.6             reshape2_1.4.4
##  [55] abind_1.4-5            modelr_0.1.9           backports_1.4.1
##  [58] httpuv_1.6.6           tools_4.2.2            ellipsis_0.3.2
##  [61] jquerylib_0.1.4        RColorBrewer_1.1-3     ggridges_0.5.4
##  [64] Rcpp_1.0.9             plyr_1.8.7             deldir_1.0-6
##  [67] pbapply_1.5-0          cowplot_1.1.1          zoo_1.8-11
##  [70] haven_2.5.1            ggrepel_0.9.2          cluster_2.1.4
##  [73] fs_1.5.2               magrittr_2.0.3         data.table_1.14.4
##  [76] scattermore_0.8        reprex_2.0.2           lmtest_0.9-40
##  [79] RANN_2.6.1             googledrive_2.0.0      fitdistrplus_1.1-8
##  [82] matrixStats_0.62.0     hms_1.1.2              patchwork_1.1.2
##  [85] mime_0.12              evaluate_0.18          xtable_1.8-4
##  [88] readxl_1.4.1           gridExtra_2.3          compiler_4.2.2
##  [91] maps_3.4.1             crayon_1.5.2           htmltools_0.5.3
##  [94] later_1.3.0            tzdb_0.3.0             lubridate_1.9.0
##  [97] DBI_1.1.3              dbplyr_2.2.1           MASS_7.3-58.1
## [100] Matrix_1.5-3           cli_3.4.1              parallel_4.2.2
## [103] dotCall64_1.0-2        igraph_1.3.5           pkgconfig_2.0.3
## [106] sp_1.5-1               spatstat.sparse_3.0-0  xml2_1.3.3
## [109] svglite_2.1.0          vipor_0.4.5            bslib_0.4.1
## [112] webshot_0.5.4          rvest_1.0.3            digest_0.6.30
## [115] sctransform_0.3.5      RcppAnnoy_0.0.20       spatstat.data_3.0-0
## [118] cellranger_1.1.0       leiden_0.4.3           uwot_0.1.14
## [121] curl_4.3.3             shiny_1.7.3            lifecycle_1.0.3
## [124] nlme_3.1-160           jsonlite_1.8.3         fansi_1.0.3
## [127] pillar_1.8.1           lattice_0.20-45        ggrastr_1.0.1
## [130] fastmap_1.1.0          httr_1.4.4             survival_3.4-0
## [133] glue_1.6.2             remotes_2.4.2          png_0.1-7
## [136] bit_4.0.4              stringi_1.7.8          sass_0.4.2
## [139] textshaping_0.3.6      irlba_2.3.5.1          future.apply_1.10.0

6. Reference

#https://ucdavis-bioinformatics-training.github.io/2021-March-Single-Cell-RNA-Seq-Analysis/data_analysis/scRNA_Workshop-PART4_fixed

7. Generate R script

file.copy(from = "/data/../TotalSeq/Tools/clustering.html",
          to   = "/data/../TotalSeq/Reports/site/clustering.html")

## [1] FALSE

file.remove("/data/../TotalSeq/Tools/clustering.html")

## [1] FALSE