1. Introduction

This document takes the QCd, normalized, scaled, batch corrected, clustered data and performs differential expression analysis.

#Install and load required R packages. Install the packages if you do not already have them installed
#BiocManager::install('multtest')
#install.packages('metap')
library(Seurat)
library(kableExtra)
library(ggplot2)
library(plotly)
library(tidyverse)
library(rmarkdown)
library(cowplot)

2. Load QC’d, normalized, scaled, batch corrected data and clustered data

# Load QCd, normalized data
load('/data/../TotalSeq/Results/clustering/SeuratObjectAfterClustering.RData')
# Input path
datadirectory <- "/data/../TotalSeq/Results/clustering/"
# Output path
outputpath="/data/../TotalSeq/Results/markers/"
# Set working directory
#setwd(outputpath)

3. Differential expression (HC vs BCR-UV)

########################### Marker genes between groups
# Current identity levels
levels(allsamples)
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
head(Idents(allsamples))
## AAACCCACACTGGAAG-NS3R189BTS AAACCCACATGACGTT-NS3R189BTS
##                           2                           1
## AAACCCAGTCCAAATC-NS3R189BTS AAACCCAGTGAGTTTC-NS3R189BTS
##                           0                           1
## AAACCCAGTTCTTAGG-NS3R189BTS AAACCCATCAGCTGAT-NS3R189BTS
##                           0                           1
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
# Change identity to HC vs BCR-UV
Idents(allsamples) <- "type"
head(Idents(allsamples))
## AAACCCACACTGGAAG-NS3R189BTS AAACCCACATGACGTT-NS3R189BTS
##                          HC                          HC
## AAACCCAGTCCAAATC-NS3R189BTS AAACCCAGTGAGTTTC-NS3R189BTS
##                          HC                          HC
## AAACCCAGTTCTTAGG-NS3R189BTS AAACCCATCAGCTGAT-NS3R189BTS
##                          HC                          HC
## Levels: HC BCR-UV
levels(allsamples)
## [1] "HC"     "BCR-UV"
########################### tables
# Proportion and counts of cells in clusters and groups
prop.table(table(Idents(allsamples)))
##
##        HC    BCR-UV
## 0.6502368 0.3497632
table(Idents(allsamples))
##
##     HC BCR-UV
##  19630  10559
# Find differentially expressed genes for UV vs BCR-UV
markershcbcruv = FindMarkers(allsamples, ident.1="BCR-UV", ident.2="HC")
head(markershcbcruv)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj
## EPB41       0 -0.8413380 0.545 0.641         0
## SF3B4       0  0.3139616 0.237 0.075         0
## C1orf56     0 -1.2897434 0.197 0.414         0
## S100A9      0 -1.3083059 0.101 0.477         0
## S100A8      0 -1.1964336 0.064 0.367         0
## RPS27       0  0.9883480 1.000 0.998         0
# Set marker rownames
markershcbcruv$gene=rownames(markershcbcruv)
# Save the list of differentially expressed genes as a tab delimited text file
write.table(markershcbcruv, file=paste(outputpath,"markershcbcruv.txt",sep=""), quote=F, row.names = FALSE, sep = "\t")
VlnPlot(allsamples,features=c("HLA-B","TMSB4X"), pt.size = 0)

FeaturePlot(allsamples, features="HLA-B")

table(allsamples$type)
##
## BCR-UV     HC
##  10559  19630
# Get all the expression count matrix
#head(GetAssayData(WT_TG_Filt_Scaled, slot="counts"))
# Get expression values for a single gene from all cells
head(FetchData(allsamples, vars="HLA-B"))
##                                HLA-B
## AAACCCACACTGGAAG-NS3R189BTS 2.136984
## AAACCCACATGACGTT-NS3R189BTS 2.900033
## AAACCCAGTCCAAATC-NS3R189BTS 2.285611
## AAACCCAGTGAGTTTC-NS3R189BTS 3.778567
## AAACCCAGTTCTTAGG-NS3R189BTS 3.275078
## AAACCCATCAGCTGAT-NS3R189BTS 3.382402

4. Differential expression between clusters

########################### tables
# Proportion and counts of cells in clusters and groups
# Change identity to 0.25 cluster resolution
Idents(allsamples) <- "RNA_snn_res.0.25"
prop.table(table(Idents(allsamples)))
##
##            0            1            2            3            4            5
## 0.3756997582 0.1352479380 0.1239524330 0.0682367750 0.0675411574 0.0620424658
##            6            7            8            9           10           11
## 0.0529994369 0.0480307397 0.0273278346 0.0216635198 0.0114611282 0.0049024479
##           12
## 0.0008943655
table(Idents(allsamples), allsamples$type)
##
##      BCR-UV   HC
##   0    3460 7882
##   1      15 4068
##   2     356 3386
##   3    1300  760
##   4    1970   69
##   5     891  982
##   6     535 1065
##   7     755  695
##   8     628  197
##   9     404  250
##   10    191  155
##   11     44  104
##   12     10   17
########################### Marker genes between clusters
# Current identity levels
levels(allsamples)
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
head(Idents(allsamples))
## AAACCCACACTGGAAG-NS3R189BTS AAACCCACATGACGTT-NS3R189BTS
##                           2                           1
## AAACCCAGTCCAAATC-NS3R189BTS AAACCCAGTGAGTTTC-NS3R189BTS
##                           0                           1
## AAACCCAGTTCTTAGG-NS3R189BTS AAACCCATCAGCTGAT-NS3R189BTS
##                           0                           1
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
# Find differentially expressed genes for cluster 0 vs 1
markerscluster = FindMarkers(allsamples, ident.1=0, ident.2=1)
head(markerscluster)
##        p_val avg_log2FC pct.1 pct.2 p_val_adj
## S100A9     0 -1.7039381 0.361 0.896         0
## S100A8     0 -1.4970085 0.245 0.741         0
## FCER1G     0 -1.2172374 0.148 0.520         0
## GNLY       0 -0.9698375 0.471 0.843         0
## IGKC       0 -3.9915160 0.170 0.466         0
## IL1B       0 -1.5641778 0.224 0.711         0
write.table(markerscluster, file=paste(outputpath,"markerscluster.txt",sep=""), quote=F, row.names = FALSE, sep = "\t")
VlnPlot(allsamples,features=c("TNFRSF4","TNFRSF25"), pt.size = 0)

FeaturePlot(allsamples, features="TNFRSF25")

table(allsamples$orig.ident)
##
## NS3R189BTS NS7R65BBTS
##      19630      10559
# Get all the expression count matrix
#head(GetAssayData(WT_TG_Filt_Scaled, slot="counts"))
# Get expression values for a single gene from all cells
head(FetchData(allsamples, vars="TNFRSF25"))
##                             TNFRSF25
## AAACCCACACTGGAAG-NS3R189BTS        0
## AAACCCACATGACGTT-NS3R189BTS        0
## AAACCCAGTCCAAATC-NS3R189BTS        0
## AAACCCAGTGAGTTTC-NS3R189BTS        0
## AAACCCAGTTCTTAGG-NS3R189BTS        0
## AAACCCATCAGCTGAT-NS3R189BTS        0

5. Differential expression between every clusters

########################### Find all markers
markersall=FindAllMarkers(allsamples)
head(markersall)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj cluster    gene
## ANXA1       0  0.7068765 0.968 0.858         0       0   ANXA1
## LMNA        0  0.6774440 0.881 0.725         0       0    LMNA
## NIBAN1      0  0.5952121 0.635 0.398         0       0  NIBAN1
## LDLRAD4     0  0.5930851 0.726 0.553         0       0 LDLRAD4
## IL7R        0  0.5203522 0.874 0.678         0       0    IL7R
## S100A11     0  0.4987982 0.867 0.716         0       0 S100A11
#View(markersall)
# Save all markers as tab delimited file
write.table(markersall, file=paste(outputpath,"markersall.txt",sep=""),quote=F, row.names = FALSE, sep = ",")
# Plot selected markers
VlnPlot(allsamples,features=c("CYP1B1","IL1B"), pt.size = 0)

FeaturePlot(allsamples, features="IL1B")

# Extract the top marker for each cluster
head(markersall)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj cluster    gene
## ANXA1       0  0.7068765 0.968 0.858         0       0   ANXA1
## LMNA        0  0.6774440 0.881 0.725         0       0    LMNA
## NIBAN1      0  0.5952121 0.635 0.398         0       0  NIBAN1
## LDLRAD4     0  0.5930851 0.726 0.553         0       0 LDLRAD4
## IL7R        0  0.5203522 0.874 0.678         0       0    IL7R
## S100A11     0  0.4987982 0.867 0.716         0       0 S100A11
markersall %>%
  group_by(cluster) %>%
    slice(1) %>%
      pull(gene) -> best.gene.per.cluster

best.gene.per.cluster
##  [1] "ANXA1"   "IGHA1"   "LEF1"    "CCL5"    "RPS26"   "GNLY"    "S100A9"
##  [8] "CD79A"   "KLRB1"   "IKZF2"   "PPBP"    "PLEKHA6" "RHEX"
# Store plot as ggplot list
vlnplot1<-VlnPlot(allsamples,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
# vlnplot1<-VlnPlot(allsamples,features=best.gene.per.cluster, pt.size = 0, same.y.lims = TRUE, combine = FALSE)

# Set properties for each plot in the list
p1 <- list()
for (i in seq_along(vlnplot1)){
    #Change x and y tick label font size.
    p1[[i]] = vlnplot1[[i]] +
      theme(legend.position = "none",
            axis.text.x = element_text(size = 8),
            axis.text.y = element_text(size = 8),
            axis.title.y = element_text(size = 6),
            axis.title.x = element_blank(),
            title = element_text(size=8))
}
# Plot all the plots in the list
plot_grid(plotlist = p1, ncol = 3)

# Save plot to file
vlnplot1=plot_grid(plotlist = p1, ncol = 3)
ggsave(paste(outputpath,"vlnplot-best-per-cluster.pdf",sep=""), plot = vlnplot1, width = 20, height = 15, units = "cm")

# Plot features in umap
fplot1<-FeaturePlot(allsamples,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
p2 <- list()
for (i in seq_along(fplot1)){
    #Change x and y label font size.
    p2[[i]] = fplot1[[i]] +
      theme(legend.position = "none",
            axis.text.x = element_text(size = 8),
            axis.text.y = element_text(size = 8),
            axis.title.y = element_blank(),
            axis.title.x = element_blank(),
            title = element_text(size=8))
}
plot_grid(plotlist = p2, ncol = 4)

fplot1=plot_grid(plotlist = p2, ncol = 4)
ggsave(paste(outputpath,"fplot-best-per-cluster.pdf",sep=""), plot = fplot1, width = 20, height = 15, units = "cm")

6. Differential expression groups within a specific cluster

########################### tables
# Proportion and counts of cells in clusters and groups
# Change identity to 0.25 cluster resolution
Idents(allsamples) <- "RNA_snn_res.0.25"
prop.table(table(Idents(allsamples)))
##
##            0            1            2            3            4            5
## 0.3756997582 0.1352479380 0.1239524330 0.0682367750 0.0675411574 0.0620424658
##            6            7            8            9           10           11
## 0.0529994369 0.0480307397 0.0273278346 0.0216635198 0.0114611282 0.0049024479
##           12
## 0.0008943655
table(Idents(allsamples))
##
##     0     1     2     3     4     5     6     7     8     9    10    11    12
## 11342  4083  3742  2060  2039  1873  1600  1450   825   654   346   148    27
# Current identity levels
levels(allsamples)
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
head(Idents(allsamples))
## AAACCCACACTGGAAG-NS3R189BTS AAACCCACATGACGTT-NS3R189BTS
##                           2                           1
## AAACCCAGTCCAAATC-NS3R189BTS AAACCCAGTGAGTTTC-NS3R189BTS
##                           0                           1
## AAACCCAGTTCTTAGG-NS3R189BTS AAACCCATCAGCTGAT-NS3R189BTS
##                           0                           1
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
# Take all cells in cluster 1, and find markers that separate 'HC' and 'BCR-UV' that are in the 'orig.ident' column. subset.ident would the default identity of clusters
markersingroup <- FindMarkers(allsamples, ident.1 = "HC", ident.2 = "BCR-UV", group.by = 'type', subset.ident = 0)

head(markersingroup)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj
## S100A11     0 -0.8963638 0.820 0.973         0
## S100A9      0  2.4467055 0.497 0.051         0
## S100A8      0  1.8369587 0.346 0.016         0
## RPS27       0 -1.1221617 0.999 0.999         0
## LMNA        0 -1.3431421 0.833 0.989         0
## RPS7        0 -0.5535502 0.985 0.999         0
# Save markers
write.table(markersingroup, file="/data/../TotalSeq/Results/markers/markersingroup.txt", quote=F, row.names = FALSE, sep = "\t")
VlnPlot(allsamples,features=c("HLA-B","IFITM1"), group.by = "type", pt.size = 0)

FeaturePlot(allsamples, features=c("HLA-B"), split.by = "type")

table(allsamples$type)
##
## BCR-UV     HC
##  10559  19630
# Get all the expression count matrix
#head(GetAssayData(allsamples, slot="counts"))
# Get expression values for a single gene from all cells
head(FetchData(allsamples, vars="HLA-B"))
##                                HLA-B
## AAACCCACACTGGAAG-NS3R189BTS 2.136984
## AAACCCACATGACGTT-NS3R189BTS 2.900033
## AAACCCAGTCCAAATC-NS3R189BTS 2.285611
## AAACCCAGTGAGTTTC-NS3R189BTS 3.778567
## AAACCCAGTTCTTAGG-NS3R189BTS 3.275078
## AAACCCATCAGCTGAT-NS3R189BTS 3.382402
save.image(file=paste(outputpath,"SeuratObjectAfterMarker.RData",sep=""))
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
##  [1] cowplot_1.1.1       rmarkdown_2.18      forcats_0.5.2
##  [4] stringr_1.4.1       dplyr_1.0.10        purrr_0.3.5
##  [7] readr_2.1.3         tidyr_1.2.1         tibble_3.1.8
## [10] tidyverse_1.3.2     kableExtra_1.3.4    KernSmooth_2.23-20
## [13] fields_14.1         viridis_0.6.2       viridisLite_0.4.1
## [16] spam_2.9-1          plotly_4.10.1       ggplot2_3.4.0
## [19] SeuratObject_4.1.3  Seurat_4.2.1        DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             spatstat.explore_3.0-5 reticulate_1.26
##   [4] tidyselect_1.2.0       htmlwidgets_1.5.4      grid_4.2.2
##   [7] Rtsne_0.16             munsell_0.5.0          codetools_0.2-18
##  [10] ragg_1.2.4             ica_1.0-3              future_1.29.0
##  [13] miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.0-1
##  [16] colorspace_2.0-3       progressr_0.11.0       highr_0.9
##  [19] knitr_1.40             rstudioapi_0.14        ROCR_1.0-11
##  [22] tensor_1.5             listenv_0.8.0          labeling_0.4.2
##  [25] polyclip_1.10-4        bit64_4.0.5            farver_2.1.1
##  [28] parallelly_1.32.1      vctrs_0.5.0            generics_0.1.3
##  [31] xfun_0.34              timechange_0.1.1       R6_2.5.1
##  [34] ggbeeswarm_0.6.0       hdf5r_1.3.7            spatstat.utils_3.0-1
##  [37] cachem_1.0.6           assertthat_0.2.1       promises_1.2.0.1
##  [40] scales_1.2.1           googlesheets4_1.0.1    beeswarm_0.4.0
##  [43] gtable_0.3.1           globals_0.16.1         goftest_1.2-3
##  [46] rlang_1.0.6            systemfonts_1.0.4      splines_4.2.2
##  [49] lazyeval_0.2.2         gargle_1.2.1           spatstat.geom_3.0-3
##  [52] broom_1.0.1            yaml_2.3.6             reshape2_1.4.4
##  [55] abind_1.4-5            modelr_0.1.9           backports_1.4.1
##  [58] httpuv_1.6.6           tools_4.2.2            ellipsis_0.3.2
##  [61] jquerylib_0.1.4        RColorBrewer_1.1-3     ggridges_0.5.4
##  [64] Rcpp_1.0.9             plyr_1.8.7             deldir_1.0-6
##  [67] pbapply_1.5-0          zoo_1.8-11             haven_2.5.1
##  [70] ggrepel_0.9.2          cluster_2.1.4          fs_1.5.2
##  [73] magrittr_2.0.3         data.table_1.14.4      scattermore_0.8
##  [76] reprex_2.0.2           lmtest_0.9-40          RANN_2.6.1
##  [79] googledrive_2.0.0      fitdistrplus_1.1-8     matrixStats_0.62.0
##  [82] hms_1.1.2              patchwork_1.1.2        mime_0.12
##  [85] evaluate_0.18          xtable_1.8-4           readxl_1.4.1
##  [88] gridExtra_2.3          compiler_4.2.2         maps_3.4.1
##  [91] crayon_1.5.2           htmltools_0.5.3        later_1.3.0
##  [94] tzdb_0.3.0             lubridate_1.9.0        DBI_1.1.3
##  [97] dbplyr_2.2.1           MASS_7.3-58.1          Matrix_1.5-3
## [100] cli_3.4.1              parallel_4.2.2         dotCall64_1.0-2
## [103] igraph_1.3.5           pkgconfig_2.0.3        sp_1.5-1
## [106] spatstat.sparse_3.0-0  xml2_1.3.3             svglite_2.1.0
## [109] vipor_0.4.5            bslib_0.4.1            webshot_0.5.4
## [112] rvest_1.0.3            digest_0.6.30          sctransform_0.3.5
## [115] RcppAnnoy_0.0.20       spatstat.data_3.0-0    cellranger_1.1.0
## [118] leiden_0.4.3           uwot_0.1.14            curl_4.3.3
## [121] shiny_1.7.3            lifecycle_1.0.3        nlme_3.1-160
## [124] jsonlite_1.8.3         limma_3.54.0           fansi_1.0.3
## [127] pillar_1.8.1           lattice_0.20-45        ggrastr_1.0.1
## [130] fastmap_1.1.0          httr_1.4.4             survival_3.4-0
## [133] glue_1.6.2             remotes_2.4.2          png_0.1-7
## [136] bit_4.0.4              stringi_1.7.8          sass_0.4.2
## [139] textshaping_0.3.6      irlba_2.3.5.1          future.apply_1.10.0

7. Generate R script

# Reference
# https://satijalab.org/seurat/articles/multimodal_vignette.html
#file.copy(from = "/data/../TotalSeq/Tools/clustering.html",
#          to   = "/data/../TotalSeq/Reports/site/clustering.html")
#file.remove("/data/../TotalSeq/Tools/clustering.html")