Quality analysis and filtering

1. Introduction

This document takes the cellranger count data for each of the samples, does qc anlaysis, filtering, generates before and after qc plots and combines the data.

# Install hdf5 library if its not already installed
#sudo apt-get install libhdf5-dev or brew install hdf5
#install.packages("hdf5r")
remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
suppressMessages(require(DoubletFinder))
# Load below libraries
library(Seurat)
library(ggplot2)
library(plotly)

2. Data setup

# Set the input, output paths
datapath="/data/../TotalSeq/Results/cellranger"
outputpath="/data/../TotalSeq/Results/qualitycontrol/"
#datapath="/data/../TotalSeq/Results/cellranger"
#outputpath="/data/../TotalSeq/Reports/"
sampleinfopath="/data/../TotalSeq/SampleInformation.tab"
setwd(outputpath)

# Read the results folders
file_list <- list.files(path=datapath)
# List result folders
print("Samples in this analysis")

## [1] "Samples in this analysis"

print(file_list)

## [1] "NS3R189BTS" "NS7R65BBTS"

##################################### Extract and map short names
# Read sample information file
sampleinformation=read.csv(sampleinfopath,sep="\t")
#View(sampleinformation)

# Specify samples for analysis. Sample names should correspond to the cellranger output path folder names for the samples
samples=c("NS3R189BTS","NS7R65BBTS")

# Create empty lists to hold the combined data set
allsamplestomerge=list()
allsamplestomergeprefilter=list()

3. Quality Analysis and Filtering

This part of the workflow reads in each of the samples, analyses the quality, performs the quality filter and merges the data. Both merged unfiltered and merged filtered data are then used for plotting the quality

# Using the for loop to reads the samples list
for(i in samples)
{
  # Remove any old variables
  rm(onesampledataFilt)
  rm(onesampledata)
  rm(onesample)
  # Print the current sample name
  print(i)
  # generate a clean sample name
  filesample=unlist(file_list[i])
  # Read the data file
  onesample <- Read10X_h5(file.path(datapath,paste0(i,"/"),"cellranger_output/filtered_feature_bc_matrix.h5"), use.names = T)
  # Print column names
  colnames(onesample$`Gene Expression`)
  # Add gene names to columnnames
  colnames(onesample$`Gene Expression`) <-     paste(sapply(strsplit(colnames(onesample$`Gene Expression`),split="-"),'[[',1L),i,sep="-")
  # Print column names
  colnames(onesample$`Gene Expression`)
  colnames(onesample$`Antibody Capture`) <- paste(sapply(strsplit(colnames(onesample$`Antibody Capture`),split="-"),'[[',1L),i,sep="-")

  # Create a Seurat data object from the gex matrix, keeping all the features, and only cells with at least 300 genes
  onesampledata <- CreateSeuratObject(counts = onesample[["Gene Expression"]], min.cells = 0,min.features = 300,names.field = 2,names.delim = "\\-")
  # Add the ADT assay to the seurate gex object created above
  onesampledata[["ADT"]] <- CreateAssayObject(onesample[["Antibody Capture"]][, colnames(x = onesampledata)])
  # Normalize ADT data with standard CLR method
  onesampledata <- NormalizeData(onesampledata, assay = "ADT", normalization.method = "CLR")

  # Check genes names
  head(rownames(onesampledata))
  # Check data rows and columns
  head(onesampledata)

  # Generate Percent mito for each of the cells
  onesampledata$percent.mito <- PercentageFeatureSet(onesampledata, pattern = "^MT-")
  summary(onesampledata$percent.mito)
  head(onesampledata)

  # Generate Percent ribo for each of the cells
  onesampledata$percent.ribo <- PercentageFeatureSet(onesampledata, pattern = "^RP[SL]")
  summary(onesampledata$percent.ribo)
  head(onesampledata)

  # Generate Percent hemoglobin for each of the cells
  onesampledata$percent.hb <- PercentageFeatureSet(onesampledata, pattern = "^HB[^(P)]")
  summary(onesampledata$percent.hb)
  head(onesampledata)

  # Check the total number of cells in this sample
  table(onesampledata$orig.ident)
  # Filter data and keep only cells that has percent.mito less than or equal to 8
  onesampledataFilt <- subset(onesampledata, percent.mito <= 8)
  # Check the number of cells after mito filtering
  table(onesampledataFilt$orig.ident)
  # Filter for nCount RNA (total rna molecules sequenced)
  onesampledataFilt <- subset(onesampledataFilt, nCount_RNA >= 1000 & nCount_RNA <= 12000)
  # Check number of cells after ncount filtering
  table(onesampledataFilt$orig.ident)
  # Filter for nfeature (total number of genes expressed)
  onesampledataFilt <- subset(onesampledataFilt, nFeature_RNA >= 700)
  # Filter for percent ribo
  onesampledataFilt <- subset(onesampledataFilt, percent.ribo > 10 & percent.ribo < 45)

  # Counts after filtering
  table(onesampledataFilt$orig.ident)

  ################################ Doublet Prediction
  ###################### Normalize Data
  # Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. This is then natural-log transformed
  # Cell level normalization - accounts for sequencing depth
  onesampledataFilt <- NormalizeData(
    object = onesampledataFilt,
    normalization.method = "LogNormalize",
    scale.factor = 10000)

  ###################### Find variable genes
  # Returns top 2000 variable genes
  # VST is uses LOESS method
  # FindVariableFeatures needs normalization
  onesampledataFilt <- FindVariableFeatures(
    object = onesampledataFilt,
    selection.method = "vst")

  ##################### SCALE DATA
  # PCA needs scaled data
  # Zero centers and scales (mean/sd) gene/feature data in each cell (for across sample comparison), so extreme ranges in expression do not affect clustering (done for making cells with similar expression cluster together)
  onesampledataFilt <- ScaleData(
    object = onesampledataFilt,
    features=VariableFeatures(onesampledataFilt))

  ##################### PCA and UMAP
  # Run PCA with 50 pcs
  onesampledataFilt <- RunPCA(
    object = onesampledataFilt, npcs=50, verbose = T)
  onesampledataFilt <- RunUMAP(
    onesampledataFilt, dims = 1:10, verbose = F)

  # Set expected 7% doublets
  nExp <- round(ncol(onesampledataFilt) * 0.07)
  # Find doublets
  onesampledataFilt <- doubletFinder_v3(onesampledataFilt, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)

  # Name of the DF prediction can change, so extract the correct column name.
  DF.name = colnames(onesampledataFilt@meta.data)[grepl("DF.classification", colnames(onesampledataFilt@meta.data))]
  # Filter out doublets, save singlet data on to the list
  onesampledataFilt = onesampledataFilt[, onesampledataFilt@meta.data[, DF.name] == "Singlet"]

  # Total cells count after filtering
  table(Idents(onesampledataFilt))

  # Add filtered and unfiltered data to a list
  allsamplestomerge[[i]]=onesampledataFilt
  allsamplestomergeprefilter[[i]]=onesampledata
}

## [1] "NS3R189BTS"
## [1] "Creating 7036 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "NS7R65BBTS"
## [1] "Creating 3785 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

4. Merge samples

# Create one and the rest of the samples as a list, for filtered data
one=allsamplestomerge[[1]]
one

## An object of class Seurat
## 36686 features across 19630 samples within 2 assays
## Active assay: RNA (36601 features, 2000 variable features)
##  1 other assay present: ADT
##  2 dimensional reductions calculated: pca, umap

rest=allsamplestomerge[2:length(allsamplestomerge)]
rest

## $NS7R65BBTS
## An object of class Seurat
## 36686 features across 10559 samples within 2 assays
## Active assay: RNA (36601 features, 2000 variable features)
##  1 other assay present: ADT
##  2 dimensional reductions calculated: pca, umap

# Create one and the rest of the samples as a list, for prefiltered data
onepre=allsamplestomergeprefilter[[1]]
onepre

## An object of class Seurat
## 36686 features across 24781 samples within 2 assays
## Active assay: RNA (36601 features, 0 variable features)
##  1 other assay present: ADT

restpre=allsamplestomergeprefilter[2:length(allsamplestomergeprefilter)]
restpre

## $NS7R65BBTS
## An object of class Seurat
## 36686 features across 14223 samples within 2 assays
## Active assay: RNA (36601 features, 0 variable features)
##  1 other assay present: ADT

# Merge filtered seurat objects
allsamples <- merge(one, y = rest, project = "UV")
allsamples

## An object of class Seurat
## 36686 features across 30189 samples within 2 assays
## Active assay: RNA (36601 features, 0 variable features)
##  1 other assay present: ADT

head(allsamples)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS       1758          948       1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS       1478          930        186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS       2045         1003        883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS       5150         2003       2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS       2287         1306        645
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226     42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620     18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139     30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109     21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763     23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS           84    5.1194539     26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS           48    2.6387009     24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS           78    3.4229829     30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS           82    0.6019417     35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS           83    2.9296021     19.45780 0.00000000
##                             pANN_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS          0.04303198
## AAACCCACATGACGTT-NS3R189BTS          0.33004343
## AAACCCAGTCCAAATC-NS3R189BTS          0.17094355
## AAACCCAGTGAGTTTC-NS3R189BTS          0.31346230
## AAACCCAGTTCTTAGG-NS3R189BTS          0.30990920
## AAACCCATCAGCTGAT-NS3R189BTS          0.16818002
## AAACCCATCATTCACT-NS3R189BTS          0.23884722
## AAACCCATCCGGACGT-NS3R189BTS          0.05132254
## AAACCCATCCGTGGTG-NS3R189BTS          0.13896565
## AAACCCATCCTCTCTT-NS3R189BTS          0.29411765
##                             DF.classifications_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS                           Singlet
## AAACCCACATGACGTT-NS3R189BTS                           Singlet
## AAACCCAGTCCAAATC-NS3R189BTS                           Singlet
## AAACCCAGTGAGTTTC-NS3R189BTS                           Singlet
## AAACCCAGTTCTTAGG-NS3R189BTS                           Singlet
## AAACCCATCAGCTGAT-NS3R189BTS                           Singlet
## AAACCCATCATTCACT-NS3R189BTS                           Singlet
## AAACCCATCCGGACGT-NS3R189BTS                           Singlet
## AAACCCATCCGTGGTG-NS3R189BTS                           Singlet
## AAACCCATCCTCTCTT-NS3R189BTS                           Singlet
##                             pANN_0.25_0.09_795 DF.classifications_0.25_0.09_795
## AAACCCACACTGGAAG-NS3R189BTS                 NA                             <NA>
## AAACCCACATGACGTT-NS3R189BTS                 NA                             <NA>
## AAACCCAGTCCAAATC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTGAGTTTC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTTCTTAGG-NS3R189BTS                 NA                             <NA>
## AAACCCATCAGCTGAT-NS3R189BTS                 NA                             <NA>
## AAACCCATCATTCACT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGGACGT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGTGGTG-NS3R189BTS                 NA                             <NA>
## AAACCCATCCTCTCTT-NS3R189BTS                 NA                             <NA>

table(Idents(allsamples))

##
## NS3R189BTS NS7R65BBTS
##      19630      10559

# Merge prefiltered seurat objects
allsamplespre <- merge(onepre, y = restpre, project = "UV")
allsamplespre

## An object of class Seurat
## 36686 features across 39004 samples within 2 assays
## Active assay: RNA (36601 features, 0 variable features)
##  1 other assay present: ADT

head(allsamplespre)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS       2431         1431        611
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS       3394         1760        669
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS       2160         1279       1274
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS       5362         2261       2976
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS       2239         1283        545
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACAAGTCATC-NS3R189BTS           77    4.1135335    18.510901 0.00000000
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226    42.526158 0.00000000
## AAACCCACAGCACACC-NS3R189BTS           81    5.1856217    19.151444 0.00000000
## AAACCCACAGGAGGAG-NS3R189BTS           81    6.5277778     8.842593 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620    18.978102 0.04293688
## AAACCCAGTAGTTACC-NS3R189BTS           85    3.8418501    18.985453 0.01864976
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139    30.052990 0.02523341
## AAACCCAGTCCATACA-NS3R189BTS           81    5.0468959    19.830281 0.04466280
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109    21.675930 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763    23.353124 0.00000000

table(Idents(allsamplespre))

##
## NS3R189BTS NS7R65BBTS
##      24781      14223

5. Add metadata

# Assign batch ids to samples
allsamples$batch <- plyr::mapvalues(
  x = allsamples$orig.ident,
  from = sampleinformation$submitted_name,
  to = sampleinformation$SampleBatch
)
head(allsamples)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS       1758          948       1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS       1478          930        186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS       2045         1003        883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS       5150         2003       2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS       2287         1306        645
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226     42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620     18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139     30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109     21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763     23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS           84    5.1194539     26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS           48    2.6387009     24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS           78    3.4229829     30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS           82    0.6019417     35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS           83    2.9296021     19.45780 0.00000000
##                             pANN_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS          0.04303198
## AAACCCACATGACGTT-NS3R189BTS          0.33004343
## AAACCCAGTCCAAATC-NS3R189BTS          0.17094355
## AAACCCAGTGAGTTTC-NS3R189BTS          0.31346230
## AAACCCAGTTCTTAGG-NS3R189BTS          0.30990920
## AAACCCATCAGCTGAT-NS3R189BTS          0.16818002
## AAACCCATCATTCACT-NS3R189BTS          0.23884722
## AAACCCATCCGGACGT-NS3R189BTS          0.05132254
## AAACCCATCCGTGGTG-NS3R189BTS          0.13896565
## AAACCCATCCTCTCTT-NS3R189BTS          0.29411765
##                             DF.classifications_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS                           Singlet
## AAACCCACATGACGTT-NS3R189BTS                           Singlet
## AAACCCAGTCCAAATC-NS3R189BTS                           Singlet
## AAACCCAGTGAGTTTC-NS3R189BTS                           Singlet
## AAACCCAGTTCTTAGG-NS3R189BTS                           Singlet
## AAACCCATCAGCTGAT-NS3R189BTS                           Singlet
## AAACCCATCATTCACT-NS3R189BTS                           Singlet
## AAACCCATCCGGACGT-NS3R189BTS                           Singlet
## AAACCCATCCGTGGTG-NS3R189BTS                           Singlet
## AAACCCATCCTCTCTT-NS3R189BTS                           Singlet
##                             pANN_0.25_0.09_795 DF.classifications_0.25_0.09_795
## AAACCCACACTGGAAG-NS3R189BTS                 NA                             <NA>
## AAACCCACATGACGTT-NS3R189BTS                 NA                             <NA>
## AAACCCAGTCCAAATC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTGAGTTTC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTTCTTAGG-NS3R189BTS                 NA                             <NA>
## AAACCCATCAGCTGAT-NS3R189BTS                 NA                             <NA>
## AAACCCATCATTCACT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGGACGT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGTGGTG-NS3R189BTS                 NA                             <NA>
## AAACCCATCCTCTCTT-NS3R189BTS                 NA                             <NA>
##                             batch
## AAACCCACACTGGAAG-NS3R189BTS     1
## AAACCCACATGACGTT-NS3R189BTS     1
## AAACCCAGTCCAAATC-NS3R189BTS     1
## AAACCCAGTGAGTTTC-NS3R189BTS     1
## AAACCCAGTTCTTAGG-NS3R189BTS     1
## AAACCCATCAGCTGAT-NS3R189BTS     1
## AAACCCATCATTCACT-NS3R189BTS     1
## AAACCCATCCGGACGT-NS3R189BTS     1
## AAACCCATCCGTGGTG-NS3R189BTS     1
## AAACCCATCCTCTCTT-NS3R189BTS     1

allsamplespre$batch <- plyr::mapvalues(
  x = allsamplespre$orig.ident,
  from = sampleinformation$submitted_name,
  to = sampleinformation$SampleBatch
)
head(allsamplespre)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS       2431         1431        611
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS       3394         1760        669
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS       2160         1279       1274
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS       5362         2261       2976
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS       2239         1283        545
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACAAGTCATC-NS3R189BTS           77    4.1135335    18.510901 0.00000000
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226    42.526158 0.00000000
## AAACCCACAGCACACC-NS3R189BTS           81    5.1856217    19.151444 0.00000000
## AAACCCACAGGAGGAG-NS3R189BTS           81    6.5277778     8.842593 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620    18.978102 0.04293688
## AAACCCAGTAGTTACC-NS3R189BTS           85    3.8418501    18.985453 0.01864976
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139    30.052990 0.02523341
## AAACCCAGTCCATACA-NS3R189BTS           81    5.0468959    19.830281 0.04466280
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109    21.675930 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763    23.353124 0.00000000
##                             batch
## AAACCCACAAGTCATC-NS3R189BTS     1
## AAACCCACACTGGAAG-NS3R189BTS     1
## AAACCCACAGCACACC-NS3R189BTS     1
## AAACCCACAGGAGGAG-NS3R189BTS     1
## AAACCCACATGACGTT-NS3R189BTS     1
## AAACCCAGTAGTTACC-NS3R189BTS     1
## AAACCCAGTCCAAATC-NS3R189BTS     1
## AAACCCAGTCCATACA-NS3R189BTS     1
## AAACCCAGTGAGTTTC-NS3R189BTS     1
## AAACCCAGTTCTTAGG-NS3R189BTS     1

# Assign sample types
allsamples$type <- plyr::mapvalues(
  x = allsamples$orig.ident,
  from = sampleinformation$submitted_name,
  to = sampleinformation$SampleType
)
head(allsamples)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS       1758          948       1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS       1478          930        186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS       2045         1003        883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS       5150         2003       2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS       2287         1306        645
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226     42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620     18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139     30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109     21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763     23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS           84    5.1194539     26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS           48    2.6387009     24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS           78    3.4229829     30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS           82    0.6019417     35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS           83    2.9296021     19.45780 0.00000000
##                             pANN_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS          0.04303198
## AAACCCACATGACGTT-NS3R189BTS          0.33004343
## AAACCCAGTCCAAATC-NS3R189BTS          0.17094355
## AAACCCAGTGAGTTTC-NS3R189BTS          0.31346230
## AAACCCAGTTCTTAGG-NS3R189BTS          0.30990920
## AAACCCATCAGCTGAT-NS3R189BTS          0.16818002
## AAACCCATCATTCACT-NS3R189BTS          0.23884722
## AAACCCATCCGGACGT-NS3R189BTS          0.05132254
## AAACCCATCCGTGGTG-NS3R189BTS          0.13896565
## AAACCCATCCTCTCTT-NS3R189BTS          0.29411765
##                             DF.classifications_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS                           Singlet
## AAACCCACATGACGTT-NS3R189BTS                           Singlet
## AAACCCAGTCCAAATC-NS3R189BTS                           Singlet
## AAACCCAGTGAGTTTC-NS3R189BTS                           Singlet
## AAACCCAGTTCTTAGG-NS3R189BTS                           Singlet
## AAACCCATCAGCTGAT-NS3R189BTS                           Singlet
## AAACCCATCATTCACT-NS3R189BTS                           Singlet
## AAACCCATCCGGACGT-NS3R189BTS                           Singlet
## AAACCCATCCGTGGTG-NS3R189BTS                           Singlet
## AAACCCATCCTCTCTT-NS3R189BTS                           Singlet
##                             pANN_0.25_0.09_795 DF.classifications_0.25_0.09_795
## AAACCCACACTGGAAG-NS3R189BTS                 NA                             <NA>
## AAACCCACATGACGTT-NS3R189BTS                 NA                             <NA>
## AAACCCAGTCCAAATC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTGAGTTTC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTTCTTAGG-NS3R189BTS                 NA                             <NA>
## AAACCCATCAGCTGAT-NS3R189BTS                 NA                             <NA>
## AAACCCATCATTCACT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGGACGT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGTGGTG-NS3R189BTS                 NA                             <NA>
## AAACCCATCCTCTCTT-NS3R189BTS                 NA                             <NA>
##                             batch type
## AAACCCACACTGGAAG-NS3R189BTS     1   HC
## AAACCCACATGACGTT-NS3R189BTS     1   HC
## AAACCCAGTCCAAATC-NS3R189BTS     1   HC
## AAACCCAGTGAGTTTC-NS3R189BTS     1   HC
## AAACCCAGTTCTTAGG-NS3R189BTS     1   HC
## AAACCCATCAGCTGAT-NS3R189BTS     1   HC
## AAACCCATCATTCACT-NS3R189BTS     1   HC
## AAACCCATCCGGACGT-NS3R189BTS     1   HC
## AAACCCATCCGTGGTG-NS3R189BTS     1   HC
## AAACCCATCCTCTCTT-NS3R189BTS     1   HC

allsamplespre$type <- plyr::mapvalues(
  x = allsamplespre$orig.ident,
  from = sampleinformation$submitted_name,
  to = sampleinformation$SampleType
)
head(allsamplespre)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS       2431         1431        611
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS       3394         1760        669
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS       2160         1279       1274
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS       5362         2261       2976
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS       2239         1283        545
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACAAGTCATC-NS3R189BTS           77    4.1135335    18.510901 0.00000000
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226    42.526158 0.00000000
## AAACCCACAGCACACC-NS3R189BTS           81    5.1856217    19.151444 0.00000000
## AAACCCACAGGAGGAG-NS3R189BTS           81    6.5277778     8.842593 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620    18.978102 0.04293688
## AAACCCAGTAGTTACC-NS3R189BTS           85    3.8418501    18.985453 0.01864976
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139    30.052990 0.02523341
## AAACCCAGTCCATACA-NS3R189BTS           81    5.0468959    19.830281 0.04466280
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109    21.675930 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763    23.353124 0.00000000
##                             batch type
## AAACCCACAAGTCATC-NS3R189BTS     1   HC
## AAACCCACACTGGAAG-NS3R189BTS     1   HC
## AAACCCACAGCACACC-NS3R189BTS     1   HC
## AAACCCACAGGAGGAG-NS3R189BTS     1   HC
## AAACCCACATGACGTT-NS3R189BTS     1   HC
## AAACCCAGTAGTTACC-NS3R189BTS     1   HC
## AAACCCAGTCCAAATC-NS3R189BTS     1   HC
## AAACCCAGTCCATACA-NS3R189BTS     1   HC
## AAACCCAGTGAGTTTC-NS3R189BTS     1   HC
## AAACCCAGTTCTTAGG-NS3R189BTS     1   HC

# View type counts
table(allsamples$type)

##
## BCR-UV     HC
##  10559  19630

table(allsamplespre$type)

##
## BCR-UV     HC
##  14223  24781

# View batch counts
table(allsamples$batch)

##
##     1     4
## 19630 10559

table(allsamplespre$batch)

##
##     1     4
## 24781 14223

# View available slots
slotNames(allsamples)

##  [1] "assays"       "meta.data"    "active.assay" "active.ident" "graphs"
##  [6] "neighbors"    "reductions"   "images"       "project.name" "misc"
## [11] "version"      "commands"     "tools"

# View data
head(allsamples)

##                             orig.ident nCount_RNA nFeature_RNA nCount_ADT
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS       2676         1078       1191
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS       2329         1271       1135
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS       7926         2660       2034
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS       2339         1291       1058
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS       3537         1608        936
## AAACCCATCAGCTGAT-NS3R189BTS NS3R189BTS       1758          948       1375
## AAACCCATCATTCACT-NS3R189BTS NS3R189BTS       1478          930        186
## AAACCCATCCGGACGT-NS3R189BTS NS3R189BTS       2045         1003        883
## AAACCCATCCGTGGTG-NS3R189BTS NS3R189BTS       5150         2003       2014
## AAACCCATCCTCTCTT-NS3R189BTS NS3R189BTS       2287         1306        645
##                             nFeature_ADT percent.mito percent.ribo percent.hb
## AAACCCACACTGGAAG-NS3R189BTS           81    0.8221226     42.52616 0.00000000
## AAACCCACATGACGTT-NS3R189BTS           84    4.3795620     18.97810 0.04293688
## AAACCCAGTCCAAATC-NS3R189BTS           81    4.6177139     30.05299 0.02523341
## AAACCCAGTGAGTTTC-NS3R189BTS           79    5.3014109     21.67593 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS           78    2.2900763     23.35312 0.00000000
## AAACCCATCAGCTGAT-NS3R189BTS           84    5.1194539     26.67804 0.00000000
## AAACCCATCATTCACT-NS3R189BTS           48    2.6387009     24.56022 0.00000000
## AAACCCATCCGGACGT-NS3R189BTS           78    3.4229829     30.66015 0.00000000
## AAACCCATCCGTGGTG-NS3R189BTS           82    0.6019417     35.28155 0.00000000
## AAACCCATCCTCTCTT-NS3R189BTS           83    2.9296021     19.45780 0.00000000
##                             pANN_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS          0.04303198
## AAACCCACATGACGTT-NS3R189BTS          0.33004343
## AAACCCAGTCCAAATC-NS3R189BTS          0.17094355
## AAACCCAGTGAGTTTC-NS3R189BTS          0.31346230
## AAACCCAGTTCTTAGG-NS3R189BTS          0.30990920
## AAACCCATCAGCTGAT-NS3R189BTS          0.16818002
## AAACCCATCATTCACT-NS3R189BTS          0.23884722
## AAACCCATCCGGACGT-NS3R189BTS          0.05132254
## AAACCCATCCGTGGTG-NS3R189BTS          0.13896565
## AAACCCATCCTCTCTT-NS3R189BTS          0.29411765
##                             DF.classifications_0.25_0.09_1477
## AAACCCACACTGGAAG-NS3R189BTS                           Singlet
## AAACCCACATGACGTT-NS3R189BTS                           Singlet
## AAACCCAGTCCAAATC-NS3R189BTS                           Singlet
## AAACCCAGTGAGTTTC-NS3R189BTS                           Singlet
## AAACCCAGTTCTTAGG-NS3R189BTS                           Singlet
## AAACCCATCAGCTGAT-NS3R189BTS                           Singlet
## AAACCCATCATTCACT-NS3R189BTS                           Singlet
## AAACCCATCCGGACGT-NS3R189BTS                           Singlet
## AAACCCATCCGTGGTG-NS3R189BTS                           Singlet
## AAACCCATCCTCTCTT-NS3R189BTS                           Singlet
##                             pANN_0.25_0.09_795 DF.classifications_0.25_0.09_795
## AAACCCACACTGGAAG-NS3R189BTS                 NA                             <NA>
## AAACCCACATGACGTT-NS3R189BTS                 NA                             <NA>
## AAACCCAGTCCAAATC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTGAGTTTC-NS3R189BTS                 NA                             <NA>
## AAACCCAGTTCTTAGG-NS3R189BTS                 NA                             <NA>
## AAACCCATCAGCTGAT-NS3R189BTS                 NA                             <NA>
## AAACCCATCATTCACT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGGACGT-NS3R189BTS                 NA                             <NA>
## AAACCCATCCGTGGTG-NS3R189BTS                 NA                             <NA>
## AAACCCATCCTCTCTT-NS3R189BTS                 NA                             <NA>
##                             batch type
## AAACCCACACTGGAAG-NS3R189BTS     1   HC
## AAACCCACATGACGTT-NS3R189BTS     1   HC
## AAACCCAGTCCAAATC-NS3R189BTS     1   HC
## AAACCCAGTGAGTTTC-NS3R189BTS     1   HC
## AAACCCAGTTCTTAGG-NS3R189BTS     1   HC
## AAACCCATCAGCTGAT-NS3R189BTS     1   HC
## AAACCCATCATTCACT-NS3R189BTS     1   HC
## AAACCCATCCGGACGT-NS3R189BTS     1   HC
## AAACCCATCCGTGGTG-NS3R189BTS     1   HC
## AAACCCATCCTCTCTT-NS3R189BTS     1   HC

# Cleanup any old metadata columns from doublet finder
pANN.name = as.vector(colnames(allsamples@meta.data)[grepl("pANN", colnames(allsamples@meta.data))])
for (i in 1:length(pANN.name))
{
  allsamples@meta.data[, pANN.name[i]]<-NULL
}

DFclass.name = as.vector(colnames(allsamples@meta.data)[grepl("DF.classification", colnames(allsamples@meta.data))])
for (i in 1:length(DFclass.name))
{
  allsamples@meta.data[, DFclass.name[i]]<-NULL
}

6. Generate before and after QC plots

# Generate before filtering quality plots
qcbefore=VlnPlot(allsamplespre,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
  NoLegend()
qcbefore

# Save the plot in current working directory
ggsave(paste(outputpath,"before-qc-violineplot.pdf",sep=""), plot = qcbefore, width = 15, height = 20, units = "cm")

# Generate after filtering quality plots
qcafter=VlnPlot(allsamples,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
  NoLegend()
qcafter

# Save the plot in current working directory
ggsave(paste(outputpath,"after-qc-violineplot.pdf",sep=""), plot = qcafter, width = 15, height = 20, units = "cm")

# Save QC session data in the working directory
save.image(file=paste(outputpath,"SeuratObjectAfterQc.RData",sep=""))

# Reference
# Doublet prediction
# https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/seurat/seurat_01_qc.html#Predict_doublets
sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_rt.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
##  [1] KernSmooth_2.23-20  fields_14.1         viridis_0.6.2
##  [4] viridisLite_0.4.1   spam_2.9-1          plotly_4.10.1
##  [7] ggplot2_3.4.0       SeuratObject_4.1.3  Seurat_4.2.1
## [10] DoubletFinder_2.0.3
##
## loaded via a namespace (and not attached):
##   [1] systemfonts_1.0.4      plyr_1.8.7             igraph_1.3.5
##   [4] lazyeval_0.2.2         sp_1.5-1               splines_4.2.2
##   [7] listenv_0.8.0          scattermore_0.8        digest_0.6.30
##  [10] htmltools_0.5.3        fansi_1.0.3            magrittr_2.0.3
##  [13] tensor_1.5             cluster_2.1.4          ROCR_1.0-11
##  [16] remotes_2.4.2          globals_0.16.1         matrixStats_0.62.0
##  [19] spatstat.sparse_3.0-0  colorspace_2.0-3       ggrepel_0.9.2
##  [22] textshaping_0.3.6      xfun_0.34              dplyr_1.0.10
##  [25] crayon_1.5.2           jsonlite_1.8.3         progressr_0.11.0
##  [28] spatstat.data_3.0-0    survival_3.4-0         zoo_1.8-11
##  [31] glue_1.6.2             polyclip_1.10-4        gtable_0.3.1
##  [34] leiden_0.4.3           future.apply_1.10.0    maps_3.4.1
##  [37] abind_1.4-5            scales_1.2.1           DBI_1.1.3
##  [40] spatstat.random_3.0-1  miniUI_0.1.1.1         Rcpp_1.0.9
##  [43] xtable_1.8-4           reticulate_1.26        bit_4.0.4
##  [46] dotCall64_1.0-2        htmlwidgets_1.5.4      httr_1.4.4
##  [49] RColorBrewer_1.1-3     ellipsis_0.3.2         ica_1.0-3
##  [52] pkgconfig_2.0.3        farver_2.1.1           sass_0.4.2
##  [55] uwot_0.1.14            deldir_1.0-6           utf8_1.2.2
##  [58] tidyselect_1.2.0       labeling_0.4.2         rlang_1.0.6
##  [61] reshape2_1.4.4         later_1.3.0            munsell_0.5.0
##  [64] tools_4.2.2            cachem_1.0.6           cli_3.4.1
##  [67] generics_0.1.3         ggridges_0.5.4         evaluate_0.18
##  [70] stringr_1.4.1          fastmap_1.1.0          yaml_2.3.6
##  [73] ragg_1.2.4             goftest_1.2-3          knitr_1.40
##  [76] bit64_4.0.5            fitdistrplus_1.1-8     purrr_0.3.5
##  [79] RANN_2.6.1             pbapply_1.5-0          future_1.29.0
##  [82] nlme_3.1-160           mime_0.12              ggrastr_1.0.1
##  [85] hdf5r_1.3.7            compiler_4.2.2         beeswarm_0.4.0
##  [88] curl_4.3.3             png_0.1-7              spatstat.utils_3.0-1
##  [91] tibble_3.1.8           bslib_0.4.1            stringi_1.7.8
##  [94] lattice_0.20-45        Matrix_1.5-3           vctrs_0.5.0
##  [97] pillar_1.8.1           lifecycle_1.0.3        spatstat.geom_3.0-3
## [100] lmtest_0.9-40          jquerylib_0.1.4        RcppAnnoy_0.0.20
## [103] data.table_1.14.4      cowplot_1.1.1          irlba_2.3.5.1
## [106] httpuv_1.6.6           patchwork_1.1.2        R6_2.5.1
## [109] promises_1.2.0.1       gridExtra_2.3          vipor_0.4.5
## [112] parallelly_1.32.1      codetools_0.2-18       MASS_7.3-58.1
## [115] assertthat_0.2.1       withr_2.5.0            sctransform_0.3.5
## [118] parallel_4.2.2         grid_4.2.2             tidyr_1.2.1
## [121] rmarkdown_2.18         Rtsne_0.16             spatstat.explore_3.0-5
## [124] shiny_1.7.3            ggbeeswarm_0.6.0

7. Generate R script

knitr::purl(input="/data/../TotalSeq/Tools/qualityControl.Rmd", output="/data/../TotalSeq/Tools/qualityControl.R")

##
  |
  |                                                                      |   0%
  |
  |....                                                                  |   6%
  |
  |.........                                                             |  12%
  |
  |.............                                                         |  19%
  |
  |..................                                                    |  25%
  |
  |......................                                                |  31%
  |
  |..........................                                            |  38%
  |
  |...............................                                       |  44%
  |
  |...................................                                   |  50%
  |
  |.......................................                               |  56%
  |
  |............................................                          |  62%
  |
  |................................................                      |  69%
  |
  |....................................................                  |  75%
  |
  |.........................................................             |  81%
  |
  |.............................................................         |  88%
  |
  |..................................................................    |  94%
  |
  |......................................................................| 100%

## [1] "/data/../TotalSeq/Tools/qualityControl.R"

file.copy(from = "/data/../TotalSeq/Tools/qualityControl.html",
          to   = "/data/../TotalSeq/Reports/site/quality-control.html")

## [1] FALSE

file.remove("/data/../TotalSeq/Tools/qualityControl.html")

## [1] FALSE