Hi thanks for the reply.
Since I'm new to this and need solution as soon ass possible I will be sharing a demo app and linking the R code of publically available repo.
server.R
sourceCpp("src/util.cpp")
source("r/filtering_dropClust2.R")
source("r/hvgs_dropClust2.R")
library(shiny)
library(Seurat)
library(SingleCellExperiment)
shinyServer(function(input,output,session){
session$onSessionEnded(stopApp)
output$dropclust<-renderPlot({
if(input$dropclust1==0){
return()
}
pbmc.data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/")
x <- CreateSeuratObject(counts = pbmc.data , min.cells = 3, min.features = 200,project = "scRNAseq")
pb<<- as.SingleCellExperiment(x)
rm(x)
pb
pb<<-FilterCells(pb)
pb<<-FilterGenes(pb)
pb<<-CountNormalize(pb)
pb<<-RankGenes(pb, ngenes_keep = 1000)
pb<<-Sampling(pb)
pb<<-RankPCAGenes(pb)
})
})
ui.R
shinyUI(fluidPage(
theme=shinytheme("cyborg"),
tags$style(HTML("
.tabs-above > .nav > li[class=active] > a {
background-color: #000;
color: #FFFF;
}")),
actionButton("dropclust1","One-Click Analysis",style="color: #fff; background-color: #337ab7; border-color: #2e6da4",icon = icon("paper-plane")),
withLoader( plotOutput("dropclust"))
))
filtering.R
FilterCells <- function(object, min_count=3, ql_th = 0.001, qh_th = 1) {
try(if(qh_th < ql_th) stop("Lower limit must be smaller than upper limit."))
if (is(object, "SingleCellExperiment")) {
cS <- Matrix::colSums(SingleCellExperiment::counts(object)>=min_count)
} else {
cS <- Matrix::colSums(object>=min_count)
}
l1 = stats::quantile(cS,probs = ql_th)
l2 = stats::quantile(cS,probs = qh_th)
keep_cells = intersect(which(cS>=l1), which(cS<=l2))
cat(length(cS)-length(keep_cells), "bad cells removed.\n")
return(object[, keep_cells])
}
#' Filter out poor quality genes
#'
#' Genes (rows) with count greater than \code{min_count} and not expressed in atleast \code{min_cell} cells are removed.
#'
#' @param object the SingleCellExperiment to filter genes
#' @param min_count integer threshold for expression count
#' @param min_cell integer threshold for number of cells expressing a particular gene
#' @return SingleCellExperiment object with the bad genes removed
#' @export
#' @examples
#' library(SingleCellExperiment)
#' counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10)
#' object <- SingleCellExperiment(assays = list(counts = counts))
#' FilterGenes(object)
FilterGenes <- function(object, min_count=2, min_cell=3) {
mat = SingleCellExperiment::counts(object)
rS <- Matrix::rowSums(mat > min_count)
keep_genes = which(rS > min_cell)
cat(length(rS)-length(keep_genes), "genes filtered out,", length(keep_genes),"genes remaining.\n")
# object@metadata[["dropClust"]] = c(unlist(object@metadata[["dropClust"]]), "FilterGenes")
return(object[keep_genes, ])
}
hvgs.R
RankPCAGenes<-function(object,top=200){
if(is.null(rowData(object)$HVG) || is.null(colData(object)$Sampling)){
stop("SingleCellExperiment object does not contain high variable genes or Sampling() is not called prior.")
}
subsamples = SingleCellExperiment::colData(object)$Sampling
hvg_genes = SingleCellExperiment::rowData(object)$HVG
mat = Log2Normalize(normcounts(object)[hvg_genes,subsamples], return.sparse = F)
mat = t(mat)
cat("Find best PCA components...")
# mat = assay(object,i = "logcounts")[rowData(object)$HVG, colData(object)$Sampling]
nc = 50
invisible(print(dim(mat)))
PRR <- irlba::prcomp_irlba(mat, nc)
S = c()
for(i in 1:nc)
{
K = PRR$x[,i]
# if(prev){
# # L = mclust::Mclust(K)
# # S = c(S,L$G)
# peaks= NbClust::NbClust(K, distance = "euclidean",
# min.nc = 2, max.nc = 10,
# method = "kmeans", index ="ball")$Best.nc[1]
# }
# else{
# peaks = all_peaks(K)
# }
peaks = all_peaks(K)
S = c(S,peaks)
# cat(i,"\t")
}
# graphics::barplot(S)
df<-as.data.frame(cbind("MOD" = S,
"VAR" = apply(PRR$x[,1:nc],2, stats::var)))
rownames(df)<-paste0("PC",seq(nc))
#gene_pca <-abs(PRR$v[,which(S>=3)])
best_components = utils::head(with(df, order(-MOD, -VAR)), 10)
gene_pca <-abs(PRR$rotation[,best_components])
gene_max<-apply(gene_pca, 1, max)
rank_gene <- order(gene_max, decreasing=TRUE)
top_pc_genes = utils::head(rank_gene,top)
# gene.ids = which(rowData(object)$HVG==TRUE)
SummarizedExperiment::rowData(object)$PCAGenes <- rep(FALSE, nrow(object))
SummarizedExperiment::rowData(object)$PCAGenes[which(hvg_genes==T)[top_pc_genes]] <- TRUE
cat(top,"genes selected.\n")
object@metadata[["dropClust"]] = c(unlist(object@metadata[["dropClust"]]), "RankPCAGenes")
return(object)
}
all_peaks <- function(x) {
den.fit <- stats::density(x, kernel=c("gaussian"))
den.spline <- stats::smooth.spline(den.fit$x, den.fit$y,
all.knots=TRUE, spar = 0.5)
d1 <- stats::predict(den.spline, den.spline$x, deriv=1)
peak.count <- length(rle(den.sign <- sign(d1$y))$values)/2
if (is.na(peak.count) == TRUE) { peak.count <- 0 }
return(peak.count)
}
Cpp file
#include <cmath>
#include <set>
#include <utility>
#include <algorithm>
#include <vector>
using namespace Rcpp;
using namespace std;
Eigen::VectorXd ColDispersion_sp(Eigen::SparseMatrix<double> mat){
int nrows = mat.rows();
Eigen::VectorXd dispersion(mat.cols());
for (int i=0; i<mat.outerSize(); ++i){
double mean = 0;
double var = 0;
int counter = 0;
for (Eigen::SparseMatrix<double>::InnerIterator iter(mat,i); iter; ++iter){
mean += iter.value();
}
mean = mean / nrows;
for (Eigen::SparseMatrix<double>::InnerIterator iter(mat,i); iter; ++iter){
var += pow(iter.value() - mean, 2);
counter += 1;
}
var = (var + (nrows - counter) * pow(mean, 2)) / (nrows - 1);
dispersion[i] = var/mean;
}
return(dispersion);
}
Eigen::VectorXd ColDispersion_dense(Eigen::MatrixXd mat){
int nrows = mat.rows();
Eigen::VectorXd dispersion(mat.cols());
for (int i=0; i<mat.outerSize(); ++i){
double mean = 0;
double var = 0;
int counter = 0;
for (Eigen::MatrixXd::InnerIterator iter(mat,i); iter; ++iter){
mean += iter.value();
}
mean = mean / nrows;
for (Eigen::MatrixXd::InnerIterator iter(mat,i); iter; ++iter){
var += pow(iter.value() - mean, 2);
counter += 1;
}
var = (var + (nrows - counter) * pow(mean, 2)) / (nrows - 1);
dispersion[i] = var/mean;
}
return(dispersion);
}
//[[Rcpp::export]]
Eigen::VectorXd ColDispersion(SEXP mat) {
if (Rf_isS4(mat)) {
if(Rf_inherits(mat, "dgCMatrix")) {
return ColDispersion_sp(as<Eigen::SparseMatrix<double>>(mat)) ;
} ;
stop("unsupported matrix class.") ;
} else {
return ColDispersion_dense(as<Eigen::MatrixXd>(mat)) ;
}
}
inline int check_close(double v1, double v2, double close_thresh) {
if(v1 > v2){
swap(v1, v2);
}
if(abs(v1) < 0.001){
return 0;
}
if((v2 - v1)/v1 < close_thresh){
return 1;
} else {
return 0;
}
}
int too_close_total(NumericVector & gval1, NumericVector & gval2, double close_thresh) {
int n = gval1.length();
int result = 0;
for ( int i = 0; i < n; ++i ) {
result += check_close(gval1(i), gval2(i), close_thresh);
}
return result;
}
// [[Rcpp::export]]
RcppExport SEXP reduce_genes_cpp(List & l) {
NumericMatrix mat = as<NumericMatrix>(l["matrix"]);
double close_thresh = as<double>(l["close_threshold"]);
double cells_thresh = as<double>(l["cells_threshold"]);
double nreduce = as<int>(l["nreduce"]);
int ncells = mat.nrow();
int ngenes = mat.ncol();
set<pair<int, int> > check_pairs;
for(int i = 0; i < ncells; i ++) {
vector<pair<double, int> > temp_vec;
for(int j = 0; j < ngenes; j ++) {
temp_vec.push_back(make_pair(mat(i, j), j));
}
sort(temp_vec.begin(), temp_vec.end());
for(int j = 1; j < ngenes; j ++) {
check_pairs.insert(make_pair(min(temp_vec[j - 1].second, temp_vec[j].second), max(temp_vec[j - 1].second, temp_vec[j].second)));
}
}
int total_removed = 0;
vector<int> removed_genes(ngenes, 0);
for(pair<int, int> pa : check_pairs) {
int gene1 = pa.first, gene2 = pa.second;
if(!removed_genes[gene1] and !removed_genes[gene2]) {
NumericVector col1 = mat(_, gene1), col2 = mat(_, gene2);
if(too_close_total(col1, col2, close_thresh) > cells_thresh * ncells) {
removed_genes[gene1] = 1;
total_removed ++;
}
}
if(total_removed >= nreduce) break;
}
NumericVector ret(removed_genes.begin(), removed_genes.end());
return ret;
}
it's the reproducible code from a new member of the community hopefully you will be able to decode this.