# Many fantastic pieces of free and open-source software can be used as key components to enable single cell analysis
# using python notebook. This scripts showed downstream analysis of 10X Genomic v3 RNAseq samples and import result
# into MongoDB for single cell explorer. The analytic codes were modified from scanpy tutorial "Clustering 3K PBMCs". We
#gratefully acknowledge all authors for Suerat and Scanpy and their contribution.  
# we use 10K healthy donor's PBMCs data obtained from 10x Genomics, data can be downloaed from
# https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_protein_v3

# Creat data folder and download demo data from 10X genomics
!mkdir data
!wget http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz -O data/pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz
!cd data; tar -xzf pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz

import scpipeline

import os,sys,csv,json,datetime,time,math,scipy.stats,collections,re;
from sklearn import preprocessing;
import numpy as np;
import pandas as pd;
import os.path;
import scanpy;
import scanpy.api as sc
sc.settings.set_figure_params(dpi=80)

# read counts data from 10X V3 data (cell ranger V3 output) 
p =  scpipeline.ProcessPipline();
dataPath='./data/filtered_feature_bc_matrix/';  # the directory with the `.mtx` file
p.readData(dataPath)                            # read 10X '.mtx'data, compute mitochondra fraction, and create p.data 
p.data                                          # p.data: this is the data object we will use for QC

--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
filtered out 10502 genes that are detected in less than 1 cells

AnnData object with n_obs × n_vars = 11769 × 23036 
    obs: 'n_genes', 'percent_mito', 'n_counts'
    var: 'gene_ids', 'n_cells'

### p.data is the data object for use 
sc.pl.highest_expr_genes(p.data, n_top=10)

# QC function
# def QC(self,max_n_genes="" ,min_n_genes="",min_n_cells="",max_percent_mito="")
# scanpy tutorial QC(self,max_n_genes=2500 ,min_n_genes=200,min_n_cells=3,max_percent_mito=0.05)

p.QC(min_n_genes=200,min_n_cells=3)

filter cells
filtered out 232 cells that have less than 200 genes expressed
filter genes
filtered out 2684 genes that are detected in less than 3 counts

## plot percentage of mitochondria 
sc.pl.violin(p.data, ['n_genes', 'n_counts', 'percent_mito'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(p.data, x='n_counts', y='n_genes')
sc.pl.scatter(p.data, x='n_counts', y='percent_mito')

# QC using percentage of mitochondria gene
p.QC(max_n_genes=5000, max_percent_mito=0.12)
"""
# for those who are more famaliar with scanpy: 
p.data = p.data[p.data.obs['n_genes'] < 5000, :]
p.data = p.data[p.data.obs['percent_mito'] < 0.12, :]
"""

filter n_genes < 5000
filter percent_mito < 0.12

"\n# for those who are more famaliar with scanpy: \np.data = p.data[p.data.obs['n_genes'] < 5000, :]\np.data = p.data[p.data.obs['percent_mito'] < 0.12, :]\n"

# QC process in scanpy package will remove cell barcodes. However, for database loading, we adata should keep same number of barcodes as original one. 
# We copy data from p.data to adata, which will be loaded into database   

adata = p.data.copy()

adata

AnnData object with n_obs × n_vars = 9571 × 20352 
    obs: 'n_genes', 'percent_mito', 'n_counts'
    var: 'gene_ids', 'n_cells', 'n_counts'

# normalization (library-size correct) the data matrix to 10,000 reads per cell
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

#Logarithmize the data
sc.pp.log1p(adata)

adata.raw = adata

# highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

sc.pl.highly_variable_genes(adata)

adata = adata[:, adata.var['highly_variable']]
# regress out effects of total counts per cell and the percentage of mitochondrial genes.
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
# Scale each gene to unit variance. Clip values exceeding standard deviation 10.
sc.pp.scale(adata, max_value=10)

regressing out ['n_counts', 'percent_mito']
    sparse input is densified and may lead to high memory use
    finished (0:00:18.01)

# Dimention reduction: PCA as a first step
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['CD3D','CD19'])

adata

AnnData object with n_obs × n_vars = 9571 × 2049 
    obs: 'n_genes', 'percent_mito', 'n_counts'
    var: 'gene_ids', 'n_cells', 'n_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

sc.pl.pca_variance_ratio(adata, log=True)

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

computing neighbors
    using 'X_pca' with n_pcs = 40
    finished (0:00:09.69) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix

sc.tl.umap(adata)

computing UMAP
    finished (0:00:21.09) --> added
    'X_umap', UMAP coordinates (adata.obsm)

## check cel marker for T cell, B cells, Monocytes, and Dendritic Cells
sc.pl.umap(adata, color=['CD3D','CD19','FCER1A','CD8A','NKG7','CD14','FCGR3A','IL3RA','PPBP'])

## compute cluster or cluster cells into subgroups using
## default resolution=1
sc.tl.leiden(adata,resolution=0.2)

running Leiden clustering
    finished (0:00:01.46) --> found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical)

sc.pl.umap(adata, color=['leiden'],legend_loc='on data',title= "umap")

## optional marker gene identification
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished (0:00:19.48) --> added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids

### t-SNE has better seperation among cell clusters, easy for single cell explorer users to lasso select cell clusters  
sc.tl.tsne(adata,n_pcs=40)
sc.pl.tsne(adata, color='leiden', legend_loc='on data', title='tSNE')

computing tSNE
    using 'X_pca' with n_pcs = 40
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
    using sklearn.manifold.TSNE with a fix by D. DeTomaso
    finished (0:01:31.48) --> added
    'X_tsne', tSNE coordinates (adata.obsm)

# to save umap result into MongoDB, you need to specify the database name, the Mongo server IP and port
p.insertToDB(dbname= 'scDB',dbport= 27017,dbhost='localhost',
                   adata=adata,mapType="umap", # choose embedding type: umap or tsne 
                   mapName='pmbc10k_umap', # this is the title of the map, you can label details info 
                   study="Demo", 
                   subjectid="",
                   disease="Healthy",
                   source="10XGenomic",
                   sample="Blood",
                   comment="",
                   author=""
              );

# save tsne result
p.insertToDB(dbname= 'scDB',dbport= 27017,dbhost='localhost',
                   adata=adata,mapType="tsne",
                   mapName='pmbc10k_tsne',
                   study="Demo",
                   subjectid="",
                   disease="Healthy",
                   source="10XGenomic",
                   sample="Blood",
                   comment="",
                   author=""
              );

PBMC 3k Analysis