# Many fantastic pieces of free and open-source software can be used as key components to enable single cell analysis
# using python notebook. This scripts showed downstream analysis of 10X Genomic v3 RNAseq samples and import result
# into MongoDB for single cell explorer. The analytic codes were modified from scanpy tutorial "Clustering 3K PBMCs". We
#gratefully acknowledge all authors for Suerat and Scanpy and their contribution.
# we use 10K healthy donor's PBMCs data obtained from 10x Genomics, data can be downloaed from
# https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_protein_v3
# Creat data folder and download demo data from 10X genomics
!mkdir data
!wget http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz -O data/pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz
!cd data; tar -xzf pbmc_10k_v3_filtered_feature_bc_matrix.tar.gz
import scpipeline
import os,sys,csv,json,datetime,time,math,scipy.stats,collections,re;
from sklearn import preprocessing;
import numpy as np;
import pandas as pd;
import os.path;
import scanpy;
import scanpy.api as sc
sc.settings.set_figure_params(dpi=80)
# read counts data from 10X V3 data (cell ranger V3 output)
p = scpipeline.ProcessPipline();
dataPath='./data/filtered_feature_bc_matrix/'; # the directory with the `.mtx` file
p.readData(dataPath) # read 10X '.mtx'data, compute mitochondra fraction, and create p.data
p.data # p.data: this is the data object we will use for QC
### p.data is the data object for use
sc.pl.highest_expr_genes(p.data, n_top=10)
# QC function
# def QC(self,max_n_genes="" ,min_n_genes="",min_n_cells="",max_percent_mito="")
# scanpy tutorial QC(self,max_n_genes=2500 ,min_n_genes=200,min_n_cells=3,max_percent_mito=0.05)
p.QC(min_n_genes=200,min_n_cells=3)
## plot percentage of mitochondria
sc.pl.violin(p.data, ['n_genes', 'n_counts', 'percent_mito'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(p.data, x='n_counts', y='n_genes')
sc.pl.scatter(p.data, x='n_counts', y='percent_mito')
# QC using percentage of mitochondria gene
p.QC(max_n_genes=5000, max_percent_mito=0.12)
"""
# for those who are more famaliar with scanpy:
p.data = p.data[p.data.obs['n_genes'] < 5000, :]
p.data = p.data[p.data.obs['percent_mito'] < 0.12, :]
"""
# QC process in scanpy package will remove cell barcodes. However, for database loading, we adata should keep same number of barcodes as original one.
# We copy data from p.data to adata, which will be loaded into database
adata = p.data.copy()
adata
# normalization (library-size correct) the data matrix to 10,000 reads per cell
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
#Logarithmize the data
sc.pp.log1p(adata)
adata.raw = adata
# highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]
# regress out effects of total counts per cell and the percentage of mitochondrial genes.
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
# Scale each gene to unit variance. Clip values exceeding standard deviation 10.
sc.pp.scale(adata, max_value=10)
# Dimention reduction: PCA as a first step
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['CD3D','CD19'])
adata
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
## check cel marker for T cell, B cells, Monocytes, and Dendritic Cells
sc.pl.umap(adata, color=['CD3D','CD19','FCER1A','CD8A','NKG7','CD14','FCGR3A','IL3RA','PPBP'])