CAMEX facilitates the discovery of new populations and markers in Primate dlPFC
This tutorial demonstrates that CAMEX could facilitates the discovery of new populations and markers in Primate dlPFC.
Here, we use collected dlPFC scRNA-seq data from four species human, rhesus, marmoset, and chimpanzee. Processed h5ad files can be downloaded from https://drive.google.com/drive/folders/1rwdjEvWFEFw82a0x2JzMi2jXICbUc5eb?usp=sharing
[1]:
import warnings
warnings.filterwarnings("ignore")
[2]:
import os
import time
import torch
import shutil
import warnings
import argparse
import importlib
import scanpy as sc
import pandas as pd
import numpy as mp
from CAMEX.base import Dataset
from CAMEX.trainer import Trainer
[2]:
from params import PARAMS
[3]:
t1 = time.time()
make log dir
[4]:
time_start = time.strftime("%Y-%m-%d-%H-%M-%S")
log_path = f'./log/{time_start}/'
for k, v in PARAMS.items():
v['time_start'] = time_start
v['log_path'] = log_path
print(log_path)
./log/2024-04-12-18-09-02/
[5]:
os.makedirs(log_path, exist_ok=True)
shutil.copy('params.py', log_path + 'params_current.py')
print(f'time: {time_start}')
time: 2024-04-12-18-09-02
preprocess scRNA_seq data to construct a heterogeneous graph of cells and genes
[6]:
# —————————————————————————————————— 1 preprocess
print('start preprocess')
dataset = Dataset(**PARAMS['preprocess'])
# torch.save(dataset, log_path + 'dataset_preprocessed.pt')
# dataset = torch.load(f'{args.path}/log/2023-06-06-09-02-45/dataset_preprocessed.pt')
adata_CAMEX = dataset.adata_whole
dgl_data = dataset.dgl_data
start preprocess
raw-brain-Human-ma: reference raw-brain-Chimpanzee-ma: query raw-brain-Marmoset-ma: query raw-brain-Rhesus-ma: query
Micro P2RY12 APBB1IP 6811 5679.0 4626.0 8058.0
Micro P2RY12 GLDN 435 69.0 NaN NaN
Micro P2RY12 CCL3 310 NaN NaN NaN
[7]:
print('start train')
trainer = Trainer(adata_CAMEX, dgl_data, **PARAMS['train'])
start train
annotation
[8]:
trainer.annotation()
--------------------------------------------- annotation ---------------------------------------------
epoch: 0, loss: 25.45038467645645
train_acc: {'raw-brain-Human-macell_acc': 0.9242}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.988, 'raw-brain-Marmoset-macell_acc': 1.0, 'raw-brain-Rhesus-macell_acc': 1.0}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0, 'raw-brain-Human-macell_ami': 0.0373, 'raw-brain-Marmoset-macell_ami': 0.0, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 0
epoch: 1, loss: 19.897223353385925
train_acc: {'raw-brain-Human-macell_acc': 0.9512}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9897, 'raw-brain-Marmoset-macell_acc': 1.0, 'raw-brain-Rhesus-macell_acc': 0.9999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0012, 'raw-brain-Human-macell_ami': 0.0836, 'raw-brain-Marmoset-macell_ami': 0.0, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 1
epoch: 2, loss: 17.70285016298294
train_acc: {'raw-brain-Human-macell_acc': 0.9708}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9899, 'raw-brain-Marmoset-macell_acc': 0.9976, 'raw-brain-Rhesus-macell_acc': 0.9995}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0043, 'raw-brain-Human-macell_ami': 0.1067, 'raw-brain-Marmoset-macell_ami': 0.0028, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 2
epoch: 3, loss: 16.376332879066467
train_acc: {'raw-brain-Human-macell_acc': 0.9749}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9899, 'raw-brain-Marmoset-macell_acc': 0.997, 'raw-brain-Rhesus-macell_acc': 0.999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0055, 'raw-brain-Human-macell_ami': 0.0928, 'raw-brain-Marmoset-macell_ami': 0.0026, 'raw-brain-Rhesus-macell_ami': 0.0004}, best_epoch: 2
epoch: 4, loss: 15.54094684123993
train_acc: {'raw-brain-Human-macell_acc': 0.9933}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.985, 'raw-brain-Marmoset-macell_acc': 0.9911, 'raw-brain-Rhesus-macell_acc': 0.9964}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0062, 'raw-brain-Human-macell_ami': 0.0898, 'raw-brain-Marmoset-macell_ami': 0.002, 'raw-brain-Rhesus-macell_ami': 0.0013}, best_epoch: 2
epoch: 5, loss: 15.03785303235054
train_acc: {'raw-brain-Human-macell_acc': 0.9967}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9805, 'raw-brain-Marmoset-macell_acc': 0.9879, 'raw-brain-Rhesus-macell_acc': 0.9935}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0076, 'raw-brain-Human-macell_ami': 0.0895, 'raw-brain-Marmoset-macell_ami': 0.0015, 'raw-brain-Rhesus-macell_ami': 0.0009}, best_epoch: 2
epoch: 6, loss: 16.603510677814484
train_acc: {'raw-brain-Human-macell_acc': 0.9825}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9873, 'raw-brain-Marmoset-macell_acc': 0.9952, 'raw-brain-Rhesus-macell_acc': 0.998}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0075, 'raw-brain-Human-macell_ami': 0.0988, 'raw-brain-Marmoset-macell_ami': 0.0055, 'raw-brain-Rhesus-macell_ami': 0.0014}, best_epoch: 5
epoch: 7, loss: 15.57164877653122
train_acc: {'raw-brain-Human-macell_acc': 0.9929}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9828, 'raw-brain-Marmoset-macell_acc': 0.9894, 'raw-brain-Rhesus-macell_acc': 0.9955}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.008, 'raw-brain-Human-macell_ami': 0.0911, 'raw-brain-Marmoset-macell_ami': 0.0029, 'raw-brain-Rhesus-macell_ami': 0.0008}, best_epoch: 5
epoch: 8, loss: 14.987527966499329
train_acc: {'raw-brain-Human-macell_acc': 0.9976}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9791, 'raw-brain-Marmoset-macell_acc': 0.9857, 'raw-brain-Rhesus-macell_acc': 0.9924}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0081, 'raw-brain-Human-macell_ami': 0.0899, 'raw-brain-Marmoset-macell_ami': 0.0036, 'raw-brain-Rhesus-macell_ami': 0.0014}, best_epoch: 5
epoch: 9, loss: 16.539766669273376
train_acc: {'raw-brain-Human-macell_acc': 0.9759}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9896, 'raw-brain-Marmoset-macell_acc': 0.9963, 'raw-brain-Rhesus-macell_acc': 0.999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0043, 'raw-brain-Human-macell_ami': 0.0977, 'raw-brain-Marmoset-macell_ami': 0.0034, 'raw-brain-Rhesus-macell_ami': 0.0008}, best_epoch: 8
[9]:
adata_CAMEX.write_h5ad(log_path + 'adata_CAMEX.h5ad', compression='gzip')
[10]:
t2 = time.time()
[11]:
print(f'time usage: {round(t2-t1)} seconds')
time usage: 383 seconds
analysis
[13]:
adata_CAMEX = sc.read_h5ad(log_path + 'adata_CAMEX.h5ad')
adata_CAMEX
[13]:
AnnData object with n_obs × n_vars = 25988 × 2000
obs: 'cell_name', 'subtype', 'subclass', 'class', 'samplename', 'tech_rep', 'species', 'cell_ontology_class', 'batch', 'n_genes_by_counts', 'total_counts', 'cell_ontology_class_num', 'cell_class', 'cell_class_num'
var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
uns: 'cell_type', 'data_order', 'dataset_description', 'dataset_type', 'hvg', 'log1p', 'neighbors', 'pca'
obsm: 'X_CAMEX_Annotation', 'X_CAMEX_Annotation_eval', 'X_pca', 'cell_train_class'
varm: 'PCs'
layers: 'counts'
obsp: 'connectivities', 'distances'
[14]:
sc.pp.neighbors(adata_CAMEX, use_rep='X_CAMEX_Annotation')
[15]:
sc.tl.umap(adata_CAMEX)
[16]:
sc.pl.umap(adata_CAMEX, color=['batch', 'cell_ontology_class'], wspace=0.6)