CAMEX facilitates the discovery of new populations and markers in Primate dlPFC

This tutorial demonstrates that CAMEX could facilitates the discovery of new populations and markers in Primate dlPFC.

Here, we use collected dlPFC scRNA-seq data from four species human, rhesus, marmoset, and chimpanzee. Processed h5ad files can be downloaded from https://drive.google.com/drive/folders/1rwdjEvWFEFw82a0x2JzMi2jXICbUc5eb?usp=sharing

[1]:
import warnings
warnings.filterwarnings("ignore")
[2]:
import os
import time
import torch
import shutil
import warnings
import argparse
import importlib
import scanpy as sc

import pandas as pd
import numpy as mp

from CAMEX.base import Dataset
from CAMEX.trainer import Trainer
[2]:
from params import PARAMS
[3]:
t1 = time.time()

make log dir

[4]:
time_start = time.strftime("%Y-%m-%d-%H-%M-%S")
log_path = f'./log/{time_start}/'
for k, v in PARAMS.items():
    v['time_start'] = time_start
    v['log_path'] = log_path
print(log_path)
./log/2024-04-12-18-09-02/
[5]:
os.makedirs(log_path, exist_ok=True)
shutil.copy('params.py', log_path + 'params_current.py')
print(f'time: {time_start}')
time: 2024-04-12-18-09-02

preprocess scRNA_seq data to construct a heterogeneous graph of cells and genes

[6]:
#  —————————————————————————————————— 1 preprocess
print('start preprocess')
dataset = Dataset(**PARAMS['preprocess'])
# torch.save(dataset, log_path + 'dataset_preprocessed.pt')
# dataset = torch.load(f'{args.path}/log/2023-06-06-09-02-45/dataset_preprocessed.pt')
adata_CAMEX = dataset.adata_whole
dgl_data = dataset.dgl_data
start preprocess
                      raw-brain-Human-ma: reference  raw-brain-Chimpanzee-ma: query  raw-brain-Marmoset-ma: query  raw-brain-Rhesus-ma: query
Micro P2RY12 APBB1IP                           6811                          5679.0                        4626.0                      8058.0
Micro P2RY12 GLDN                               435                            69.0                           NaN                         NaN
Micro P2RY12 CCL3                               310                             NaN                           NaN                         NaN

[7]:
print('start train')
trainer = Trainer(adata_CAMEX, dgl_data, **PARAMS['train'])
start train

annotation

[8]:
trainer.annotation()
--------------------------------------------- annotation ---------------------------------------------
epoch: 0, loss: 25.45038467645645
train_acc: {'raw-brain-Human-macell_acc': 0.9242}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.988, 'raw-brain-Marmoset-macell_acc': 1.0, 'raw-brain-Rhesus-macell_acc': 1.0}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0, 'raw-brain-Human-macell_ami': 0.0373, 'raw-brain-Marmoset-macell_ami': 0.0, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 0
epoch: 1, loss: 19.897223353385925
train_acc: {'raw-brain-Human-macell_acc': 0.9512}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9897, 'raw-brain-Marmoset-macell_acc': 1.0, 'raw-brain-Rhesus-macell_acc': 0.9999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0012, 'raw-brain-Human-macell_ami': 0.0836, 'raw-brain-Marmoset-macell_ami': 0.0, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 1
epoch: 2, loss: 17.70285016298294
train_acc: {'raw-brain-Human-macell_acc': 0.9708}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9899, 'raw-brain-Marmoset-macell_acc': 0.9976, 'raw-brain-Rhesus-macell_acc': 0.9995}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0043, 'raw-brain-Human-macell_ami': 0.1067, 'raw-brain-Marmoset-macell_ami': 0.0028, 'raw-brain-Rhesus-macell_ami': 0.0}, best_epoch: 2
epoch: 3, loss: 16.376332879066467
train_acc: {'raw-brain-Human-macell_acc': 0.9749}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9899, 'raw-brain-Marmoset-macell_acc': 0.997, 'raw-brain-Rhesus-macell_acc': 0.999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0055, 'raw-brain-Human-macell_ami': 0.0928, 'raw-brain-Marmoset-macell_ami': 0.0026, 'raw-brain-Rhesus-macell_ami': 0.0004}, best_epoch: 2
epoch: 4, loss: 15.54094684123993
train_acc: {'raw-brain-Human-macell_acc': 0.9933}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.985, 'raw-brain-Marmoset-macell_acc': 0.9911, 'raw-brain-Rhesus-macell_acc': 0.9964}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0062, 'raw-brain-Human-macell_ami': 0.0898, 'raw-brain-Marmoset-macell_ami': 0.002, 'raw-brain-Rhesus-macell_ami': 0.0013}, best_epoch: 2
epoch: 5, loss: 15.03785303235054
train_acc: {'raw-brain-Human-macell_acc': 0.9967}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9805, 'raw-brain-Marmoset-macell_acc': 0.9879, 'raw-brain-Rhesus-macell_acc': 0.9935}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0076, 'raw-brain-Human-macell_ami': 0.0895, 'raw-brain-Marmoset-macell_ami': 0.0015, 'raw-brain-Rhesus-macell_ami': 0.0009}, best_epoch: 2
epoch: 6, loss: 16.603510677814484
train_acc: {'raw-brain-Human-macell_acc': 0.9825}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9873, 'raw-brain-Marmoset-macell_acc': 0.9952, 'raw-brain-Rhesus-macell_acc': 0.998}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0075, 'raw-brain-Human-macell_ami': 0.0988, 'raw-brain-Marmoset-macell_ami': 0.0055, 'raw-brain-Rhesus-macell_ami': 0.0014}, best_epoch: 5
epoch: 7, loss: 15.57164877653122
train_acc: {'raw-brain-Human-macell_acc': 0.9929}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9828, 'raw-brain-Marmoset-macell_acc': 0.9894, 'raw-brain-Rhesus-macell_acc': 0.9955}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.008, 'raw-brain-Human-macell_ami': 0.0911, 'raw-brain-Marmoset-macell_ami': 0.0029, 'raw-brain-Rhesus-macell_ami': 0.0008}, best_epoch: 5
epoch: 8, loss: 14.987527966499329
train_acc: {'raw-brain-Human-macell_acc': 0.9976}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9791, 'raw-brain-Marmoset-macell_acc': 0.9857, 'raw-brain-Rhesus-macell_acc': 0.9924}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0081, 'raw-brain-Human-macell_ami': 0.0899, 'raw-brain-Marmoset-macell_ami': 0.0036, 'raw-brain-Rhesus-macell_ami': 0.0014}, best_epoch: 5
epoch: 9, loss: 16.539766669273376
train_acc: {'raw-brain-Human-macell_acc': 0.9759}, test_acc: {'raw-brain-Chimpanzee-macell_acc': 0.9896, 'raw-brain-Marmoset-macell_acc': 0.9963, 'raw-brain-Rhesus-macell_acc': 0.999}, train_ami:{'raw-brain-Chimpanzee-macell_ami': 0.0043, 'raw-brain-Human-macell_ami': 0.0977, 'raw-brain-Marmoset-macell_ami': 0.0034, 'raw-brain-Rhesus-macell_ami': 0.0008}, best_epoch: 8
[9]:
adata_CAMEX.write_h5ad(log_path + 'adata_CAMEX.h5ad', compression='gzip')
[10]:
t2 = time.time()
[11]:
print(f'time usage: {round(t2-t1)} seconds')
time usage: 383 seconds

analysis

[13]:
adata_CAMEX = sc.read_h5ad(log_path + 'adata_CAMEX.h5ad')
adata_CAMEX
[13]:
AnnData object with n_obs × n_vars = 25988 × 2000
    obs: 'cell_name', 'subtype', 'subclass', 'class', 'samplename', 'tech_rep', 'species', 'cell_ontology_class', 'batch', 'n_genes_by_counts', 'total_counts', 'cell_ontology_class_num', 'cell_class', 'cell_class_num'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'cell_type', 'data_order', 'dataset_description', 'dataset_type', 'hvg', 'log1p', 'neighbors', 'pca'
    obsm: 'X_CAMEX_Annotation', 'X_CAMEX_Annotation_eval', 'X_pca', 'cell_train_class'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'
[14]:
sc.pp.neighbors(adata_CAMEX, use_rep='X_CAMEX_Annotation')
[15]:
sc.tl.umap(adata_CAMEX)
[16]:
sc.pl.umap(adata_CAMEX, color=['batch', 'cell_ontology_class'], wspace=0.6)
_images/discovery_new_populations_markers_20_0.png