{ "cells": [ { "cell_type": "markdown", "id": "1eb84700-b6c0-41c2-bb67-de4c786cc7fd", "metadata": {}, "source": [ "# CAMEX could achieve more accurate integration and annotation performance in both relatives and distant species\n", "\n", "\n", "This tutorial demonstrates that CAMEX could achieve more accurate integration and annotation performance in both relatives and distant species. \n", "\n", "Here, we use collected scRNA-seq data from four species: adult human visual cortex, frontal cortex, and cerebellum, mouse neocortex, as well as lizard and turtle pallium. Processed h5ad files can be downloaded from https://drive.google.com/drive/folders/1rwdjEvWFEFw82a0x2JzMi2jXICbUc5eb?usp=sharing" ] }, { "cell_type": "code", "execution_count": 1, "id": "e10bb31b-771b-4f36-8e19-afb7bb1b458d", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "488c6775-65e3-415c-8c0f-c9b4bc2a187f", "metadata": {}, "outputs": [], "source": [ "import os\n", "import time\n", "import torch\n", "import shutil\n", "import warnings\n", "import argparse\n", "import importlib\n", "import scanpy as sc\n", "\n", "import pandas as pd\n", "import numpy as mp\n", "\n", "from CAMEX.base import Dataset\n", "from CAMEX.trainer import Trainer" ] }, { "cell_type": "code", "execution_count": 4, "id": "5eb0b318-946a-4ffd-bbe7-920ba7353252", "metadata": {}, "outputs": [], "source": [ "from params import PARAMS" ] }, { "cell_type": "code", "execution_count": 5, "id": "a5e3468e-57c8-4f2d-9a68-d414de2e67d9", "metadata": {}, "outputs": [], "source": [ "t1 = time.time()" ] }, { "cell_type": "markdown", "id": "d4f0cfc7-9c5b-47e1-9246-570044e4efc6", "metadata": {}, "source": [ "## make log dir" ] }, { "cell_type": "code", "execution_count": 6, "id": "e3defd9a-5ab8-440b-9d5d-1e5e7f4f2dbf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./log/2025-06-14-10-39-23/\n" ] } ], "source": [ "time_start = time.strftime(\"%Y-%m-%d-%H-%M-%S\")\n", "log_path = f'./log/{time_start}/'\n", "for k, v in PARAMS.items():\n", " v['time_start'] = time_start\n", " v['log_path'] = log_path\n", "print(log_path)" ] }, { "cell_type": "code", "execution_count": 7, "id": "3c474362-1bcd-4fec-9857-83d5a9841174", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time: 2025-06-14-10-39-23\n" ] } ], "source": [ "os.makedirs(log_path, exist_ok=True)\n", "shutil.copy('params.py', log_path + 'params_current.py')\n", "print(f'time: {time_start}')" ] }, { "cell_type": "markdown", "id": "9f195c9d-e49e-42d4-a71f-77bab282b612", "metadata": {}, "source": [ "## preprocess scRNA_seq data to construct a heterogeneous graph of cells and genes" ] }, { "cell_type": "code", "execution_count": 8, "id": "e1e3d59e-8d0b-45e4-8332-287d8f9f481b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "start preprocess\n", " raw-brain-human-Lake: reference raw-brain-mouse-Chen: query raw-brain-lizard-Tosches: query raw-brain-turtle-Tosches: query\n", "excitatory neuron 14747.0 906.0 1910.0 7151.0\n", "inhibitory neuron 6808.0 1392.0 242.0 1490.0\n", "oligodendrocyte 4369.0 3724.0 551.0 155.0\n", "cerebellar granule cell 3298.0 NaN NaN NaN\n", "astrocyte 2524.0 1757.0 520.0 6514.0\n", "oligodendrocyte precursor cell 1358.0 1792.0 398.0 1862.0\n", "Purkinje cell 1001.0 NaN NaN NaN\n", "microglial cell 756.0 724.0 278.0 589.0\n", "endothelial cell 219.0 1197.0 NaN NaN\n", "brain pericyte 209.0 NaN NaN 114.0\n", "ependymal cell NaN 413.0 NaN NaN\n", "macrophage NaN 167.0 NaN NaN\n", "neural progenitor cell NaN NaN 133.0 717.0\n", "\n" ] } ], "source": [ "# —————————————————————————————————— 1 preprocess\n", "print('start preprocess')\n", "dataset = Dataset(**PARAMS['preprocess'])\n", "# torch.save(dataset, log_path + 'dataset_preprocessed.pt')\n", "# dataset = torch.load(f'{args.path}/log/2023-06-06-09-02-45/dataset_preprocessed.pt')\n", "adata_CAMEX = dataset.adata_whole\n", "dgl_data = dataset.dgl_data" ] }, { "cell_type": "code", "execution_count": 9, "id": "5a318493-b102-489e-8168-01b77cf7f39e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "start train\n" ] } ], "source": [ "print('start train')\n", "trainer = Trainer(adata_CAMEX, dgl_data, **PARAMS['train'])" ] }, { "cell_type": "markdown", "id": "55a0c11f-d705-4a59-b793-92a767ec6d85", "metadata": {}, "source": [ "## integration" ] }, { "cell_type": "code", "execution_count": 10, "id": "c1619926-7aab-4c6b-9d7a-5a909beae4c6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--------------------------------------------- integration ---------------------------------------------\n", "epoch: 0, loss: 88.19321090792432\n", "epoch: 1, loss: 26.90171910509651\n", "epoch: 2, loss: 26.61919125215507\n", "epoch: 3, loss: 26.456716372642987\n", "epoch: 4, loss: 26.36408836458936\n", "epoch: 5, loss: 26.273979987627193\n", "epoch: 6, loss: 26.22034063456971\n", "epoch: 7, loss: 26.136304831799166\n", "epoch: 8, loss: 26.09897893740807\n", "epoch: 9, loss: 26.08717859527211\n" ] } ], "source": [ "trainer.integration()" ] }, { "cell_type": "markdown", "id": "294570a8-591e-46e9-9a57-069630559cfc", "metadata": {}, "source": [ "## annotation" ] }, { "cell_type": "code", "execution_count": 11, "id": "84757b08-cc5c-4758-a5d2-47fb9b75f2bf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--------------------------------------------- annotation ---------------------------------------------\n", "epoch: 0, loss: 88.01263552904129\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9388}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.933, 'raw-brain-mouse-Chencell_acc': 0.6207, 'raw-brain-turtle-Toschescell_acc': 0.9275}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6674, 'raw-brain-lizard-Toschescell_ami': 0.7326, 'raw-brain-mouse-Chencell_ami': 0.5929, 'raw-brain-turtle-Toschescell_ami': 0.6558}, best_epoch: 0\n", "epoch: 1, loss: 55.156198382377625\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9442}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9368, 'raw-brain-mouse-Chencell_acc': 0.7285, 'raw-brain-turtle-Toschescell_acc': 0.9235}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6667, 'raw-brain-lizard-Toschescell_ami': 0.7578, 'raw-brain-mouse-Chencell_ami': 0.6837, 'raw-brain-turtle-Toschescell_ami': 0.6535}, best_epoch: 1\n", "epoch: 2, loss: 53.2356573343277\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9434}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9301, 'raw-brain-mouse-Chencell_acc': 0.7244, 'raw-brain-turtle-Toschescell_acc': 0.9161}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6631, 'raw-brain-lizard-Toschescell_ami': 0.7533, 'raw-brain-mouse-Chencell_ami': 0.6903, 'raw-brain-turtle-Toschescell_ami': 0.6396}, best_epoch: 1\n", "epoch: 3, loss: 52.655654072761536\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9479}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9201, 'raw-brain-mouse-Chencell_acc': 0.7352, 'raw-brain-turtle-Toschescell_acc': 0.9208}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6667, 'raw-brain-lizard-Toschescell_ami': 0.7362, 'raw-brain-mouse-Chencell_ami': 0.6919, 'raw-brain-turtle-Toschescell_ami': 0.6366}, best_epoch: 1\n", "epoch: 4, loss: 53.30202889442444\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9453}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9263, 'raw-brain-mouse-Chencell_acc': 0.737, 'raw-brain-turtle-Toschescell_acc': 0.9172}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6666, 'raw-brain-lizard-Toschescell_ami': 0.7458, 'raw-brain-mouse-Chencell_ami': 0.7004, 'raw-brain-turtle-Toschescell_ami': 0.644}, best_epoch: 3\n", "epoch: 5, loss: 52.66183912754059\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9491}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9209, 'raw-brain-mouse-Chencell_acc': 0.7464, 'raw-brain-turtle-Toschescell_acc': 0.9215}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6694, 'raw-brain-lizard-Toschescell_ami': 0.7351, 'raw-brain-mouse-Chencell_ami': 0.7069, 'raw-brain-turtle-Toschescell_ami': 0.6424}, best_epoch: 3\n", "epoch: 6, loss: 53.20870327949524\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.95}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9221, 'raw-brain-mouse-Chencell_acc': 0.7778, 'raw-brain-turtle-Toschescell_acc': 0.9246}, train_ami:{'raw-brain-human-Lakecell_ami': 0.671, 'raw-brain-lizard-Toschescell_ami': 0.7481, 'raw-brain-mouse-Chencell_ami': 0.7196, 'raw-brain-turtle-Toschescell_ami': 0.6532}, best_epoch: 6\n", "epoch: 7, loss: 52.54552209377289\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9479}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9244, 'raw-brain-mouse-Chencell_acc': 0.7353, 'raw-brain-turtle-Toschescell_acc': 0.9203}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6663, 'raw-brain-lizard-Toschescell_ami': 0.742, 'raw-brain-mouse-Chencell_ami': 0.6937, 'raw-brain-turtle-Toschescell_ami': 0.637}, best_epoch: 6\n", "epoch: 8, loss: 52.27011561393738\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.946}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9164, 'raw-brain-mouse-Chencell_acc': 0.6963, 'raw-brain-turtle-Toschescell_acc': 0.9093}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6634, 'raw-brain-lizard-Toschescell_ami': 0.7229, 'raw-brain-mouse-Chencell_ami': 0.6655, 'raw-brain-turtle-Toschescell_ami': 0.6217}, best_epoch: 6\n", "epoch: 9, loss: 52.543911933898926\n", "train_acc: {'raw-brain-human-Lakecell_acc': 0.9468}, test_acc: {'raw-brain-lizard-Toschescell_acc': 0.9216, 'raw-brain-mouse-Chencell_acc': 0.7345, 'raw-brain-turtle-Toschescell_acc': 0.9213}, train_ami:{'raw-brain-human-Lakecell_ami': 0.6657, 'raw-brain-lizard-Toschescell_ami': 0.7396, 'raw-brain-mouse-Chencell_ami': 0.701, 'raw-brain-turtle-Toschescell_ami': 0.6424}, best_epoch: 8\n" ] } ], "source": [ "trainer.annotation()" ] }, { "cell_type": "code", "execution_count": 12, "id": "0c0bdc16-fe85-4f25-a559-0294c1d564b1", "metadata": {}, "outputs": [], "source": [ "adata_CAMEX.write_h5ad(log_path + 'adata_CAMEX.h5ad', compression='gzip')" ] }, { "cell_type": "code", "execution_count": 13, "id": "7d878577-7ed8-48ea-af22-dce4d56a1845", "metadata": {}, "outputs": [], "source": [ "t2 = time.time()" ] }, { "cell_type": "code", "execution_count": 14, "id": "27e6cd2c-e0a2-43ec-940c-9f1994d38034", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time usage: 1841 seconds\n" ] } ], "source": [ "print(f'time usage: {round(t2-t1)} seconds')" ] }, { "cell_type": "markdown", "id": "7676623b-eb73-4d29-aaa2-553d1ea12f0d", "metadata": {}, "source": [ "## analysis" ] }, { "cell_type": "code", "execution_count": 15, "id": "4923ccb3-363e-4cb8-8e6b-0ef046e98b96", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'./log/2025-06-14-10-39-23/'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_path" ] }, { "cell_type": "code", "execution_count": 16, "id": "ee3857fe-77f1-4fad-84ef-1a926735ed5d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 69985 × 2000\n", " obs: 'cell_ontology_class', 'cell_ontology_id', 'cell_type1', 'dataset_name', 'donor', 'organ', 'organism', 'platform', 'region', 'tSNE1', 'tSNE2', 'batch', 'n_genes_by_counts', 'total_counts', 'cell_ontology_class_num', 'cell_class', 'cell_class_num'\n", " var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'\n", " uns: 'cell_type', 'data_order', 'dataset_description', 'dataset_type', 'hvg', 'log1p', 'neighbors', 'pca'\n", " obsm: 'X_CAMEX_Annotation', 'X_CAMEX_Annotation_eval', 'X_CAMEX_Integration', 'X_pca', 'cell_train_class'\n", " varm: 'PCs'\n", " layers: 'counts'\n", " obsp: 'connectivities', 'distances'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_CAMEX = sc.read_h5ad(log_path + 'adata_CAMEX.h5ad')\n", "adata_CAMEX" ] }, { "cell_type": "markdown", "id": "f0059c01-f4a4-4746-81d1-65b57721facc", "metadata": {}, "source": [ "## integration" ] }, { "cell_type": "code", "execution_count": 47, "id": "f806da21-eaf1-4c7f-b1f5-a377b7827f8b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | cell_ontology_class | \n", "cell_ontology_id | \n", "cell_type1 | \n", "dataset_name | \n", "donor | \n", "organ | \n", "organism | \n", "platform | \n", "region | \n", "tSNE1 | \n", "tSNE2 | \n", "batch | \n", "n_genes_by_counts | \n", "total_counts | \n", "cell_ontology_class_num | \n", "cell_class | \n", "cell_class_num | \n", "cell_type_pred | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| Gran_cbm1_TTAATCAGTCGC | \n", "cerebellar granule cell | \n", "CL:0001031 | \n", "Gran | \n", "Lake_2018 | \n", "1 | \n", "Brain | \n", "Homo sapiens | \n", "snDrop-seq | \n", "cbm | \n", "-26.725456 | \n", "-30.189707 | \n", "raw-brain-human-Lake | \n", "708 | \n", "1090.0 | \n", "3 | \n", "cerebellar granule cell | \n", "3 | \n", "cerebellar granule cell | \n", "
| Gran_cbm1_ACAACGACATCC | \n", "cerebellar granule cell | \n", "CL:0001031 | \n", "Gran | \n", "Lake_2018 | \n", "1 | \n", "Brain | \n", "Homo sapiens | \n", "snDrop-seq | \n", "cbm | \n", "-31.022097 | \n", "-22.272980 | \n", "raw-brain-human-Lake | \n", "646 | \n", "915.0 | \n", "3 | \n", "cerebellar granule cell | \n", "3 | \n", "cerebellar granule cell | \n", "
| Gran_cbm1_TATGTCTATATG | \n", "cerebellar granule cell | \n", "CL:0001031 | \n", "Gran | \n", "Lake_2018 | \n", "1 | \n", "Brain | \n", "Homo sapiens | \n", "snDrop-seq | \n", "cbm | \n", "-35.684959 | \n", "-24.031462 | \n", "raw-brain-human-Lake | \n", "701 | \n", "1053.0 | \n", "3 | \n", "cerebellar granule cell | \n", "3 | \n", "cerebellar granule cell | \n", "
| Gran_cbm1_TAATGGAAAATA | \n", "cerebellar granule cell | \n", "CL:0001031 | \n", "Gran | \n", "Lake_2018 | \n", "1 | \n", "Brain | \n", "Homo sapiens | \n", "snDrop-seq | \n", "cbm | \n", "-25.886816 | \n", "-33.706535 | \n", "raw-brain-human-Lake | \n", "715 | \n", "1115.0 | \n", "3 | \n", "cerebellar granule cell | \n", "3 | \n", "cerebellar granule cell | \n", "
| Gran_cbm1_CTGGACTACAGC | \n", "cerebellar granule cell | \n", "CL:0001031 | \n", "Gran | \n", "Lake_2018 | \n", "1 | \n", "Brain | \n", "Homo sapiens | \n", "snDrop-seq | \n", "cbm | \n", "-25.068712 | \n", "-36.672504 | \n", "raw-brain-human-Lake | \n", "651 | \n", "960.0 | \n", "3 | \n", "cerebellar granule cell | \n", "3 | \n", "cerebellar granule cell | \n", "