Label Projection

Automated cell type annotation from rich, labeled reference data

6 datasets · 16 methods · 3 control methods · 4 metrics

Info

Task info Method info Metric info Dataset info Results

A major challenge for integrating single cell datasets is creating matching cell type annotations for each cell. One of the most common strategies for annotating cell types is referred to as “cluster-then-annotate” whereby cells are aggregated into clusters based on feature similarity and then manually characterized based on differential gene expression or previously identified marker genes. Recently, methods have emerged to build on this strategy and annotate cells using known marker genes. However, these strategies pose a difficulty for integrating atlas-scale datasets as the particular annotations may not match.

To ensure that the cell type labels in newly generated datasets match existing reference datasets, some methods align cells to a previously annotated reference dataset and then project labels from the reference to the new dataset.

Here, we compare methods for annotation based on a reference dataset. The datasets consist of two or more samples of single cell profiles that have been manually annotated with matching labels. These datasets are then split into training and test batches, and the task of each method is to train a cell type classifer on the training set and project those labels onto the test set.

Summary

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

has_resources = results[0].hasOwnProperty("resources")
has_exit_codes = results[0].hasOwnProperty("exit_codes")

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

results_resources = {
  let results_resources = null

  if (has_resources) {
    results_resources = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        ...d.resources
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return results_resources
}

resources = {
  let resources = null

  if (has_resources) {
    resources = d3.groups(results_resources, d => d.method_id)
      .map(([method_id, values]) => {
        const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
        const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
        const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
        const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))

        return ({
          method_id,
          mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
          mean_peak_memory_mb,
          mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
          mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
          mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
          mean_disk_read_log: -Math.log10(mean_disk_read_mb),
          mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
          mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
          mean_disk_write_log: -Math.log10(mean_disk_write_mb),
          mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
          mean_duration_sec,
          mean_duration_log: -Math.log10(mean_duration_sec),
          mean_duration_str: " " + label_time(mean_duration_sec) + " "
        })
      })
  }

  return resources
}

exit_codes = {
  let exit_codes = null

  if (has_exit_codes) {
    exit_codes = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: Object.values(d.exit_codes)
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  } else {
    exit_codes = results_resources.flatMap(d => {
      let exit_code = d.exit_code
      if (exit_code === undefined) {
        // If there is not exit code, assume the method ran successfully
        exit_code = 0
      }

      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: [exit_code]
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return exit_codes
}

error_reasons = d3.groups(exit_codes, d => d.method_id)
  .map(([method_id, values]) => {
    const all_codes = values.flatMap(d => d.exit_codes)

    if (all_codes.length === 0) {
      return {method_id, error_reason: []}
    }

    const error_pct_oom = d3.mean(all_codes, d => d === 137)
    const error_pct_timeout = d3.mean(all_codes, d => d === 143)
    const error_pct_na = d3.mean(all_codes, d => d === 99)
    const error_pct_error = d3.mean(all_codes, d => d > 0) - error_pct_oom - error_pct_timeout - error_pct_na
    const error_pct_unknown = d3.mean(all_codes, d => d < 0)
    const error_pct_ok = d3.mean(all_codes, d => d === 0)
    return ({
      method_id,
      error_reason: [
        error_pct_oom,
        error_pct_timeout,
        error_pct_error,
        error_pct_unknown,
        error_pct_na,
        error_pct_ok
      ],
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const error_reasons_ = error_reasons.find(d => d.method_id === method_id)

    let summary = {
      method_id,
      method_name,
      mean_score,
      ...datasets,
      ...metrics,
      ...error_reasons_
    }

    if (has_resources) {
      const resources_ = resources.find(d => d.method_id === method_id)
      summary = {...summary, ...resources_}
    }
    return summary
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = {
  let column_info = [
    {
      id: "method_name",
      name: "Name",
      label: null,
      group: "method",
      geom: "text",
      palette: null
    },
    {
      id: "mean_score",
      name: "Score",
      group: "overall",
      geom: "bar",
      palette: "overall"
    },
    {
      id: "error_reason",
      name: "Error reason",
      group: "overall",
      geom: "pie",
      palette: "error_reason"
    },
    ...dataset_info
      .filter(d => dataset_ids.includes(d.dataset_id))
      .map(
        d => ({
          id: "dataset_" + d.dataset_id,
          name: d.dataset_name,
          group: "dataset",
          geom: "funkyrect",
          palette: "dataset"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
    ...metric_info
      .filter(d => metric_ids.includes(d.metric_id))
      .map(
        d => ({
          id: "metric_" + d.metric_id,
          name: d.metric_name,
          group: "metric",
          geom: "funkyrect",
          palette: "metric"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
  ]

  if (has_resources) {
    column_info.push(
      {
        id: "mean_cpu_pct",
        name: "%CPU",
        group: "resources",
        geom: "funkyrect",
        palette: "resources"
      },
      {
        id: "mean_peak_memory_log",
        name: "Peak memory",
        label: "mean_peak_memory_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_read_log",
        name: "Disk read",
        label: "mean_disk_read_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_write_log",
        name: "Disk write",
        label: "mean_disk_write_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_duration_log",
        name: "Duration",
        label: "mean_duration_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      }
    )
  }

  column_info = column_info.map(d => {
    if (d.id === "method_name") {
      return {...d, options: {width: 15, hjust: 0}}
    } else if (d.id === "is_baseline") {
      return {...d, options: {width: 1}}
    } else if (d.geom === "bar") {
      return {...d, options: {width: 4}}
    } else {
      return d
    }
  })

  return column_info
}

column_groups = {
  let column_groups = [
    {
      group: "method",
      palette: null,
      level1: ""
    },
    {
      group: "overall",
      palette: "overall",
      level1: "Overall"
    },
    {
      group: "error_reason",
      palette: "error_reason",
      level1: "Error reason"
    },
    {
      group: "dataset",
      palette: "dataset",
      level1: dataset_info.length >= 3 ? "Datasets" : ""
    },
    {
      group: "metric",
      palette: "metric",
      level1: metric_info.length >= 3 ? "Metrics" : ""
    }
  ]

  if (has_resources) {
    column_groups.push(
      {group: "resources", palette: "resources", level1: "Resources"}
    )
  }

  return column_groups
}

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#fdb462", "#999999", "#FFFFFF"],
      names: [
        "Memory limit exceeded",
        "Time limit exceeded",
        "Execution error",
        "Unknown error",
        "Not applicable",
        "No error"
      ]
    }
  }
][0]

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Dataset info

Show

GTEX v9

Source dataset · Data source · 23-01-2025 · 196.56 MiB

Single-nucleus cross-tissue molecular reference maps to decipher disease gene function (Eraslan et al. 2022).

Understanding the function of genes and their regulation in tissue homeostasis and disease requires knowing the cellular context in which genes are expressed in tissues across the body. Single cell genomics allows the generation of detailed cellular atlases in human tissues, but most efforts are focused on single tissue types. Here, we establish a framework for profiling multiple tissues across the human body at single-cell resolution using single nucleus RNA-Seq (snRNA-seq), and apply it to 8 diverse, archived, frozen tissue types (three donors per tissue). We apply four snRNA-seq methods to each of 25 samples from 16 donors, generating a cross-tissue atlas of 209,126 nuclei profiles, and benchmark them vs. scRNA-seq of comparable fresh tissues. We use a conditional variational autoencoder (cVAE) to integrate an atlas across tissues, donors, and laboratory methods. We highlight shared and tissue-specific features of tissue-resident immune cells, identifying tissue-restricted and non-restricted resident myeloid populations. These include a cross-tissue conserved dichotomy between LYVE1- and HLA class II-expressing macrophages, and the broad presence of LAM-like macrophages across healthy tissues that is also observed in disease. For rare, monogenic muscle diseases, we identify cell types that likely underlie the neuromuscular, metabolic, and immune components of these diseases, and biological processes involved in their pathology. For common complex diseases and traits analyzed by GWAS, we identify the cell types and gene modules that potentially underlie disease mechanisms. The experimental and analytical frameworks we describe will enable the generation of large-scale studies of how cellular and molecular processes vary across individuals and populations.

Tabula Sapiens

Source dataset · Data source · 23-01-2025 · 23.61 MiB

A multiple-organ, single-cell transcriptomic atlas of humans (Jones et al. 2022).

Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects. This work is the product of the Tabula Sapiens Consortium. Taking the organs from the same individual controls for genetic background, age, environment, and epigenetic effects and allows detailed analysis and comparison of cell types that are shared between tissues. Our work creates a detailed portrait of cell types as well as their distribution and variation in gene expression across tissues and within the endothelial, epithelial, stromal and immune compartments.

Mouse Pancreatic Islet Atlas

Source dataset · Data source · 23-01-2025 · 432.04 MiB

Mouse pancreatic islet scRNA-seq atlas across sexes, ages, and stress conditions including diabetes (Hrovatin et al. 2023).

To better understand pancreatic β-cell heterogeneity we generated a mouse pancreatic islet atlas capturing a wide range of biological conditions. The atlas contains scRNA-seq datasets of over 300,000 mouse pancreatic islet cells, of which more than 100,000 are β-cells, from nine datasets with 56 samples, including two previously unpublished datasets. The samples vary in sex, age (ranging from embryonic to aged), chemical stress, and disease status (including T1D NOD model development and two T2D models, mSTZ and db/db) together with different diabetes treatments. Additional information about data fields is available in anndata uns field ‘field_descriptions’ and on https://github.com/theislab/mm_pancreas_atlas_rep/blob/main/resources/cellxgene.md.

HypoMap

Source dataset · Data source · 23-01-2025 · 33.72 MiB

A unified single cell gene expression atlas of the murine hypothalamus (Steuernagel et al. 2022).

The hypothalamus plays a key role in coordinating fundamental body functions. Despite recent progress in single-cell technologies, a unified catalogue and molecular characterization of the heterogeneous cell types and, specifically, neuronal subtypes in this brain region are still lacking. Here we present an integrated reference atlas “HypoMap” of the murine hypothalamus consisting of 384,925 cells, with the ability to incorporate new additional experiments. We validate HypoMap by comparing data collected from SmartSeq2 and bulk RNA sequencing of selected neuronal cell types with different degrees of cellular heterogeneity.

Immune Cell Atlas

Source dataset · Data source · 23-01-2025 · 117.72 MiB

Cross-tissue immune cell analysis reveals tissue-specific features in humans (Domínguez Conde et al. 2022).

Despite their crucial role in health and disease, our knowledge of immune cells within human tissues remains limited. We surveyed the immune compartment of 16 tissues from 12 adult donors by single-cell RNA sequencing and VDJ sequencing generating a dataset of ~360,000 cells. To systematically resolve immune cell heterogeneity across tissues, we developed CellTypist, a machine learning tool for rapid and precise cell type annotation. Using this approach, combined with detailed curation, we determined the tissue distribution of finely phenotyped immune cell types, revealing hitherto unappreciated tissue-specific features and clonal architecture of T and B cells. Our multitissue approach lays the foundation for identifying highly resolved immune cell types by leveraging a common reference dataset, tissue-integrated expression analysis, and antigen receptor sequencing.

Diabetic Kidney Disease

Source dataset · Data source · 23-01-2025 · 151.83 MiB

Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression (Wilson et al. 2022).

Multimodal single cell sequencing is a powerful tool for interrogating cell-specific changes in transcription and chromatin accessibility. We performed single nucleus RNA (snRNA-seq) and assay for transposase accessible chromatin sequencing (snATAC-seq) on human kidney cortex from donors with and without diabetic kidney disease (DKD) to identify altered signaling pathways and transcription factors associated with DKD. Both snRNA-seq and snATAC-seq had an increased proportion of VCAM1+ injured proximal tubule cells (PT_VCAM1) in DKD samples. PT_VCAM1 has a pro-inflammatory expression signature and transcription factor motif enrichment implicated NFkB signaling. We used stratified linkage disequilibrium score regression to partition heritability of kidney-function-related traits using publicly-available GWAS summary statistics. Cell-specific PT_VCAM1 peaks were enriched for heritability of chronic kidney disease (CKD), suggesting that genetic background may regulate chromatin accessibility and DKD progression. snATAC-seq found cell-specific differentially accessible regions (DAR) throughout the nephron that change accessibility in DKD and these regions were enriched for glucocorticoid receptor (GR) motifs. Changes in chromatin accessibility were associated with decreased expression of insulin receptor, increased gluconeogenesis, and decreased expression of the GR cytosolic chaperone, FKBP5, in the diabetic proximal tubule. Cleavage under targets and release using nuclease (CUT&RUN) profiling of GR binding in bulk kidney cortex and an in vitro model of the proximal tubule (RPTEC) showed that DAR co-localize with GR binding sites. CRISPRi silencing of GR response elements (GRE) in the FKBP5 gene body reduced FKBP5 expression in RPTEC, suggesting that reduced FKBP5 chromatin accessibility in DKD may alter cellular response to GR. We developed an open-source tool for single cell allele specific analysis (SALSA) to model the effect of genetic background on gene expression. Heterozygous germline single nucleotide variants (SNV) in proximal tubule ATAC peaks were associated with allele-specific chromatin accessibility and differential expression of target genes within cis-coaccessibility networks. Partitioned heritability of proximal tubule ATAC peaks with a predicted allele-specific effect was enriched for eGFR, suggesting that genetic background may modify DKD progression in a cell-specific manner.

Method info

Show

Geneformer

Documentation · Repository · Source Code · Container · build_main

Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology (Theodoris et al. 2023) (H. Chen et al. 2024)

Geneformer is a context-aware, attention-based deep learning model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology. Here, a pre-trained Geneformer model is fine-tuned and used to predict cell type labels for an unlabelled dataset.

KNN

Documentation · Repository · Source Code · Container · build_main

Assumes cells with similar gene expression belong to the same cell type, and assigns an unlabelled cell the most common cell type among its k nearest neighbors in PCA space (Cover and Hart 1967)

Using the “k-nearest neighbours” approach, which is a popular machine learning algorithm for classification and regression tasks. The assumption underlying KNN in this context is that cells with similar gene expression profiles tend to belong to the same cell type. For each unlabelled cell, this method computes the k labelled cells (in this case, 5) with the smallest distance in PCA space, and assigns that cell the most common cell type among its k nearest neighbors.

Logistic Regression

Documentation · Repository · Source Code · Container · build_main

Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes (Hosmer Jr, Lemeshow, and Sturdivant 2013)

Logistic Regression estimates parameters of a logistic function for multivariate classification tasks. Here, we use 100-dimensional whitened PCA coordinates as independent variables, and the model minimises the cross entropy loss over all cell type classes.

Multilayer perceptron

Documentation · Repository · Source Code · Container · build_main

A neural network with 100-dimensional PCA input, two hidden layers, and gradient descent weight updates to minimize cross entropy loss (Hinton 1989)

Multi-Layer Perceptron is a type of artificial neural network that consists of multiple layers of interconnected neurons. Each neuron computes a weighted sum of all neurons in the previous layer and transforms it with nonlinear activation function. The output layer provides the final prediction, and network weights are updated by gradient descent to minimize the cross entropy loss. Here, the input data is 100-dimensional whitened PCA coordinates for each cell, and we use two hidden layers of 100 neurons each.

Naive Bayesian Classifier

Documentation · Repository · Source Code · Container · build_main

Naive Bayes classification using feature probabilities to project cell type labels from a reference dataset (Hosmer Jr, Lemeshow, and Sturdivant 2013)

Naive Bayes classification leverages probabilistic models based on Bayes’ theorem to classify cells into different types. In the context of single-cell datasets, this method utilizes the probabilities of features to project cell type labels from a reference dataset to new datasets. The algorithm assumes independence between features, making it computationally efficient and well-suited for high-dimensional data. It is particularly useful for annotating cells in atlas-scale datasets, ensuring consistency and alignment with existing reference annotations.

scANVI

Documentation · Repository · Source Code · Container · build_main

scANVI predicts cell type labels for unlabelled test data by leveraging cell type labels, modelling uncertainty and using deep neural networks with stochastic optimization (Lotfollahi et al. 2020)

single-cell ANnotation using Variational Inference is a semi-supervised variant of the scVI(Lopez et al. 2018) algorithm. Like scVI, scANVI uses deep neural networks and stochastic optimization to model uncertainty caused by technical noise and bias in single - cell transcriptomics measurements. However, scANVI also leverages cell type labels in the generative modelling. In this approach, scANVI is used to predict the cell type labels of the unlabelled test data.

scANVI+scArches

Documentation · Repository · Source Code · Container · build_main

Query to reference single-cell integration with transfer learning with scANVI and scArches (Lotfollahi et al. 2020)

scArches+scANVI or “Single-cell architecture surgery” is a deep learning method for mapping new datasets onto a pre-existing reference model, using transfer learning and parameter optimization. It first uses scANVI to build a reference model from the training data, and then apply scArches to map the test data onto the reference model and make predictions.

scGPT (fine-tuned)

Documentation · Repository · Source Code · Container · build_main

Cell-type annotation by fine-tuning on a pre-trained scGPT model (Cui et al. 2024)

scGPT is a foundation model for single-cell biology based on a generative pre-trained transformer and trained on a repository of over 33 million cells. Here, we fine-tune a pre-trained model on a reference dataset using the hyper-parameter recommenations for the cell-type task and then infer cell-types for the unlabelled cells in a query dataset.

scGPT (zero shot)

Documentation · Repository · Source Code · Container · build_main

Reference mapping using cell embedding by pretrained scGPT model (Cui et al. 2024)

scGPT is a foundation model for single-cell biology based on a generative pre-trained transformer and trained on a repository of over 33 million cells. Following the zero-shot approach, a pre-trained scGPT model is used to embed cells and map unlabelled cells in a query set to the reference dataset with provided annotations based on a nearest neighbor similarity search.

SCimilarity

Documentation · Repository · Source Code · Container · build_main

SCimilarity provides unifying representation of single cell expression profiles (Heimberg et al. 2023)

SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training.

This method uses the SCimilarity cell annotation module. As labels in the SCimilarity model are likely to be different to those in the dataset we use a combination of exact string matching and iterative linear sum assignment to match generate a mapping between them using the training data.

SCimilarity (kNN)

Documentation · Repository · Source Code · Container · build_main

SCimilarity provides unifying representation of single cell expression profiles (Heimberg et al. 2023)

SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training.

This method trains a kNN classifier using cell embeddings from SCimilarity. The classifier is trained on embeddings for the training data and used to predict labels for the test data. This does not use the SCimilarity cell annotation model but avoids needing to match SCimilarity labels to dataset labels.

scPRINT

Documentation · Repository · Source Code · Container · build_main

scPRINT is a large transformer model built for the inference of gene networks (Kalfon et al. 2024)

scPRINT is a large transformer model built for the inference of gene networks (connections between genes explaining the cell’s expression profile) from scRNAseq data.

It uses novel encoding and decoding of the cell expression profile and new pre-training methodologies to learn a cell model.

scPRINT can be used to perform the following analyses: - expression denoising: increase the resolution of your scRNAseq data - cell embedding: generate a low-dimensional representation of your dataset - label prediction: predict the cell type, disease, sequencer, sex, and ethnicity of your cells - gene network inference: generate a gene network from any cell or cell cluster in your scRNAseq dataset

This method uses the zero-shot ability of scPRINT for cell type prediction. As some predicted labels are likely to be different to those in the reference dataset, we use a combination of exact string matching and iterative linear sum assignment to generate a mapping between them using the training data.

Seurat TransferData

Documentation · Repository · Source Code · Container · build_main

Seurat reference mapping predicts cell types for unlabelled cells using PCA distances, labelled anchors, and transfer anchors from Seurat, with SCTransform normalization (Hao et al. 2021)

Seurat reference mapping is a cell type label transfer method provided by the Seurat package. Gene expression counts are first normalised by SCTransform before computing PCA. Then it finds mutual nearest neighbours, known as transfer anchors, between the labelled and unlabelled part of the data in PCA space, and computes each cell’s distance to each of the anchor pairs. Finally, it uses the labelled anchors to predict cell types for unlabelled cells based on these distances.

SingleR

Documentation · Repository · Source Code · Container · build_main

Reference-Based Single-Cell RNA-Seq Annotation (Aran et al. 2019)

Performs unbiased cell type recognition from single-cell RNA sequencing data, by leveraging reference transcriptomic datasets of pure cell types to infer the cell of origin of each single cell independently.

UCE

Documentation · Repository · Source Code · Container · build_main

UCE offers a unified biological latent space that can represent any cell (Rosen et al. 2023)

Universal Cell Embedding (UCE) is a single-cell foundation model that offers a unified biological latent space that can represent any cell, regardless of tissue or species.

This method trains a logistic regression classifier on the UCE embedding space.

XGBoost

Documentation · Repository · Source Code · Container · build_main

XGBoost is a decision tree model that averages multiple trees with gradient boosting (T. Chen and Guestrin 2016)

XGBoost is a gradient boosting decision tree model that learns multiple tree structures in the form of a series of input features and their values, leading to a prediction decision, and averages predictions from all its trees. Here, input features are normalised gene expression values.

Control method info

Show

Majority Vote

Documentation · Repository · Source Code · Container · build_main

A control-type method that predicts all cells to belong to the most abundant cell type in the dataset

Random Labels

Documentation · Repository · Source Code · Container · build_main

a negative control, where the labels are randomly predicted

A negative control, where the labels are randomly predicted without training the data.

True labels

Documentation · Repository · Source Code · Container · build_main

a positive control, solution labels are copied 1 to 1 to the predicted data

A positive control, where the solution labels are copied 1 to 1 to the predicted data.

Metric info

Show

Accuracy

Source code · Container

The percentage of correctly predicted labels (Grandini, Bagli, and Visani 2020).

The percentage of correctly predicted labels.

F1 weighted

Source code · Container

Average weigthed support between each labels F1 score (Grandini, Bagli, and Visani 2020).

Calculates the F1 score for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.

F1 macro

Source code · Container

Unweighted mean of each label F1-score (Grandini, Bagli, and Visani 2020).

Calculates the F1 score for each label, and find their unweighted mean. This does not take label imbalance into account.

F1 micro

Source code · Container

Calculation of TP, FN and FP (Grandini, Bagli, and Visani 2020).

Calculates the F1 score globally by counting the total true positives, false negatives and false positives.

Quality control results

Show

Category	Name	Value	Condition	Severity
Raw results	Method 'geneformer' %missing	1.0000000	pct_missing <= .1	✗✗✗
Raw results	Method 'scgpt_finetuned' %missing	1.0000000	pct_missing <= .1	✗✗✗
Raw results	Method 'scprint' %missing	1.0000000	pct_missing <= .1	✗✗✗
Raw results	Dataset 'cellxgene_census/mouse_pancreas_atlas' %missing	0.3684211	pct_missing <= .1	✗✗✗
Raw results	Method 'scgpt_zeroshot' %missing	0.3333333	pct_missing <= .1	✗✗✗
Raw results	Method 'scimilarity' %missing	0.3333333	pct_missing <= .1	✗✗✗
Raw results	Method 'scimilarity_knn' %missing	0.3333333	pct_missing <= .1	✗✗✗
Raw results	Method 'singler' %missing	0.3333333	pct_missing <= .1	✗✗✗
Raw results	Dataset 'cellxgene_census/hypomap' %missing	0.3157895	pct_missing <= .1	✗✗✗
Dataset info	Pct 'task_id' missing	1.0000000	percent_missing(dataset_info, field)	✗✗
Method info	Pct 'paper_reference' missing	0.8421053	percent_missing(method_info, field)	✗✗
Metric info	Pct 'paper_reference' missing	1.0000000	percent_missing(metric_info, field)	✗✗
Raw results	Metric 'accuracy' %missing	0.2280702	pct_missing <= .1	✗✗
Raw results	Metric 'f1_macro' %missing	0.2280702	pct_missing <= .1	✗✗
Raw results	Metric 'f1_micro' %missing	0.2280702	pct_missing <= .1	✗✗
Raw results	Metric 'f1_weighted' %missing	0.2280702	pct_missing <= .1	✗✗
Raw results	Dataset 'cellxgene_census/immune_cell_atlas' %missing	0.2105263	pct_missing <= .1	✗✗
Raw results	Dataset 'cellxgene_census/dkd' %missing	0.1578947	pct_missing <= .1	✗
Raw results	Dataset 'cellxgene_census/gtex_v9' %missing	0.1578947	pct_missing <= .1	✗
Raw results	Dataset 'cellxgene_census/tabula_sapiens' %missing	0.1578947	pct_missing <= .1	✗

Normalisation visualisation

Show

Authors

Nikolay Markov (author, maintainer)
Scott Gigante (author) ,
Robrecht Cannoodt (author) ,

References

Aran, Dvir, Agnieszka P. Looney, Leqian Liu, Esther Wu, Valerie Fong, Austin Hsu, Suzanna Chak, et al. 2019. “Reference-Based Analysis of Lung Single-Cell Sequencing Reveals a Transitional Profibrotic Macrophage.” Nature Immunology 20 (2): 163–72. https://doi.org/10.1038/s41590-018-0276-y.

Chen, Han, Madhavan S Venkatesh, Javier Gomez Ortega, Siddharth V Mahesh, Tarak N Nandi, Ravi K Madduri, Karin Pelka, and Christina V Theodoris. 2024. “Quantized Multi-Task Learning for Context-Specific Representations of Gene Network Dynamics,” August. https://doi.org/10.1101/2024.08.16.608180.

Chen, Tianqi, and Carlos Guestrin. 2016. “XGBoost: A Scalable Tree Boosting System.” In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 785–94. KDD ’16. ACM. https://doi.org/10.1145/2939672.2939785.

Cover, T., and P. Hart. 1967. “Nearest Neighbor Pattern Classification.” IEEE Transactions on Information Theory 13 (1): 21–27. https://doi.org/10.1109/tit.1967.1053964.

Cui, Haotian, Chloe Wang, Hassaan Maan, Kuan Pang, Fengning Luo, Nan Duan, and Bo Wang. 2024. “scGPT: Toward Building a Foundation Model for Single-Cell Multi-Omics Using Generative AI.” Nature Methods 21 (8): 1470–80. https://doi.org/10.1038/s41592-024-02201-0.

Domínguez Conde, C., C. Xu, L. B. Jarvis, D. B. Rainbow, S. B. Wells, T. Gomes, S. K. Howlett, et al. 2022. “Cross-Tissue Immune Cell Analysis Reveals Tissue-Specific Features in Humans.” Science 376 (6594). https://doi.org/10.1126/science.abl5197.

Eraslan, Gökcen, Eugene Drokhlyansky, Shankara Anand, Evgenij Fiskin, Ayshwarya Subramanian, Michal Slyper, Jiali Wang, et al. 2022. “Single-Nucleus Cross-Tissue Molecular Reference Maps Toward Understanding Disease Gene Function.” Science 376 (6594). https://doi.org/10.1126/science.abl4290.

Grandini, Margherita, Enrico Bagli, and Giorgio Visani. 2020. “Metrics for Multi-Class Classification: An Overview.” arXiv. https://doi.org/10.48550/ARXIV.2008.05756.

Hao, Yuhan, Stephanie Hao, Erica Andersen-Nissen, William M. Mauck, Shiwei Zheng, Andrew Butler, Maddie J. Lee, et al. 2021. “Integrated Analysis of Multimodal Single-Cell Data.” Cell 184 (13): 3573–3587.e29. https://doi.org/10.1016/j.cell.2021.04.048.

Heimberg, Graham, Tony Kuo, Daryle DePianto, Tobias Heigl, Nathaniel Diamant, Omar Salem, Gabriele Scalia, et al. 2023. “Scalable Querying of Human Cell Atlases via a Foundational Model Reveals Commonalities Across Fibrosis-Associated Macrophages,” July. https://doi.org/10.1101/2023.07.18.549537.

Hinton, Geoffrey E. 1989. “Connectionist Learning Procedures.” Artificial Intelligence 40 (1–3): 185–234. https://doi.org/10.1016/0004-3702(89)90049-0.

Hosmer Jr, D. W., S. Lemeshow, and R. X. Sturdivant. 2013. Applied Logistic Regression. Vol. 398. John Wiley & Sons.

Hrovatin, Karin, Aimée Bastidas-Ponce, Mostafa Bakhti, Luke Zappia, Maren Büttner, Ciro Sallino, Michael Sterr, et al. 2023. “Delineating Mouse β-Cell Identity During Lifetime and in Diabetes with a Single Cell Atlas.” bioRxiv. https://doi.org/10.1101/2022.12.22.521557.

Jones, Robert C., Jim Karkanias, Mark A. Krasnow, Angela Oliveira Pisco, Stephen R. Quake, Julia Salzman, Nir Yosef, et al. 2022. “The Tabula Sapiens: A Multiple-Organ, Single-Cell Transcriptomic Atlas of Humans.” Science 376 (6594). https://doi.org/10.1126/science.abl4896.

Kalfon, Jérémie, Jules Samaran, Gabriel Peyré, and Laura Cantini. 2024. “scPRINT: Pre-Training on 50 Million Cells Allows Robust Gene Network Predictions,” July. https://doi.org/10.1101/2024.07.29.605556.

Lotfollahi, Mohammad, Mohsen Naghipourfar, Malte D. Luecken, Matin Khajavi, Maren Büttner, Ziga Avsec, Alexander V. Misharin, and Fabian J. Theis. 2020. “Query to Reference Single-Cell Integration with Transfer Learning,” July. https://doi.org/10.1101/2020.07.16.205997.

Rosen, Yanay, Yusuf Roohani, Ayush Agrawal, Leon Samotorcan, Tabula Sapiens Consortium, Stephen R. Quake, and Jure Leskovec. 2023. “Universal Cell Embeddings: A Foundation Model for Cell Biology,” November. https://doi.org/10.1101/2023.11.28.568918.

Steuernagel, Lukas, Brian Y. H. Lam, Paul Klemm, Georgina K. C. Dowsett, Corinna A. Bauder, John A. Tadross, Tamara Sotelo Hitschfeld, et al. 2022. “HypoMap—a Unified Single-Cell Gene Expression Atlas of the Murine Hypothalamus.” Nature Metabolism 4 (10): 1402–19. https://doi.org/10.1038/s42255-022-00657-y.

Theodoris, Christina V., Ling Xiao, Anant Chopra, Mark D. Chaffin, Zeina R. Al Sayed, Matthew C. Hill, Helene Mantineo, et al. 2023. “Transfer Learning Enables Predictions in Network Biology.” Nature 618 (7965): 616–24. https://doi.org/10.1038/s41586-023-06139-9.

Wilson, Parker C., Yoshiharu Muto, Haojia Wu, Anil Karihaloo, Sushrut S. Waikar, and Benjamin D. Humphreys. 2022. “Multimodal Single Cell Sequencing Implicates Chromatin Accessibility and Genetic Background in Diabetic Kidney Disease Progression.” Nature Communications 13 (1). https://doi.org/10.1038/s41467-022-32972-z.