Predict Modality

Predicting the profiles of one modality (e.g. protein abundance) from another (e.g. mRNA expression).

6 datasets · 4 methods · 4 control methods · 8 metrics

Info

Task info Method info Metric info Dataset info Results

Experimental techniques to measure multiple modalities within the same single cell are increasingly becoming available. The demand for these measurements is driven by the promise to provide a deeper insight into the state of a cell. Yet, the modalities are also intrinsically linked. We know that DNA must be accessible (ATAC data) to produce mRNA (expression data), and mRNA in turn is used as a template to produce protein (protein abundance). These processes are regulated often by the same molecules that they produce: for example, a protein may bind DNA to prevent the production of more mRNA. Understanding these regulatory processes would be transformative for synthetic biology and drug target discovery. Any method that can predict a modality from another must have accounted for these regulatory processes, but the demand for multi-modal data shows that this is not trivial.

Summary

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

has_resources = results[0].hasOwnProperty("resources")
has_exit_codes = results[0].hasOwnProperty("exit_codes")

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

results_resources = {
  let results_resources = null

  if (has_resources) {
    results_resources = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        ...d.resources
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return results_resources
}

resources = {
  let resources = null

  if (has_resources) {
    resources = d3.groups(results_resources, d => d.method_id)
      .map(([method_id, values]) => {
        const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
        const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
        const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
        const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))

        return ({
          method_id,
          mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
          mean_peak_memory_mb,
          mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
          mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
          mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
          mean_disk_read_log: -Math.log10(mean_disk_read_mb),
          mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
          mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
          mean_disk_write_log: -Math.log10(mean_disk_write_mb),
          mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
          mean_duration_sec,
          mean_duration_log: -Math.log10(mean_duration_sec),
          mean_duration_str: " " + label_time(mean_duration_sec) + " "
        })
      })
  }

  return resources
}

exit_codes = {
  let exit_codes = null

  if (has_exit_codes) {
    exit_codes = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: Object.values(d.exit_codes)
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  } else {
    exit_codes = results_resources.flatMap(d => {
      let exit_code = d.exit_code
      if (exit_code === undefined) {
        // If there is not exit code, assume the method ran successfully
        exit_code = 0
      }

      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: [exit_code]
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return exit_codes
}

error_reasons = d3.groups(exit_codes, d => d.method_id)
  .map(([method_id, values]) => {
    const all_codes = values.flatMap(d => d.exit_codes)

    if (all_codes.length === 0) {
      return {method_id, error_reason: []}
    }

    const error_pct_oom = d3.mean(all_codes, d => d === 137)
    const error_pct_timeout = d3.mean(all_codes, d => d === 143)
    const error_pct_na = d3.mean(all_codes, d => d === 99)
    const error_pct_error = d3.mean(all_codes, d => d > 0) - error_pct_oom - error_pct_timeout - error_pct_na
    const error_pct_unknown = d3.mean(all_codes, d => d < 0)
    const error_pct_ok = d3.mean(all_codes, d => d === 0)
    return ({
      method_id,
      error_reason: [
        error_pct_oom,
        error_pct_timeout,
        error_pct_error,
        error_pct_unknown,
        error_pct_na,
        error_pct_ok
      ],
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const error_reasons_ = error_reasons.find(d => d.method_id === method_id)

    let summary = {
      method_id,
      method_name,
      mean_score,
      ...datasets,
      ...metrics,
      ...error_reasons_
    }

    if (has_resources) {
      const resources_ = resources.find(d => d.method_id === method_id)
      summary = {...summary, ...resources_}
    }
    return summary
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = {
  let column_info = [
    {
      id: "method_name",
      name: "Name",
      label: null,
      group: "method",
      geom: "text",
      palette: null
    },
    {
      id: "mean_score",
      name: "Score",
      group: "overall",
      geom: "bar",
      palette: "overall"
    },
    {
      id: "error_reason",
      name: "Error reason",
      group: "overall",
      geom: "pie",
      palette: "error_reason"
    },
    ...dataset_info
      .filter(d => dataset_ids.includes(d.dataset_id))
      .map(
        d => ({
          id: "dataset_" + d.dataset_id,
          name: d.dataset_name,
          group: "dataset",
          geom: "funkyrect",
          palette: "dataset"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
    ...metric_info
      .filter(d => metric_ids.includes(d.metric_id))
      .map(
        d => ({
          id: "metric_" + d.metric_id,
          name: d.metric_name,
          group: "metric",
          geom: "funkyrect",
          palette: "metric"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
  ]

  if (has_resources) {
    column_info.push(
      {
        id: "mean_cpu_pct",
        name: "%CPU",
        group: "resources",
        geom: "funkyrect",
        palette: "resources"
      },
      {
        id: "mean_peak_memory_log",
        name: "Peak memory",
        label: "mean_peak_memory_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_read_log",
        name: "Disk read",
        label: "mean_disk_read_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_write_log",
        name: "Disk write",
        label: "mean_disk_write_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_duration_log",
        name: "Duration",
        label: "mean_duration_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      }
    )
  }

  column_info = column_info.map(d => {
    if (d.id === "method_name") {
      return {...d, options: {width: 15, hjust: 0}}
    } else if (d.id === "is_baseline") {
      return {...d, options: {width: 1}}
    } else if (d.geom === "bar") {
      return {...d, options: {width: 4}}
    } else {
      return d
    }
  })

  return column_info
}

column_groups = {
  let column_groups = [
    {
      group: "method",
      palette: null,
      level1: ""
    },
    {
      group: "overall",
      palette: "overall",
      level1: "Overall"
    },
    {
      group: "error_reason",
      palette: "error_reason",
      level1: "Error reason"
    },
    {
      group: "dataset",
      palette: "dataset",
      level1: dataset_info.length >= 3 ? "Datasets" : ""
    },
    {
      group: "metric",
      palette: "metric",
      level1: metric_info.length >= 3 ? "Metrics" : ""
    }
  ]

  if (has_resources) {
    column_groups.push(
      {group: "resources", palette: "resources", level1: "Resources"}
    )
  }

  return column_groups
}

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#fdb462", "#999999", "#FFFFFF"],
      names: [
        "Memory limit exceeded",
        "Time limit exceeded",
        "Execution error",
        "Unknown error",
        "Not applicable",
        "No error"
      ]
    }
  }
][0]

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Dataset info

Show

NeurIPS2021 CITE-Seq (GEX2ADT)

Data source · 25-11-2024 · 688.47 KiB

Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors (Luecken et al. 2021).

Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site.

NeurIPS2021 Multiome (GEX2ATAC)

Data source · 25-11-2024 · 29.64 MiB

Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors (Luecken et al. 2021).

Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site.

NeurIPS2021 Multiome (ATAC2GEX)

Data source · 25-11-2024 · 7.52 MiB

Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors (Luecken et al. 2021).

OpenProblems NeurIPS2022 CITE-Seq (GEX2ADT)

Data source · 25-11-2024 · 578.01 KiB

Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors (... 2024).

Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site.

OpenProblems NeurIPS2022 CITE-Seq (ADT2GEX)

Data source · 25-11-2024 · 31.04 MiB

Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors (... 2024).

Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site.

NeurIPS2021 CITE-Seq (ADT2GEX)

Data source · 25-11-2024 · 12.84 MiB

Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors (Luecken et al. 2021).

Method info

Show

KNNR (Py)

Documentation · Repository · Source Code · Container · build_main

K-nearest neighbor regression in Python (Fix and Hodges 1989)

K-nearest neighbor regression in Python.

KNNR (R)

Documentation · Repository · Source Code · Container · build_main

K-nearest neighbor regression in R (Fix and Hodges 1989)

K-nearest neighbor regression in R.

Linear Model

Documentation · Repository · Source Code · Container · build_main

Linear model regression (Wilkinson and Rogers 1973)

A linear model regression method.

Guanlab-dengkw

Documentation · Repository · Source Code · Container · build_main

A kernel ridge regression method with RBF kernel (Lance et al. 2022)

This is a solution developed by Team Guanlab - dengkw in the Neurips 2021 competition to predict one modality from another using kernel ridge regression (KRR) with RBF kernel. Truncated SVD is applied on the combined training and test data from modality 1 followed by row-wise z-score normalization on the reduced matrix. The truncated SVD of modality 2 is predicted by training a KRR model on the normalized training matrix of modality 1. Predictions on the normalized test matrix are then re-mapped to the modality 2 feature space via the right singular vectors.

Control method info

Show

Mean per gene

Documentation · Repository · Source Code · Container · build_main

Returns the mean expression value per gene

Returns the mean expression value per gene.

Random predictions

Documentation · Repository · Source Code · Container · build_main

Returns random training profiles

Returns random training profiles.

Zeros

Documentation · Repository · Source Code · Container · build_main

Returns a prediction consisting of all zeros

Returns a prediction consisting of all zeros.

Solution

Documentation · Repository · Source Code · Container · build_main

Returns the ground-truth solution

Returns the ground-truth solution.

Metric info

Show

Mean pearson per cell

Source code · Container

The mean of the pearson values of per-cell expression value vectors (1895).

The mean of the pearson values of per-cell expression value vectors.

Mean spearman per cell

Source code · Container

The mean of the spearman values of per-cell expression value vectors (KENDALL 1938).

The mean of the spearman values of per-cell expression value vectors.

Mean pearson per gene

Source code · Container

The mean of the pearson values of per-gene expression value vectors (1895).

The mean of the pearson values of per-gene expression value vectors.

Mean spearman per gene

Source code · Container

The mean of the spearman values of per-gene expression value vectors (KENDALL 1938).

The mean of the spearman values of per-gene expression value vectors.

Overall pearson

Source code · Container

The mean of the pearson values of vectorized expression matrices (1895).

The mean of the pearson values of vectorized expression matrices.

Overall spearman

Source code · Container

The mean of the spearman values of vectorized expression matrices (KENDALL 1938).

The mean of the spearman values of vectorized expression matrices.

RMSE

Source code · Container

The root mean squared error (Chai and Draxler 2014).

The square root of the mean of the square of all of the error.

MAE

Source code · Container

The mean absolute error (Chai and Draxler 2014).

The average difference between the expression values and the predicted expression values.

Quality control results

Show

Category	Name	Value	Condition	Severity
Raw results	Dataset 'openproblems_neurips2022/pbmc_multiome/swap' %missing	0.3611111	pct_missing <= .1	✗✗✗
Dataset info	Pct 'task_id' missing	1.0000000	percent_missing(dataset_info, field)	✗✗
Method info	Pct 'paper_reference' missing	0.5555556	percent_missing(method_info, field)	✗✗
Metric info	Pct 'paper_reference' missing	1.0000000	percent_missing(metric_info, field)	✗✗
Raw results	Method 'guanlab_dengkw_pm' %missing	0.2500000	pct_missing <= .1	✗✗
Raw results	Method 'zeros' %missing	0.2500000	pct_missing <= .1	✗✗
Scaling	Worst score lmds_irlba_rf overall_pearson	-2.4102000	worst_score >= -1	✗✗
Raw results	Metric 'overall_pearson' %missing	0.1666667	pct_missing <= .1	✗
Raw results	Metric 'overall_spearman' %missing	0.1666667	pct_missing <= .1	✗
Raw results	Dataset 'openproblems_neurips2022/pbmc_multiome/normal' %missing	0.1388889	pct_missing <= .1	✗
Raw results	Method 'knnr_py' %missing	0.1250000	pct_missing <= .1	✗
Raw results	Method 'lm' %missing	0.1250000	pct_missing <= .1	✗

Normalisation visualisation

Show

Authors

Alejandro Granados (author)
Alex Tong (author)
Bastian Rieck (author)
Daniel Burkhardt (author)
Kai Waldrant (contributor) ,
Kaiwen Deng (contributor)
Louise Deconinck (author)
Robrecht Cannoodt (author, maintainer) ,

References

1895. Proceedings of the Royal Society of London 58 (347–352): 240–42. https://doi.org/10.1098/rspl.1895.0041.

... 2024. “Predicting Cellular Profiles Across Modalities in Longitudinal Single-Cell Data: An Open Problems Competition.” In Preparation.

Chai, T., and R. R. Draxler. 2014. “Root Mean Square Error (RMSE) or Mean Absolute Error (MAE)?” February. https://doi.org/10.5194/gmdd-7-1525-2014.

Fix, Evelyn, and J. L. Hodges. 1989. “Discriminatory Analysis. Nonparametric Discrimination: Consistency Properties.” International Statistical Review / Revue Internationale de Statistique 57 (3): 238. https://doi.org/10.2307/1403797.

KENDALL, M. G. 1938. “A NEW MEASURE OF RANK CORRELATION.” Biometrika 30 (1–2): 81–93. https://doi.org/10.1093/biomet/30.1-2.81.

Lance, Christopher, Malte D. Luecken, Daniel B. Burkhardt, Robrecht Cannoodt, Pia Rautenstrauch, Anna Laddach, Aidyn Ubingazhibov, et al. 2022. “Multimodal Single Cell Data Integration Challenge: Results and Lessons Learned,” April. https://doi.org/10.1101/2022.04.11.487796.

Luecken, Malte, Daniel Burkhardt, Robrecht Cannoodt, Christopher Lance, Aditi Agrawal, Hananeh Aliee, Ann Chen, et al. 2021. “A Sandbox for Prediction and Integration of DNA, RNA, and Proteins in Single Cells.” In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks, edited by J. Vanschoren and S. Yeung. Vol. 1. Curran. https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf.

Wilkinson, G. N., and C. E. Rogers. 1973. “Symbolic Description of Factorial Models for Analysis of Variance.” Applied Statistics 22 (3): 392. https://doi.org/10.2307/2346786.