Source code for maxfuse.metrics

"""
Functions for metric calculation
"""
import numpy as np
from collections.abc import Iterable

from . import match_utils


[docs]def get_matching_acc(matching, labels1, labels2, order=None):
    """
    Compute the cluster level matching accuracy.

    Parameters
    ----------
    matching: a list of length three.
        The matched pairs are (matching[0][i], matching[1][i]),
        and its score (the higher, the better) is matching[2][i].
    labels1: np.array of shape (n_samples1,)
        The first label vector.
    labels2: np.array of shape (n_samples2,)
        The first label vector.
    order: None or (1, 2) or (2, 1), default=None
        If None, then directly use matching without addressing any redundancy.
        If (1, 2), find one-to-one matching from the first dataset to the second dataset;
        if (2, 1), do the other way around.

    Returns
    -------
    Matching accuracy.
    """
    if order is None:
        return np.mean([labels1[i] == labels2[j] for i, j in zip(matching[0], matching[1])])
    matching = match_utils.address_matching_redundancy(matching=matching, order=order)
    rows, cols, _ = matching
    return np.mean([labels1[i] == labels2[j] for i, j in zip(rows, cols)])


[docs]def get_foscttm(dist, true_matching='identity'):
    """
    Compute the fraction of samples closer than true match.

    Parameters
    ----------
    dist: np.ndarray of shape (n1, n2)
        Distance matrix.
    true_matching: 'identity' or Iterable of length n1, default='identity'
        If is a list, then the ground truth matched pairs are (i, true_matching[i])
        If is 'identity', then true_matching = [0, 1..., n1].

    Returns
    -------
    The fraction of samples closer than true match.
    """
    n1, _ = dist.shape
    if true_matching == 'identity':
        true_matching = np.arange(n1)
    elif isinstance(true_matching, Iterable):
        true_matching = [i for i in true_matching]
    else:
        raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')
    # mask[i, j] = True iff dist[i, j] < dist[i, true_matching[i]]
    mask = (dist.T < dist[np.arange(n1), true_matching]).T
    return np.mean(np.mean(mask, axis=1))


[docs]def get_matching_alignment_score(estimated_matching, n_samples, true_matching='identity'):
    """
    Compute the alignment between the estimated matching and the true_matching
    according to the metric in https://openproblems.bio/neurips_docs/about_tasks/task2_modality_matching/.

    Parameters
    ----------
    estimated_matching: a list of length three.
        The matched pairs are (matching[0][i], matching[1][i]),
        and its score (the higher, the better) is matching[2][i].
    n_samples: int
        The sample size for the first dataset.
    true_matching: 'identity' or Iterable of length n_samples, default='identity'
        If is a list, then the ground truth matched pairs are (i, true_matching[i])
        If is 'identity', then true_matching = [0, 1..., n_samples].

    Returns
    -------
    The alignment score.
    """
    if true_matching == 'identity':
        true_matching = np.arange(n_samples)
    elif isinstance(true_matching, Iterable):
        true_matching = [i for i in true_matching]
    else:
        raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')

    idx1_to_indices2_and_scores = dict()
    for i, j, score in zip(estimated_matching[0], estimated_matching[1], estimated_matching[2]):
        if i not in idx1_to_indices2_and_scores:
            idx1_to_indices2_and_scores[i] = [[j], [score]]
        else:
            idx1_to_indices2_and_scores[i][0].append(j)
            idx1_to_indices2_and_scores[i][1].append(score)

    for idx1, indices2_and_scores in idx1_to_indices2_and_scores.items():
        indices2_and_scores[1] = list(np.array(indices2_and_scores[1]) / np.sum(indices2_and_scores[1]))

    res = 0
    for idx1, idx2 in enumerate(true_matching):
        if idx1 in idx1_to_indices2_and_scores:
            for loc in range(len(idx1_to_indices2_and_scores[idx1][0])):
                candidate_idx2 = idx1_to_indices2_and_scores[idx1][0][loc]
                if idx2 == candidate_idx2:
                    res += idx1_to_indices2_and_scores[idx1][1][loc]
    return res / len(idx1_to_indices2_and_scores)


[docs]def get_knn_alignment_score(dist, k_max, true_matching='identity'):
    """
    For each 1 <= k <= k_max, obtain knn matching from dist,
    and compute its matching proximity with the true matching.
    The proximity is calculated by:
    for each cell in arr1, claim it is successfully matched when the true match is in the k-nearest-neighborhood;
    then calculate the average success rate.

    Parameters
    ----------
    dist: np.ndarray of shape (n1, n2)
        Distance matrix.
    k_max: int
        Maximum k for knn matching.
    true_matching: 'identity' or Iterable of length n1, default='identity'
        If is a list, then the ground truth matched pairs are (i, true_matching[i])
        If is 'identity', then true_matching = [0, 1..., n1].

    Returns
    -------
    np.ndarray of shape (k_max,) representing the score for each 1<=k<=k_max.
    """
    n1, n2 = dist.shape
    assert k_max <= n2
    knn_indices = np.argsort(dist, axis=1)[:, :k_max]
    # knn_scores = 1 - dist[np.arange(n1)[:, None], knn_indices]

    if true_matching == 'identity':
        true_matching = np.arange(n1)
    elif isinstance(true_matching, Iterable):
        true_matching = [i for i in true_matching]
    else:
        raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')

    res = np.zeros(k_max)
    for idx1, idx2 in enumerate(true_matching):
        candidates = knn_indices[idx1, :]
        idx2_location = np.where(candidates == idx2)[0]
        if len(idx2_location) == 0:
            # even k_max-nn matching does not contain the true match
            continue
        # find the first occurrence of idx2
        # every knn matching with k >= idx_location is able to find the true match
        idx2_location = idx2_location[0]
        # curr_scores = knn_scores[idx1, idx2_location] / np.cumsum(knn_scores[idx1, :])
        # res[idx2_location:] = res[idx2_location:] + curr_scores[idx2_location:]
        res[idx2_location:] = res[idx2_location:] + 1
    return res / n1