"""
Functions for metric calculation
"""
import numpy as np
from collections.abc import Iterable
from . import match_utils
[docs]def get_matching_acc(matching, labels1, labels2, order=None):
"""
Compute the cluster level matching accuracy.
Parameters
----------
matching: a list of length three.
The matched pairs are (matching[0][i], matching[1][i]),
and its score (the higher, the better) is matching[2][i].
labels1: np.array of shape (n_samples1,)
The first label vector.
labels2: np.array of shape (n_samples2,)
The first label vector.
order: None or (1, 2) or (2, 1), default=None
If None, then directly use matching without addressing any redundancy.
If (1, 2), find one-to-one matching from the first dataset to the second dataset;
if (2, 1), do the other way around.
Returns
-------
Matching accuracy.
"""
if order is None:
return np.mean([labels1[i] == labels2[j] for i, j in zip(matching[0], matching[1])])
matching = match_utils.address_matching_redundancy(matching=matching, order=order)
rows, cols, _ = matching
return np.mean([labels1[i] == labels2[j] for i, j in zip(rows, cols)])
[docs]def get_foscttm(dist, true_matching='identity'):
"""
Compute the fraction of samples closer than true match.
Parameters
----------
dist: np.ndarray of shape (n1, n2)
Distance matrix.
true_matching: 'identity' or Iterable of length n1, default='identity'
If is a list, then the ground truth matched pairs are (i, true_matching[i])
If is 'identity', then true_matching = [0, 1..., n1].
Returns
-------
The fraction of samples closer than true match.
"""
n1, _ = dist.shape
if true_matching == 'identity':
true_matching = np.arange(n1)
elif isinstance(true_matching, Iterable):
true_matching = [i for i in true_matching]
else:
raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')
# mask[i, j] = True iff dist[i, j] < dist[i, true_matching[i]]
mask = (dist.T < dist[np.arange(n1), true_matching]).T
return np.mean(np.mean(mask, axis=1))
[docs]def get_matching_alignment_score(estimated_matching, n_samples, true_matching='identity'):
"""
Compute the alignment between the estimated matching and the true_matching
according to the metric in https://openproblems.bio/neurips_docs/about_tasks/task2_modality_matching/.
Parameters
----------
estimated_matching: a list of length three.
The matched pairs are (matching[0][i], matching[1][i]),
and its score (the higher, the better) is matching[2][i].
n_samples: int
The sample size for the first dataset.
true_matching: 'identity' or Iterable of length n_samples, default='identity'
If is a list, then the ground truth matched pairs are (i, true_matching[i])
If is 'identity', then true_matching = [0, 1..., n_samples].
Returns
-------
The alignment score.
"""
if true_matching == 'identity':
true_matching = np.arange(n_samples)
elif isinstance(true_matching, Iterable):
true_matching = [i for i in true_matching]
else:
raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')
idx1_to_indices2_and_scores = dict()
for i, j, score in zip(estimated_matching[0], estimated_matching[1], estimated_matching[2]):
if i not in idx1_to_indices2_and_scores:
idx1_to_indices2_and_scores[i] = [[j], [score]]
else:
idx1_to_indices2_and_scores[i][0].append(j)
idx1_to_indices2_and_scores[i][1].append(score)
for idx1, indices2_and_scores in idx1_to_indices2_and_scores.items():
indices2_and_scores[1] = list(np.array(indices2_and_scores[1]) / np.sum(indices2_and_scores[1]))
res = 0
for idx1, idx2 in enumerate(true_matching):
if idx1 in idx1_to_indices2_and_scores:
for loc in range(len(idx1_to_indices2_and_scores[idx1][0])):
candidate_idx2 = idx1_to_indices2_and_scores[idx1][0][loc]
if idx2 == candidate_idx2:
res += idx1_to_indices2_and_scores[idx1][1][loc]
return res / len(idx1_to_indices2_and_scores)
[docs]def get_knn_alignment_score(dist, k_max, true_matching='identity'):
"""
For each 1 <= k <= k_max, obtain knn matching from dist,
and compute its matching proximity with the true matching.
The proximity is calculated by:
for each cell in arr1, claim it is successfully matched when the true match is in the k-nearest-neighborhood;
then calculate the average success rate.
Parameters
----------
dist: np.ndarray of shape (n1, n2)
Distance matrix.
k_max: int
Maximum k for knn matching.
true_matching: 'identity' or Iterable of length n1, default='identity'
If is a list, then the ground truth matched pairs are (i, true_matching[i])
If is 'identity', then true_matching = [0, 1..., n1].
Returns
-------
np.ndarray of shape (k_max,) representing the score for each 1<=k<=k_max.
"""
n1, n2 = dist.shape
assert k_max <= n2
knn_indices = np.argsort(dist, axis=1)[:, :k_max]
# knn_scores = 1 - dist[np.arange(n1)[:, None], knn_indices]
if true_matching == 'identity':
true_matching = np.arange(n1)
elif isinstance(true_matching, Iterable):
true_matching = [i for i in true_matching]
else:
raise NotImplementedError('true_matching must be \'identity\' or Iterable of length dist.shape[0].')
res = np.zeros(k_max)
for idx1, idx2 in enumerate(true_matching):
candidates = knn_indices[idx1, :]
idx2_location = np.where(candidates == idx2)[0]
if len(idx2_location) == 0:
# even k_max-nn matching does not contain the true match
continue
# find the first occurrence of idx2
# every knn matching with k >= idx_location is able to find the true match
idx2_location = idx2_location[0]
# curr_scores = knn_scores[idx1, idx2_location] / np.cumsum(knn_scores[idx1, :])
# res[idx2_location:] = res[idx2_location:] + curr_scores[idx2_location:]
res[idx2_location:] = res[idx2_location:] + 1
return res / n1