Source code for maxfuse.match_utils

"""
Utility functions for matching
"""
import numpy as np
from scipy.optimize import linear_sum_assignment
from . import utils


[docs]def address_matching_redundancy(matching, order=(1, 2)):
    """
    Make a potentially multiple-to-multiple matching to an one-to-one matching according to order.

    Parameters
    ----------
    matching: list of length three
        rows, cols, vals = matching: list
        Each matched pair of rows[i], cols[i], their score (the larger, the better) is vals[i]
    order: None or (1, 2) or (2, 1)
        If None, do nothing;
        If (1, 2), then the redundancy is addressed by making matching
        an injective map from the first dataset to the second;
        if (2, 1), do the other way around.

    Returns
    -------
    rows, cols, vals: list
        Each matched pair of rows[i], cols[i], their score is vals[i].
    """
    if order is None:
        return matching
    res = [[], [], []]
    if order == (1, 2):
        idx1_to_idx2 = dict()
        idx1_to_score = dict()
        for i, j, score in zip(matching[0], matching[1], matching[2]):
            if i not in idx1_to_idx2:
                idx1_to_idx2[i] = j
                idx1_to_score[i] = score
            elif score > idx1_to_score[i]:
                idx1_to_idx2[i] = j
                idx1_to_score[i] = score
        for idx1, idx2 in idx1_to_idx2.items():
            res[0].append(idx1)
            res[1].append(idx2)
            res[2].append(idx1_to_score[idx1])
    elif order == (2, 1):
        idx2_to_idx1 = dict()
        idx2_to_score = dict()
        for i, j, score in zip(matching[0], matching[1], matching[2]):
            if j not in idx2_to_idx1:
                idx2_to_idx1[j] = i
                idx2_to_score[j] = score
            elif score > idx2_to_score[j]:
                idx2_to_idx1[j] = i
                idx2_to_score[j] = score
        for idx2, idx1 in idx2_to_idx1.items():
            res[0].append(idx1)
            res[1].append(idx2)
            res[2].append(idx2_to_score[idx2])
    else:
        raise NotImplementedError('order must be in {None, (1, 2), (2, 1)}.')

    return res


[docs]def match_cells(arr1, arr2, base_dist=None, wt_on_base_dist=0, verbose=True):
    """
    Get matching between arr1 and arr2 using linear assignment, the distance is 1 - Pearson correlation.

    Parameters
    ----------
    arr1: np.array of shape (n_samples1, n_features)
        The first data matrix
    arr2: np.array of shape (n_samples2, n_features)
        The second data matrix
    base_dist: None or np.ndarray of shape (n_samples1, n_samples2)
        Baseline distance matrix
    wt_on_base_dist: float between 0 and 1
        The final distance matrix to use is (1-wt_on_base_dist) * dist[arr1, arr2] + wt_on_base_dist * base_dist
    verbose: bool, default=True
        Whether to print the progress

    Returns
    -------
    rows, cols, vals: list
        Each matched pair of rows[i], cols[i], their distance is vals[i]
    """
    if verbose:
        print('Start the matching process...', flush=True)
        print('Computing the distance matrix...', flush=True)
    dist = utils.cdist_correlation(arr1, arr2)
    if base_dist is not None:
        if verbose:
            print(
                f'Interpolating {1-wt_on_base_dist} * dist[arr1, arr2] + {wt_on_base_dist} * base_dist...',
                flush=True
            )
        dist = (1-wt_on_base_dist) * dist + wt_on_base_dist * base_dist
    if verbose:
        print('Solving linear assignment...', flush=True)
    rows, cols = linear_sum_assignment(dist)
    if verbose:
        print('Linear assignment completed!', flush=True)

    return rows, cols, np.array([dist[i, j] for i, j in zip(rows, cols)])


[docs]def get_initial_matching(
        arr1, arr2,
        clust_labels1=None, clust_labels2=None,
        edges1=None, edges2=None,
        wt1=0.3, wt2=0.3,
        randomized_svd=True,
        svd_runs=1,
        svd_components1=None, svd_components2=None,
        verbose=True
):
    """
    Assume the features of arr1 and arr2 are column-wise directly comparable,
    obtain a matching by minimizing the correlation distance between arr1 and arr2.

    Parameters
    ----------
    arr1: np.array of shape (n_samples1, n_features1)
        The first data matrix.
    arr2: np.array of shape (n_samples2, n_features2)
        The second data matrix.
    clust_labels1: None or np.array of shape (n_samples1, )
        If not None, then it is the clustering label of the first data matrix,
        and the smoothing of this matrix will be done via cluster centroid shrinkage.
    clust_labels2: None or np.array of shape (n_samples2, )
        Same as clust_labels1 but for the second data matrix.
    edges1: None or list of length two or three
        If not None, then each edge in the graph is (edges[0][i], edges[1][i]) with weight edges[2][i] (if exists)
        and the smoothing of this matrix will be done via graph smoothing.
    edges2: None or scipy.sparse.csr_matrix of shape (n_samples2, n_samples2)
        Same as edges1 but for the second data matrix.
    wt1: float, default=0.3
        The smoothing of the first data matrix will be wt1 * arr1 + (1-wt1) * shrinkage_targets,
        where the shrinkage_targets will be either the cluster centroids or the average of graph neighbors.
    wt2: float, default=0.3
        Same as wt1 but for the second data matrix.
    randomized_svd: bool, default=False
        Whether to use randomized svd.
    svd_runs: int, default=1
        Randomized SVD will result in different runs,
        so if randomized_svd=True, perform svd_runs many randomized SVDs, and pick the one with the
        smallest Frobenious reconstruction error.
        If randomized_svd=False, svd_runs is forced to be 1.
    svd_components1: None or int
        If None, then do not do SVD,
        else, number of components to keep when doing SVD de-noising for the first data matrix.
    svd_components2: None or int
        Same as svd_components1 but for the second data matrix.
    verbose: bool, default=True
        Whether to print the progress.

    Returns
    -------
    matching: list of length 3
        rows, cols, vals = matching,
        Each matched pair is rows[i], cols[i], their distance is vals[i].
    """
    assert arr1.shape[1] == arr2.shape[1]
    # labels and edges cannot be specified simultaneously
    assert (clust_labels1 is None) or (edges1 is None)
    assert (clust_labels2 is None) or (edges2 is None)

    arr1, arr2 = utils.drop_zero_variability_columns(arr_lst=[arr1, arr2])

    # smoothing and denoising
    if verbose:
        print("Denoising the data...", flush=True)

    if clust_labels1 is not None:
        arr1 = utils.shrink_towards_centroids(arr=arr1, labels=clust_labels1, wt=wt1)
    elif edges1 is not None:
        arr1 = utils.graph_smoothing(arr=arr1, edges=edges1, wt=wt1)
    arr1 = utils.svd_denoise(
        arr=arr1, n_components=svd_components1, randomized=randomized_svd,
        n_runs=svd_runs
    )

    if clust_labels2 is not None:
        arr2 = utils.shrink_towards_centroids(arr=arr2, labels=clust_labels2, wt=wt2)
    elif edges2 is not None:
        arr2 = utils.graph_smoothing(arr=arr2, edges=edges2, wt=wt2)
    arr2 = utils.svd_denoise(
        arr=arr2, n_components=svd_components2, randomized=randomized_svd,
        n_runs=svd_runs
    )

    res = match_cells(arr1=arr1, arr2=arr2, verbose=verbose)
    if verbose:
        print('Initial matching completed!', flush=True)

    return res


[docs]def get_refined_matching_one_iter(
        init_matching, arr1, arr2,
        clust_labels1=None, clust_labels2=None,
        edges1=None, edges2=None,
        wt1=0.5, wt2=0.5,
        filter_prop=0,
        cca_components=15,
        cca_max_iter=2000,
        verbose=True
):
    """
    Run one iteration of CCA refinement.

    Parameters
    ----------
    init_matching: list
        init_matching[0][i], init_matching[1][i] is a matched pair,
        and init_matching[2][i] is the distance for this pair
    arr1: np.array of shape (n_samples1, n_features1)
        The first data matrix.
    arr2: np.array of shape (n_samples2, n_features2)
        The second data matrix.
    clust_labels1: None or np.array of shape (n_samples1, )
        If not None, then it is the clustering label of the first data matrix,
        and the smoothing of this matrix will be done via cluster centroid shrinkage.
    clust_labels2: None or np.array of shape (n_samples2, )
        Same as clust_labels1 but for the second data matrix.
    edges1: None or list of length two or three
        If not None, then each edge in the graph is (edges[0][i], edges[1][i]) with weight edges[2][i] (if exists)
        and the smoothing of this matrix will be done via graph smoothing.
    edges2: None or scipy.sparse.csr_matrix of shape (n_samples2, n_samples2)
        Same as edges1 but for the second data matrix.
    wt1: float, default=0.5
        The smoothing of the first data matrix will be wt1 * (cca embedding of arr1) + (1-wt1) * shrinkage_targets,
        where the shrinkage_targets will be either the cluster centroids or the average of graph neighbors.
    wt2: float, default=0.5
        Same as wt1 but for the second data matrix.
    filter_prop: float, default=0
        Proportion of matched pairs to discard before feeding into refinement iterations.
    cca_components: int, default=15
        Number of CCA components.
    cca_max_iter: int, default=2000
        Maximum number of CCA iterations.
    verbose: bool, default=True
        Whether to print the

    Returns
    -------
    rows, cols, vals: list
        Each matched pair of rows[i], cols[i], their distance is vals[i]
    """
    if verbose:
        print('Fitting CCA...', flush=True)
    arr1_cca, arr2_cca, _ = utils.cca_embedding(
        arr1=arr1, arr2=arr2,
        init_matching=init_matching, filter_prop=filter_prop, n_components=cca_components, max_iter=cca_max_iter
    )
    if verbose:
        print('Denoising the data...', flush=True)
    if clust_labels1 is not None:
        arr1_cca = utils.shrink_towards_centroids(arr=arr1_cca, labels=clust_labels1, wt=wt1)
    elif edges1 is not None:
        arr1_cca = utils.graph_smoothing(arr=arr1_cca, edges=edges1, wt=wt1)
    if clust_labels2 is not None:
        arr2_cca = utils.shrink_towards_centroids(arr=arr2_cca, labels=clust_labels2, wt=wt2)
    elif edges2 is not None:
        arr2_cca = utils.graph_smoothing(arr=arr2_cca, edges=edges2, wt=wt2)

    return match_cells(
        arr1=arr1_cca, arr2=arr2_cca, verbose=verbose
    )


[docs]def get_refined_matching(
        init_matching, arr1, arr2,
        randomized_svd=False, svd_runs=1,
        svd_components1=None, svd_components2=None,
        clust_labels1=None, clust_labels2=None,
        edges1=None, edges2=None,
        wt1=0.5, wt2=0.5,
        n_iters=3, filter_prop=0,
        cca_components=15,
        cca_max_iter=2000,
        verbose=True
):
    """
    Refinement of init_matching.

    Parameters
    ----------
    init_matching: list
        init_matching[0][i], init_matching[1][i] is a matched pair,
        and init_matching[2][i] is the distance for this pair.
    arr1: np.array of shape (n_samples1, n_features1)
        The first data matrix.
    arr2: np.array of shape (n_samples2, n_features2)
        The second data matrix.
    randomized_svd: bool, default=False
        Whether to use randomized SVD
    svd_runs: int, default=1
        Randomized SVD will result in different runs,
        so if randomized_svd=True, perform svd_runs many randomized SVDs, and pick the one with the
        smallest Frobenious reconstruction error.
        If randomized_svd=False, svd_runs is forced to be 1.
    svd_components1: None or int
        If None, then do not do SVD,
        else, number of components to keep when doing SVD de-noising for the first data matrix
        before feeding into CCA.
    svd_components2: None or int
        Same as svd_components1 but for the second data matrix.
    clust_labels1: None or np.array of shape (n_samples1, )
        If not None, then it is the clustering label of the first data matrix,
        and the smoothing of this matrix will be done via cluster centroid shrinkage.
    clust_labels2: None or np.array of shape (n_samples2, )
        Same as clust_labels1 but for the second data matrix.
    edges1: None or list of length two or three
        If not None, then each edge in the graph is (edges[0][i], edges[1][i]) with weight edges[2][i] (if exists)
        and the smoothing of this matrix will be done via graph smoothing.
    edges2: None or scipy.sparse.csr_matrix of shape (n_samples2, n_samples2)
        Same as edges1 but for the second data matrix.
    wt1: float, default=0.5
        The smoothing of the first data matrix will be wt1 * (cca embedding of arr1) + (1-wt1) * shrinkage_targets,
        where the shrinkage_targets will be either the cluster centroids or the average of graph neighbors.
    wt2: float, default=0.5
        Same as wt1 but for the second data matrix.
    n_iters: int, default=3
        Number of refinement iterations.
    filter_prop: float, default=0
        Proportion of matched pairs to discard before feeding into refinement iterations.
    cca_components: int, default=15
        Number of CCA components.
    cca_max_iter: int, default=2000,
        Maximum number of CCA iterations.
    verbose: bool, default=True
        Whether to print the progress.

    Returns
    -------
    matching: list of length 3
        rows, cols, vals = matching,
        Each matched pair is rows[i], cols[i], their distance is vals[i].
    """
    ns = [len(x) for x in init_matching]
    assert ns[0] == ns[1] == ns[2]
    # labels and edges can not be specified simultaneously
    assert (clust_labels1 is None) or (edges1 is None)
    assert (clust_labels2 is None) or (edges2 is None)
    assert isinstance(n_iters, int) and n_iters >= 1
    assert 0 <= int(ns[0] * filter_prop) < ns[0]

    assert 1 <= cca_components <= min(arr1.shape[1], arr2.shape[1])

    arr1 = utils.drop_zero_variability_columns(arr_lst=[arr1])[0]
    arr2 = utils.drop_zero_variability_columns(arr_lst=[arr2])[0]

    if verbose:
        print('Normalizing and reducing the dimension of the data...', flush=True)
    arr1_svd = utils.svd_embedding(
        arr=arr1, n_components=svd_components1,
        randomized=randomized_svd, n_runs=svd_runs
    )
    arr2_svd = utils.svd_embedding(
        arr=arr2, n_components=svd_components2,
        randomized=randomized_svd, n_runs=svd_runs
    )

    # prepare the distance matrix used in the initial matching
    cca_matching = init_matching
    # iterative refinement
    for it in range(n_iters):
        if verbose:
            print('Now at iteration {}:'.format(it), flush=True)
        cca_matching = get_refined_matching_one_iter(
            init_matching=cca_matching,
            arr1=arr1_svd, arr2=arr2_svd,
            clust_labels1=clust_labels1,
            clust_labels2=clust_labels2, edges1=edges1, edges2=edges2,
            wt1=wt1, wt2=wt2, filter_prop=filter_prop,
            cca_components=cca_components, cca_max_iter=cca_max_iter, verbose=verbose
        )

    arr1_cca, arr2_cca, _ = utils.cca_embedding(
        arr1=arr1_svd, arr2=arr2_svd,
        init_matching=cca_matching,
        filter_prop=filter_prop,
        n_components=cca_components,
        max_iter=cca_max_iter)

    if verbose:
        print('Refined matching completed!', flush=True)
    return cca_matching