Contribution | PUMC_CHINA

UMAP - Uniform Manifold Approximation and Projection

Uniform Manifold Approximation and Projection is a powerful tool for measuring manifold structure of high dimensional data and performing dimension reduction an visulization. However, the current python implemention of UMAP did not open the API for precomputed distances as input for dimension reduction.

Here, we provided a simplfied python implemention of umap which enables distances as input. This implemention is integrated in our project's source code, or you can also copy the code below in order to use it.

After copying the source code, you can simply type The following codes to use it, which is almost the same as the UMAP package:

umap = UMAP(n_components=2, distance="precomputed")
X_lowdim = umap.fit_transform(distance_matrix)

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import SpectralEmbedding
from scipy import optimize


class UMAP(object):
    def __init__(self, n_neighbors: int = 15, n_components: int = 2, n_epochs: int = 200,
                 learning_rate: float = 1.0, distance: str = "euclidean", min_dist: float = 0.25, random_state=None):
        self.n_neighbors = n_neighbors
        self. n_components = n_components
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self.distence_mrtric = distance
        self.min_dist = min_dist
        self.random_state = random_state
        self.distance = None
        self.rho = None
        self.sigma = None
        self.connectivity = None
        self.a = None
        self.b = None

    def fit_transform(self, X):
        if self.distence_mrtric == "precomputed":
            dist = np.copy(X)
            self.distance = dist
        elif self.distence_mrtric == 'euclidean':
            dist = np.square(euclidean_distances(X, X))
            self.distance = dist
        self.rho = [sorted(dist[i])[1] for i in range(dist.shape[0])]
        n = X.shape[0]
        prob = np.zeros((n, n))
        sigma_array = []
        self.sigma = sigma_array
        for dist_row in range(n):
            func = lambda sigma: self._k(self._prob_high_dim(sigma, dist_row))
            binary_search_result = self._sigma_binary_search(func, self.n_neighbors)
            prob[dist_row] = self._prob_high_dim(binary_search_result, dist_row)
            sigma_array.append(binary_search_result)
        P = (prob + np.transpose(prob)) / 2
        self.connectivity = P
        x = np.linspace(0, 3, 300)
        dist_low_dim = lambda x, a, b: 1 / (1 + a * x ** (2 * b))
        p, _ = optimize.curve_fit(dist_low_dim, x, self._f(x))
        self.a = p[0]
        self.b = p[1]

        np.random.seed(self.random_state)
        model = SpectralEmbedding(n_components=self.n_components, n_neighbors=50)
        y = model.fit_transform(np.log(X + 1))

        CE_array = []
        for i in range(self.n_epochs):
            y = y - self.learning_rate * self._CE_gradient(P, y)
            CE_current = np.sum(self._CE(P, y)) / 1e+5
            CE_array.append(CE_current)
        return y

    def _prob_high_dim(self, sigma, dist_row):
        """
        For each row of Euclidean distance matrix (dist_row) compute
        probability in high dimensions (1D array)
        """
        d = self.distance[dist_row] - self.rho[dist_row]
        d[d < 0] = 0
        return np.exp(- d / sigma)

    def _k(self, prob):
        """
        Compute n_neighbor = k (scalar) for each 1D array of high-dimensional probability
        """
        return np.power(2, np.sum(prob))

    def _sigma_binary_search(self, k_of_sigma, fixed_k):
        """
        Solve equation k_of_sigma(sigma) = fixed_k
        with respect to sigma by the binary search algorithm
        """
        sigma_lower_limit = 0; sigma_upper_limit = 1000
        for i in range(20):
            approx_sigma = (sigma_lower_limit + sigma_upper_limit) / 2
            if k_of_sigma(approx_sigma) < fixed_k:
                sigma_lower_limit = approx_sigma
            else:
                sigma_upper_limit = approx_sigma
            if np.abs(fixed_k - k_of_sigma(approx_sigma)) <= 1e-5:
                break
        return approx_sigma

    def _f(self, x):
        y = []
        for i in range(len(x)):
            if(x[i] <= self.min_dist):
                y.append(1)
            else:
                y.append(np.exp(- x[i] + self.min_dist))
        return y

    def _prob_low_dim(self, Y):
        """
        Compute matrix of probabilities q_ij in low-dimensional space
        """
        inv_distances = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1)
        return inv_distances

    def _CE(self, P, Y):
        """
        Compute Cross-Entropy (CE) from matrix of high-dimensional probabilities
        and coordinates of low-dimensional embeddings
        """
        Q = self._prob_low_dim(Y)
        return - P * np.log(Q + 0.01) - (1 - P) * np.log(1 - Q + 0.01)

    def _CE_gradient(self, P, Y):
        """
        Compute the gradient of Cross-Entropy (CE)
        """
        y_diff = np.expand_dims(Y, 1) - np.expand_dims(Y, 0)
        inv_dist = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1)
        Q = np.dot(1 - P, np.power(0.001 + np.square(euclidean_distances(Y, Y)), -1))
        np.fill_diagonal(Q, 0)
        Q = Q / np.sum(Q, axis=1, keepdims=True)
        fact=np.expand_dims(self.a*P*(1e-8 + np.square(euclidean_distances(Y, Y)))**(self.b-1) - Q, 2)
        return 2 * self.b * np.sum(fact * y_diff * np.expand_dims(inv_dist, 2), axis = 1)

Reference

1. UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction link

2. How to Program UMAP from Scratch link