Here is a simplfied python implemention of UMAP - Uniform Manifold Approximation and Projection metric.
Uniform Manifold Approximation and Projection is a powerful tool for measuring manifold structure of high dimensional data and performing dimension reduction an visulization. However, the current python implemention of UMAP did not open the API for precomputed distances as input for dimension reduction.
Here, we provided a simplfied python implemention of umap which enables distances as input. This implemention is integrated in our project's source code, or you can also copy the code below in order to use it.
After copying the source code, you can simply type The following codes to use it, which is almost the same as the UMAP package:
umap = UMAP(n_components=2, distance="precomputed") X_lowdim = umap.fit_transform(distance_matrix)
import numpy as np from sklearn.metrics.pairwise import euclidean_distances from sklearn.manifold import SpectralEmbedding from scipy import optimize class UMAP(object): def __init__(self, n_neighbors: int = 15, n_components: int = 2, n_epochs: int = 200, learning_rate: float = 1.0, distance: str = "euclidean", min_dist: float = 0.25, random_state=None): self.n_neighbors = n_neighbors self. n_components = n_components self.n_epochs = n_epochs self.learning_rate = learning_rate self.distence_mrtric = distance self.min_dist = min_dist self.random_state = random_state self.distance = None self.rho = None self.sigma = None self.connectivity = None self.a = None self.b = None def fit_transform(self, X): if self.distence_mrtric == "precomputed": dist = np.copy(X) self.distance = dist elif self.distence_mrtric == 'euclidean': dist = np.square(euclidean_distances(X, X)) self.distance = dist self.rho = [sorted(dist[i])[1] for i in range(dist.shape[0])] n = X.shape[0] prob = np.zeros((n, n)) sigma_array = [] self.sigma = sigma_array for dist_row in range(n): func = lambda sigma: self._k(self._prob_high_dim(sigma, dist_row)) binary_search_result = self._sigma_binary_search(func, self.n_neighbors) prob[dist_row] = self._prob_high_dim(binary_search_result, dist_row) sigma_array.append(binary_search_result) P = (prob + np.transpose(prob)) / 2 self.connectivity = P x = np.linspace(0, 3, 300) dist_low_dim = lambda x, a, b: 1 / (1 + a * x ** (2 * b)) p, _ = optimize.curve_fit(dist_low_dim, x, self._f(x)) self.a = p[0] self.b = p[1] np.random.seed(self.random_state) model = SpectralEmbedding(n_components=self.n_components, n_neighbors=50) y = model.fit_transform(np.log(X + 1)) CE_array = [] for i in range(self.n_epochs): y = y - self.learning_rate * self._CE_gradient(P, y) CE_current = np.sum(self._CE(P, y)) / 1e+5 CE_array.append(CE_current) return y def _prob_high_dim(self, sigma, dist_row): """ For each row of Euclidean distance matrix (dist_row) compute probability in high dimensions (1D array) """ d = self.distance[dist_row] - self.rho[dist_row] d[d < 0] = 0 return np.exp(- d / sigma) def _k(self, prob): """ Compute n_neighbor = k (scalar) for each 1D array of high-dimensional probability """ return np.power(2, np.sum(prob)) def _sigma_binary_search(self, k_of_sigma, fixed_k): """ Solve equation k_of_sigma(sigma) = fixed_k with respect to sigma by the binary search algorithm """ sigma_lower_limit = 0; sigma_upper_limit = 1000 for i in range(20): approx_sigma = (sigma_lower_limit + sigma_upper_limit) / 2 if k_of_sigma(approx_sigma) < fixed_k: sigma_lower_limit = approx_sigma else: sigma_upper_limit = approx_sigma if np.abs(fixed_k - k_of_sigma(approx_sigma)) <= 1e-5: break return approx_sigma def _f(self, x): y = [] for i in range(len(x)): if(x[i] <= self.min_dist): y.append(1) else: y.append(np.exp(- x[i] + self.min_dist)) return y def _prob_low_dim(self, Y): """ Compute matrix of probabilities q_ij in low-dimensional space """ inv_distances = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1) return inv_distances def _CE(self, P, Y): """ Compute Cross-Entropy (CE) from matrix of high-dimensional probabilities and coordinates of low-dimensional embeddings """ Q = self._prob_low_dim(Y) return - P * np.log(Q + 0.01) - (1 - P) * np.log(1 - Q + 0.01) def _CE_gradient(self, P, Y): """ Compute the gradient of Cross-Entropy (CE) """ y_diff = np.expand_dims(Y, 1) - np.expand_dims(Y, 0) inv_dist = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1) Q = np.dot(1 - P, np.power(0.001 + np.square(euclidean_distances(Y, Y)), -1)) np.fill_diagonal(Q, 0) Q = Q / np.sum(Q, axis=1, keepdims=True) fact=np.expand_dims(self.a*P*(1e-8 + np.square(euclidean_distances(Y, Y)))**(self.b-1) - Q, 2) return 2 * self.b * np.sum(fact * y_diff * np.expand_dims(inv_dist, 2), axis = 1)
1. UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction link
2. How to Program UMAP from Scratch link