Here is a simplfied python implemention of UMAP - Uniform Manifold Approximation and Projection metric.
Uniform Manifold Approximation and Projection is a powerful tool for measuring manifold structure of high dimensional data and performing dimension reduction an visulization. However, the current python implemention of UMAP did not open the API for precomputed distances as input for dimension reduction.
Here, we provided a simplfied python implemention of umap which enables distances as input. This implemention is integrated in our project's source code, or you can also copy the code below in order to use it.
After copying the source code, you can simply type The following codes to use it, which is almost the same as the UMAP package:
umap = UMAP(n_components=2, distance="precomputed") X_lowdim = umap.fit_transform(distance_matrix)
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import SpectralEmbedding
from scipy import optimize
class UMAP(object):
def __init__(self, n_neighbors: int = 15, n_components: int = 2, n_epochs: int = 200,
learning_rate: float = 1.0, distance: str = "euclidean", min_dist: float = 0.25, random_state=None):
self.n_neighbors = n_neighbors
self. n_components = n_components
self.n_epochs = n_epochs
self.learning_rate = learning_rate
self.distence_mrtric = distance
self.min_dist = min_dist
self.random_state = random_state
self.distance = None
self.rho = None
self.sigma = None
self.connectivity = None
self.a = None
self.b = None
def fit_transform(self, X):
if self.distence_mrtric == "precomputed":
dist = np.copy(X)
self.distance = dist
elif self.distence_mrtric == 'euclidean':
dist = np.square(euclidean_distances(X, X))
self.distance = dist
self.rho = [sorted(dist[i])[1] for i in range(dist.shape[0])]
n = X.shape[0]
prob = np.zeros((n, n))
sigma_array = []
self.sigma = sigma_array
for dist_row in range(n):
func = lambda sigma: self._k(self._prob_high_dim(sigma, dist_row))
binary_search_result = self._sigma_binary_search(func, self.n_neighbors)
prob[dist_row] = self._prob_high_dim(binary_search_result, dist_row)
sigma_array.append(binary_search_result)
P = (prob + np.transpose(prob)) / 2
self.connectivity = P
x = np.linspace(0, 3, 300)
dist_low_dim = lambda x, a, b: 1 / (1 + a * x ** (2 * b))
p, _ = optimize.curve_fit(dist_low_dim, x, self._f(x))
self.a = p[0]
self.b = p[1]
np.random.seed(self.random_state)
model = SpectralEmbedding(n_components=self.n_components, n_neighbors=50)
y = model.fit_transform(np.log(X + 1))
CE_array = []
for i in range(self.n_epochs):
y = y - self.learning_rate * self._CE_gradient(P, y)
CE_current = np.sum(self._CE(P, y)) / 1e+5
CE_array.append(CE_current)
return y
def _prob_high_dim(self, sigma, dist_row):
"""
For each row of Euclidean distance matrix (dist_row) compute
probability in high dimensions (1D array)
"""
d = self.distance[dist_row] - self.rho[dist_row]
d[d < 0] = 0
return np.exp(- d / sigma)
def _k(self, prob):
"""
Compute n_neighbor = k (scalar) for each 1D array of high-dimensional probability
"""
return np.power(2, np.sum(prob))
def _sigma_binary_search(self, k_of_sigma, fixed_k):
"""
Solve equation k_of_sigma(sigma) = fixed_k
with respect to sigma by the binary search algorithm
"""
sigma_lower_limit = 0; sigma_upper_limit = 1000
for i in range(20):
approx_sigma = (sigma_lower_limit + sigma_upper_limit) / 2
if k_of_sigma(approx_sigma) < fixed_k:
sigma_lower_limit = approx_sigma
else:
sigma_upper_limit = approx_sigma
if np.abs(fixed_k - k_of_sigma(approx_sigma)) <= 1e-5:
break
return approx_sigma
def _f(self, x):
y = []
for i in range(len(x)):
if(x[i] <= self.min_dist):
y.append(1)
else:
y.append(np.exp(- x[i] + self.min_dist))
return y
def _prob_low_dim(self, Y):
"""
Compute matrix of probabilities q_ij in low-dimensional space
"""
inv_distances = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1)
return inv_distances
def _CE(self, P, Y):
"""
Compute Cross-Entropy (CE) from matrix of high-dimensional probabilities
and coordinates of low-dimensional embeddings
"""
Q = self._prob_low_dim(Y)
return - P * np.log(Q + 0.01) - (1 - P) * np.log(1 - Q + 0.01)
def _CE_gradient(self, P, Y):
"""
Compute the gradient of Cross-Entropy (CE)
"""
y_diff = np.expand_dims(Y, 1) - np.expand_dims(Y, 0)
inv_dist = np.power(1 + self.a * np.square(euclidean_distances(Y, Y))**self.b, -1)
Q = np.dot(1 - P, np.power(0.001 + np.square(euclidean_distances(Y, Y)), -1))
np.fill_diagonal(Q, 0)
Q = Q / np.sum(Q, axis=1, keepdims=True)
fact=np.expand_dims(self.a*P*(1e-8 + np.square(euclidean_distances(Y, Y)))**(self.b-1) - Q, 2)
return 2 * self.b * np.sum(fact * y_diff * np.expand_dims(inv_dist, 2), axis = 1)
1. UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction link
2. How to Program UMAP from Scratch link