How can implement EM-GMM in python?

后端 未结 2 1316
面向向阳花
面向向阳花 2020-12-06 23:06

I have implemented EM algorithm for GMM using this post GMMs and Maximum Likelihood Optimization Using NumPy unsuccessfully as follows:

import numpy as np

de         


        
2条回答
  •  死守一世寂寞
    2020-12-06 23:11

    # Expectation step
    likelihood = PDF(data, means, np.sqrt(variances))
    
    • Why are we passing sqrt of variances? The pdf function accept variances. So this should be PDF(data, means, variances).

    Another problem,

    # Maximization step
    b = likelihood * weights # shape=(k, n)
    b /= np.sum(b, axis=1)[:, np.newaxis] + eps
    
    • The second line above should be b /= np.sum(b, axis=0)[:, np.newaxis] + eps

    Also in the initialization of variances,

    variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
    
    • Why are we random initializing variances? We have the data and means, why not compute the current estimated variances as in vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1) ?

    With these changes, here is my implementation,

    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    plt.style.use('seaborn')
    
    eps=1e-8
    
    
    def pdf(data, means, vars):
        denom = np.sqrt(2 * np.pi * vars) + eps
        numer = np.exp(-0.5 * np.square(data - means) / (vars + eps))
        return numer /denom
    
    
    def em_gmm(data, k, n_iter, init_strategy='k_means'):
        weights = np.ones((k, 1), dtype=np.float32) / k
        if init_strategy == 'k_means':
            from sklearn.cluster import KMeans
            km = KMeans(k).fit(data[:, None])
            means = km.cluster_centers_
        else:
            means = np.random.choice(data, k)[:, np.newaxis]
        data = np.repeat(data[np.newaxis, :], k, 0)
        vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)
        for step in range(n_iter):
            p = pdf(data, means, vars)
            b = p * weights
            denom = np.expand_dims(np.sum(b, axis=0), 0) + eps
            b = b / denom
            means_n = np.sum(b * data, axis=1)
            means_d = np.sum(b, axis=1) + eps
            means = np.expand_dims(means_n / means_d, -1)
            vars = np.sum(b * np.square(data - means), axis=1) / means_d
            vars = np.expand_dims(vars, -1)
            weights = np.expand_dims(np.mean(b, axis=1), -1)
    
        return means, vars
    
    
    def main():
        s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667,
                      41.48666667, 37.54, 41.175, 44.81, 44.44571429,
                      44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
                      44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
                      44.44571429, 44.44571429, 39.71, 26.69, 34.15,
                      24.94, 24.75, 24.56, 24.38, 35.25,
                      44.62, 44.94, 44.815, 44.69, 42.31,
                      40.81, 44.38, 44.56, 44.44, 44.25,
                      43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
                      40.75, 32.31, 36.08, 30.135, 24.19])
        k = 3
        n_iter = 100
    
        means, vars = em_gmm(s, k, n_iter)
        y = 0
        colors = ['green', 'red', 'blue', 'yellow']
        bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100)
        plt.figure(figsize=(10, 7))
        plt.xlabel('$x$')
        plt.ylabel('pdf')
        sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data')
        for i, (m, v) in enumerate(zip(means, vars)):
            sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}')
        plt.legend()
        plt.plot()
    
        plt.show()
        pass
    

    And here is my result.

提交回复
热议问题