scikit-learn: test_k_means_fit_predict failing on some MacPython runs

KMeans fit_predict(X) != fit(X).predict(X) in several cases in

https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223437 https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223439

_________ test_k_means_fit_predict[0-2-1e-07-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 0, max_iter = 2
tol = 1e-07
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 80.0%)
E            x: array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
E                  2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0, 2, 0, 9, 1, 9, 4, 3, 1, 5, 4, 1,
E                  6, 3, 5, 9, 3, 9, 5, 4, 8, 2, 2, 0, 5, 7, 3, 7, 4, 9, 8, 6, 9, 0, 6,...
E            y: array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
E                  3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1, 3, 1, 8, 2, 8, 5, 4, 2, 0, 5, 2,
E                  6, 4, 0, 8, 4, 8, 0, 5, 9, 3, 3, 1, 0, 7, 4, 7, 5, 8, 9, 6, 8, 1, 6,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=2,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=1e-07, verbose=0)
labels_1   = array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
       2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0,...3, 5, 1, 3, 3, 2, 3, 5, 4, 8, 8, 0, 8, 1, 7, 3, 6, 2, 2, 6, 3, 3,
       3, 3, 8, 3, 7, 9, 8, 9, 5, 4, 2], dtype=int32)
labels_2   = array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
       3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1,...4, 0, 2, 4, 4, 3, 4, 0, 5, 9, 9, 1, 9, 2, 7, 4, 6, 3, 3, 6, 4, 4,
       4, 4, 9, 4, 7, 8, 9, 8, 0, 5, 3], dtype=int32)
max_iter   = 2
rng        = <mtrand.RandomState object at 0x114933ea0>
seed       = 0
tol        = 1e-07
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError
_________ test_k_means_fit_predict[4-300-0.1-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 4, max_iter = 300
tol = 0.1
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 100.0%)
E            x: array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
E                  3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0, 6, 4, 6, 0, 4, 4, 9, 6, 1, 2, 0,
E                  2, 5, 1, 4, 9, 1, 5, 3, 9, 6, 6, 9, 9, 8, 7, 1, 6, 2, 7, 0, 9, 1, 3,...
E            y: array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
E                  7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2, 4, 3, 4, 2, 3, 3, 1, 4, 5, 8, 2,
E                  8, 6, 5, 3, 1, 5, 6, 7, 1, 4, 4, 1, 1, 0, 9, 5, 4, 8, 9, 2, 1, 5, 7,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=4, tol=0.1, verbose=0)
labels_1   = array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
       3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0,...0, 0, 8, 5, 8, 2, 4, 7, 3, 3, 6, 8, 5, 7, 1, 2, 7, 1, 4, 9, 9, 5,
       4, 2, 2, 7, 5, 9, 8, 4, 9, 0, 1], dtype=int32)
labels_2   = array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
       7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2,...2, 2, 0, 6, 0, 8, 3, 9, 7, 7, 4, 0, 6, 9, 5, 8, 9, 5, 3, 1, 1, 6,
       3, 8, 8, 9, 6, 1, 0, 3, 1, 2, 5], dtype=int32)
max_iter   = 300
rng        = <mtrand.RandomState object at 0x1141c9708>
seed       = 4
tol        = 0.1
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError

About this issue

Original URL
State: closed
Created 6 years ago
Comments: 22 (22 by maintainers)

Most upvoted comments

I don’t think this is the case: there is an explicit n_jobs=1.

I missed that… Another confirmation it’s not this is that the test only fail with algorithm="full" if I saw correctly.

The difference between “full” and “elkan” which can cause this issue is that “full” uses the fast method to compute euclidean distances, the one with the precision issue. The inertia returned by the current implementation of k-means can strongly differ from the exact inertia. This can lead to different inertia being computed with only permutations of the labels.

jeremiedbb on Nov 22, 2018