scikit-learn: MemoryError in KNNImputer with california housing

I was doing a simple example with california housing and the KNNImputer blow up into my face:

import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

calhousing = fetch_california_housing()

X = pd.DataFrame(calhousing.data, columns=calhousing.feature_names)
y = pd.Series(calhousing.target, name='house_value')

rng = np.random.RandomState(42)

density = 4  # one in 10 values will be NaN

mask = rng.randint(density, size=X.shape) == 0
X_na = X.copy()
X_na.values[mask] = np.nan
X_na.head()

X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(
    X_na[y<4.9], y[y<4.9], test_size=1000, random_state=0)

model = make_pipeline(
    StandardScaler(),
    KNNImputer(add_indicator=True),
    LinearRegression()
)
model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-71-ad8b65bc77f2> in <module>
      4     LinearRegression()
      5 )
----> 6 model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)

~/Documents/packages/scikit-learn/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    346             This estimator
    347         """
--> 348         Xt, fit_params = self._fit(X, y, **fit_params)
    349         with _print_elapsed_time('Pipeline',
    350                                  self._log_message(len(self.steps) - 1)):

~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    311                 message_clsname='Pipeline',
    312                 message=self._log_message(step_idx),
--> 313                 **fit_params_steps[name])
    314             # Replace the transformer of the step with the fitted
    315             # transformer. This is necessary when loading the transformer

~/miniconda3/envs/dev/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    724     with _print_elapsed_time(message_clsname, message):
    725         if hasattr(transformer, 'fit_transform'):
--> 726             res = transformer.fit_transform(X, y, **fit_params)
    727         else:
    728             res = transformer.fit(X, y, **fit_params).transform(X)

~/Documents/packages/scikit-learn/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    566         else:
    567             # fit method of arity 2 (supervised transformation)
--> 568             return self.fit(X, y, **fit_params).transform(X)
    569 
    570 

~/Documents/packages/scikit-learn/sklearn/impute/_knn.py in transform(self, X)
    230                                   metric=self.metric,
    231                                   missing_values=self.missing_values,
--> 232                                   force_all_finite=force_all_finite)
    233 
    234         # Maps from indices from X to indices in dist matrix

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, force_all_finite, **kwds)
   1742         func = partial(distance.cdist, metric=metric, **kwds)
   1743 
-> 1744     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1745 
   1746 

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1341 
   1342     if effective_n_jobs(n_jobs) == 1:
-> 1343         return func(X, Y, **kwds)
   1344 
   1345     # enforce a threading backend to prevent data communication overhead

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in nan_euclidean_distances(X, Y, squared, missing_values, copy)
    409     present_coords_cnt = np.dot(1 - missing_X, 1 - missing_Y.T)
    410     present_mask = (present_coords_cnt != 0)
--> 411     distances[present_mask] *= (X.shape[1] / present_coords_cnt[present_mask])
    412 
    413     if X is Y:

MemoryError: Unable to allocate array with shape (311408488,) and data type float64

About this issue

Original URL
State: closed
Created 5 years ago
Comments: 22 (15 by maintainers)

Commits related to this issue

ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to jnothman/scikit-learn by jnothman 4 years ago
ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to thomasjpfan/scikit-learn by jnothman 4 years ago
ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to panpiort8/scikit-learn by jnothman 4 years ago

Most upvoted comments

Thanks, the fact that we can’t apply KNNImputer on 100k samples is indeed problematic.

rth on Feb 3, 2020

I have the same problem. Unable to allocate 62.7 GiB for an array with shape (91686, 91713) and data type float64

ajing on Feb 6, 2020

Same problem here with 100k rows and 100 features. The problem arises from metrics\pairwise.py. distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True) MemoryError: Unable to allocate 38.7 GiB for an array with shape (51896, 100000) and data type float64

mosari on Feb 3, 2020