scikit-learn: MemoryError in KNNImputer with california housing

I was doing a simple example with california housing and the KNNImputer blow up into my face:

import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

calhousing = fetch_california_housing()

X = pd.DataFrame(calhousing.data, columns=calhousing.feature_names)
y = pd.Series(calhousing.target, name='house_value')

rng = np.random.RandomState(42)

density = 4  # one in 10 values will be NaN

mask = rng.randint(density, size=X.shape) == 0
X_na = X.copy()
X_na.values[mask] = np.nan
X_na.head()

X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(
    X_na[y<4.9], y[y<4.9], test_size=1000, random_state=0)

model = make_pipeline(
    StandardScaler(),
    KNNImputer(add_indicator=True),
    LinearRegression()
)
model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-71-ad8b65bc77f2> in <module>
      4     LinearRegression()
      5 )
----> 6 model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)

~/Documents/packages/scikit-learn/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    346             This estimator
    347         """
--> 348         Xt, fit_params = self._fit(X, y, **fit_params)
    349         with _print_elapsed_time('Pipeline',
    350                                  self._log_message(len(self.steps) - 1)):

~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    311                 message_clsname='Pipeline',
    312                 message=self._log_message(step_idx),
--> 313                 **fit_params_steps[name])
    314             # Replace the transformer of the step with the fitted
    315             # transformer. This is necessary when loading the transformer

~/miniconda3/envs/dev/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    724     with _print_elapsed_time(message_clsname, message):
    725         if hasattr(transformer, 'fit_transform'):
--> 726             res = transformer.fit_transform(X, y, **fit_params)
    727         else:
    728             res = transformer.fit(X, y, **fit_params).transform(X)

~/Documents/packages/scikit-learn/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    566         else:
    567             # fit method of arity 2 (supervised transformation)
--> 568             return self.fit(X, y, **fit_params).transform(X)
    569 
    570 

~/Documents/packages/scikit-learn/sklearn/impute/_knn.py in transform(self, X)
    230                                   metric=self.metric,
    231                                   missing_values=self.missing_values,
--> 232                                   force_all_finite=force_all_finite)
    233 
    234         # Maps from indices from X to indices in dist matrix

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, force_all_finite, **kwds)
   1742         func = partial(distance.cdist, metric=metric, **kwds)
   1743 
-> 1744     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1745 
   1746 

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1341 
   1342     if effective_n_jobs(n_jobs) == 1:
-> 1343         return func(X, Y, **kwds)
   1344 
   1345     # enforce a threading backend to prevent data communication overhead

~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in nan_euclidean_distances(X, Y, squared, missing_values, copy)
    409     present_coords_cnt = np.dot(1 - missing_X, 1 - missing_Y.T)
    410     present_mask = (present_coords_cnt != 0)
--> 411     distances[present_mask] *= (X.shape[1] / present_coords_cnt[present_mask])
    412 
    413     if X is Y:

MemoryError: Unable to allocate array with shape (311408488,) and data type float64

About this issue

  • Original URL
  • State: closed
  • Created 5 years ago
  • Comments: 22 (15 by maintainers)

Commits related to this issue

Most upvoted comments

Thanks, the fact that we can’t apply KNNImputer on 100k samples is indeed problematic.

I have the same problem. Unable to allocate 62.7 GiB for an array with shape (91686, 91713) and data type float64

Same problem here with 100k rows and 100 features. The problem arises from metrics\pairwise.py. distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True) MemoryError: Unable to allocate 38.7 GiB for an array with shape (51896, 100000) and data type float64