scikit-learn: MemoryError in KNNImputer with california housing
I was doing a simple example with california housing and the KNNImputer
blow up into my face:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
calhousing = fetch_california_housing()
X = pd.DataFrame(calhousing.data, columns=calhousing.feature_names)
y = pd.Series(calhousing.target, name='house_value')
rng = np.random.RandomState(42)
density = 4 # one in 10 values will be NaN
mask = rng.randint(density, size=X.shape) == 0
X_na = X.copy()
X_na.values[mask] = np.nan
X_na.head()
X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(
X_na[y<4.9], y[y<4.9], test_size=1000, random_state=0)
model = make_pipeline(
StandardScaler(),
KNNImputer(add_indicator=True),
LinearRegression()
)
model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-71-ad8b65bc77f2> in <module>
4 LinearRegression()
5 )
----> 6 model.fit(X_train_na, y_train_na).score(X_test_na, y_test_na)
~/Documents/packages/scikit-learn/sklearn/pipeline.py in fit(self, X, y, **fit_params)
346 This estimator
347 """
--> 348 Xt, fit_params = self._fit(X, y, **fit_params)
349 with _print_elapsed_time('Pipeline',
350 self._log_message(len(self.steps) - 1)):
~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
311 message_clsname='Pipeline',
312 message=self._log_message(step_idx),
--> 313 **fit_params_steps[name])
314 # Replace the transformer of the step with the fitted
315 # transformer. This is necessary when loading the transformer
~/miniconda3/envs/dev/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~/Documents/packages/scikit-learn/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
724 with _print_elapsed_time(message_clsname, message):
725 if hasattr(transformer, 'fit_transform'):
--> 726 res = transformer.fit_transform(X, y, **fit_params)
727 else:
728 res = transformer.fit(X, y, **fit_params).transform(X)
~/Documents/packages/scikit-learn/sklearn/base.py in fit_transform(self, X, y, **fit_params)
566 else:
567 # fit method of arity 2 (supervised transformation)
--> 568 return self.fit(X, y, **fit_params).transform(X)
569
570
~/Documents/packages/scikit-learn/sklearn/impute/_knn.py in transform(self, X)
230 metric=self.metric,
231 missing_values=self.missing_values,
--> 232 force_all_finite=force_all_finite)
233
234 # Maps from indices from X to indices in dist matrix
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, force_all_finite, **kwds)
1742 func = partial(distance.cdist, metric=metric, **kwds)
1743
-> 1744 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1745
1746
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1341
1342 if effective_n_jobs(n_jobs) == 1:
-> 1343 return func(X, Y, **kwds)
1344
1345 # enforce a threading backend to prevent data communication overhead
~/Documents/packages/scikit-learn/sklearn/metrics/pairwise.py in nan_euclidean_distances(X, Y, squared, missing_values, copy)
409 present_coords_cnt = np.dot(1 - missing_X, 1 - missing_Y.T)
410 present_mask = (present_coords_cnt != 0)
--> 411 distances[present_mask] *= (X.shape[1] / present_coords_cnt[present_mask])
412
413 if X is Y:
MemoryError: Unable to allocate array with shape (311408488,) and data type float64
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 22 (15 by maintainers)
Commits related to this issue
- ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to jnothman/scikit-learn by jnothman 4 years ago
- ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to thomasjpfan/scikit-learn by jnothman 4 years ago
- ENH Perform KNN imputation without O(n^2) memory cost Fixes #15604 This is more computationally expensive than the previous implementation, but should reduce memory costs substantially in common use... — committed to panpiort8/scikit-learn by jnothman 4 years ago
Thanks, the fact that we can’t apply KNNImputer on 100k samples is indeed problematic.
I have the same problem.
Unable to allocate 62.7 GiB for an array with shape (91686, 91713) and data type float64
Same problem here with 100k rows and 100 features. The problem arises from metrics\pairwise.py. distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True) MemoryError: Unable to allocate 38.7 GiB for an array with shape (51896, 100000) and data type float64