skops: Bug: loading certain scipy functions fails
Certain scipy functions cannot be loaded correctly. Here is a test to reproduce:
from scipy import stats
def test_scipy_stats(tmp_path):
estimator = FunctionTransformer(func=stats.zipf)
save_load_round(estimator, tmp_path / "file.skops")
zipf
is only one of a few failing scipy functions that I tested. The error message is
AttributeError: module ‘builtins’ has no attribute ‘method’
with a really long, unhelpful stacktrace. However, the problem already occurs during the generation of the state
, which looks really weird:
Click to expand
{
"__class__": "FunctionTransformer",
"__module__": "sklearn.preprocessing._function_transformer",
"content": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"func": {
"__class__": "zipf_gen",
"__module__": "scipy.stats._discrete_distns",
"content": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"_stats_has_moments": {
"__class__": "str",
"__module__": "builtins",
"content": "true",
"is_json": true
},
"_random_state": {
"__class__": "RandomState",
"__module__": "numpy.random.mtrand",
"content": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"bit_generator": {
"__class__": "str",
"__module__": "builtins",
"content": "\"MT19937\"",
"is_json": true
},
"state": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"key": {
"__class__": "ndarray",
"__module__": "numpy",
"type": "numpy",
"file": "140209309201776.npy"
},
"pos": {
"__class__": "str",
"__module__": "builtins",
"content": "335",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
},
"has_gauss": {
"__class__": "str",
"__module__": "builtins",
"content": "0",
"is_json": true
},
"gauss": {
"__class__": "str",
"__module__": "builtins",
"content": "0.0",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
}
},
"_rvs_uses_size_attribute": {
"__class__": "str",
"__module__": "builtins",
"content": "false",
"is_json": true
},
"_rvs_size_warned": {
"__class__": "str",
"__module__": "builtins",
"content": "false",
"is_json": true
},
"_ctor_param": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"a": {
"__class__": "str",
"__module__": "builtins",
"content": "1",
"is_json": true
},
"b": {
"__class__": "str",
"__module__": "builtins",
"content": "Infinity",
"is_json": true
},
"name": {
"__class__": "str",
"__module__": "builtins",
"content": "\"zipf\"",
"is_json": true
},
"badvalue": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"moment_tol": {
"__class__": "str",
"__module__": "builtins",
"content": "1e-08",
"is_json": true
},
"values": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"inc": {
"__class__": "str",
"__module__": "builtins",
"content": "1",
"is_json": true
},
"longname": {
"__class__": "str",
"__module__": "builtins",
"content": "\"A Zipf\"",
"is_json": true
},
"shapes": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"extradoc": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"seed": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
},
"badvalue": {
"__class__": "str",
"__module__": "builtins",
"content": "NaN",
"is_json": true
},
"a": {
"__class__": "str",
"__module__": "builtins",
"content": "1",
"is_json": true
},
"b": {
"__class__": "str",
"__module__": "builtins",
"content": "Infinity",
"is_json": true
},
"moment_tol": {
"__class__": "str",
"__module__": "builtins",
"content": "1e-08",
"is_json": true
},
"inc": {
"__class__": "str",
"__module__": "builtins",
"content": "1",
"is_json": true
},
"shapes": {
"__class__": "str",
"__module__": "builtins",
"content": "\"a\"",
"is_json": true
},
"_parse_arg_template": {
"__class__": "str",
"__module__": "builtins",
"content": "\"\\ndef _parse_args(self, a, loc=0):\\n return (a, ), loc, 1\\n\\ndef _parse_args_rvs(self, a, loc=0, size=None):\\n return self._argcheck_rvs(a, loc, 1, size=size)\\n\\ndef _parse_args_stats(self, a, loc=0, moments='mv'):\\n return (a, ), loc, 1, moments\\n\"",
"is_json": true
},
"numargs": {
"__class__": "str",
"__module__": "builtins",
"content": "1",
"is_json": true
},
"vecentropy": {
"__class__": "vectorize",
"__module__": "numpy",
"content": {
"__class__": "dict",
"__module__": "builtins",
"content": {
"pyfunc": {
"__class__": "method",
"__module__": "builtins",
"content": {
"__class__": "dict",
"__module__": "builtins",
"content": {},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": []
}
}
},
"cache": {
"__class__": "str",
"__module__": "builtins",
"content": "false",
"is_json": true
},
"signature": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"_ufunc": {
"__class__": "dict",
"__module__": "builtins",
"content": {},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": []
}
},
"__doc__": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"otypes": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"excluded": {
"__class__": "set",
"__module__": "builtins"
},
"_in_and_out_core_dims": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
}
},
"name": {
"__class__": "str",
"__module__": "builtins",
"content": "\"zipf\"",
"is_json": true
},
"extradoc": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"__doc__": {
"__class__": "str",
"__module__": "builtins",
"content": "\"A Zipf (Zeta) discrete random variable.\\n\\n As an instance of the `rv_discrete` class, `zipf` object inherits from it\\n a collection of generic methods (see below for the full list),\\n and completes them with details specific for this particular distribution.\\n \\n Methods\\n -------\\n rvs(a, loc=0, size=1, random_state=None)\\n Random variates.\\n pmf(k, a, loc=0)\\n Probability mass function.\\n logpmf(k, a, loc=0)\\n Log of the probability mass function.\\n cdf(k, a, loc=0)\\n Cumulative distribution function.\\n logcdf(k, a, loc=0)\\n Log of the cumulative distribution function.\\n sf(k, a, loc=0)\\n Survival function (also defined as ``1 - cdf``, but `sf` is sometimes more accurate).\\n logsf(k, a, loc=0)\\n Log of the survival function.\\n ppf(q, a, loc=0)\\n Percent point function (inverse of ``cdf`` --- percentiles).\\n isf(q, a, loc=0)\\n Inverse survival function (inverse of ``sf``).\\n stats(a, loc=0, moments='mv')\\n Mean('m'), variance('v'), skew('s'), and/or kurtosis('k').\\n entropy(a, loc=0)\\n (Differential) entropy of the RV.\\n expect(func, args=(a,), loc=0, lb=None, ub=None, conditional=False)\\n Expected value of a function (of one argument) with respect to the distribution.\\n median(a, loc=0)\\n Median of the distribution.\\n mean(a, loc=0)\\n Mean of the distribution.\\n var(a, loc=0)\\n Variance of the distribution.\\n std(a, loc=0)\\n Standard deviation of the distribution.\\n interval(alpha, a, loc=0)\\n Endpoints of the range that contains fraction alpha [0, 1] of the\\n distribution\\n\\n See Also\\n --------\\n zipfian\\n\\n Notes\\n -----\\n The probability mass function for `zipf` is:\\n\\n .. math::\\n\\n f(k, a) = \\\\frac{1}{\\\\zeta(a) k^a}\\n\\n for :math:`k \\\\ge 1`, :math:`a > 1`.\\n\\n `zipf` takes :math:`a > 1` as shape parameter. :math:`\\\\zeta` is the\\n Riemann zeta function (`scipy.special.zeta`)\\n\\n The Zipf distribution is also known as the zeta distribution, which is\\n a special case of the Zipfian distribution (`zipfian`).\\n\\n The probability mass function above is defined in the \\\"standardized\\\" form.\\n To shift distribution use the ``loc`` parameter.\\n Specifically, ``zipf.pmf(k, a, loc)`` is identically\\n equivalent to ``zipf.pmf(k - loc, a)``.\\n\\n References\\n ----------\\n .. [1] \\\"Zeta Distribution\\\", Wikipedia,\\n https://en.wikipedia.org/wiki/Zeta_distribution\\n\\n Examples\\n --------\\n >>> from scipy.stats import zipf\\n >>> import matplotlib.pyplot as plt\\n >>> fig, ax = plt.subplots(1, 1)\\n \\n Calculate the first four moments:\\n \\n >>> a = 6.5\\n >>> mean, var, skew, kurt = zipf.stats(a, moments='mvsk')\\n \\n Display the probability mass function (``pmf``):\\n \\n >>> x = np.arange(zipf.ppf(0.01, a),\\n ... zipf.ppf(0.99, a))\\n >>> ax.plot(x, zipf.pmf(x, a), 'bo', ms=8, label='zipf pmf')\\n >>> ax.vlines(x, 0, zipf.pmf(x, a), colors='b', lw=5, alpha=0.5)\\n \\n Alternatively, the distribution object can be called (as a function)\\n to fix the shape and location. This returns a \\\"frozen\\\" RV object holding\\n the given parameters fixed.\\n \\n Freeze the distribution and display the frozen ``pmf``:\\n \\n >>> rv = zipf(a)\\n >>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1,\\n ... label='frozen pmf')\\n >>> ax.legend(loc='best', frameon=False)\\n >>> plt.show()\\n \\n Check accuracy of ``cdf`` and ``ppf``:\\n \\n >>> prob = zipf.cdf(x, a)\\n >>> np.allclose(x, zipf.ppf(prob, a))\\n True\\n \\n Generate random numbers:\\n \\n >>> r = zipf.rvs(a, size=1000)\\n\\n Confirm that `zipf` is the large `n` limit of `zipfian`.\\n\\n >>> from scipy.stats import zipfian\\n >>> k = np.arange(11)\\n >>> np.allclose(zipf.pmf(k, a), zipfian.pmf(k, a, n=10000000))\\n True\\n\\n \"",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
}
},
"inverse_func": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"validate": {
"__class__": "str",
"__module__": "builtins",
"content": "false",
"is_json": true
},
"accept_sparse": {
"__class__": "str",
"__module__": "builtins",
"content": "false",
"is_json": true
},
"check_inverse": {
"__class__": "str",
"__module__": "builtins",
"content": "true",
"is_json": true
},
"feature_names_out": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"kw_args": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"inv_kw_args": {
"__class__": "str",
"__module__": "builtins",
"content": "null",
"is_json": true
},
"_sklearn_version": {
"__class__": "str",
"__module__": "builtins",
"content": "\"1.1.1\"",
"is_json": true
}
},
"key_types": {
"__class__": "list",
"__module__": "builtins",
"content": [...]
}
}
}
("key_types"
omitted for readability)
About this issue
- Original URL
- State: open
- Created 2 years ago
- Comments: 17 (4 by maintainers)
Reopening since this is not fixed IIUC.
Done
It’s probably a bug in the pasted code, where the variable
obj
is overridden by the pickle call. When you remove that, there is indeed an error (at least on main).Yes, we explicitly didn’t want to “re-invent” the whole pickle machinery, at that point we might as well use pickle 😃 So hopefully, for the much narrower scope we set ourselves for skops, we find a way to avoid this problem. Absolute worst case scenario would be special casing objects known to have that problem and giving custom code an escape hatch via
__getstate__
but I’m sure we can come up with something better.Besides the problem with methods, there is also a problem with circular references. Here are some simple examples that fail at the moment:
Fixing this might also fix
sklearn.cluster.Birch
.I can confirm that adding dummy
MethodType
dispatch functions can get the provided example working.I’ll raise a draft PR that has the code needed to get the above example working, but I think it’ll need more work to make sure it works in other cases of bound methods getting serialised.