cudf: [BUG] std on dask_cudf frame fails using the agg api
Describe the bug
std on dask_cudf frame fails using the agg api
Steps/Code to reproduce bug
import cudf
import dask_cudf
df = cudf.DataFrame({'a': [1,1,2,2],'b': [4,5,6,10]})
ddf = dask_cudf.from_cudf(df, npartitions=2)
ddf.groupby('a').agg({'b':['mean','std']}).compute()
Trace:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
171 try:
--> 172 yield
173 except Exception as e:
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
4937 with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4938 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
4939
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in _groupby_apply_funcs(df, *index, **kwargs)
880 for result_column, func, func_kwargs in funcs:
--> 881 r = func(grouped, **func_kwargs)
882
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in _compute_sum_of_squares(grouped, column)
898 base = grouped[column] if column is not None else grouped
--> 899 return base.apply(lambda x: (x ** 2).sum())
900
AttributeError: 'SeriesGroupBy' object has no attribute 'apply'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-13-ae54f4074622> in <module>
5 ddf = dask_cudf.from_cudf(df, npartitions=2)
6
----> 7 ddf.groupby('a').agg({'b':['mean','std']}).compute()
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in agg(self, arg, split_every, split_out)
1756 @derived_from(pd.core.groupby.DataFrameGroupBy)
1757 def agg(self, arg, split_every=None, split_out=1):
-> 1758 return self.aggregate(arg, split_every=split_every, split_out=split_out)
1759
1760
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in aggregate(self, arg, split_every, split_out)
1751
1752 return super(DataFrameGroupBy, self).aggregate(
-> 1753 arg, split_every=split_every, split_out=split_out
1754 )
1755
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in aggregate(self, arg, split_every, split_out)
1532 split_out=split_out,
1533 split_out_setup=split_out_on_index,
-> 1534 sort=self.sort,
1535 )
1536
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in apply_concat_apply(args, chunk, aggregate, combine, meta, token, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, split_out, split_out_setup, split_out_setup_kwargs, sort, **kwargs)
4891
4892 if meta is no_default:
-> 4893 meta_chunk = _emulate(chunk, *args, udf=True, **chunk_kwargs)
4894 meta = _emulate(aggregate, _concat([meta_chunk]), udf=True, **aggregate_kwargs)
4895 meta = make_meta(
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
4936 """
4937 with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4938 return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
4939
4940
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
191 )
192 msg = msg.format(" in `{0}`".format(funcname) if funcname else "", repr(e), tb)
--> 193 raise ValueError(msg)
194
195
ValueError: Metadata inference failed in `_groupby_apply_funcs`.
You have supplied a custom function and Dask is unable to
determine the type of output that that function returns.
To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.
Original error is below:
------------------------
AttributeError("'SeriesGroupBy' object has no attribute 'apply'")
Traceback:
---------
File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py", line 172, in raise_on_meta_error
yield
File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py", line 4938, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py", line 881, in _groupby_apply_funcs
r = func(grouped, **func_kwargs)
File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py", line 899, in _compute_sum_of_squares
return base.apply(lambda x: (x ** 2).sum())
Expected behaviour
import cudf
import dask_cudf
df = cudf.DataFrame({'a': [1,1,2,2],'b': [4,5,6,10]})
ddf = dask_cudf.from_cudf(df, npartitions=2).to_dask_dataframe()
ddf.groupby('a').agg({'b':['mean','std']}).compute()
b
mean std
a
1 4.5 0.707107
2 8.0 2.828427
Environment details
# packages in environment at /raid/vjawa/miniconda3/envs/cudf_march_9:
cudf 0.13.0a200309 py37_3294 rapidsai-nightly
dask-cudf 0.13.0a200309 py37_3294 rapidsai-nightly
libcudf 0.13.0a200309 cuda10.1_3294 rapidsai-nightly
dask 2.12.0 py_0 conda-forge
dask-core 2.12.0 py_0 conda-forge
dask-cuda 0.13.0b200309 py37_65 rapidsai-nightly
dask-cudf 0.13.0a200309 py37_3294 rapidsai-nightly
About this issue
- Original URL
- State: closed
- Created 4 years ago
- Comments: 16 (12 by maintainers)
Closing this since dask#6186 is merged.