cudf: [BUG] std on dask_cudf frame fails using the agg api

Describe the bug std on dask_cudf frame fails using the agg api

Steps/Code to reproduce bug

import cudf
import dask_cudf

df = cudf.DataFrame({'a': [1,1,2,2],'b': [4,5,6,10]})
ddf = dask_cudf.from_cudf(df, npartitions=2)

ddf.groupby('a').agg({'b':['mean','std']}).compute()

Trace:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
    171     try:
--> 172         yield
    173     except Exception as e:

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
   4937     with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4938         return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
   4939 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in _groupby_apply_funcs(df, *index, **kwargs)
    880     for result_column, func, func_kwargs in funcs:
--> 881         r = func(grouped, **func_kwargs)
    882 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in _compute_sum_of_squares(grouped, column)
    898     base = grouped[column] if column is not None else grouped
--> 899     return base.apply(lambda x: (x ** 2).sum())
    900 

AttributeError: 'SeriesGroupBy' object has no attribute 'apply'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-13-ae54f4074622> in <module>
      5 ddf = dask_cudf.from_cudf(df, npartitions=2)
      6 
----> 7 ddf.groupby('a').agg({'b':['mean','std']}).compute()

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in agg(self, arg, split_every, split_out)
   1756     @derived_from(pd.core.groupby.DataFrameGroupBy)
   1757     def agg(self, arg, split_every=None, split_out=1):
-> 1758         return self.aggregate(arg, split_every=split_every, split_out=split_out)
   1759 
   1760 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in aggregate(self, arg, split_every, split_out)
   1751 
   1752         return super(DataFrameGroupBy, self).aggregate(
-> 1753             arg, split_every=split_every, split_out=split_out
   1754         )
   1755 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py in aggregate(self, arg, split_every, split_out)
   1532             split_out=split_out,
   1533             split_out_setup=split_out_on_index,
-> 1534             sort=self.sort,
   1535         )
   1536 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in apply_concat_apply(args, chunk, aggregate, combine, meta, token, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, split_out, split_out_setup, split_out_setup_kwargs, sort, **kwargs)
   4891 
   4892     if meta is no_default:
-> 4893         meta_chunk = _emulate(chunk, *args, udf=True, **chunk_kwargs)
   4894         meta = _emulate(aggregate, _concat([meta_chunk]), udf=True, **aggregate_kwargs)
   4895     meta = make_meta(

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py in _emulate(func, *args, **kwargs)
   4936     """
   4937     with raise_on_meta_error(funcname(func), udf=kwargs.pop("udf", False)):
-> 4938         return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
   4939 
   4940 

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
    128                 value = type()
    129             try:
--> 130                 self.gen.throw(type, value, traceback)
    131             except StopIteration as exc:
    132                 # Suppress StopIteration *unless* it's the same exception that

/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py in raise_on_meta_error(funcname, udf)
    191         )
    192         msg = msg.format(" in `{0}`".format(funcname) if funcname else "", repr(e), tb)
--> 193         raise ValueError(msg)
    194 
    195 

ValueError: Metadata inference failed in `_groupby_apply_funcs`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
AttributeError("'SeriesGroupBy' object has no attribute 'apply'")

Traceback:
---------
  File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/utils.py", line 172, in raise_on_meta_error
    yield
  File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/core.py", line 4938, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py", line 881, in _groupby_apply_funcs
    r = func(grouped, **func_kwargs)
  File "/raid/vjawa/miniconda3/envs/cudf_march_9/lib/python3.7/site-packages/dask/dataframe/groupby.py", line 899, in _compute_sum_of_squares
    return base.apply(lambda x: (x ** 2).sum())

Expected behaviour

import cudf
import dask_cudf

df = cudf.DataFrame({'a': [1,1,2,2],'b': [4,5,6,10]})
ddf = dask_cudf.from_cudf(df, npartitions=2).to_dask_dataframe()

ddf.groupby('a').agg({'b':['mean','std']}).compute()
b
mean	std
a		
1	4.5	0.707107
2	8.0	2.828427

Environment details

# packages in environment at /raid/vjawa/miniconda3/envs/cudf_march_9:
cudf                      0.13.0a200309         py37_3294    rapidsai-nightly
dask-cudf                 0.13.0a200309         py37_3294    rapidsai-nightly
libcudf                   0.13.0a200309     cuda10.1_3294    rapidsai-nightly
dask                      2.12.0                     py_0    conda-forge
dask-core                 2.12.0                     py_0    conda-forge
dask-cuda                 0.13.0b200309           py37_65    rapidsai-nightly
dask-cudf                 0.13.0a200309         py37_3294    rapidsai-nightly

About this issue

  • Original URL
  • State: closed
  • Created 4 years ago
  • Comments: 16 (12 by maintainers)

Most upvoted comments

Closing this since dask#6186 is merged.