pangeo-forge-recipes: Problem with cftime coordinates on `sequence_dim`

In #47, @naomi-henderson reported that cftime-based time coordinates did not work with her recipe. (Details in this notebook.

The error occurs on prepare_target. Some relevant traceback is:

---------------------------------------------------------------------------
OutOfBoundsDatetime                       Traceback (most recent call last)
/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode_cf_datetime(num_dates, units, calendar, use_cftime)
    193         try:
--> 194             dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)
    195         except (KeyError, OutOfBoundsDatetime, OverflowError):

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_datetime_with_pandas(flat_num_dates, units, calendar)
    141             "Cannot decode times from a non-standard calendar, {!r}, using "
--> 142             "pandas.".format(calendar)
    143         )

OutOfBoundsDatetime: Cannot decode times from a non-standard calendar, 'noleap', using pandas.

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_cf_datetime_dtype(data, units, calendar, use_cftime)
    112     try:
--> 113         result = decode_cf_datetime(example_value, units, calendar, use_cftime)
    114     except Exception:

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode_cf_datetime(num_dates, units, calendar, use_cftime)
    196             dates = _decode_datetime_with_cftime(
--> 197                 flat_num_dates.astype(float), units, calendar
    198             )

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_datetime_with_cftime(num_dates, units, calendar)
    133     return np.asarray(
--> 134         cftime.num2date(num_dates, units, calendar, only_use_cftime_datetimes=True)
    135     )

src/cftime/_cftime.pyx in cftime._cftime.num2date()

TypeError: unsupported operand type(s) for +: 'cftime._cftime.DatetimeNoLeap' and 'NoneType'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-47-9bb7712f434d> in <module>
      1 # put basic info in target directory
----> 2 recipe.prepare_target()

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/pangeo_forge/recipe.py in _prepare_target()
    166 
    167             try:
--> 168                 ds = self.open_target()
    169                 logger.info("Found an existing dataset in target")
    170                 logger.debug(f"{ds}")

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/pangeo_forge/recipe.py in open_target(self)
    271     def open_target(self):
    272         target_mapper = self.target.get_mapper()
--> 273         return xr.open_zarr(target_mapper)
    274 
    275     def initialize_target(self, ds, **expand_dims):

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/zarr.py in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, decode_timedelta, use_cftime, **kwargs)
    686         backend_kwargs=backend_kwargs,
    687         decode_timedelta=decode_timedelta,
--> 688         use_cftime=use_cftime,
    689     )
    690 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
    573 
    574     with close_on_error(store):
--> 575         ds = maybe_decode_store(store, chunks)
    576 
    577     # Ensure source filename always stored in dataset object (GH issue #2550)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in maybe_decode_store(store, chunks)
    477             drop_variables=drop_variables,
    478             use_cftime=use_cftime,
--> 479             decode_timedelta=decode_timedelta,
    480         )
    481 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf(obj, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
    596         drop_variables=drop_variables,
    597         use_cftime=use_cftime,
--> 598         decode_timedelta=decode_timedelta,
    599     )
    600     ds = Dataset(vars, attrs=attrs)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf_variables(variables, attributes, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
    498             stack_char_dim=stack_char_dim,
    499             use_cftime=use_cftime,
--> 500             decode_timedelta=decode_timedelta,
    501         )
    502         if decode_coords:

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf_variable(name, var, concat_characters, mask_and_scale, decode_times, decode_endianness, stack_char_dim, use_cftime, decode_timedelta)
    338         var = times.CFTimedeltaCoder().decode(var, name=name)
    339     if decode_times:
--> 340         var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
    341 
    342     dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode(self, variable, name)
    461             units = pop_to(attrs, encoding, "units")
    462             calendar = pop_to(attrs, encoding, "calendar")
--> 463             dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
    464             transform = partial(
    465                 decode_cf_datetime,

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_cf_datetime_dtype(data, units, calendar, use_cftime)
    121             "if it is not installed."
    122         )
--> 123         raise ValueError(msg)
    124     else:
    125         dtype = getattr(result, "dtype", np.dtype("object"))

ValueError: unable to decode time units 'hours since 0001-01-16 12:00:00.000000' with "calendar 'noleap'". Try open

Examining this closely, it looks like there is already a dataset in the target, but it can’t be opened. This makes it hard to debug. The notebook has been run non-sequentially, making it hard to debug. @naomi-henderson, it would be great if you could turn this into a reproducible example we can use to get the the bottom of the cftime issue.

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 21 (21 by maintainers)

Most upvoted comments

@rabernat, As usual, your comments are very clarifying, thanks! This old lady brain gets easily bogged down!

Yes, “outdated in many ways” was an exaggeration, of course - that first example is very helpful! Now that I know about context managers I will go through it again and make a pull request with my suggestions.

Okay, will use the new pangeo-forge environment - I had made a kernel with the old one and then just updated xarray, fsspec and pangeo-forge. I agree it is best to make a new kernel at this point. I will also give mamba a try because conda is taking way… too… long…

As for making a new tutorial example with CMIP6 - yes, I will give it a try. I am concerned about all of the moving parts, GFDL’s AWS collection included, but will try to create something robust.

For the sake of organization, perhaps this issue should be closed, and the 3 new issues @naomi-henderson discovered opened as distinct, new Issues?

Thanks for this suggestion @cisaacstern. I think 1&2 are related (see comment above). 3 is about stale documentation. Having dedicated issues for these would be useful; however, depending on @naomi-henderson’s response, they might be resolved very quickly, so possibly not needed…

Thanks so much @naomi-henderson for trying this out!

Issues 1 and 2 are very likely related to your environment. The intermittent hanging in 1 sounds a lot like https://github.com/intake/filesystem_spec/issues/565; this was a bug in filesystem spec that was surfaced in part by our work on Pangeo Forge. It has been fixed in the latest fsspec master. It would be great if you could verify this.

2 is because we are now dependent on an as-of-yet unmerged xarray PR (https://github.com/pydata/xarray/pull/5065) which adds the safe_chunks option. Hopefully that will go in soon.

For development, you’re probably best off creating a new environment that matches our CI, which should eliminate both problems:

https://github.com/pangeo-forge/pangeo-forge/blob/ba4dc7430137ae854b358698f0eb84fb4232c032/ci/py3.8.yml#L1-L39

(I have switched from conda to mamba and am never going back.)

3. example is outdated in many ways

Thanks for checking this. You’re absolutely right that I have not bothered to update the tutorials after some recent changes. However, I hope that “outdated in many ways” is an exaggeration; I have strived to keep the API the same. The biggest change, as you noted, is the use of context managers for all openers. This allows us to keep better track of open / closed file objects and is in line with python best practice. So instead of.

ds = recipe.open_input(input_key)

you do

with recipe.open_input(input_key) as ds:
    # do something with ds
    display(ds)
    # If you want it in memory outside of the context manager, do
    ds.load()
# now the file is closed

(Same for open_chunk().) It would be fantastic if you could update the tutorials where needed. Going even further, perhaps you could turn your CMIP recipe into its own tutorial example notebook for the docs?

Thanks again for your helpful comments and real-world testing. Things are moving fast, so it’s great to have this input.

@cisaacstern , that notebook was in reference to a very old version of the NetCDFtoZarrSequentialRecipe. I haven’t looked at the problem since then and I don’t know if it is still an issue. I will try to get the latest recipe and see if I still have cftime issues and, if so, try find a better example for you

Naomi’s full traceback is

---------------------------------------------------------------------------
OutOfBoundsDatetime                       Traceback (most recent call last)
/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode_cf_datetime(num_dates, units, calendar, use_cftime)
    193         try:
--> 194             dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)
    195         except (KeyError, OutOfBoundsDatetime, OverflowError):

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_datetime_with_pandas(flat_num_dates, units, calendar)
    141             "Cannot decode times from a non-standard calendar, {!r}, using "
--> 142             "pandas.".format(calendar)
    143         )

OutOfBoundsDatetime: Cannot decode times from a non-standard calendar, 'noleap', using pandas.

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_cf_datetime_dtype(data, units, calendar, use_cftime)
    112     try:
--> 113         result = decode_cf_datetime(example_value, units, calendar, use_cftime)
    114     except Exception:

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode_cf_datetime(num_dates, units, calendar, use_cftime)
    196             dates = _decode_datetime_with_cftime(
--> 197                 flat_num_dates.astype(float), units, calendar
    198             )

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_datetime_with_cftime(num_dates, units, calendar)
    133     return np.asarray(
--> 134         cftime.num2date(num_dates, units, calendar, only_use_cftime_datetimes=True)
    135     )

src/cftime/_cftime.pyx in cftime._cftime.num2date()

TypeError: unsupported operand type(s) for +: 'cftime._cftime.DatetimeNoLeap' and 'NoneType'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-8-dd7bdaf49ef5> in <module>
      1 #from cftime import DatetimeNoLeap
      2 # store first chunk
----> 3 recipe.store_chunk(0)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/pangeo_forge/recipe.py in _store_chunk(chunk_key)
    209             write_region = self.region_for_chunk(chunk_key)
    210             logger.info(f"Storing chunk '{chunk_key}' to Zarr region {write_region}")
--> 211             ds_chunk.to_zarr(target_mapper, region=write_region)
    212 
    213         return _store_chunk

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/core/dataset.py in to_zarr(self, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region)
   1754             consolidated=consolidated,
   1755             append_dim=append_dim,
-> 1756             region=region,
   1757         )
   1758 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in to_zarr(dataset, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region)
   1479     writer = ArrayWriter()
   1480     # TODO: figure out how to properly handle unlimited_dims
-> 1481     dump_to_store(dataset, zstore, writer, encoding=encoding)
   1482     writes = writer.sync(compute=compute)
   1483 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
   1156         variables, attrs = encoder(variables, attrs)
   1157 
-> 1158     store.store(variables, attrs, check_encoding, writer, unlimited_dims=unlimited_dims)
   1159 
   1160 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/zarr.py in store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
    460             # there are variables to append
    461             # their encoding must be the same as in the store
--> 462             ds = open_zarr(self.ds.store, group=self.ds.path, chunks=None)
    463             variables_with_encoding = {}
    464             for vn in existing_variables:

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/zarr.py in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, decode_timedelta, use_cftime, **kwargs)
    686         backend_kwargs=backend_kwargs,
    687         decode_timedelta=decode_timedelta,
--> 688         use_cftime=use_cftime,
    689     )
    690 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
    573 
    574     with close_on_error(store):
--> 575         ds = maybe_decode_store(store, chunks)
    576 
    577     # Ensure source filename always stored in dataset object (GH issue #2550)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/backends/api.py in maybe_decode_store(store, chunks)
    477             drop_variables=drop_variables,
    478             use_cftime=use_cftime,
--> 479             decode_timedelta=decode_timedelta,
    480         )
    481 

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf(obj, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
    596         drop_variables=drop_variables,
    597         use_cftime=use_cftime,
--> 598         decode_timedelta=decode_timedelta,
    599     )
    600     ds = Dataset(vars, attrs=attrs)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf_variables(variables, attributes, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
    498             stack_char_dim=stack_char_dim,
    499             use_cftime=use_cftime,
--> 500             decode_timedelta=decode_timedelta,
    501         )
    502         if decode_coords:

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/conventions.py in decode_cf_variable(name, var, concat_characters, mask_and_scale, decode_times, decode_endianness, stack_char_dim, use_cftime, decode_timedelta)
    338         var = times.CFTimedeltaCoder().decode(var, name=name)
    339     if decode_times:
--> 340         var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)
    341 
    342     dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in decode(self, variable, name)
    461             units = pop_to(attrs, encoding, "units")
    462             calendar = pop_to(attrs, encoding, "calendar")
--> 463             dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
    464             transform = partial(
    465                 decode_cf_datetime,

/usr/local/python/anaconda3/envs/pangeo-forge/lib/python3.7/site-packages/xarray/coding/times.py in _decode_cf_datetime_dtype(data, units, calendar, use_cftime)
    121             "if it is not installed."
    122         )
--> 123         raise ValueError(msg)
    124     else:
    125         dtype = getattr(result, "dtype", np.dtype("object"))

ValueError: unable to decode time units 'hours since 1850-01-15 12:00:00.000000' with "calendar 'noleap'". Try opening your dataset with decode_times=False or installing cftime if it is not installed.