dask: Dask Dataframe not able to read remote file, but local (and pandas) work

import pandas as pd
from dask import dataframe as dd

# works
df0 = pd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')
# works
df1 = dd.read_csv('path/to/that/file/on/disk.csv')
# does not work
df2 = dd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')

The combination of dask and a remote read causes the following error:

---------------------------------------------------------------------------
EmptyDataError                            Traceback (most recent call last)
<ipython-input-75-fc142575840b> in <module>
      1 from dask import dataframe as dd
----> 2 df2 = dd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')

~/mc/lib/python3.7/site-packages/dask/dataframe/io/csv.py in read(urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
    576             storage_options=storage_options,
    577             include_path_column=include_path_column,
--> 578             **kwargs
    579         )
    580 

~/mc/lib/python3.7/site-packages/dask/dataframe/io/csv.py in read_pandas(reader, urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
    442 
    443     # Use sample to infer dtypes and check for presence of include_path_column
--> 444     head = reader(BytesIO(b_sample), **kwargs)
    445     if include_path_column and (include_path_column in head.columns):
    446         raise ValueError(

~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name

~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455 
    456     # Create the parser.
--> 457     parser = TextFileReader(fp_or_buf, **kwds)
    458 
    459     if chunksize or iterator:

~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options["has_index_names"] = kwds["has_index_names"]
    894 
--> 895         self._make_engine(self.engine)
    896 
    897     def close(self):

~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1133     def _make_engine(self, engine="c"):
   1134         if engine == "c":
-> 1135             self._engine = CParserWrapper(self.f, **self.options)
   1136         else:
   1137             if engine == "python":

~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1915         kwds["usecols"] = self.usecols
   1916 
-> 1917         self._reader = parsers.TextReader(src, **kwds)
   1918         self.unnamed_cols = self._reader.unnamed_cols
   1919 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

EmptyDataError: No columns to parse from file

I’m not certain what is going on exactly.

About this issue

  • Original URL
  • State: closed
  • Created 5 years ago
  • Comments: 19 (19 by maintainers)

Most upvoted comments

OK, I’ll make a PR to fsspec with that.

On Wed, Oct 23, 2019 at 9:58 AM Martin Durant notifications@github.com wrote:

Agreed, point 2 seems sensible when you put it that way!

— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/dask/dask/issues/5517?email_source=notifications&email_token=AAKAOIUOOYZEYI6YZM5YXV3QQBRCFA5CNFSM4JDEBRXKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOECBXFMA#issuecomment-545485488, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAKAOIRZ2JYBOH6B643K4ULQQBRCFANCNFSM4JDEBRXA .

@martindurant would know better, but that’s what we do if no size is provided. But from what I can tell the server is responding with a size, it’s just wrong (saying it’s 0)

(Pdb) pp dict(r.headers)
{'Accept-Ranges': 'bytes',
 'Connection': 'keep-alive',
 'Content-Disposition': 'attachment; filename="SP500.csv"',
 'Content-Encoding': 'gzip',
 'Content-Length': '0',
 'Content-Type': 'text/csv;charset=UTF-8',
 'Date': 'Mon, 21 Oct 2019 20:36:26 GMT',
 'Pragma': 'public',
 'Strict-Transport-Security': 'max-age=15768000; includeSubDomains',
 'Vary': 'Accept-Encoding,User-Agent,Origin',
 'X-Content-Type-Options': 'nosniff',
 'X-XSS-Protection': '1; mode=block'}

ideally we would have a way to completely disable this, but I’m not sure if / how to do that.