dask: Dask Dataframe not able to read remote file, but local (and pandas) work
import pandas as pd
from dask import dataframe as dd
# works
df0 = pd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')
# works
df1 = dd.read_csv('path/to/that/file/on/disk.csv')
# does not work
df2 = dd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')
The combination of dask and a remote read causes the following error:
---------------------------------------------------------------------------
EmptyDataError Traceback (most recent call last)
<ipython-input-75-fc142575840b> in <module>
1 from dask import dataframe as dd
----> 2 df2 = dd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=SP500')
~/mc/lib/python3.7/site-packages/dask/dataframe/io/csv.py in read(urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
576 storage_options=storage_options,
577 include_path_column=include_path_column,
--> 578 **kwargs
579 )
580
~/mc/lib/python3.7/site-packages/dask/dataframe/io/csv.py in read_pandas(reader, urlpath, blocksize, collection, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
442
443 # Use sample to infer dtypes and check for presence of include_path_column
--> 444 head = reader(BytesIO(b_sample), **kwargs)
445 if include_path_column and (include_path_column in head.columns):
446 raise ValueError(
~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
455
456 # Create the parser.
--> 457 parser = TextFileReader(fp_or_buf, **kwds)
458
459 if chunksize or iterator:
~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
893 self.options["has_index_names"] = kwds["has_index_names"]
894
--> 895 self._make_engine(self.engine)
896
897 def close(self):
~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
1133 def _make_engine(self, engine="c"):
1134 if engine == "c":
-> 1135 self._engine = CParserWrapper(self.f, **self.options)
1136 else:
1137 if engine == "python":
~/mc/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1915 kwds["usecols"] = self.usecols
1916
-> 1917 self._reader = parsers.TextReader(src, **kwds)
1918 self.unnamed_cols = self._reader.unnamed_cols
1919
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
EmptyDataError: No columns to parse from file
I’m not certain what is going on exactly.
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 19 (19 by maintainers)
OK, I’ll make a PR to fsspec with that.
On Wed, Oct 23, 2019 at 9:58 AM Martin Durant notifications@github.com wrote:
@martindurant would know better, but that’s what we do if no size is provided. But from what I can tell the server is responding with a size, it’s just wrong (saying it’s 0)
ideally we would have a way to completely disable this, but I’m not sure if / how to do that.