pandas: to_parquet can't handle mixed type columns
Code Sample, a copy-pastable example if possible
import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')
Problem description
to_parquet tries to convert an object column to int64. This happens when using either engine but is clearly seen when using data.to_parquet('example.parquet', engine='fastparquet')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
447 if selement.type == parquet_thrift.Type.INT64:
--> 448 data = data.astype(int)
449 elif selement.type == parquet_thrift.Type.BOOLEAN:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
176 kwargs[new_arg_name] = new_arg_value
--> 177 return func(*args, **kwargs)
178 return wrapper
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4996 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4997 **kwargs)
4998 return self._constructor(new_data).__finalize__(self)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3713 def astype(self, dtype, **kwargs):
-> 3714 return self.apply('astype', dtype=dtype, **kwargs)
3715
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
574 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575 **kwargs)
576
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
663 # _astype_nansafe works fine with 1-d only
--> 664 values = astype_nansafe(values.ravel(), dtype, copy=True)
665 values = values.reshape(self.shape)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
708 if np.issubdtype(dtype.type, np.integer):
--> 709 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
710
pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src\util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: 'Z31'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-6bc14a88da64> in <module>()
----> 1 data.to_parquet('example.parquet', engine='fastparquet')
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1940 from pandas.io.parquet import to_parquet
1941 to_parquet(self, fname, engine,
-> 1942 compression=compression, **kwargs)
1943
1944 @Substitution(header='Write out the column names. If a list of strings '
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in to_parquet(df, path, engine, compression, **kwargs)
255 """
256 impl = get_engine(engine)
--> 257 return impl.write(df, path, compression=compression, **kwargs)
258
259
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in write(self, df, path, compression, **kwargs)
216 with catch_warnings(record=True):
217 self.api.write(path, df,
--> 218 compression=compression, **kwargs)
219
220 def read(self, path, columns=None, **kwargs):
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
846 if file_scheme == 'simple':
847 write_simple(filename, data, fmd, row_group_offsets,
--> 848 compression, open_with, has_nulls, append)
849 elif file_scheme in ['hive', 'drill']:
850 if append:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
715 else None)
716 rg = make_row_group(f, data[start:end], fmd.schema,
--> 717 compression=compression)
718 if rg is not None:
719 fmd.row_groups.append(rg)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in make_row_group(f, data, schema, compression)
612 comp = compression
613 chunk = write_column(f, data[column.name], column,
--> 614 compression=comp)
615 rg.columns.append(chunk)
616 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
453 raise ValueError('Error converting column "%s" to primitive '
454 'type %s. Original error: '
--> 455 '%s' % (data.name, t, e))
456
457 else:
ValueError: Error converting column "B" to primitive type INT64. Original error: invalid literal for int() with base 10: 'Z31'
You can see that it is a mixed type column issue if you use to_csv and read_csv to load data from csv file instead - you get the following warning on import:
C:\Users\I347500\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
Specifying dtype option solves the issue but it isn’t convenient that there is no way to set column types after loading the data. It is also strange that to_parquet tries to infer column types instead of using dtypes as stated in .dtypes or .info()
Expected Output
to_parquet tries write parquet file using dtypes as specified
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None python: 3.6.5.final.0 python-bits: 64 OS: Windows OS-release: 10 machine: AMD64 processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel byteorder: little LC_ALL: None LANG: None LOCALE: None.None
pandas: 0.23.0 pytest: 3.5.1 pip: 10.0.1 setuptools: 39.1.0 Cython: None numpy: 1.14.3 scipy: 1.1.0 pyarrow: 0.9.0 xarray: None IPython: 6.4.0 sphinx: None patsy: 0.5.0 dateutil: 2.7.3 pytz: 2018.4 blosc: None bottleneck: None tables: None numexpr: None feather: None matplotlib: 2.2.2 openpyxl: None xlrd: 1.1.0 xlwt: 1.3.0 xlsxwriter: 1.0.4 lxml: None bs4: None html5lib: 0.9999999 sqlalchemy: None pymysql: None psycopg2: None jinja2: 2.10 s3fs: None fastparquet: 0.1.5 pandas_gbq: None pandas_datareader: None
About this issue
- Original URL
- State: closed
- Created 6 years ago
- Reactions: 1
- Comments: 16 (6 by maintainers)
re ’ you have partly strings, partly integer values. What would be the expected type when writing this column?’ I would expect it to be a string.
I know this is a closed issue, but in case someone looks for a patch, here is what worked for me:
I needed this as I was dealing with a large dataframe (coming from openfoodfacts: https://world.openfoodfacts.org/data ), containing 1M lines and 177 columns of various types, and I simply could not manually cast each column.
In my case, I had read in multiple csv’s and done
pandas.concat(). Some read in as float and others as string.pandas.concat()stuck them together without any warnings, and the problem became apparent whento_parquet()complained.So in that case at least, it may be more an issue with
concat()than withto_parquet()We could have some mechanism to indicate “this column should have a string type in the final parquet file”, like we have a
dtypeargument forto_sql(you can actually already do something like manually this by passing theschemaargument). However, the problem is that the arrow functions that convert numpy arrays to arrow arrays still give errors for mixed string / integer types, even if you indicate that it should be strings, eg:So unless that is something arrow would want to change (but personally I would not do that), this would not help for the specific example case in this issue.
We could of course still do a conversion on the pandas side, but that would need to be rather custom logic (and a user can do
df.astype({'col': str}).to_parquet(..)themselves before writing to parquet). So I think we can close this issue.@xhochy It is a string type column that unfortunately has a lot of integer-like values but the expected type is definitely string.
IMHO, there should be an option to write a column with a string type even if all the values inside are integers - for example, to maintain consistency of column types among multiple files. This is not the case for my example - column B can’t have integer type.
I solved this by:
First, find out the mixed type column and convert them to string. Then find out list type column and convert them to string if not you may get
pyarrow.lib.ArrowInvalid: Nested column branch had multiple childrenReference:https://stackoverflow.com/questions/29376026/whats-a-good-strategy-to-find-mixed-types-in-pandas-columns https://stackoverflow.com/questions/50876505/does-any-python-library-support-writing-arrays-of-structs-to-parquet-files
I know this issue is closed but I found the quick fix. When you write to_parquet(), make sure to pass the argument low_memory = False. This will automatically handle the mixed types columns error.
IMHO we should close this since it’s giving people the wrong impression that parquet “can’t handle mixed type columns”, e.g. “hey ,they have an open issue with this title” (without a clear resolution at the end of the thread).
As @jorisvandenbossche mentioned, the OP’s problem is type inference when doing
pd.read_excel(). It has nothing to do withto_parquet, and as he pointed out, the user can always dodf.astype({'col': str}).to_parquet(..)to manage and mix types as needed.I realize that this has been closed for a while now, but as I’m revisiting this error, I wanted to share a possible hack around it (not that it’s an ideal approach):
as @catawbasam mentioned:
I cast all my categorical columns into ‘str’ before writing as parquet (instead of specifying each column by name which can get cumbersome for 500 columns).
When I load it back into pandas, the type of the str column would be
objectagain.Edit: If you happen to hit an error with NA’s being hardcoded into ‘None’ after you convert your object columns into str, make sure to convert these NA’s into np.nan before converting into str (stackoverflow link)