dask: CI test failures with pyarrow and hdfs
Seeing this on CIs. Looks like a packaging issue actually as it is missing boost
. Maybe related to PR ( https://github.com/conda-forge/parquet-cpp-feedstock/pull/43 ).
=================================== FAILURES ===================================
_________________________ test_parquet_pyarrow[hdfs3] __________________________
mod_name = 'pyarrow', error_msg = '`pyarrow` not installed'
def import_required(mod_name, error_msg):
"""Attempt to import a required dependency.
Raises a RuntimeError if the requested module is not available.
"""
try:
> return import_module(mod_name)
dask/utils.py:92:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
name = 'pyarrow', package = None
def import_module(name, package=None):
"""Import a module.
The 'package' argument is required when performing a relative import. It
specifies the package to use as the anchor point from which to resolve the
relative import to an absolute import.
"""
level = 0
if name.startswith('.'):
if not package:
msg = ("the 'package' argument is required to perform a relative "
"import for {!r}")
raise TypeError(msg.format(name))
for character in name:
if character != '.':
break
level += 1
> return _bootstrap._gcd_import(name[level:], package, level)
/opt/conda/lib/python3.6/importlib/__init__.py:126:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
name = 'pyarrow', package = None, level = 0
> ???
<frozen importlib._bootstrap>:994:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
name = 'pyarrow', import_ = <function _gcd_import at 0x7f105ba66e18>
> ???
<frozen importlib._bootstrap>:971:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
name = 'pyarrow', import_ = <function _gcd_import at 0x7f105ba66e18>
> ???
<frozen importlib._bootstrap>:955:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
spec = ModuleSpec(name='pyarrow', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f102ff02c50>, origin='/opt...n3.6/site-packages/pyarrow/__init__.py', submodule_search_locations=['/opt/conda/lib/python3.6/site-packages/pyarrow'])
> ???
<frozen importlib._bootstrap>:665:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <_frozen_importlib_external.SourceFileLoader object at 0x7f102ff02c50>
module = <module 'pyarrow' from '/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py'>
> ???
<frozen importlib._bootstrap_external>:678:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
f = <built-in function exec>
args = (<code object <module> at 0x7f102fef2c00, file "/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py", line 20>,...cached__': '/opt/conda/lib/python3.6/site-packages/pyarrow/__pycache__/__init__.cpython-36.pyc', '__doc__': None, ...})
kwds = {}
> ???
<frozen importlib._bootstrap>:219:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from pkg_resources import get_distribution, DistributionNotFound
try:
__version__ = get_distribution(__name__).version
except DistributionNotFound:
# package is not installed
try:
# This code is duplicated from setup.py to avoid a dependency on each
# other.
def parse_version(root):
from setuptools_scm import version_from_scm
import setuptools_scm.git
describe = (setuptools_scm.git.DEFAULT_DESCRIBE +
" --match 'apache-arrow-[0-9]*'")
# Strip catchall from the commandline
describe = describe.replace("--match *.*", "")
version = setuptools_scm.git.parse(root, describe)
if not version:
return version_from_scm(root)
else:
return version
import setuptools_scm
__version__ = setuptools_scm.get_version('../', parse=parse_version)
except (ImportError, LookupError):
__version__ = None
> from pyarrow.lib import cpu_count, set_cpu_count
E ImportError: libboost_regex.so.1.66.0: cannot open shared object file: No such file or directory
/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py:47: ImportError
During handling of the above exception, another exception occurred:
hdfs = hdfs://localhost:8020, Connected
@require_pyarrow
def test_parquet_pyarrow(hdfs):
dd = pytest.importorskip('dask.dataframe')
import pandas as pd
import numpy as np
fn = '%s/test.parquet' % basedir
hdfs_fn = 'hdfs://%s' % fn
df = pd.DataFrame(np.random.normal(size=(1000, 4)),
columns=list('abcd'))
ddf = dd.from_pandas(df, npartitions=4)
> ddf.to_parquet(hdfs_fn, engine='pyarrow')
dask/bytes/tests/test_hdfs.py:245:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
dask/dataframe/core.py:1045: in to_parquet
return to_parquet(self, path, *args, **kwargs)
dask/dataframe/io/parquet.py:892: in to_parquet
write = get_engine(engine)['write']
dask/dataframe/io/parquet.py:749: in get_engine
import_required('pyarrow', "`pyarrow` not installed")
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
mod_name = 'pyarrow', error_msg = '`pyarrow` not installed'
def import_required(mod_name, error_msg):
"""Attempt to import a required dependency.
Raises a RuntimeError if the requested module is not available.
"""
try:
return import_module(mod_name)
except ImportError:
> raise RuntimeError(error_msg)
E RuntimeError: `pyarrow` not installed
dask/utils.py:94: RuntimeError
------------------------------ Captured log setup ------------------------------
core.py 137 DEBUG Connect to handle 94361929179216
=============== 1 failed, 10 passed, 13 skipped in 11.03 seconds ===============
The command "if [[ $TEST_HDFS == 'true' ]]; then source continuous_integration/hdfs/run_tests.sh; fi" exited with 1.
About this issue
- Original URL
- State: closed
- Created 6 years ago
- Comments: 35 (34 by maintainers)
The SO version wasn’t being set previously, and now it is as of this PR. Maybe that’s related, but I’m not sure how that explains conda wanting to install mismatched versions.