dask: CI test failures with pyarrow and hdfs

Seeing this on CIs. Looks like a packaging issue actually as it is missing boost. Maybe related to PR ( https://github.com/conda-forge/parquet-cpp-feedstock/pull/43 ).

=================================== FAILURES ===================================
_________________________ test_parquet_pyarrow[hdfs3] __________________________
mod_name = 'pyarrow', error_msg = '`pyarrow` not installed'
    def import_required(mod_name, error_msg):
        """Attempt to import a required dependency.
    
        Raises a RuntimeError if the requested module is not available.
        """
        try:
>           return import_module(mod_name)
dask/utils.py:92: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
name = 'pyarrow', package = None
    def import_module(name, package=None):
        """Import a module.
    
        The 'package' argument is required when performing a relative import. It
        specifies the package to use as the anchor point from which to resolve the
        relative import to an absolute import.
    
        """
        level = 0
        if name.startswith('.'):
            if not package:
                msg = ("the 'package' argument is required to perform a relative "
                       "import for {!r}")
                raise TypeError(msg.format(name))
            for character in name:
                if character != '.':
                    break
                level += 1
>       return _bootstrap._gcd_import(name[level:], package, level)
/opt/conda/lib/python3.6/importlib/__init__.py:126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
name = 'pyarrow', package = None, level = 0
>   ???
<frozen importlib._bootstrap>:994: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
name = 'pyarrow', import_ = <function _gcd_import at 0x7f105ba66e18>
>   ???
<frozen importlib._bootstrap>:971: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
name = 'pyarrow', import_ = <function _gcd_import at 0x7f105ba66e18>
>   ???
<frozen importlib._bootstrap>:955: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
spec = ModuleSpec(name='pyarrow', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f102ff02c50>, origin='/opt...n3.6/site-packages/pyarrow/__init__.py', submodule_search_locations=['/opt/conda/lib/python3.6/site-packages/pyarrow'])
>   ???
<frozen importlib._bootstrap>:665: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
self = <_frozen_importlib_external.SourceFileLoader object at 0x7f102ff02c50>
module = <module 'pyarrow' from '/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py'>
>   ???
<frozen importlib._bootstrap_external>:678: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
f = <built-in function exec>
args = (<code object <module> at 0x7f102fef2c00, file "/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py", line 20>,...cached__': '/opt/conda/lib/python3.6/site-packages/pyarrow/__pycache__/__init__.cpython-36.pyc', '__doc__': None, ...})
kwds = {}
>   ???
<frozen importlib._bootstrap>:219: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
    from pkg_resources import get_distribution, DistributionNotFound
    try:
        __version__ = get_distribution(__name__).version
    except DistributionNotFound:
       # package is not installed
        try:
            # This code is duplicated from setup.py to avoid a dependency on each
            # other.
            def parse_version(root):
                from setuptools_scm import version_from_scm
                import setuptools_scm.git
                describe = (setuptools_scm.git.DEFAULT_DESCRIBE +
                            " --match 'apache-arrow-[0-9]*'")
                # Strip catchall from the commandline
                describe = describe.replace("--match *.*", "")
                version = setuptools_scm.git.parse(root, describe)
                if not version:
                    return version_from_scm(root)
                else:
                    return version
    
            import setuptools_scm
            __version__ = setuptools_scm.get_version('../', parse=parse_version)
        except (ImportError, LookupError):
            __version__ = None
    
    
>   from pyarrow.lib import cpu_count, set_cpu_count
E   ImportError: libboost_regex.so.1.66.0: cannot open shared object file: No such file or directory
/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py:47: ImportError
During handling of the above exception, another exception occurred:
hdfs = hdfs://localhost:8020, Connected
    @require_pyarrow
    def test_parquet_pyarrow(hdfs):
        dd = pytest.importorskip('dask.dataframe')
        import pandas as pd
        import numpy as np
    
        fn = '%s/test.parquet' % basedir
        hdfs_fn = 'hdfs://%s' % fn
        df = pd.DataFrame(np.random.normal(size=(1000, 4)),
                          columns=list('abcd'))
        ddf = dd.from_pandas(df, npartitions=4)
    
>       ddf.to_parquet(hdfs_fn, engine='pyarrow')
dask/bytes/tests/test_hdfs.py:245: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
dask/dataframe/core.py:1045: in to_parquet
    return to_parquet(self, path, *args, **kwargs)
dask/dataframe/io/parquet.py:892: in to_parquet
    write = get_engine(engine)['write']
dask/dataframe/io/parquet.py:749: in get_engine
    import_required('pyarrow', "`pyarrow` not installed")
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
mod_name = 'pyarrow', error_msg = '`pyarrow` not installed'
    def import_required(mod_name, error_msg):
        """Attempt to import a required dependency.
    
        Raises a RuntimeError if the requested module is not available.
        """
        try:
            return import_module(mod_name)
        except ImportError:
>           raise RuntimeError(error_msg)
E           RuntimeError: `pyarrow` not installed
dask/utils.py:94: RuntimeError
------------------------------ Captured log setup ------------------------------
core.py                    137 DEBUG    Connect to handle 94361929179216
=============== 1 failed, 10 passed, 13 skipped in 11.03 seconds ===============
The command "if [[ $TEST_HDFS == 'true' ]]; then source continuous_integration/hdfs/run_tests.sh; fi" exited with 1.

ref: https://travis-ci.org/dask/dask/jobs/359059325

About this issue

  • Original URL
  • State: closed
  • Created 6 years ago
  • Comments: 35 (34 by maintainers)

Most upvoted comments

The SO version wasn’t being set previously, and now it is as of this PR. Maybe that’s related, but I’m not sure how that explains conda wanting to install mismatched versions.