dvc: dvc.api.open fails on pickle serialisation files absent from a dvc tracked directory

Potential Bug Report

dvc.api.open (from the Python API) fails with an UnpicklingError when trying to open a pickle serialised file that should be contained in a dvc tracked directory but that misses in cache. Note that dvc.api.read works well in the exact same situation; I assume that this might be related to the streaming aspect provided by dvc.api.open but not present in dvc.api.read. Is this a known / expected failure?

Setup

$ dvc version
DVC version: 1.8.1 (pip)
---------------------------------
Platform: Python 3.6.10 on Darwin-19.6.0-x86_64-i386-64bit
Supports: http, https, s3, ssh
Cache types: reflink, hardlink, symlink
Cache directory: apfs on /dev/disk1s5
Workspace directory: apfs on /dev/disk1s5
Repo: dvc, git

Steps to reproduce

  • Within an initialised and configured (especially the remotes) git and dvc repository; create a test directory containing one file test/foo.pickle that results from a pickle.dump call;
  • dvc add test && dvc push && git commit -m "test" && git push
  • Pull a fresh copy of the repository (or remove both the original file (or link) and the cache) and then run:
from dvc.api import open

with open('test/foo.pickle', mode='rb') as file:
    _ = pickle.load(file)

which results in the following uncaught exception:

---------------------------------------------------------------------------
GitCommandError                           Traceback (most recent call last)
.../python3.6/site-packages/dvc/scm/git.py in clone(url, to_path, rev, shallow_branch)
    126                 if shallow_branch is None:
--> 127                     tmp_repo = clone_from()
    128                 else:

.../python3.6/site-packages/git/repo/base.py in clone_from(cls, url, to_path, progress, env, multi_options, **kwargs)
   1031             git.update_environment(**env)
-> 1032         return cls._clone(git, url, to_path, GitCmdObjectDB, progress, multi_options, **kwargs)
   1033

.../python3.6/site-packages/git/repo/base.py in _clone(cls, git, url, path, odb_default_type, progress, multi_options, **kwargs)
    968         if progress:
--> 969             handle_process_output(proc, None, progress.new_message_handler(), finalize_process, decode_streams=False)
    970         else:

.../python3.6/site-packages/git/cmd.py in handle_process_output(process, stdout_handler, stderr_handler, finalizer, decode_streams)
    114     if finalizer:
--> 115         return finalizer(process)
    116

.../python3.6/site-packages/git/util.py in finalize_process(proc, **kwargs)
    328     ## TODO: No close proc-streams??
--> 329     proc.wait(**kwargs)
    330

.../python3.6/site-packages/git/cmd.py in wait(self, stderr)
    407                 log.debug('AutoInterrupt wait stderr: %r' % (errstr,))
--> 408                 raise GitCommandError(self.args, status, errstr)
    409             # END status handling

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone --no-single-branch --progress -v rb /var/folders/cs/gpn2tgnx6g5_89hg13ss77sh0000gp/T/tmpwpifp5kxdvc-clone

The above exception was the direct cause of the following exception:

CloneError                                Traceback (most recent call last)
<ipython-input-x-xxxxxxxxxxxx> in <module>
----> 1 with open('test/foo.pickle', mode='rb') as file:
      2     _ = pickle.load(file)

.../python3.6/contextlib.py in __enter__(self)
     80         try:
---> 81             return next(self.gen)
     82         except StopIteration:

.../python3.6/site-packages/dvc/api.py in _open(path, repo, rev, remote, mode, encoding)
     81 def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
---> 82     with _make_repo(repo, rev=rev) as _repo:
     83         with _repo.open_by_relpath(

.../python3.6/contextlib.py in __enter__(self)
     80         try:
---> 81             return next(self.gen)
     82         except StopIteration:

.../python3.6/site-packages/dvc/api.py in _make_repo(repo_url, rev)
    109             pass  # fallthrough to external_repo
--> 110     with external_repo(url=repo_url, rev=rev) as repo:
    111         yield repo

.../python3.6/contextlib.py in __enter__(self)
     80         try:
---> 81             return next(self.gen)
     82         except StopIteration:

.../python3.6/site-packages/dvc/external_repo.py in external_repo(url, rev, for_write, **kwargs)
     40     logger.debug("Creating external repo %s@%s", url, rev)
---> 41     path = _cached_clone(url, rev, for_write=for_write)
     42     # Local HEAD points to the tip of whatever branch we first cloned from

.../python3.6/site-packages/dvc/external_repo.py in _cached_clone(url, rev, for_write)
    353     # fetch/fast-forward to get specified rev
--> 354     clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write)
    355

.../python3.6/site-packages/funcy/decorators.py in wrapper(*args, **kwargs)
     38             call = Call(func, args, kwargs)
---> 39             return deco(call, *dargs, **dkwargs)
     40         return wraps(func)(wrapper)

.../python3.6/site-packages/funcy/flow.py in wrap_with(call, ctx)
    246     with ctx:
--> 247         return call()

.../python3.6/site-packages/funcy/decorators.py in __call__(self, *a, **kw)
     59         if not a and not kw:
---> 60             return self._func(*self._args, **self._kwargs)
     61         else:

.../python3.6/site-packages/dvc/external_repo.py in _clone_default_branch(url, rev, for_write)
    413             if not git:
--> 414                 git = Git.clone(url, clone_path)
    415                 shallow = False

.../python3.6/site-packages/dvc/scm/git.py in clone(url, to_path, rev, shallow_branch)
    131         except git.exc.GitCommandError as exc:  # pylint: disable=no-member
--> 132             raise CloneError(url, to_path) from exc
    133

CloneError: Failed to clone repo 'rb' to '/var/folders/cs/gpn2tgnx6g5_89hg13ss77sh0000gp/T/tmpwpifp5kxdvc-clone'

During handling of the above exception, another exception occurred:

UnpicklingError                           Traceback (most recent call last)
<ipython-input-x-xxxxxxxxxxxx> in <module>
      1 with open('test/foo.pickle', mode='rb') as file:
----> 2     _ = pickle.load(file)

UnpicklingError: pickle data was truncated

In the meantime, using dvc.api.read works fine (in the exact same setup):

from dvc.api import read

_ = pickle.loads(read('test/foo.pickle', mode='rb')):

and obviously, calling dvc pull test/foo.pickle before bypasses the issue as well.

About this issue

  • Original URL
  • State: closed
  • Created 4 years ago
  • Reactions: 1
  • Comments: 26 (14 by maintainers)

Most upvoted comments

DEBUG: fetched: [(0,)]

This just means the id in the DVC’s state that the 1 item has been fetched.

With the target file missing from both desired location and local cache:

It means that it’s not in your workspace nor in the cache, but exists in the remote, right?

UnpicklingError: pickle data was truncated

This might be a bug in the DVC. It could happen if the network gets closed prematurely, or something similar happens. It could also happen if the pickled file is itself corrupted.

Could you try the following script? And, please modify the pickle file, so that it could be easily reproducible.

#! /bin/bash

set -e

# initialize
cd "$(mktemp -d)"
git init && dvc init

# install and run moto server locally
pip install moto[server]
moto_server s3 &> /dev/null &
trap "echo killing moto server && kill $!" EXIT  # cleanup

# create a bucket and add remote
export AWS_ACCESS_KEY_ID='testing' AWS_SECRET_ACCESS_KEY='testing'
python -c '
import boto3

session = boto3.session.Session()
s3 = session.client("s3", endpoint_url="http://localhost:5000")
s3.create_bucket(Bucket="dvc-temp")
'
dvc remote add -d moto s3://dvc-temp
dvc remote modify moto endpointurl http://localhost:5000

# create file
python -c '
import pickle
with open("model.pickle", mode="wb") as f:
    pickle.dump("hello world", f)
'

# track file
dvc add model.pickle
dvc push
git add -A && git commit -m "initial"

# clear from cache and workspace
rm model.pickle .dvc/cache/** -rf

# try to read
python -c '
import dvc.api, pickle

import logging
logger = logging.getLogger("dvc")
logger.setLevel(5)


with dvc.api.open("model.pickle", mode="rb") as f:
  msg = pickle.load(f)

print(msg)
'

Also, it’d be great if you could try out a few more things, and see if it works: 1)

import dvc.api, io, pickle

with dvc.api.open("model.pickle", mode="rb") as f:
  stream = io.BytesIO(f.read())

msg = pickle.load(stream)
print(msg)

and, 2)

import dvc.api, io, pickle

msg = pickle.load(io.BytesIO(dvc.api.read("model.pickle", mode="rb")))

Sorry that this is taking quite long to figure out, but we haven’t been able to figure out what’s wrong.

The above script should be as close to your environment with an s3 remote. Please try it out, and see if you could manage to reproduce.

@efiop issue is gone with dvc==2.0.3.

@pared the issue is not critical for me as using read is an acceptable workaround. However, as read breaks the streaming aspect, and requires loading the entire archive in RAM (as I understood from the doc, open behaves differently), there is an almost doubled peak memory consumption on model loading that might become a severe issue in the future (at least for me). Thus, I would not categories this issue as urgent but still, I think it should be fixed in a decent timeframe.

@hugo-ricateau-tiime, can confirm. Thanks for the repro.

I finally succeed to figure out what is the key aspect of the issue: the archive size; it looks like large archives are truncated when streamed from an s3 storage (maybe when the buffer gets full?).

The following diff on the suggested script:

30c30
<     pickle.dump("hello world"*1000, f)
---
>     pickle.dump("hello world", f)
39c39
< rm -rf model.pickle .dvc/cache/**
---
> rm model.pickle .dvc/cache/** -rf

results in

Initialized empty Git repository in .../.git/

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>
Requirement already satisfied: [...]
Setting 'moto' as a default remote.
100% Add|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|1/1 [00:00,  2.94file/s]

To track the changes with git, run:

	git add model.pickle.dvc .gitignore
1 file pushed
[master (root-commit) 501ad01] initial
 11 files changed, 525 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvc/plots/confusion.json
 create mode 100644 .dvc/plots/confusion_normalized.json
 create mode 100644 .dvc/plots/default.json
 create mode 100644 .dvc/plots/linear.json
 create mode 100644 .dvc/plots/scatter.json
 create mode 100644 .dvc/plots/smooth.json
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 model.pickle.dvc
2020-12-09 09:31:04,689 TRACE: PRAGMA user_version;
2020-12-09 09:31:04,689 DEBUG: fetched: [(3,)]
2020-12-09 09:31:04,689 TRACE: CREATE TABLE IF NOT EXISTS state (inode INTEGER PRIMARY KEY, mtime TEXT NOT NULL, size TEXT NOT NULL, md5 TEXT NOT NULL, timestamp TEXT NOT NULL)
2020-12-09 09:31:04,690 TRACE: CREATE TABLE IF NOT EXISTS state_info (count INTEGER)
2020-12-09 09:31:04,690 TRACE: CREATE TABLE IF NOT EXISTS link_state (path TEXT PRIMARY KEY, inode INTEGER NOT NULL, mtime TEXT NOT NULL)
2020-12-09 09:31:04,690 TRACE: INSERT OR IGNORE INTO state_info (count) SELECT 0 WHERE NOT EXISTS (SELECT * FROM state_info)
2020-12-09 09:31:04,690 TRACE: PRAGMA user_version = 3;
2020-12-09 09:31:04,695 DEBUG: cache '.../.dvc/cache/87/61b028bd29d9fa506a25513d0c10b9' expected 'HashInfo(name='md5', value='8761b028bd29d9fa506a25513d0c10b9', dir_info=None, size=11010, nfiles=None)' actual 'None'
2020-12-09 09:31:04,935 TRACE: SELECT count from state_info WHERE rowid=?
2020-12-09 09:31:04,935 DEBUG: fetched: [(2,)]
2020-12-09 09:31:04,935 TRACE: UPDATE state_info SET count = ? WHERE rowid = ?
Traceback (most recent call last):
  File "<string>", line 10, in <module>
_pickle.UnpicklingError: pickle data was truncated
killing moto server

Note that adding any of these diff to the previous one, workarounds the issue

43c43
< import dvc.api, io, pickle
---
> import dvc.api, pickle
51c51
<   msg = pickle.load(io.BytesIO(f.read()))
---
>   msg = pickle.load(f)
43c43
< import dvc.api, io, pickle
---
> import dvc.api, pickle
50c50,51
< msg = pickle.load(io.BytesIO(dvc.api.read("model.pickle", mode="rb")))
---
> with dvc.api.open("model.pickle", mode="rb") as f:
>   msg = pickle.load(f)
50c50,51
< msg = pickle.loads(dvc.api.read("model.pickle", mode="rb"))
---
> with dvc.api.open("model.pickle", mode="rb") as f:
>   msg = pickle.load(f)

Also note that there is no problem with a local storage (i.e. using this script with "hello world"*1000).