aws-sdk-js: S3 sometimes does not complete downloads

I put together a test case for you:

var AWS = require('aws-sdk');
var fs = require('fs');
var crypto = require('crypto');


var client = new AWS.S3({
    accessKeyId: process.env.S3_KEY,
    secretAccessKey: process.env.S3_SECRET,
});

var s3Path = "1tmp1/node_modules/chem-cli/node_modules/watchify2/node_modules/browserify/node_modules/browser-builtins/node_modules/http-browserify/example/json-stream/node_modules/JSONStream/test/fixtures/all_npm.json";
var localFile = "test_out.json";
var s3Params = {
  Bucket: process.env.S3_BUCKET,
  Key: s3Path,
};
var count = 1;
downloadOnce();
function downloadOnce() {
  doTheDownload(function(err) {
    if (err) throw err;
    console.log("downloaded", count++);
    downloadOnce();
  });
}

function doTheDownload(cb) {
  var request = client.getObject(s3Params);
  var response = request.createReadStream();
  var outStream = fs.createWriteStream(localFile);
  var hash = crypto.createHash('md5');
  var errorOccurred = false;
  var eTag = "";

  response.on('error', handleError);
  outStream.on('error', handleError);

  request.on('httpHeaders', function(statusCode, headers, resp) {
    if (statusCode < 300) {
      var contentLength = parseInt(headers['content-length'], 10);
      eTag = headers.etag || "";
    } else {
      handleError(new Error("http status code " + statusCode));
    }
  });

  hash.on('data', function(digest) {
    if (!compareETag(eTag, digest)) {
      console.log("eTag", eTag, "digest", digest.toString('hex'), "path:", localFile);
      handleError(new Error("ETag does not match MD5 checksum"));
    }
  });

  outStream.on('close', function() {
    if (errorOccurred) return;
    cb();
  });

  response.pipe(outStream);
  response.pipe(hash);

  function handleError(err) {
    if (errorOccurred) return;
    errorOccurred = true;
    cb(err);
  }
}

function compareETag(eTag, md5Buffer) {
  eTag = eTag.replace(/^\s*'?\s*"?\s*(.*?)\s*"?\s*'?\s*$/, "$1");
  var hex = md5Buffer.toString('hex');
  return eTag === hex;
}

Edit s3Path variable to be some file on S3. My example file is 362KB.

Here’s an example run:

$ S3_BUCKET='examplebucket' S3_KEY='examplekey' S3_SECRET='examplesecret' node test.js
downloaded 1
downloaded 2
downloaded 3
downloaded 4
downloaded 5
downloaded 6
downloaded 7
downloaded 8
downloaded 9
downloaded 10
downloaded 11
downloaded 12
downloaded 13
downloaded 14
downloaded 15
downloaded 16
downloaded 17
downloaded 18
downloaded 19
downloaded 20
downloaded 21
downloaded 22
downloaded 23
downloaded 24
downloaded 25
downloaded 26
downloaded 27
downloaded 28
downloaded 29
downloaded 30
downloaded 31
downloaded 32
downloaded 33
downloaded 34
downloaded 35
downloaded 36
downloaded 37
eTag "abac71913645791ed7a98e4461ee1a71" digest 70e838acf6649dc50bbac3e40272a674 path: test_out.json

/home/andy/tmp/npmtest/test.js:21
    if (err) throw err;
                   ^
Error: ETag does not match MD5 checksum
    at Hash.<anonymous> (/home/andy/tmp/npmtest/test.js:50:19)
    at Hash.emit (events.js:95:17)
    at Hash.<anonymous> (_stream_readable.js:748:14)
    at Hash.emit (events.js:92:17)
    at emitReadable_ (_stream_readable.js:410:10)
    at emitReadable (_stream_readable.js:406:5)
    at readableAddChunk (_stream_readable.js:168:9)
    at Hash.Readable.push (_stream_readable.js:130:10)
    at Hash.Transform.push (_stream_transform.js:140:32)
    at Hash._flush (crypto.js:201:8)

So the first 37 times it downloaded fine, but the 38th time we downloaded the file, AWS SDK said that the file was complete, when it really wasn’t. If I go look at test_out.json on my computer, the first 356KB are downloaded correctly and then the file is truncated.

I am seeing this problem frequently.

About this issue

  • Original URL
  • State: closed
  • Created 10 years ago
  • Comments: 43 (20 by maintainers)

Commits related to this issue

Most upvoted comments

What’s the best number of parallel requests to maximize bandwidth?

This is not an issue with the SDK but rather with general usage of S3. If you send many parallel requests, eventually S3 will throttle your access and kill your connection.

One way you can mitigate the failures is by using HTTP’s Expect 100-continue header, which S3 supports:

request.on('build', function() {
  request.httpRequest.headers['Expect'] = '100-continue';
});

This will send headers and wait for a server response prior to attempting to upload/download any data, thereby allowing S3 to return with any errors prior to accepting your connection. If this is not passed, S3’s only choice is to cut your connection off in the middle of transfer, causing the behavior you are seeing.

That said, you should generally expect that you may encounter abrupt disconnections for a variety of reasons (bad internet connection, routing issues, etc.), so you should design your application to recover from any download failures. The SDK has retry logic for most operations out of the box, which is why you generally don’t see these failures, but retry logic is disabled for the raw createReadStream() operation, since it is just that: a raw low-level socket operation.

Hope that helps.