rethinkdbdash: High memory usage with pool and rethink feeds

I discovered that one of my services has a rather high memory usage which grows over time. Did a bit of testing and found that when using rethinkdbdash pooling the closing changefeeds to rethinkdb doesn’t seem to release the related memory. For example when using pooling vs no pooling:

Both start of with 1000 rethinkdb feeds which evaluates to ~100mb memory usage.
I trigger some updates for the feed which increases mem usage by another ~100mb.
After that I GC clean up small amounts ~0-20mb. It doesn’t seem to clean up any more however long I wait.
After that I close all the cursors. cursor.close() for each one:
POOLING: memory usage stays the same as with feeds running, nothing is released
NO POOL: memory usage drops to ~0-20mb over the baseline.

For no pooling I create a new connection for each feed and after closing the cursor I also close the connections with connection.close(). I tried looking at the amount of connections with r.getPoolMaster().getLength(); and r.getPoolMaster().getAvailableLength();, the latter was showing -110 after closing all the feeds.

Would it be possible to purge the connections somehow? Also long-running feeds memory usage will eventually go rather high as no memory seems to be released and every message from rethink seems to increase mem usage. Could this be cleaned up somehow as well?

About this issue

Original URL
State: closed
Created 8 years ago
Reactions: 1
Comments: 19 (11 by maintainers)

Commits related to this issue

Cut call stack in asyncEach - Second attempt for #263 — committed to neumino/rethinkdbdash by neumino 8 years ago
Another stab at #263 — committed to neumino/rethinkdbdash by neumino 8 years ago

Most upvoted comments

Thanks @analytik for digging a bit more into it. Rethinkdbdash used to call the callstack in each before, but for some reason it’s not doing it anymore - I somehow can’t remember when I stopped doing that. I just pushed 2.3.13 with a fix.

Script

var Promise = require('bluebird');
var util = require('util');
var heapdump = require('heapdump');

function log(message) {
    console.log(new Date().toISOString() + ' - ' + message);
}

function gc() {
    log('-- Forced GC');
    global.gc();
}
var countOps = 0;
var countOpsSuccess = 0;
var countOpsErrors = 0;
var feedChanges = 0;

var initialMemoryUsage = process.memoryUsage();
log(util.inspect(initialMemoryUsage));
function printMemoryUsage() {
    var diff = process.memoryUsage();
    var result = {};
    result.diff_rss = diff.rss - initialMemoryUsage.rss;
    result.diff_heapTotal = diff.heapTotal - initialMemoryUsage.heapTotal;
    result.diff_heapUsed = diff.heapUsed - initialMemoryUsage.heapUsed;
    log(`${countOps} tried, ${countOpsSuccess} done, ${countOpsErrors} errors, ${feedChanges} feed changes; ${parseInt(result.diff_heapUsed / (1024 * 1024 ))} MiB`);
}

var NUM_DOCS = 1000;
var UPDATE_TIMERS = 1000;
var TIME_END = parseFloat(process.argv[2] || 15) * 60 * 1000;


function startTest() {
    log('-- Start test');
    var r = require('./lib/index.js')({
        max: 1000,
        buffer: 50,
        host: 'localhost',
        port: 28015,
        // timeoutGb: 1000 * 10
    });

    setup(r).then(function () {
        var feeds = [];
        openFeeds(r, feeds);
        var timers = [];
        fireUpdates(r, timers);
        setTimeout(function () {
            tearDown(r, feeds, timers)
        }, TIME_END);
        timers.push(setInterval(function() {
          gc()
          printMemoryUsage()
        }, 5000));
        return null;
    }).catch(function (err) {
        log('Could not setup the test. ' + err);
    });
}

function setup(r) {
    log('-- Setup test');
    return r.dbCreate('test').run().then(function () {
        log('-- db created');
    }).catch(function () {
        log('-- db not created');
    }).finally(function () {
        return r.db('test').tableDrop('test').run()
    }).then(function () {
        log('-- table dropped');
    }).catch(function () {
        log('-- table not dropped');
    }).finally(function () {
        return r.db('test').tableCreate('test').run()
    }).then(function () {
        log('-- table created');
    }).catch(function () {
        log('-- table not created');
    }).finally(function () {
        return Promise.resolve(true);
    });
}

function openFeeds(r, feeds) {
    log('-- Open feeds');
    r.db('test').table('test').changes().run().then(function (feed) {
        feeds.push(feed);
        feed.eachAsync(function () {
            feedChanges++;
        }).catch(function (e) {
            log('Caught error?');
            log(e);
        });
    });
}

function fireUpdates(r, timers) {
    log('-- Fire updates');
    // Insert the first batch
    var promises = [];
    for (var i = 0; i < NUM_DOCS; i++) {
      promises.push(r.db('test').table('test').insert({}).run())
    }

    Promise.all(promises).then(function() {
      for (var i = 0; i < UPDATE_TIMERS; i++) {
          timers.push(setInterval(function () {
              countOps++;
              r.db('test').table('test').limit(1).update({
                  value: Math.random()
              }).run().then(function () {
                countOpsSuccess++;
              }).catch(function (e) {
                countOpsErrors++;
              });
          }, 1000));
      }
    }).catch(function(error) {
      console.log('Could not perform all inserts', error);
      console.log(error);
    });
}

function tearDown(r, feeds, timers) {
    log(`-- countOps: ${countOps}`);
    log(`-- countOpsSuccess: ${countOpsSuccess}`);
    log(`-- countOpsErrors: ${countOpsErrors}`);
    log('-- Tear down');
    printMemoryUsage();
    gc();
    printMemoryUsage();
    for (var i = 0; i < timers.length; i++) {
        clearInterval(timers[i]);
    }
    Promise.map(feeds, function (feed) {
        return feed.close().catch(function (e) {
            log('Caught error?');
            log(e);
        });
    }).then(function () {
        log('-- Feed closed');
        feeds = [];
        timers = [];
        printMemoryUsage();
        gc()
        printMemoryUsage();
        // return r.getPoolMaster().drain();
    }).then(function () {
        setTimeout(function () {
            log('-- Pool master drained');
            printMemoryUsage();
            gc()
            printMemoryUsage();
            r = null;
            printMemoryUsage();
    log(`-- countOps: ${countOps}`);
    log(`-- countOpsSuccess: ${countOpsSuccess}`);
    log(`-- countOpsErrors: ${countOpsErrors}`);
            gc()
            printMemoryUsage();
        }, 5000);
        log('-- Pool master drained');
        printMemoryUsage();
        gc()
        printMemoryUsage();
        // r = null;
        printMemoryUsage();
    log(`-- countOps: ${countOps}`);
    log(`-- countOpsSuccess: ${countOpsSuccess}`);
    log(`-- countOpsErrors: ${countOpsErrors}`);
        gc()
        printMemoryUsage();
        // heapdump.writeSnapshot('./' + Date.now() + '.heapsnapshot');
    });
}


startTest();

Result:

michel@h9:~/projects/github.com/neumino/rethinkdbdash$ node --expose-gc test.js 1000
2016-06-30T05:22:07.780Z - { rss: 25960448, heapTotal: 11530240, heapUsed: 5507488 }
2016-06-30T05:22:07.781Z - -- Start test
Creating a pool connected to localhost:28015
2016-06-30T05:22:07.822Z - -- Setup test
2016-06-30T05:22:07.885Z - -- db not created
2016-06-30T05:22:07.897Z - -- table dropped
2016-06-30T05:22:08.229Z - -- table created
2016-06-30T05:22:08.229Z - -- Open feeds
2016-06-30T05:22:08.230Z - -- Fire updates
2016-06-30T05:22:13.318Z - -- Forced GC
2016-06-30T05:22:13.344Z - 4000 tried, 3000 done, 0 errors, 3986 feed changes; 11 MiB
2016-06-30T05:22:18.372Z - -- Forced GC
2016-06-30T05:22:18.392Z - 9000 tried, 8000 done, 0 errors, 8987 feed changes; 11 MiB
2016-06-30T05:22:23.414Z - -- Forced GC
2016-06-30T05:22:23.434Z - 14000 tried, 13000 done, 0 errors, 13986 feed changes; 10 MiB
2016-06-30T05:22:28.455Z - -- Forced GC
2016-06-30T05:22:28.472Z - 18000 tried, 18000 done, 0 errors, 18986 feed changes; 8 MiB
2016-06-30T05:22:33.489Z - -- Forced GC
2016-06-30T05:22:33.507Z - 23000 tried, 22570 done, 0 errors, 23618 feed changes; 9 MiB
2016-06-30T05:22:38.525Z - -- Forced GC
2016-06-30T05:22:38.542Z - 28000 tried, 27409 done, 0 errors, 28459 feed changes; 9 MiB
2016-06-30T05:22:43.559Z - -- Forced GC
2016-06-30T05:22:43.585Z - 33000 tried, 29000 done, 0 errors, 30050 feed changes; 19 MiB
2016-06-30T05:22:48.611Z - -- Forced GC
2016-06-30T05:22:48.643Z - 38000 tried, 35394 done, 0 errors, 35766 feed changes; 15 MiB
2016-06-30T05:22:53.677Z - -- Forced GC
2016-06-30T05:22:53.699Z - 43000 tried, 41972 done, 0 errors, 42932 feed changes; 10 MiB
2016-06-30T05:22:58.721Z - -- Forced GC
2016-06-30T05:22:59.190Z - 48000 tried, 47000 done, 0 errors, 47507 feed changes; 10 MiB
2016-06-30T05:23:04.658Z - -- Forced GC
2016-06-30T05:23:04.682Z - 54000 tried, 53000 done, 0 errors, 53986 feed changes; 11 MiB
2016-06-30T05:23:09.708Z - -- Forced GC
2016-06-30T05:23:09.745Z - 59000 tried, 58000 done, 0 errors, 58986 feed changes; 11 MiB
2016-06-30T05:23:14.782Z - -- Forced GC
2016-06-30T05:23:14.804Z - 63000 tried, 62977 done, 0 errors, 63894 feed changes; 8 MiB
2016-06-30T05:23:19.826Z - -- Forced GC
2016-06-30T05:23:19.851Z - 68000 tried, 67806 done, 0 errors, 68698 feed changes; 9 MiB
2016-06-30T05:23:24.876Z - -- Forced GC
2016-06-30T05:23:24.902Z - 73000 tried, 72536 done, 0 errors, 73576 feed changes; 10 MiB
2016-06-30T05:23:29.928Z - -- Forced GC
2016-06-30T05:23:29.962Z - 78000 tried, 77000 done, 0 errors, 77988 feed changes; 11 MiB
2016-06-30T05:23:34.996Z - -- Forced GC
2016-06-30T05:23:35.029Z - 83000 tried, 82000 done, 0 errors, 82987 feed changes; 11 MiB
2016-06-30T05:23:40.064Z - -- Forced GC
2016-06-30T05:23:40.095Z - 88000 tried, 86911 done, 0 errors, 87593 feed changes; 11 MiB
2016-06-30T05:23:45.127Z - -- Forced GC
2016-06-30T05:23:45.161Z - 93000 tried, 92000 done, 0 errors, 92986 feed changes; 11 MiB
2016-06-30T05:23:50.195Z - -- Forced GC
2016-06-30T05:23:50.226Z - 98000 tried, 97000 done, 0 errors, 97986 feed changes; 11 MiB
2016-06-30T05:23:55.257Z - -- Forced GC
2016-06-30T05:23:55.290Z - 103000 tried, 102000 done, 0 errors, 102986 feed changes; 11 MiB
2016-06-30T05:24:00.323Z - -- Forced GC
2016-06-30T05:24:00.344Z - 107000 tried, 107000 done, 0 errors, 107986 feed changes; 8 MiB
2016-06-30T05:24:05.365Z - -- Forced GC
2016-06-30T05:24:05.385Z - 112000 tried, 111875 done, 0 errors, 112906 feed changes; 9 MiB
2016-06-30T05:24:10.405Z - -- Forced GC
2016-06-30T05:24:10.431Z - 117000 tried, 116330 done, 0 errors, 117380 feed changes; 10 MiB
2016-06-30T05:24:15.457Z - -- Forced GC
2016-06-30T05:24:15.479Z - 122000 tried, 121000 done, 0 errors, 121988 feed changes; 11 MiB
2016-06-30T05:24:20.502Z - -- Forced GC
2016-06-30T05:24:20.529Z - 127000 tried, 126000 done, 0 errors, 126986 feed changes; 12 MiB
2016-06-30T05:24:25.556Z - -- Forced GC
2016-06-30T05:24:25.585Z - 132000 tried, 131000 done, 0 errors, 131987 feed changes; 12 MiB
2016-06-30T05:24:30.614Z - -- Forced GC
2016-06-30T05:24:30.637Z - 136000 tried, 136000 done, 0 errors, 136986 feed changes; 8 MiB
2016-06-30T05:24:35.660Z - -- Forced GC
2016-06-30T05:24:35.679Z - 141000 tried, 140907 done, 0 errors, 141899 feed changes; 10 MiB

neumino on Jun 30, 2016

2.3.19 should fix the issue. I guess there’s really no way to build a long chain of promises…

neumino on Jul 24, 2016

Amazing, thank you for your patience with this ticket, and solving it so quickly 😃

Just for the joy of it, here are two services listening to the same changefeed, the climbing one with 2.3.12, the sawtooth one with 2.3.13 -

screenshot 2016-06-30 10 52 48

Again, thanks for rethinkdbdash! (Now only if you had a twin brother who would do the same for python 😄 )

analytik on Jun 30, 2016

Hi,

I’m sorry, I’ve realized I haven’t explained myself correctly previously. The memory leaks occurs when the changefeed is open a long time and gets a lot of changes and eachAsync is used - it just keeps ballooning up with every change. For me, it releases the memory correctly if the connection is closed (I haven’t tested closing and reopening the changefeed without closing the connection, but it seems to be the case based on this output)

I’ve modified the script slightly and made a gist here, and this is the output for 15 minutes -

2016-06-29T12:35:53.195Z - { rss: 21430272, heapTotal: 11530240, heapUsed: 5351752 }
2016-06-29T12:35:53.196Z - -- Start test
Creating a pool connected to localhost:28015
2016-06-29T12:35:53.240Z - -- Setup test
2016-06-29T12:35:53.315Z - -- db not created
2016-06-29T12:35:53.415Z - -- table dropped
2016-06-29T12:35:54.863Z - -- table created
2016-06-29T12:35:54.864Z - 0 tried, 0 done, 0 errors; 5 MiB
2016-06-29T12:35:54.865Z - -- Forced GC
2016-06-29T12:35:54.877Z - 0 tried, 0 done, 0 errors; 2 MiB
2016-06-29T12:35:54.878Z - -- Open feeds
2016-06-29T12:35:54.879Z - -- Fire updates
2016-06-29T12:35:59.882Z - 400 tried, 400 done, 0 errors; 10 MiB
2016-06-29T12:36:04.883Z - 900 tried, 900 done, 0 errors; 7 MiB
2016-06-29T12:36:09.885Z - 1400 tried, 1400 done, 0 errors; 19 MiB
2016-06-29T12:36:14.887Z - 1900 tried, 1900 done, 0 errors; 15 MiB
2016-06-29T12:36:19.889Z - 2400 tried, 2400 done, 0 errors; 13 MiB
2016-06-29T12:36:24.889Z - 2900 tried, 2900 done, 0 errors; 25 MiB
2016-06-29T12:36:29.888Z - 3400 tried, 3400 done, 0 errors; 21 MiB
2016-06-29T12:36:34.889Z - 3900 tried, 3900 done, 0 errors; 18 MiB
2016-06-29T12:36:39.888Z - 4400 tried, 4400 done, 0 errors; 15 MiB
2016-06-29T12:36:44.888Z - 4900 tried, 4900 done, 0 errors; 27 MiB
2016-06-29T12:36:49.889Z - 5400 tried, 5400 done, 0 errors; 23 MiB
2016-06-29T12:36:54.882Z - -- Forced GC
2016-06-29T12:36:54.907Z - 5900 tried, 5900 done, 0 errors; 10 MiB
2016-06-29T12:36:59.910Z - 6400 tried, 6400 done, 0 errors; 24 MiB

// *snip*

2016-06-29T12:49:50.147Z - 83400 tried, 82870 done, 0 errors; 72 MiB 
2016-06-29T12:49:55.148Z - 83900 tried, 83327 done, 0 errors; 76 MiB 
2016-06-29T12:49:56.353Z - -- Forced GC                              
2016-06-29T12:50:00.150Z - 84400 tried, 83829 done, 0 errors; 69 MiB 
2016-06-29T12:50:05.150Z - 84900 tried, 84429 done, 0 errors; 76 MiB 
2016-06-29T12:50:10.149Z - 85400 tried, 85040 done, 0 errors; 81 MiB 
2016-06-29T12:50:15.150Z - 85900 tried, 85536 done, 0 errors; 72 MiB 
2016-06-29T12:50:20.150Z - 86400 tried, 86086 done, 0 errors; 77 MiB 
2016-06-29T12:50:25.150Z - 86900 tried, 86647 done, 0 errors; 80 MiB 
2016-06-29T12:50:30.174Z - 87400 tried, 87183 done, 0 errors; 83 MiB 
2016-06-29T12:50:35.176Z - 87900 tried, 87745 done, 0 errors; 74 MiB 
2016-06-29T12:50:40.177Z - 88400 tried, 88261 done, 0 errors; 75 MiB 
2016-06-29T12:50:45.178Z - 88900 tried, 88744 done, 0 errors; 75 MiB 
2016-06-29T12:50:50.179Z - 89400 tried, 89201 done, 0 errors; 75 MiB 
2016-06-29T12:50:54.882Z - -- countOps: 89800                        
2016-06-29T12:50:54.882Z - -- countOpsSuccess: 89650                 
2016-06-29T12:50:54.883Z - -- countOpsErrors: 0                      
2016-06-29T12:50:54.883Z - -- Tear down                              
2016-06-29T12:50:54.883Z - 89800 tried, 89650 done, 0 errors; 68 MiB 
2016-06-29T12:50:54.884Z - -- Forced GC                              
2016-06-29T12:50:54.950Z - 89800 tried, 89650 done, 0 errors; 60 MiB 
2016-06-29T12:50:54.969Z - -- Feed closed                            
2016-06-29T12:50:54.977Z - 89800 tried, 89689 done, 0 errors; 60 MiB 
2016-06-29T12:50:54.977Z - -- Forced GC                              
2016-06-29T12:50:54.997Z - 89800 tried, 89689 done, 0 errors; 10 MiB 
2016-06-29T12:50:54.998Z - -- Pool master drained                    
2016-06-29T12:50:54.998Z - 89800 tried, 89689 done, 0 errors; 10 MiB 
2016-06-29T12:50:54.998Z - -- Forced GC                              
2016-06-29T12:50:55.021Z - 89800 tried, 89689 done, 0 errors; 10 MiB

Basically, for every 100k changes, I get ~100MB of leaked memory. The above was run using Node.js 6.2.1 on Windows 8.1 x64, but production servers are on Debian x64, I think it happened on both Node.js 4 and 6.

analytik on Jun 29, 2016

Thanks for the thorough response. Sorry for the delay in responding on my part. I see that you actually drain the pool. When I use this then all of the memory seemed to be released as expected. Though it seems a bit like a hack to drain the pool. Should using drain() be something done regularly on a production server then? And how would one come back from that? Create a new pool that would initialize new connections?

veikovx on Jun 28, 2016