neo4j-go-driver: Getting SIGSEGV: segmentation violation panic when using session.Run after Neo recovery

test.log Neo4j Version: 3.5.23 Community
Neo4j Mode: HA cluster / bolt Driver version: Go driver 5.5.0, also happened on version 3
Operating System: Ubuntu 22 locally, ubuntu 14 on aws

We noticed that every time a neo instance restarts our service get a panic, while investigating i found out that the panic doesn’t happen when the neo is down, but rather when it recovers. I’ve managed to reproduce this by running the service locally, which access neo, bombarding it with requests to reproduce production behaviour and in the middle blocking access to neo, and then restoring it.

This only happens under high load when there are requests from multiple go routines after the connection was lost and is being reestablished,

To simulate this behaviour in testing I’ve created a test that opens a 100 go routines that close connection to neo and then try to execute a query, we have an error wrapper for closed connection that we reestablish in such case, and if session.Run would return an error we could handle it, however it throws a panic due to some race condition with thread unsafe buffer usage, the only solution for this would be to add a mutex that would let one go routine at a time run, but it would heavily slow down our process.

The test function

func (s *DriverTestSuite) TestMultithreadedQueryRequestsWithConnectionRecovery() {
	s.driver.Close(s.ctx)
	count := 100
	wg := &sync.WaitGroup{}
	wg.Add(count)

	for i := 0; i < count; i++ {

		go func(wg *sync.WaitGroup, i int, s *DriverTestSuite) {
			defer wg.Done()
			s.driver.Close(s.ctx)
			err := s.executeSimpleQuery()
			s.Require().NoError(err)

		}(wg, i, s)
	}
	wg.Wait()

}

func executeSimpleQuery(ctx context.Context, driver *Driver) error {
	return driver.ExecuteQuery(ctx, "CREATE (test:Test) return true", map[string]interface{}{}, func(result neo4j.ResultWithContext) error {
		var record *neo4j.Record
		result.NextRecord(ctx, &record)
		if record == nil || len(record.Values) == 0 {
			return errors.New("no records")
		}
		_, ok := record.Values[0].(bool)
		if !ok {
			return errors.New("expected value to be bool")
		}
		return nil
	})
}


func (s *DriverTestSuite) executeSimpleQuery() error {
	return executeSimpleQuery(s.ctx, s.driver)
}

and the relevant functions in code:

// ResultsHookFn allows the caller to parse the query results safely
type ResultsHookFn func(result neo4j.ResultWithContext) error

// ExecuteQuery runs a query an ensured connected driver via Bolt. it it used with a hook of the original neo4j.Result object for a convenient usage
func (d *Driver) ExecuteQuery(ctx context.Context, query string, params map[string]interface{}, onResults ResultsHookFn) (err error) {
	accessLock.RLock()
	defer accessLock.RUnlock()
	return d.nonblockExecuteQuery(ctx, query, params, onResults)

}

// nonblockExecuteQuery makes sure that a recursive retry to execute a query doesn't create a more mutexes and thus a deadlock
// example is when a query executed, Rlock acquired, than Close function called, trying to aquire Lock, blocked, and then
// the function calls itself again for retry, trying to acquire Rlock, but is blocked by Lock that is blocked by previous Rlock
func (d *Driver) nonblockExecuteQuery(ctx context.Context, query string, params map[string]interface{}, onResults ResultsHookFn) (err error) {

	session, err := d.NewSession(ctx)
	if err != nil {
		return err
	}
	defer d.CloseSession(ctx, session)

	result, err := session.Run(ctx, query, params)
	if err != nil {
		queryExecutionFailureMeter.Mark(1)
		if err.Error() == "Trying to create session on closed driver" || strings.HasPrefix(err.Error(), "ConnectivityError") {
			err = d.reconnect(ctx)
			if err != nil {
				return err
			}
			return d.nonblockExecuteQuery(ctx, query, params, onResults)
		}
		return err
	}
	err = executeHook(onResults, result) //<-- reporting metrics inside
	if err != nil {
		return err
	}
	return nil
}

// NewSession returns a new *connected* session only after ensuring the underlying connection is alive.
// it ensures liveliness by re-creating a new driver in case of connectivity issues.
// it returns an error in case any connectivity issue could not be resolved even after re-creating the driver.
func (d *Driver) NewSession(ctx context.Context) (neo4j.SessionWithContext, error) {
	return d.driver.NewSession(ctx, neo4j.SessionConfig{AccessMode: neo4j.AccessModeWrite}), nil
}

// CloseSession closes any open resources and marks this session as unusable.
// it wraps the original neo4j.Session.Close() func with af metrics and logs
func (d *Driver) CloseSession(ctx context.Context, session neo4j.SessionWithContext) {
	defer recoverLogPanic()
	err := session.Close(ctx)
	if err != nil {
		closeSessionFailureMeter.Mark(1)
	}
}

// reconnect will create a new driver if current one is not connected
// it uses double verification, as two queries might both get an error and try to reconnect, one will fix the connection
// the other doesn't need to reconnect
func (d *Driver) reconnect(ctx context.Context) error {
	recoveryLock.Lock()
	defer recoveryLock.Unlock()
	if err := d.driver.VerifyConnectivity(ctx); err == nil {
		return nil

	}

	connectionLostMeter.Mark(1)

	data, err := d.enclavedPassword.Open()
	if err != nil {
		return errors.New("couldnt open neo4j password enclave")
	}

	driver, err := NewDriver(Settings{d.dbURI, d.user, data.String()})
	if err != nil {
		return err
	}
	d.nonblockClose(ctx) //close old driver
	d.driver = driver.driver
	return nil
}

func (d *Driver) nonblockClose(ctx context.Context) {
	defer recoverLogPanic()
	if d.driver == nil {
		return
	}
	if err := d.driver.Close(ctx); err != nil {
		failedToCloseMeter.Mark(1)
		log.WarnWithError("failed to close existing driver", err)
	}
}

// Close safely closes the underlying open connections to the DB.
func (d *Driver) Close(ctx context.Context) {
	accessLock.Lock()
	defer accessLock.Unlock()
	d.nonblockClose(ctx)
}

About this issue

  • Original URL
  • State: closed
  • Created a year ago
  • Comments: 24 (16 by maintainers)

Commits related to this issue

Most upvoted comments

For the sake of test i removed the line from reconnect where it closes the old driver just in case, but I get the same error. The code only goes to reconnect in the specific case where there is connectivity error or the driver is already closed, I use mutex to make sure that only one goroutine enters reconnect, and double verify, if several routines get the same error than even after entering the function, i check again if the connection is down, the first goroutine fixes the connection, the following ones see that its working and return. And when closing the connection in the test I use the blocking method, which means it finishes for all the goroutines to finish before closing the connection.

I shared the full trace, its in the very top, you got the test.log button