SqlClient: A transport-level error has occurred when receiving results from the server if query length exceeds 956 characters

In MAUI Android using Microsoft.Data.SqlClient, it’s not possible to run a query exceeding 956 characters.

Exception message: Stack trace: Microsoft.Data.SqlClient.SqlException (0x80131904): A transport-level error has occurred when receiving results from the server. (provider: TCP Provider, error: 0 - Success) at Microsoft.Data.SqlClient.SqlConnection.OnError(SqlException exception, Boolean breakConnection, Action1 wrapCloseInAction) in D:\a\_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlConnection.cs:line 2010 at Microsoft.Data.SqlClient.SqlInternalConnection.OnError(SqlException exception, Boolean breakConnection, Action1 wrapCloseInAction) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\src\Microsoft\Data\SqlClient\SqlInternalConnection.cs:line 770 at Microsoft.Data.SqlClient.TdsParser.ThrowExceptionAndWarning(TdsParserStateObject stateObj, Boolean callerHasConnectionLock, Boolean asyncClose) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParser.cs:line 1404 at Microsoft.Data.SqlClient.TdsParserStateObject.ThrowExceptionAndWarning(Boolean callerHasConnectionLock, Boolean asyncClose) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 734 at Microsoft.Data.SqlClient.TdsParserStateObject.ReadSniError(TdsParserStateObject stateObj, UInt32 error) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 1783 at Microsoft.Data.SqlClient.TdsParserStateObject.ReadSniSyncOverAsync() in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 1256 at Microsoft.Data.SqlClient.TdsParserStateObject.TryReadNetworkPacket() in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 1181 at Microsoft.Data.SqlClient.TdsParserStateObject.TryPrepareBuffer() in D:\a_work\1\s\src\Microsoft.Data.SqlClient\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 904 at Microsoft.Data.SqlClient.TdsParserStateObject.TryReadByte(Byte& value) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParserStateObject.cs:line 446 at Microsoft.Data.SqlClient.TdsParser.TryRun(RunBehavior runBehavior, SqlCommand cmdHandler, SqlDataReader dataStream, BulkCopySimpleResultSet bulkCopyHandler, TdsParserStateObject stateObj, Boolean& dataReady) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\TdsParser.cs:line 2019 at Microsoft.Data.SqlClient.SqlDataReader.TryConsumeMetaData() in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlDataReader.cs:line 1142 at Microsoft.Data.SqlClient.SqlDataReader.get_MetaData() in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlDataReader.cs:line 258 at Microsoft.Data.SqlClient.SqlCommand.FinishExecuteReader(SqlDataReader ds, RunBehavior runBehavior, String resetOptionsString, Boolean isInternal, Boolean forDescribeParameterEncryption, Boolean shouldCacheForAlwaysEncrypted) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 5157 at Microsoft.Data.SqlClient.SqlCommand.RunExecuteReaderTds(CommandBehavior cmdBehavior, RunBehavior runBehavior, Boolean returnStream, Boolean isAsync, Int32 timeout, Task& task, Boolean asyncWrite, Boolean inRetry, SqlDataReader ds, Boolean describeParameterEncryptionRequest) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 4951 at Microsoft.Data.SqlClient.SqlCommand.RunExecuteReader(CommandBehavior cmdBehavior, RunBehavior runBehavior, Boolean returnStream, TaskCompletionSource`1 completion, Int32 timeout, Task& task, Boolean& usedCache, Boolean asyncWrite, Boolean inRetry, String method) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 4621 at Microsoft.Data.SqlClient.SqlCommand.RunExecuteReader(CommandBehavior cmdBehavior, RunBehavior runBehavior, Boolean returnStream, String method) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 4491 at Microsoft.Data.SqlClient.SqlCommand.ExecuteReader(CommandBehavior behavior) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 2050 at Microsoft.Data.SqlClient.SqlCommand.ExecuteDbDataReader(CommandBehavior behavior) in D:\a_work\1\s\src\Microsoft.Data.SqlClient\netcore\src\Microsoft\Data\SqlClient\SqlCommand.cs:line 2000 at System.Data.Common.DbCommand.System.Data.IDbCommand.ExecuteReader(CommandBehavior behavior) at System.Data.Common.DbDataAdapter.FillInternal(DataSet dataset, DataTable[] datatables, Int32 startRecord, Int32 maxRecords, String srcTable, IDbCommand command, CommandBehavior behavior) at System.Data.Common.DbDataAdapter.Fill(DataSet dataSet, Int32 startRecord, Int32 maxRecords, String srcTable, IDbCommand command, CommandBehavior behavior) at System.Data.Common.DbDataAdapter.Fill(DataSet dataSet) at MercatorApi.Api.Zselect(SqlCommand cmd, Boolean withSchema, MercatorSqlParam[] mercatorSqlParams) in M:\Mercator_dotnet\MercatorTunnel\MercatorTunnel\Api\MercatorApiZselectSqlClientCore.cs:line 133 ClientConnectionId:7f5e8352-c020-4aaa-9a88-42ca61cfaa28

To reproduce

sqlCommand.CommandText = new string(' ', 948) + "select 1"; works fine.

sqlCommand.CommandText = new string(' ', 949) + "select 1"; gives the error.


### Expected behavior
Queries exceeding 956 characters should work.

### Further technical details
Microsoft.Data.SqlClient version: 5.1.1,
.NET target: MAUI net7.0-android
SQL Server version: SQL Server 2019
Operating system: Windows 11

About this issue

  • Original URL
  • State: open
  • Created 10 months ago
  • Reactions: 1
  • Comments: 40 (13 by maintainers)

Most upvoted comments

@JRahnama I can send decrypted Wireshark captures showing the actual traffic without TLS record fragmentation that works from Windows, and TLS record fragmentation causing SQL Server to disconnect the Android client, GitHub just doesn’t allow .pcapng file attachments. There is no client certificate authentication involved, and the connection string doesn’t matter, what matters is the issue always happens with TLS and query strings around 956 characters because they produce messages over 2048 bytes that somehow only get sent as multiple TLS records on Android, and no other platform. We suspect this would be an issue within the .NET runtime combined with SQL Server being unable to handle query messages split in multiple TLS records.

I’ve been digging all morning trying to get SQL Server to produce traces to get a hint as to why it disconnects after receiving the query in split TLS records, but I couldn’t get it to be verbose enough for that. This is a critical issue for us and we’re looking for a swift response - and pretty much everyone else in this thread has been able to reproduce it easily: .NET 8, SQL Server with TLS, and a long query string. That’s it, boom. We were able to hit the issue with a smaller sample like the one that was shared by @jFrancoisH, and our analysis now points in the same direction as @guyvaio - https://github.com/dotnet/SqlClient/issues/2141#issuecomment-1731215123

Knowing that at the protocol level, the fragmentation happens at 2048 bytes, the original magical value of 956 characters becomes less relevant. All you really need is a query resulting in a protocol message going over 2048 bytes, for which the end result is a single query message sent over multiple TLS records rather than a single TLS record fragmented at the TCP level. I could not observe the same behavior as Android from any other platform, and only Android is affected. I tried finding a hint in the SqlClient source code, and I don’t see an obvious link with the fragmentation.

However, there are many hints that point towards the .NET runtime implementation of SslStream: as @guyvaio reported earlier, it happens in external code (the .NET runtime here), in between lines from SqlClient. This makes a lot of sense given the poor state of SslStream for Android prior to .NET 8, it only just got fixed. This is not battle-tested code, there’s been churn in that area very recently, so bugs like this would not be a surprise.

In the .NET runtime, we can see the value 2048 used as the InitialBufferSize for SslStream. When _stream.Write() is called from SqlClient, it calls SSLStreamWrite from pal_sslstream.c. The managed wrapper of the native Android SSLStreamWrite is in Interop.Ssl.cs.

I’ve tried to look and compare the various platform-specific implementations to find a hint on what might cause the difference in behavior, but I could not find it yet. However, Wireshark doesn’t lie: those long queries are only split in multiple TLS records with .NET on Android, and all other platforms work. I’m still looking for undeniable proof that this is the reason SQL Server drops the connection, but from observation, there’s a direct cause and effect relationship: if you stick to queries that result in messages smaller than 2048 bytes over the wire, the queries get sent as a single TLS record, and everything works fine. Other platforms send queries over 2048 as a single TLS record, and they work fine.

Would it be possible to increase the priority on this issue and maybe get people from the .NET runtime involved? If you know people from the SQL Server team, maybe this could help confirm the theory about queries sent over multiple TLS records not being handled properly?

We have recently upgraded to .NET 8 for Remote Desktop Manager (RDM) on Android after TLS-related issues were finally fixed, only to start getting error reports from different customers for Microsoft.Data.SqlClient.SqlException (0x80131904): A transport-level error has occurred when receiving results from the server. (provider: TCP Provider, error: 0 - Success).

image

I took the time to set up RDM Android with the latest Visual Studio 2022 Preview to launch it in WSA, and used a test VM with SQL Server as my RDM SQL Server data source with TLS encryption enforced server-side. I then used instructions for TLS pre-master secret dumping from LSASS to decrypt traffic on port 1433 from the server in Wireshark.

RDM Windows can connect to the same SQL Server data source without problem - only RDM Android is affected, and it’s very easy to reproduce, as we do a bunch of queries after the initial connection that go over the weird limit observed. At first sight, there doesn’t seem to be an obvious issue with the query size, or even the response size:

image image

However, looking at the last query sent from the client to the server, we can observe an important difference: the query was fragmented in two TLS records (2048+307 bytes). This isn’t TCP fragmentation of a single large TLS record like the previous query, and the fact that it has been split in two TLS records appears to have triggered a server-initiated disconnection upon reception:

image image

Normally, an application should be able to handle this nicely, but there’s potentially something unhandled there. We do know that the issues only occur with TLS, and that TLS issues were only recently fixed. In my past experience with FreeRDP, I encountered similar weirdness between OpenSSL and SChannel - for instance, zero-length TLS records had to be disabled. Here we seem to be hitting a problem related to the way a single query message is fragmented over multiple TLS records. What about platforms where the issue doesn’t happen, like RDM Windows? The equivalent query is sent as a single TLS record, and the server doesn’t drop the connection!

image

Applications normally handle such fragmentation differences correctly, but I really wouldn’t be surprised to learn that SQL Server + SChannel doesn’t handle this properly. I’ve had issues in the past with FreeRDP for CredSSP+OpenSSL interoperability where we had to disable zero-length TLS records because SChannel couldn’t handle them. SQL Server probably has similar weird unhandled cases. I have no idea what triggers the fragmentation at the TLS record level in .NET 8 for Android, but the fragment size appears to be set at 2048, which is close-enough to a query of 956 characters encoded in UTF-16. The numbers fit, now we just need someone more knowledgeable of the stack to take a closer look at why it’s behaving the way it is right now, and see how queries could be sent as a single TLS record fragmented at the TCP level instead of sending it as multiple TLS records, and I think it should resolve the issue.

Success! I finally managed to prove my point by increasing the value of InitialBufferSize from 2048 to 4096, rebuilt the .NET runtime from the v8.0.0 tag, then manually copied over my patched System.Net.Security.dll in the following locations, which is where it was picked up by Visual Studio according to ProcMon, and surprise, surprise: Remote Desktop Manager works again, no issue resulting from broken SQL queries, because the long TDS RPC requests no longer get fragmented. Of course, the issue would still happen for requests longer than 4096 bytes, but I feel like just doubling the buffer size is good enough to fix this blocking issue (some of our customers are unable to use RDM Android anything until we fix this).

I don’t really know if there’s a better way to swap .NET runtime assemblies for a custom build, but here are the paths on Windows where I replaced System.Net.Security.dll with my locally built patched copy:

  • C:\Program Files\dotnet\packs\Microsoft.NETCore.App.Runtime.Mono.android-x86\8.0.0\runtimes\android-x86\lib\net8.0
  • C:\Program Files\dotnet\packs\Microsoft.NETCore.App.Runtime.Mono.android-x64\8.0.0\runtimes\android-x64\lib\net8.0
  • C:\Program Files\dotnet\packs\Microsoft.NETCore.App.Runtime.Mono.android-arm\8.0.0\runtimes\android-arm\lib\net8.0
  • C:\Program Files\dotnet\packs\Microsoft.NETCore.App.Runtime.Mono.android-arm64\8.0.0\runtimes\android-arm64\lib\net8.0

Despite having a separate copy for reach CPU architecture, the DLLs are exactly the same in all of those folders.

Here’s a zipped copy of my patched DLL for convenience: System.Net.Security.dll.zip

And here’s a patch file, but it’s just a matter of changing private const int InitialBufferSize = 2048; to something higher: workaround-for-dotnet-sql-client-issue-2141.patch

I hadn’t realized that we could attach zip files to GitHub comments until now, so here are the decrypted Wireshark captures mentioned earlier for my initial analysis of the issue: rdm-android-mssql-failure.zip rdm-windows-mssql-success.zip

TL;DR: SQL Server requires TDS RPC requests to be sent as a single TLS record, and the protocol specification hints about this limitation. The initial buffer size of 2048 bytes used in the .NET 8 runtime SslStream stack results in fragmentation of messages into multiple TLS records, breaking SqlClient on Android when sending queries of about 956 characters (header size + UTF-16 string encoding makes it close to 2048 bytes). The only solution is to avoid splitting SQL queries (TDS RPC requests) in multiple TLS records, of face immediate disconnection from SQL Server.

Increasing the initial buffer size to 4096 is sufficient to work around the issue for Remote Desktop Manager on Android: image

Since we need it a fix now, I’ll look into overwriting System.Net.Security.dll in our CI build runners. I would be fine with just increasing the initial buffer size, but I somehow doubt the .NET runtime time will accept this as a proper fix. We should look into the best way to work around the issue in the short term until a more permanent solution can be found.

@wfurt we can certainly try to push the issue to the SQL Server team, but only as an improvement on their side, and something we cannot expect to be fixed in a timely manner.

Also, it works for all platforms other than Android, and MS-TDS documents this finicky behavior as a MUST. I feel like the best we’d get from the SQL Server team might be that they ask the protocol docs team to make the finicky behavior more clearly specified instead of improving the server.

My colleague @thenextman may have found the reason why the issue only happens with Android, from the docs of the Android SSLEngine APIs used by the .NET runtime:

The SSLEngine produces/consumes complete SSL/TLS packets only, and does not store application data internally between calls to wrap()/unwrap(). Thus input and output ByteBuffers must be sized appropriately to hold the maximum record that can be produced. Calls to SSLSession.getPacketBufferSize() and SSLSession.getApplicationBufferSize() should be used to determine the appropriate buffer sizes. The size of the outbound application data buffer generally does not matter. If buffer conditions do not allow for the proper consumption/production of data, the application must determine (via SSLEngineResult) and correct the problem, and then try the call again.

If we look at the .NET runtime DoWrap() implementation using the Android SSLEngine APIs (simplified and commented in-line here):

static PAL_SSLStreamStatus DoWrap(JNIEnv* env, SSLStream* sslStream, int* handshakeStatus)
{
    IGNORE_RETURN((*env)->CallObjectMethod(env, sslStream->appOutBuffer, g_ByteBufferFlip));
    jobject result = (*env)->CallObjectMethod(
        env, sslStream->sslEngine, g_SSLEngineWrap, sslStream->appOutBuffer, sslStream->netOutBuffer);
    if (CheckJNIExceptions(env))
        return SSLStreamStatus_Error;

    IGNORE_RETURN((*env)->CallObjectMethod(env, sslStream->appOutBuffer, g_ByteBufferCompact));

    *handshakeStatus = GetEnumAsInt(env, (*env)->CallObjectMethod(env, result, g_SSLEngineResultGetHandshakeStatus));
    int status = GetEnumAsInt(env, (*env)->CallObjectMethod(env, result, g_SSLEngineResultGetStatus));
    (*env)->DeleteLocalRef(env, result);

    if (g_SSLEngineResultStatusLegacyOrder)
    {
        status = MapLegacySSLEngineResultStatus(status);
    }

    switch (status)
    {
        case STATUS__OK:
        {
            return Flush(env, sslStream);
        }
        case STATUS__CLOSED:
        {
            (void)Flush(env, sslStream);
            (*env)->CallVoidMethod(env, sslStream->sslEngine, g_SSLEngineCloseOutbound);
            return SSLStreamStatus_Closed;
        }
        case STATUS__BUFFER_OVERFLOW:
        {
        	// FIXME: we increase the output buffer size, but fail to call g_SSLEngineWrap again!
        	// The alternative would be increase the buffer size *prior* to calling g_SSLEngineWrap and avoid this
            int32_t newCapacity = (*env)->CallIntMethod(env, sslStream->sslSession, g_SSLSessionGetPacketBufferSize) +
                                  (*env)->CallIntMethod(env, sslStream->netOutBuffer, g_ByteBufferRemaining);
            sslStream->netOutBuffer = ExpandBuffer(env, sslStream->netOutBuffer, newCapacity);
            return SSLStreamStatus_OK;
        }
        default:
        {
            LOG_ERROR("Unknown SSLEngineResult status: %d", status);
            return SSLStreamStatus_Error;
        }
    }
}

TL;DR: the Android SSLEngineWrap API return STATUS__BUFFER_OVERFLOW to tell that it could not write all the data bytes to the output buffer. The end result is a TLS record which holds only part of the data we wanted to send. Here we just catch the condition to increase the buffer size for the next call to SSLEngineWrap, but not the current one, which is already too late for SQLClient. Ideally, we should increase the buffer size before calling SSLEngineWrap to avoid getting the overflow status code, and get a single, clean TLS record.