aspnetcore: Kestrel stops serving https (http2) requests after reboot under load

Describe the bug

I am running Kestrel as an edge server in the Digital Ocean ( Ubuntu Docker 5:19.03.1~3 on 18.04) via docker-compose

version: '3.5'
services:

  db:
    build: ./docker/db
    command: ["--default-authentication-plugin=mysql_native_password", "--character-set-server=utf8"]
    volumes:
      - dotnet_db_data:/var/lib/mysql
    networks:
      - default
    restart: always
        
  dotnet:
    depends_on:
      - db
    build: .
    environment:
      ASPNETCORE_URLS: "http://+:80;https://+:443"
      ASPNETCORE_ENVIRONMENT: Production
      APPLICATION_NAME: "${APPLICATION_NAME}"
    volumes:
      - ./docker/dotnet_certs:/etc/dotnet_certs
    networks:
      - default
    ports:
      - "80:80"
      - "443:443"
    restart: always

volumes:
  dotnet_db_data: {}

networks:
  default:
    driver: bridge

The container is built with mcr.microsoft.com/dotnet/core/sdk:3.1 and mcr.microsoft.com/dotnet/core/aspnet:3.1

I am using Compression and ResponseCaching middlewares in the request pipeline.


        public void ConfigureServices(IServiceCollection services)
        {
            var deployment = DeploymentResolver.Resolve(Environment, Configuration);
            services.AddSingleton<Deployment>(deployment);
            
            services.AddHttpContextAccessor();
            services.TryAddScoped<IUserAgentService, UserAgentService>();
            services.TryAddScoped<IDeviceService, DeviceService>();

            var mvc = services.AddMvc(options =>
            {
                options.CacheProfiles.Add(DefinedCacheProfiles.Default, new CacheProfile
                {
                    VaryByHeader = DefinedCacheProfiles.PlatformDetectHeader,
                    Duration = 60 * 5
                });

                if (Environment.IsStaging())
                {
                    options.Filters.Add(new BasicAuth("test"));
                }
            })
            .AddJsonOptions(options => options.JsonSerializerOptions.PropertyNamingPolicy = null); ;

            if (Environment.IsDevelopment())
            {
                mvc.AddRazorRuntimeCompilation();
            }

            services.AddDbContext<TrackDbContext>(options =>
            {
                options.UseMySql(Configuration.GetConnectionString("DefaultConnection"),
                    mysql =>
                    {
                        mysql.ServerVersion(new Version(8, 0, 16), ServerType.MySql);
                        mysql.MigrationsHistoryTable("migrations_history");
                    });
            });

            services.AddResponseCaching(options => { options.UseCaseSensitivePaths = false; });
            services.AddMemoryCache(options => { });
            
            services.AddHttpClient();
            services.RemoveAll<IHttpMessageHandlerBuilderFilter>();
           
            services.Configure<GzipCompressionProviderOptions>(options =>
            {
                options.Level = CompressionLevel.Optimal;
            });

            services.AddResponseCompression(options =>
            {
                options.EnableForHttps = true;
                options.MimeTypes = new[]
                {
                    "application/atom+xml",
                    "application/javascript",
                    "application/json",
                    "application/ld+json",
                    "application/manifest+json",
                    "application/rss+xml",
                    "application/vnd.geo+json",
                    "application/vnd.ms-fontobject",
                    "application/x-font-ttf",
                    "application/x-web-app-manifest+json",
                    "application/xhtml+xml",
                    "application/xml",
                    "font/opentype",
                    "font/woff2",
                    "image/bmp",
                    "image/svg+xml",
                    "image/x-icon",
                    "text/html",
                    "text/cache-manifest",
                    "text/css",
                    "text/plain",
                    "text/vcard",
                    "text/vnd.rim.location.xloc",
                    "text/vtt",
                    "text/x-component",
                    "text/x-cross-domain-policy",
                };

                options.Providers.Add<BrotliCompressionProvider>();
                options.Providers.Add<GzipCompressionProvider>();
            });

            if (!Environment.IsDevelopment())
            {
                services.AddFluffySpoonLetsEncrypt(new LetsEncryptOptions
                {
                    Email =  "email@emails.com", 
                    UseStaging = false, 
                    Domains = new[] { deployment.DNS },
                    TimeUntilExpiryBeforeRenewal = TimeSpan.FromDays(30), 
                    TimeAfterIssueDateBeforeRenewal = TimeSpan.FromDays(7), 
                    CertificateSigningRequest = new CsrInfo(),
                    KeyAlgorithm = KeyAlgorithm.ES256,
                });
                
                services.AddFluffySpoonLetsEncryptFileCertificatePersistence("//etc/dotnet_certs/main");
                services.AddFluffySpoonLetsEncryptMemoryChallengePersistence();
            }
        }

        public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
        {
            if (!env.IsProduction())
            {
                app.UseDeveloperExceptionPage();
                app.UseDatabaseErrorPage();
            }

            app.UseFluffySpoonLetsEncrypt();
            
            app.UseHsts();
            app.UseHttpsRedirection();
            app.UseStatusCodePagesWithReExecute("/status/{0}");
            
            app.Use((context, next) =>
            {
                context.SetEndpoint(null);
                return next();
            });
            
            app.Map("/metrics", metricsApp =>
            {
                const string userName = "monitoring";
                const string password = "long-and-secure-password-for-monitoring-2";
                var valid = "Basic " + Convert.ToBase64String(Encoding.UTF8.GetBytes(userName + ":" + password));

                metricsApp.Use((context, next) =>
                {
                    if (context.Request.Headers["Authorization"] == valid)
                    {
                        return next();
                    }

                    context.Response.Headers["WWW-Authenticate"] = "Basic";
                    context.Response.StatusCode = (int) HttpStatusCode.Unauthorized;
                    return Task.CompletedTask;
                });
                
                metricsApp.UseMetricServer("");
            });
            
            app.UseResponseCompression();

            var cacheValue = $"public, max-age={30 * 24 * 60 * 60}";
            app.UseStaticFiles(new StaticFileOptions
            {
                OnPrepareResponse = ctx => { 
                    ctx.Context.Response.Headers.Append("Cache-Control", cacheValue); 
                }
            });

            app.UseRouting();
            
            app.UseHttpMetrics(options =>
            {
                options.InProgress.Enabled = false;
            });
            
            app.Use(DetectDevice);
            
            app.UseResponseCaching();

            app.UseEndpoints(endpoints => { endpoints.MapControllers(); });
        }

The issue was not appearing before we have started receiving increased volume of traffic (eg before 1 rps after 8 rps).

Deployment process loads latest commit from repo, builds container on the host and launches new instance docker-compose -f prod.yml up -d --build

This process restarts the running Kestrel container and after the restart, newly started instance is not handling any requests.

Cpu is low during this period (normal avg 10%, broken avg 10%).

After a series of reboots server starts to handle requests again.

To Reproduce

I am able to consistently reproduce the issues with the syntetic traffic on our staging env:

    class Program
    {
        private static readonly string[] Urls = new[]
        {
            "https://server.com/homepage",
            "https://server.com/homepage2",
            "https://server.com/homepage3",
            
        };
        
        static async Task Main(string[] args)
        {
            for (int i = 0; i < 50; i++)
            {
                Go(i + 1);
            }

            Console.ReadLine();
        }
        
        static async Task Go(int id)
        {
            Console.WriteLine("Running: " + id);
            var random = new Random();

            while (true)
            {
                try
                {
                    var httpClient = new HttpClient();
                    var request = new HttpRequestMessage(HttpMethod.Get, Urls[random.Next(Urls.Length)] );
                    request.Headers.Authorization = new AuthenticationHeaderValue("Basic", "YWRtaW46bG9uZy1hbmQtc2VjdXJlLXBhc3N3b3JkLWZvci1hZG1pbjI=");
                    var response = await httpClient.SendAsync(request, HttpCompletionOption.ResponseContentRead);
                    var str = await response.Content.ReadAsStringAsync();
                    // Console.WriteLine("OK");
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                    
                    if (e.Message.Contains("SSL"))
                        Console.WriteLine(e.ToString());
                    // await Task.Delay(10);
                }
            }
        }
    }

While the fake load is running I am shutting the stack down and bringing it up again. Repro rate is around 90%

Further technical details

  • ASP.NET Core version: 3.1
  • Include the output of dotnet --info:
Host (useful for support):
  Version: 3.1.1
  Commit:  a1388f194c

.NET Core SDKs installed:
  No SDKs were found.

.NET Core runtimes installed:
  Microsoft.AspNetCore.App 3.1.1 [/usr/share/dotnet/shared/Microsoft.AspNetCore.App]
  Microsoft.NETCore.App 3.1.1 [/usr/share/dotnet/shared/Microsoft.NETCore.App]

About this issue

  • Original URL
  • State: closed
  • Created 4 years ago
  • Reactions: 2
  • Comments: 57 (31 by maintainers)

Commits related to this issue

Most upvoted comments

This whole story rendered that SSL could be tricky and assuming that SslStream internals will change in the future this could lead to other similar issues.

Yes this great issue investigation spawned a set of work that I have tracked here https://github.com/dotnet/aspnetcore/issues/21512. 5.0 should have this situation dramatically improved.

@bartonjs is our certificate crypto export and we’re looking at ways to represent a “pre-validated” certificate chain for these scenarios to avoid this in the future.

Thanks for the review guys, both PRs (https://github.com/natemcmaster/LetsEncrypt/pull/81 and https://github.com/ffMathy/FluffySpoon.AspNet.LetsEncrypt/pull/71) are merged and will be released soon. I am not aware of any other community projects aimed at acme cert providers.

This whole story rendered that SSL could be tricky and assuming that SslStream internals will change in the future this could lead to other similar issues.

Idk, I’d be happy to see support from the dotnet team on the LestsEncrypt side. Having SSL benchmarks with popular community libs is a great starting point as @davidfowl suggested. Dotnet foundation membership is better. Builtin support is best imo.

This issue could be closed I believe.

Really interested if there will be any further steps and if I can help somehow.

This was a wild ride.

Assign this to me. I’m going to drive the fixes to solve this issue

This is what you can put in your docker file to install the tools (assuming you’re doing a multi-stage build)

FROM mcr.microsoft.com/dotnet/core/sdk:3.1 AS build-env

RUN dotnet tool install dotnet-dump --tool-path /tools
RUN dotnet tool install dotnet-counters --tool-path /tools
RUN dotnet tool install dotnet-trace --tool-path /tools

WORKDIR /app

# Build runtime image
FROM mcr.microsoft.com/dotnet/core/aspnet:3.1

COPY --from=build-env /tools /tools

Triage: Queuing for 5.0. There is runtime work that needs to happen first (there are discussions planned to resolve this). @davidfowl let’s make sure the outcome of that meeting and any action for ASP.NET Core gets captured clearly on this issue so that we know what to do when it comes back up for planning in a 5.0 preview.

Am I right saying that all requests on a fresh/unvalidated certificate ended up querying cert status from LetsEncrypt servers and starved threadpool?

Even though the full chain is being requested we still validate it but this is what happens:

  • We get a pfx file with the full cert chain
  • We feed that cert into X509Certificate2 type which only grabs the first https cert not the full chain (we don’t have a way to represent the validated chain)
  • The logic then tries to download the full cert chain (per connection!), it does this synchronously.
  • It caches the cert chain on disk, which is ephemeral inside of the container…

All badness.

OK so this logic that you wrote here:

https://github.com/dotnet/aspnetcore/issues/21183#issuecomment-619579627

You basically need to replicate that but I would copy this logic:

https://github.com/dotnet/runtime/blob/4f84429fb44801b841c052505935a3cd14da18e2/src/libraries/Common/src/System/Net/Http/TlsCertificateExtensions.cs#L77-L97

var chain = new X509Chain();
chain.ChainPolicy.VerificationFlags = X509VerificationFlags.AllFlags;
chain.ChainPolicy.RevocationFlag = X509RevocationFlag.ExcludeRoot;
chain.ChainPolicy.RevocationMode = X509RevocationMode.NoCheck;

// We're not doing anything client side
// if (includeClientApplicationPolicy)
// {
//    chain.ChainPolicy.ApplicationPolicy.Add(s_clientCertOidInst);
//}

if (chain.Build(certificate))
{
    return chain;
}
else
{
    chain.Dispose();
    return null;
}

Run this code here https://github.com/ffMathy/FluffySpoon.AspNet.LetsEncrypt/blob/02b617896e2ab394cb185a77b9d6f48abe6e5e12/src/FluffySpoon.AspNet.LetsEncrypt/Certificates/CertificateProvider.cs#L80

After getting the cert, Build the chain here.

We should look into making ASP.NET Core work with certbot, or at least understand how to connect the two and document it.