next.js: [Error 500] "Socket Hang Up" Randomly Occurring on any Routes in Production Mode

Verify canary release

  • I verified that the issue exists in the latest Next.js canary release

Provide environment information

Operating System:
      Platform: darwin
      Arch: x64
      Version: Darwin Kernel Version 21.6.0: Mon Aug 22 20:17:10 PDT 2022; root:xnu-8020.140.49~2/RELEASE_X86_64
    Binaries:
      Node: 16.14.2
      npm: 8.5.0
      Yarn: 1.22.15
      pnpm: 6.11.0
    Relevant packages:
      next: 13.4.6
      eslint-config-next: 13.4.2
      react: 18.2.0
      react-dom: 18.2.0
      typescript: 4.9.5

Which area(s) of Next.js are affected? (leave empty if unsure)

No response

Link to the code that reproduces this issue or a replay of the bug

not possible confidential

To Reproduce

this our package.json

`{
  "name": "********",
  "version": "0.1.0",
  "private": true,
  "scripts": {
    "dev": "next dev",
    "dev-https": "NODE_TLS_REJECT_UNAUTHORIZED='0' node server.js",
    "ngrok": "ngrok http https://localhost:3000",
    "build": "next build",
    "postbuild": "next-sitemap",
    "start": "next start",
    "clean": "rimraf .next out",
    "lint": "next lint",
    "lint.fix": "next lint --fix",
    "test": "jest --watch",
    "prepare": "husky install",
    "analyze": "ANALYZE=true next build"
  },
  "dependencies": {
    "@everipedia/wagmi-magic-connector": "^0.12.1",
    "@headlessui/react": "^1.7.15",
    "@headlessui/tailwindcss": "^0.1.3",
    "@heroicons/react": "^1.0.6",
    "@next/bundle-analyzer": "^12.2.0",
    "@next/env": "^13.1.5",
    "@radix-ui/react-dropdown-menu": "^2.0.5",
    "@rainbow-me/rainbowkit": "^0.12.15",
    "@ramp-network/ramp-instant-sdk": "^4.0.2",
    "@react-spring/web": "^9.6.1",
    "@react-three/cannon": "^6.4.0",
    "@react-three/drei": "^9.34.3",
    "@react-three/fiber": "^8.8.10",
    "@segment/analytics-next": "^1.52.0",
    "@sentry/nextjs": "^7.54.0",
    "@stripe/react-stripe-js": "^1.16.3",
    "@stripe/stripe-js": "^1.46.0",
    "@tanstack/react-table": "^8.5.13",
    "@use-gesture/react": "^10.2.19",
    "axios": "^1.4.0",
    "clsx": "^1.2.1",
    "cookies-next": "^2.1.1",
    "date-fns": "^2.29.3",
    "ethers": "^5.7.1",
    "i18next": "^22.4.9",
    "next": "^13.4.6",
    "next-auth": "^4.21.1",
    "next-axiom": "^0.17.0",
    "next-i18next": "^11.3.0",
    "next-password-protect": "^1.8.0",
    "next-share": "^0.18.2",
    "next-sitemap": "^3.1.47",
    "nextjs-progressbar": "^0.0.14",
    "react": "^18.2.0",
    "react-canvas-confetti": "^1.3.0",
    "react-countup": "^6.4.0",
    "react-csv": "^2.2.2",
    "react-currency-input-field": "^3.6.10",
    "react-device-detect": "^2.2.3",
    "react-div-100vh": "^0.7.0",
    "react-dom": "^18.2.0",
    "react-fast-marquee": "^1.3.5",
    "react-hook-form": "^7.41.5",
    "react-hot-toast": "^2.4.0",
    "react-i18next": "^12.1.4",
    "react-icons": "^4.8.0",
    "react-infinite-scroll-component": "^6.1.0",
    "react-intersection-observer": "^9.4.1",
    "react-spring-bottom-sheet": "^3.5.0-alpha.0",
    "react-type-animation": "^2.1.1",
    "react-use-intercom": "^3.0.2",
    "recharts": "2.5.0",
    "sharp": "^0.30.7",
    "swiper": "^9.1.1",
    "swr": "1.3.0",
    "tailwind-merge": "^1.13.1",
    "tailwind-scrollbar": "^3.0.0",
    "tailwind-scrollbar-hide": "^1.1.7",
    "tailwindcss": "^3.1.4",
    "three": "^0.144.0",
    "uuid": "^9.0.0",
    "wagmi": "^0.12.12"
  },
  "devDependencies": {
    "@commitlint/cli": "^17.0.3",
    "@commitlint/config-conventional": "^17.3.0",
    "@testing-library/jest-dom": "^5.16.4",
    "@testing-library/react": "^13.3.0",
    "@types/jest": "^28.1.4",
    "@types/node": "18.0.0",
    "@types/react": "18.0.14",
    "@types/react-csv": "^1.1.3",
    "@types/react-dom": "18.0.5",
    "@types/react-stripe-elements": "^6.0.6",
    "@types/three": "^0.143.0",
    "@types/uuid": "^8.3.4",
    "@typescript-eslint/eslint-plugin": "^5.30.0",
    "@typescript-eslint/parser": "^5.30.0",
    "autoprefixer": "^10.4.7",
    "commitizen": "^4.2.6",
    "commitlint": "^11.0.0",
    "commitlint-config-gitmoji": "2.2.5",
    "cssnano": "^5.1.12",
    "cz-conventional-changelog": "^3.3.0",
    "eslint": "8.18.0",
    "eslint-config-airbnb-base": "^15.0.0",
    "eslint-config-airbnb-typescript": "^17.0.0",
    "eslint-config-next": "^13.3.0",
    "eslint-config-prettier": "^8.5.0",
    "eslint-plugin-import": "^2.26.0",
    "eslint-plugin-jsx-a11y": "^6.6.0",
    "eslint-plugin-prettier": "^4.1.0",
    "eslint-plugin-react": "^7.30.1",
    "eslint-plugin-react-hooks": "^4.6.0",
    "eslint-plugin-simple-import-sort": "^7.0.0",
    "eslint-plugin-tailwindcss": "^3.6.0",
    "eslint-plugin-unused-imports": "^2.0.0",
    "husky": "^8.0.0",
    "jest": "^28.1.2",
    "jest-environment-jsdom": "^28.1.2",
    "lint-staged": "^13.0.3",
    "postcss": "^8.4.14",
    "prettier": "^2.7.1",
    "rimraf": "^3.0.2",
    "typescript": "^4.9.5"
  },
  "config": {
    "commitizen": {
      "path": "./node_modules/cz-conventional-changelog"
    }
  }
}`

our next.config.js :

`/** @type {import('next').NextConfig} */

const { withSentryConfig } = require('@sentry/nextjs');
const { withAxiom } = require('next-axiom');
const withBundleAnalyzer = require('@next/bundle-analyzer')({
  enabled: process.env.ANALYZE === 'true',
});

const { i18n } = require('./next-i18next.config');

const IS_PROTECTED = process.env.NEXT_PUBLIC_NODE_ENV === 'staging';

const securityHeaders = [
  {
    key: 'X-XSS-Protection',
    value: '1; mode=block',
  },
  {
    key: 'X-Content-Type-Options',
    value: 'nosniff',
  },
  {
    key: 'Referrer-Policy',
    value: 'origin-when-cross-origin',
  },
  {
    key: 'X-DNS-Prefetch-Control',
    value: 'on',
  },
  {
    key: 'Strict-Transport-Security',
    value: 'max-age=63072000; includeSubDomains; preload',
  },
];

const nextConfig = withAxiom(
  withBundleAnalyzer({
    reactStrictMode: true,
    swcMinify: false,
    i18n,
    env: {
      PASSWORD_PROTECT: IS_PROTECTED,
    },
    images: {
      domains: ['lh3.googleusercontent.com', 'i.scdn.co'],
    },
    sentry: {
      widenClientFileUpload: true,
      hideSourceMaps: true,
      automaticVercelMonitors: false,
    },
    // transpilePackages: ['react-native'],
    async redirects() {
      return [
        {
          source: '/login',
          destination: '/auth/login',
          permanent: true,
        },
        {
          source: '/signup',
          destination: '/auth/signup',
          permanent: true,
        },
        {
          source: '/dashboard',
          destination: '/users/dashboard',
          permanent: true,
        },
        {
          source: '/backstage',
          destination: '/artists/backstage',
          permanent: true,
        },
        {
          source: '/explore',
          destination: '/search',
          permanent: true,
        },
        {
          source: '/faqs',
          destination: '/faq',
          permanent: true,
        },
        {
          source: '/users/reward-tasks',
          destination: '/users/game/explain',
          permanent: true,
        },
      ];
    },
    async headers() {
      return [
        {
          source: '/:path*',
          headers: securityHeaders,
        },
        {
          source: '/.well-known/apple-developer-merchantid-domain-association',
          headers: [{ key: 'Content-Type', value: 'application/json' }],
        },
      ];
    },
    webpack: (config) => {
      config.module.rules.push({
        test: /\.pdf$/,
        use: {
          loader: 'file-loader',
          options: {
            name: '[path][name].[ext]',
          },
        },
      });
      // config.externals.push('react-native');
      return config;
    },
  })
);

const sentryWebpackPluginOptions = {
  org: '*****-*****',
  project: '*****-nextjs',
  silent: true, // Suppresses all logs
  // For all available options, see:
  // https://github.com/getsentry/sentry-webpack-plugin#options.
};

module.exports = withSentryConfig(nextConfig, sentryWebpackPluginOptions);

our middleware.ts

/* eslint-disable consistent-return */
import type { NextRequest } from 'next/server';
import { NextResponse } from 'next/server';
import { withAuth } from 'next-auth/middleware';

const ROLES_ALLOWED_TO_AUTH = new Set<any>(['artist', 'user']);

export default withAuth(
  function middleware(req: NextRequest & { nextauth: { token: any } }) {s
    // Redirect if they don't have the appropriate role
    if (
      req.nextUrl.pathname.startsWith('/artists/backstage') ||
      req.nextUrl.pathname.startsWith('/artists/onboarding') ||
      req.nextUrl.pathname.startsWith('/artists/new')
    ) {
      if (!ROLES_ALLOWED_TO_AUTH.has(req.nextauth.token?.userRole)) {
        return NextResponse.redirect(new URL('/auth/login', req.url));
      }
      if (req.nextauth.token?.userRole === 'user' && req.nextauth.token?.userRole !== 'artist') {
        return NextResponse.redirect(new URL('/users/dashboard', req.url));
      }
      if (req.nextauth.token?.userRole === 'artist') {
        return NextResponse.next();
      }
    }
  },
  {
    callbacks: {
      authorized: ({ token }) =>
        token?.userRole !== undefined && ROLES_ALLOWED_TO_AUTH.has(token.userRole),
    },
  }
);

export const config = {
  matcher: [
    '/feed',
    '/artists/new/:path*',
    '/artists/backstage/:path*',
    '/artists/onboarding/:path*',
    '/users/dashboard/:path*',
    '/users/game/:path*',
    '/users/settings',
  ],
};

Describe the Bug

We are experiencing a bug that occurs randomly for some of our users, only in production, on any route of the site, and it has never been reported on Sentry. We can only see it in the Vercel logs.

The full error message is as follows: Uncaught Exception {"errorType":"Error","errorMessage":"socket hang up","code":"ECONNRESET","stack":["Error: socket hang up"," at connResetException (node:internal/errors:717:14)"," at TLSSocket.socketOnEnd (node:_http_client:526:23)"," at TLSSocket.emit (node:events:525:35)"," at TLSSocket.emit (node:domain:489:12)"," at endReadableNT (node:internal/streams/readable:1359:12)"," at process.processTicksAndRejections (node:internal/process/task_queues:82:21)"]} Unknown application error occurred Runtime.Unknown.

We think (but can’t verify) that this bug appeared when we updated to Next.js 13. However, none of our pages use appRouter; we’re still using Page Router for the time being. We’ve seen that rewrites can cause socket hangs, but as you can see in our next.config.js, we don’t use rewrites.

This can happen on SSG (Static Site Generation), SSR (Server-Side Rendering), or Client-side rendered pages. It can also happen on any browser or device.

Honestly, we have no clue or way of reproducing this problem because even in our development environment, we don’t encounter any problems.

Expected Behavior

I expect the application to work seamlessly without any errors or disruptions. Specifically, I anticipate that the mentioned “Socket Hang Up” error will not occur randomly in production mode on any route of the site. Additionally, I hope that better error handling mechanisms will be implemented to address any potential issues that may arise.

Which browser are you using? (if relevant)

No response

How are you deploying your application? (if relevant)

Vercel

About this issue

  • Original URL
  • State: closed
  • Created a year ago
  • Reactions: 37
  • Comments: 71 (10 by maintainers)

Most upvoted comments

I had the same issue, with next@13.4.x + next-auth@4.x.x, deployed to ECS in a standalone mode docker image. The server crash randomly with Error: socket hang up even under super light load. But when running next dev locally, it was all normal.

I tried all different kinds of troubleshooting, including changing AWS health check settings, moving all API routes from app dir to pages dir, trying different next-auth versions, setting httpAgentOptions.keepAlive to false, switching docker image base from alpine to debian, etc. None of which fixed the issue.

With a local docker container and k6 for load testing, I was finally able to reproduce the error. And after trying all different combinations, I found out the error starts from next@13.3.5-canary.9, and there’s the diff link https://github.com/vercel/next.js/compare/v13.3.5-canary.8...v13.3.5-canary.9 if any of you are interested in digging more into it. And 13.3.4 is the last stable version prior to it.

TLDR: use next@13.3.4


Update 18 Aug

Please ignore my workaround above. I just found out that 13.3.4 is also broken, just happened that it didn’t throw the error but crashes the server without printing any logs.

The true fix I found so far is to run a x64 image, and the error is gone even under heavy load:

docker buildx build --platform=linux/amd64 -t app-name .

However, this is far from ideal, it takes way longer for my M chip MacBook to build, and I’ll have to migrate my AWS server architecture from ARM to x64 😢.

And one more thing, the current latest version 13.4.18 is still broken in the standalone mode due to env variables bug, see https://github.com/vercel/next.js/issues/53367, so i have to test with the last working version 13.4.12. While the PR https://github.com/vercel/next.js/pull/54203 to fix the env variable bug has been merged, it might take a while to be released on npm.


@ielaajezdev saw you mention you tried linux/amd64 but did work, can you share more detail? like how you build the image and how you trigger the error?


Update 19 Aug

Found this issue https://github.com/prisma/prisma/issues/19419. And i tried the workaround posted by @tigranbs, setting Prisma’s DB url with ?connection_limit=1 fixed the error for me, but obviously it’s not the fix to this bug.

I also tried, prisma@4.16.2 and prisma@5.1.1, and getting the error for both. But when I load test on a route that doesn’t fetch data with Prisma, the error is gone.

So for me, the causes of the error can be narrowed down to:

  • Arm64 Linux
  • next@13.4.x + prisma@4/prisma@5
  • More than 2 concurrent requests

@piotrcichosz @0xadada @mthmcalixto @renanrodrigueszup did your case follow this pattern?


Update 19 Aug

Tried the latest next@13.4.19, still getting the error under the conditions mentioned above, but the error message is different this time:

TypeError: fetch failed
    at Object.fetch (node:internal/deps/undici/undici:11457:11)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async invokeRequest (/app/node_modules/.pnpm/next@13.4.19_react-dom@18.2.0_react@18.2.0/node_modules/next/dist/server/lib/server-ipc/invoke-request.js:17:12)
    at async invokeRender (/app/node_modules/.pnpm/next@13.4.19_react-dom@18.2.0_react@18.2.0/node_modules/next/dist/server/lib/router-server.js:254:29)
    at async handleRequest (/app/node_modules/.pnpm/next@13.4.19_react-dom@18.2.0_react@18.2.0/node_modules/next/dist/server/lib/router-server.js:447:24)
    at async requestHandler (/app/node_modules/.pnpm/next@13.4.19_react-dom@18.2.0_react@18.2.0/node_modules/next/dist/server/lib/router-server.js:464:13)
    at async Server.<anonymous> (/app/node_modules/.pnpm/next@13.4.19_react-dom@18.2.0_react@18.2.0/node_modules/next/dist/server/lib/start-server.js:117:13) {
  cause: Error: connect ECONNREFUSED 0.0.0.0:39679
      at TCPConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
    errno: -111,
    code: 'ECONNREFUSED',
    syscall: 'connect',
    address: '0.0.0.0',
    port: 39679
  }
}

By running netstat -tulpn | grep LISTEN before the load test, the process on port 39679 is:

tcp        0      0 0.0.0.0:39679           0.0.0.0:*               LISTEN      19/next-render-work

I would like to add my experience with this issue

  • we don’t use Prisma in our project
  • we don’t use Next Auth
  • but still, the error appeared multiple times on prod
  • had to downgrade to 13.3.4 and haven’t seen the error since then (at least, in our case)

So, I don’t think the issue is related to Prisma or NextAuth

In case anyone is using Sentry, our issue turned out to be related to a bug with the @sentry/nextjs package. Bumping it up a version has fixed the issue on our end.

https://github.com/orgs/vercel/discussions/3248#discussioncomment-7851868

I have just tried v14.0.4-canary.47 and the issue persists. I also tried Node.js v18 and v20.

We are only using the App Router. We do not use Prisma or NextAuth. This is affecting builds hosted on Vercel.com (including production).

It takes a little while for the issue to pop up after deploying, but after a few RSC renders, it happens quite often (~15% of the time).

Wow… this is really broad. Personally, I have a deployed project running 13.4.9 on bare Windows Server 2022 Datacenter, and it hasn’t had any of those errors. Though, I did get the errors while developing it, but at the time I believe it was some one-time-thing.

Also, 4 days ago on another project, I was messing around with next.js source code (in node_modules directly) to optimize API compression mechanism and I always got the errors consistenly. The first request completed/downloaded its content successfully, but I noticed the request was still going. I closed the tab and opened a new one, and it looked like the request just hung. Any requests to the server completely not responsive.

In that experiment I don’t think the render worker process died, I believe it is something else. I will try to replicate it, but I don’t think I will make an issue out of it (because in next.js defence, I modified its source code). Though I hope I will come with a theory that could explain the errors.

This error sometimes happens in developer mode in version 13.4.12, when there is a lot of refresh it stops working and needs to start the terminal again.

I’m also getting the error using next@13.4.19 and prisma for app/api route.

same here on 13.4.12

Failed to proxy http://127.0.0.1:59062/tapi Error: socket hang up
    at connResetException (node:internal/errors:717:14)
    at Socket.socketOnEnd (node:_http_client:526:23)
    at Socket.emit (node:events:525:35)
    at Socket.emit (node:domain:489:12)
    at endReadableNT (node:internal/streams/readable:1359:12)
    at processTicksAndRejections (node:internal/process/task_queues:82:21) {
  code: 'ECONNRESET'
}
Failed to proxy http://127.0.0.1:59062/tapi Error: socket hang up
    at connResetException (node:internal/errors:717:14)
    at Socket.socketOnEnd (node:_http_client:526:23)
    at Socket.emit (node:events:525:35)
    at Socket.emit (node:domain:489:12)
    at endReadableNT (node:internal/streams/readable:1359:12)
    at processTicksAndRejections (node:internal/process/task_queues:82:21) {
  code: 'ECONNRESET'
}
Failed to proxy http://127.0.0.1:59062/tapi Error: socket hang up
    at connResetException (node:internal/errors:717:14)
    at Socket.socketOnEnd (node:_http_client:526:23)
    at Socket.emit (node:events:525:35)
    at Socket.emit (node:domain:489:12)
    at endReadableNT (node:internal/streams/readable:1359:12)
    at processTicksAndRejections (node:internal/process/task_queues:82:21) {
  code: 'ECONNRESET'
}

Just happened with me today using version 13.4.10. This app I’m worrking on is being deployed within a docker container on a EKS. Funny thing is, that moments before the container started logging the “Socket Hang Up” thing, the Ephemeral storage was nearly full, up to 86% usage. Maybe it could be something related to cache ? I’m keeping an eye on it trying to find a pattern.

Glad to see this thread since I am also experiencing frequent (seemingly random) “socket hang up errors” and it was quite hard to debug the root cause. I think my use case is different so I will add my error scenario just in case it makes the problem exploration easier.

Setup

  • NextJS version: 13.4.11
  • Using the pages/* and pages/api/* directories. not using the new app router

I am developing my next app in a Docker compose composition. The next app runs in the node 18 alpine image as per the Docker examples provided. Other containers in the composition are a postgres db, Prisma (studio), cerbos and strapi. I am developing on MacOS (Mac with M2). I have not yet deployed my app to production and only use development mode currently.

Answers to your questions

  • The socket hang up occurs seemingly random and when it happens, I get about 3 socket hang up/ECONNRESET errors in a row.
  • After a socket hang up, no other requests are handled anymore: I need to restart the container again to restore functionality.
  • I thought at first that the problem was connected to my file API endpoint (an endpoint that downloads a file from a remote server and then sends it to the client after logging the request) but that does not seem to be the case: even when not loading any images, this error occurs.

Things I have tried

  • I have tried to use the specific linux/amd64 platform for my Dockerfile and that did seem to resolve issues but was too slow in the development environment on my M2 mac
  • I have tried to decrease the MTU for the docker bridge network as suggested here but that also did not resolve issues

Hope this helps! Following this issue and reverting to next 13.4.5 for now…

@SebastienSusini I also isolated this issue in my app to have started in next (same version in your bug report), and reverting to 13.4.5 resolved the issue13.4.6 🎉

13.4.12 with the same problem man

Same issue here, happening very often, impossible to find where it comes from.

I am using “next”: “^14.0.4”, with nextAuth, and nextJS middleware (my app uses also Wundergraph/sdk)

Any update on this issue?

Thx a lot

“next”: “^14.0.3” the same issue when running custom server

In our case it is not random. There is a feed.xml route in our app and it sends a request to S3 to get the actual feed. But the file size is about ~100mb, it gives “socket hang up” error and then it breaks the entire app. No way to handle the error in try-catch.

I’ve tried almost everything to get rid of this error, including trying to use node https or other 3rd party http clients instead of fetch api. But it seems, the problem is not related to next, it’s a node.js or maybe undici related problem. Also tried some other node images but I think a stable node image could fix this issue.

We could “solve” or at least work around the socket hang up errors by replacing node with bun.

After updating to Prisma 5.3.0, the issue seems to be resolved (in development) so it might be that the issues are unrelated, but I will continue to monitor.

Currently I have this issue very often (almost everyday). Situation is always the same: socket hang up, next-render-worker-app is missing in processes, server doesn’t respond - just hangs.

BTW prisma maintainers said they fixed related problem: https://github.com/prisma/prisma/issues/19419 – maybe this helps. I haven’t tried it yet since the fix is in unstable branch for now.

So for me, the causes of the error can be narrowed down to:

  • Arm64 Linux
  • next@13.4.x + prisma@4/prisma@5
  • More than 2 concurrent requests

@piotrcichosz @0xadada @mthmcalixto @renanrodrigueszup did your case follow this pattern?

@mdluo not exactly, we don’t use prisma, and can see it occur after a single (non-concurrent) request.

@0xadada Interesting… It seems like your issue is a bit different but I think it correlates. I think this could be just the client aborting the connection and next.js still actively waiting for incoming TCP packets.

I’ll try to reproduce it by making a long request and abort it. See if I can get a reproduction.

We are also running into this issue, with the same circumstances as described before:

  • Only seems to occur in production. (Possibly due to load?)
  • Occurs only after a couple of days or a week+, after millions of requests have already happened
  • Unable to pin-point one specific action/request that causes this issue
Error: socket hang up
    at connResetException (node:internal/errors:705:14)
    at Socket.socketOnEnd (node:_http_client:518:23)
    at Socket.emit (node:events:525:35)
    at Socket.emit (node:domain:489:12)
    at endReadableNT (node:internal/streams/readable:1358:12)
    at processTicksAndRejections (node:internal/process/task_queues:83:21) {
  code: 'ECONNRESET'

We will now downgrade to 13.4.5 and watch if it happens again, unfortunately we have no way to test it consistently. Time will tell…?

For info, we are running on AWS EC2 instances. @0xadada do I understand correctly that in your case following requests do get handled? For us it seems to completely stop the server from being able to handle any requests after that point.