jaeger: [Bug]: jaeger ingester module nil pointer error (when consume to ES)

What happened?

Start jaeger container with ES as data storage, the ingester container keeps down, nil pointer error.

Version 1.35 & 1.36 has the same issue.

Steps to reproduce

Step0: run jaeger with es by container

#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail

BASE=/data/jaeger
mkdir -p $BASE
cd $BASE

if [ $# -ge 1 ] && [ $1 = "clean" ]; then
    sudo docker-compose down 2>/dev/null
    sudo rm -rf esdata01 esdata02 esdata03 kibanadata 2>/dev/null
    rm .env docker-compose.yml nginx_query.conf nginx_collector.conf 2>/dev/null
    exit 0
fi

mkdir -p esdata01 esdata02 esdata03 kibanadata
chown 1000:1000 -R esdata01 esdata02 esdata03 kibanadata

cat >.env <<'EOF'
# Password for the 'elastic' user (at least 6 characters)
ELASTIC_PASSWORD=passwd

# Password for the 'kibana_system' user (at least 6 characters)
KIBANA_PASSWORD=passwd

# Version of Elastic products
STACK_VERSION=7.17.5

# Set the cluster name
CLUSTER_NAME=jeager-cluster

# Set to 'basic' or 'trial' to automatically start the 30-day trial
LICENSE=basic
#LICENSE=trial

# Port to expose Elasticsearch HTTP API to the host
ES_PORT=9200

# Port to expose Kibana to the host
KIBANA_PORT=5601

# Increase or decrease based on the available host memory (in bytes)
MEM_LIMIT=4294967296

KAFKA_BROKER=<broker1>:9002,<broker2>:9002,<broker3>:9002
EOF

cat >nginx_query.conf <<'EOF'
upstream jaeger_query {
    server query01:16686;
    server query02:16686;
    server query03:16686;
}

server {
    listen          16686;
    server_name     0.0.0.0;

    location / {
      proxy_pass  http://jaeger_query;
    }
}
EOF

cat >nginx_collector.conf <<'EOF'
upstream jaeger_collector {
    server collector01:4318;
    server collector02:4318;
    server collector03:4318;
}

server {
    listen          4318;
    server_name     0.0.0.0;
    location / {
        proxy_pass      http://jaeger_collector;
    }
}
EOF

cat >docker-compose.yml <<'EOF'
version: "2.2"

services:
  es01:
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - ./esdata01:/usr/share/elasticsearch/data
    ports:
      - 9200
    restart: always
    environment:
      - node.name=es01
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es02,es03
      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
      - bootstrap.memory_lock=true
      - xpack.security.enabled=false
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s http://localhost:9200 | grep 'You Know, for Search'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  es02:
    depends_on:
      - es01
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - ./esdata02:/usr/share/elasticsearch/data
    ports:
      - 9200
    restart: always
    environment:
      - node.name=es02
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es01,es03
      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
      - bootstrap.memory_lock=true
      - xpack.security.enabled=false
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s http://localhost:9200 | grep 'You Know, for Search'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  es03:
    depends_on:
      - es02
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - ./esdata03:/usr/share/elasticsearch/data
    ports:
      - 9200
    restart: always
    environment:
      - node.name=es03
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es01,es02
      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
      - bootstrap.memory_lock=true
      - xpack.security.enabled=false
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s http://localhost:9200 | grep 'You Know, for Search'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  kibana:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
    volumes:
      - ./kibanadata:/usr/share/kibana/data
    ports:
      - ${KIBANA_PORT}:5601
    restart: always
    environment:
      - SERVERNAME=kibana
      - ELASTICSEARCH_HOSTS=["http://es01:9200","http://es02:9200","http://es03:9200"]
      - ELASTICSEARCH_USERNAME=kibana_system
      - ELASTICSEARCH_PASSWORD=${KIBANA_PASSWORD}
    mem_limit: ${MEM_LIMIT}
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s -I http://localhost:5601 | grep 'HTTP/1.1 302 Found'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  collector01:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-collector:1.35
    ports:
      - 9411
      - 14250
      - 14268
      - 14269
      - 4318
      - 4317
    restart: always
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - SPAN_STORAGE_TYPE=kafka
      - KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}

  collector02:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-collector:1.35
    ports:
      - 9411
      - 14250
      - 14268
      - 14269
      - 4318
      - 4317
    restart: always
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - SPAN_STORAGE_TYPE=kafka
      - KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}

  collector03:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-collector:1.35
    ports:
      - 9411
      - 14250
      - 14268
      - 14269
      - 4318
      - 4317
    restart: always
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - SPAN_STORAGE_TYPE=kafka
      - KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}

  ingestor:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-ingester:1.35
    ports:
      - 14270
    restart: always
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
    command:
      - "--kafka.consumer.brokers=${KAFKA_BROKER}"
      - "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"

  query01:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-query:1.35
    ports:
      - 16685
      - 16686
      - 16687
    restart: always
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
    command:
      - "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"

  query02:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-query:1.35
    ports:
      - 16685
      - 16686
      - 16687
    restart: always
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
    command:
      - "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"

  query03:
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: jaegertracing/jaeger-query:1.35
    ports:
      - 16685
      - 16686
      - 16687
    restart: always
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
    command:
      - "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"

  nginx:
    depends_on:
      - query01
      - query02
      - query03
      - collector01
      - collector02
      - collector03
    image: nginx
    ports:
      - 16686:16686
      - 4318:4318
    restart: always
    volumes:
      - ./nginx_query.conf:/etc/nginx/conf.d/nginx_query.conf
      - ./nginx_collector.conf:/etc/nginx/conf.d/nginx_collector.conf
EOF

sudo docker-compose up -d

Step1: At first it works fine, I can see data in jaeger-ui, but then ingester container keeps down. Error log:

2022/07/26 10:24:35 maxprocs: Leaving GOMAXPROCS=36: CPU quota undefined
{"level":"info","ts":1658831075.124132,"caller":"flags/service.go:119","msg":"Mounting metrics handler on admin server","route":"/metrics"}
{"level":"info","ts":1658831075.1241765,"caller":"flags/service.go:125","msg":"Mounting expvar handler on admin server","route":"/debug/vars"}
{"level":"info","ts":1658831075.1243546,"caller":"flags/admin.go:128","msg":"Mounting health check on admin server","route":"/"}
{"level":"info","ts":1658831075.1244004,"caller":"flags/admin.go:141","msg":"Starting admin HTTP server","http-addr":":14270"}
{"level":"info","ts":1658831075.1244335,"caller":"flags/admin.go:120","msg":"Admin server started","http.host-port":"[::]:14270","health-status":"unavailable"}
{"level":"info","ts":1658831075.1393647,"caller":"config/config.go:206","msg":"Elasticsearch detected","version":7}
{"level":"info","ts":1658831075.73236,"caller":"healthcheck/handler.go:129","msg":"Health Check state change","status":"ready"}
{"level":"info","ts":1658831075.7324462,"caller":"consumer/consumer.go:79","msg":"Starting main loop"}
{"level":"info","ts":1658831102.2093694,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":0}
{"level":"info","ts":1658831102.2093897,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":0}
{"level":"info","ts":1658831102.408139,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":2}
{"level":"info","ts":1658831102.4081435,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":2}
{"level":"info","ts":1658831102.4151018,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":1}
{"level":"info","ts":1658831102.4151034,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":1}
{"level":"info","ts":1658831104.133721,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":0}
{"level":"info","ts":1658831104.194396,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":2}
{"level":"info","ts":1658831104.380035,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":1}
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0xe417c6]

goroutine 2291 [running]:
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.convertProcess(...)
        github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:123
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.convertSpanEmbedProcess({0x80?, 0xc0003f6ba0?, {0x15a7e88?, 0xfd1bf2?}}, 0xc0015432c0)
        github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:64 +0x126
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.FromDomainEmbedProcess(...)
        github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:43
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore.(*SpanWriter).WriteSpan(0xc0001152c0, {0x0?, 0x0?}, 0xc0015432c0)
        github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/writer.go:152 +0x7a
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.KafkaSpanProcessor.Process({{0x15ac040, 0x1e84f90}, {0x15abec0, 0xc0001152c0}, {0x0, 0x0}}, {0x15aeac0?, 0xc000f6b5e0?})
        github.com/jaegertracing/jaeger/cmd/ingester/app/processor/span_processor.go:67 +0xd3
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/decorator.(*retryDecorator).Process(0xc000515180, {0x15aeac0, 0xc000f6b5e0})
        github.com/jaegertracing/jaeger/cmd/ingester/app/processor/decorator/retry.go:110 +0x37
github.com/jaegertracing/jaeger/cmd/ingester/app/consumer.(*comittingProcessor).Process(0xc000741e00, {0x15aeac0, 0xc000f6b5e0})
        github.com/jaegertracing/jaeger/cmd/ingester/app/consumer/committing_processor.go:44 +0x5e
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*metricsDecorator).Process(0xc000651840, {0x15aeac0, 0xc000f6b5e0})
        github.com/jaegertracing/jaeger/cmd/ingester/app/processor/metrics_decorator.go:44 +0x5b
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*ParallelProcessor).Start.func1()
        github.com/jaegertracing/jaeger/cmd/ingester/app/processor/parallel_processor.go:57 +0x42
created by github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*ParallelProcessor).Start
        github.com/jaegertracing/jaeger/cmd/ingester/app/processor/parallel_processor.go:53 +0xf5

Expected behavior

Everything works.

Relevant log output

No response

Screenshot

No response

Additional context

No response

Jaeger backend version

No response

SDK

No response

Pipeline

No response

Stogage backend

No response

Operating system

No response

Deployment model

No response

Deployment configs

No response

About this issue

  • Original URL
  • State: open
  • Created 2 years ago
  • Comments: 19 (11 by maintainers)

Most upvoted comments

So all the fixes we have so far are defensive. Still don’t know how messages existing collector->Kafka or entering Kafka->ingested might end up with nil Process. If it’s due to data corruption in Kafka, it seems very peculiar that it’s always the nil Process that people experience (although it’s very possible that because we never checked for nil there, it’s just the most obvious symptom). I just merged the fix #3578 that should prevent panics, maybe then someone can investigate what the spans look like when stored, i.e. maybe some other parts of the span are corrupted.