jaeger: [Bug]: jaeger ingester module nil pointer error (when consume to ES)
What happened?
Start jaeger container with ES as data storage, the ingester container keeps down, nil pointer error.
Version 1.35 & 1.36 has the same issue.
Steps to reproduce
Step0: run jaeger with es by container
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
BASE=/data/jaeger
mkdir -p $BASE
cd $BASE
if [ $# -ge 1 ] && [ $1 = "clean" ]; then
sudo docker-compose down 2>/dev/null
sudo rm -rf esdata01 esdata02 esdata03 kibanadata 2>/dev/null
rm .env docker-compose.yml nginx_query.conf nginx_collector.conf 2>/dev/null
exit 0
fi
mkdir -p esdata01 esdata02 esdata03 kibanadata
chown 1000:1000 -R esdata01 esdata02 esdata03 kibanadata
cat >.env <<'EOF'
# Password for the 'elastic' user (at least 6 characters)
ELASTIC_PASSWORD=passwd
# Password for the 'kibana_system' user (at least 6 characters)
KIBANA_PASSWORD=passwd
# Version of Elastic products
STACK_VERSION=7.17.5
# Set the cluster name
CLUSTER_NAME=jeager-cluster
# Set to 'basic' or 'trial' to automatically start the 30-day trial
LICENSE=basic
#LICENSE=trial
# Port to expose Elasticsearch HTTP API to the host
ES_PORT=9200
# Port to expose Kibana to the host
KIBANA_PORT=5601
# Increase or decrease based on the available host memory (in bytes)
MEM_LIMIT=4294967296
KAFKA_BROKER=<broker1>:9002,<broker2>:9002,<broker3>:9002
EOF
cat >nginx_query.conf <<'EOF'
upstream jaeger_query {
server query01:16686;
server query02:16686;
server query03:16686;
}
server {
listen 16686;
server_name 0.0.0.0;
location / {
proxy_pass http://jaeger_query;
}
}
EOF
cat >nginx_collector.conf <<'EOF'
upstream jaeger_collector {
server collector01:4318;
server collector02:4318;
server collector03:4318;
}
server {
listen 4318;
server_name 0.0.0.0;
location / {
proxy_pass http://jaeger_collector;
}
}
EOF
cat >docker-compose.yml <<'EOF'
version: "2.2"
services:
es01:
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- ./esdata01:/usr/share/elasticsearch/data
ports:
- 9200
restart: always
environment:
- node.name=es01
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es02,es03
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=true
- xpack.security.enabled=false
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s http://localhost:9200 | grep 'You Know, for Search'",
]
interval: 10s
timeout: 10s
retries: 120
es02:
depends_on:
- es01
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- ./esdata02:/usr/share/elasticsearch/data
ports:
- 9200
restart: always
environment:
- node.name=es02
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es03
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=true
- xpack.security.enabled=false
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s http://localhost:9200 | grep 'You Know, for Search'",
]
interval: 10s
timeout: 10s
retries: 120
es03:
depends_on:
- es02
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- ./esdata03:/usr/share/elasticsearch/data
ports:
- 9200
restart: always
environment:
- node.name=es03
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es02
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=true
- xpack.security.enabled=false
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s http://localhost:9200 | grep 'You Know, for Search'",
]
interval: 10s
timeout: 10s
retries: 120
kibana:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
volumes:
- ./kibanadata:/usr/share/kibana/data
ports:
- ${KIBANA_PORT}:5601
restart: always
environment:
- SERVERNAME=kibana
- ELASTICSEARCH_HOSTS=["http://es01:9200","http://es02:9200","http://es03:9200"]
- ELASTICSEARCH_USERNAME=kibana_system
- ELASTICSEARCH_PASSWORD=${KIBANA_PASSWORD}
mem_limit: ${MEM_LIMIT}
healthcheck:
test:
[
"CMD-SHELL",
"curl -s -I http://localhost:5601 | grep 'HTTP/1.1 302 Found'",
]
interval: 10s
timeout: 10s
retries: 120
collector01:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-collector:1.35
ports:
- 9411
- 14250
- 14268
- 14269
- 4318
- 4317
restart: always
environment:
- COLLECTOR_OTLP_ENABLED=true
- SPAN_STORAGE_TYPE=kafka
- KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}
collector02:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-collector:1.35
ports:
- 9411
- 14250
- 14268
- 14269
- 4318
- 4317
restart: always
environment:
- COLLECTOR_OTLP_ENABLED=true
- SPAN_STORAGE_TYPE=kafka
- KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}
collector03:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-collector:1.35
ports:
- 9411
- 14250
- 14268
- 14269
- 4318
- 4317
restart: always
environment:
- COLLECTOR_OTLP_ENABLED=true
- SPAN_STORAGE_TYPE=kafka
- KAFKA_PRODUCER_BROKERS=${KAFKA_BROKER}
ingestor:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-ingester:1.35
ports:
- 14270
restart: always
environment:
- SPAN_STORAGE_TYPE=elasticsearch
command:
- "--kafka.consumer.brokers=${KAFKA_BROKER}"
- "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"
query01:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-query:1.35
ports:
- 16685
- 16686
- 16687
restart: always
environment:
- SPAN_STORAGE_TYPE=elasticsearch
command:
- "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"
query02:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-query:1.35
ports:
- 16685
- 16686
- 16687
restart: always
environment:
- SPAN_STORAGE_TYPE=elasticsearch
command:
- "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"
query03:
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: jaegertracing/jaeger-query:1.35
ports:
- 16685
- 16686
- 16687
restart: always
environment:
- SPAN_STORAGE_TYPE=elasticsearch
command:
- "--es.server-urls=http://es01:${ES_PORT},http://es02:${ES_PORT},http://es02:${ES_PORT}"
nginx:
depends_on:
- query01
- query02
- query03
- collector01
- collector02
- collector03
image: nginx
ports:
- 16686:16686
- 4318:4318
restart: always
volumes:
- ./nginx_query.conf:/etc/nginx/conf.d/nginx_query.conf
- ./nginx_collector.conf:/etc/nginx/conf.d/nginx_collector.conf
EOF
sudo docker-compose up -d
Step1: At first it works fine, I can see data in jaeger-ui, but then ingester container keeps down. Error log:
2022/07/26 10:24:35 maxprocs: Leaving GOMAXPROCS=36: CPU quota undefined
{"level":"info","ts":1658831075.124132,"caller":"flags/service.go:119","msg":"Mounting metrics handler on admin server","route":"/metrics"}
{"level":"info","ts":1658831075.1241765,"caller":"flags/service.go:125","msg":"Mounting expvar handler on admin server","route":"/debug/vars"}
{"level":"info","ts":1658831075.1243546,"caller":"flags/admin.go:128","msg":"Mounting health check on admin server","route":"/"}
{"level":"info","ts":1658831075.1244004,"caller":"flags/admin.go:141","msg":"Starting admin HTTP server","http-addr":":14270"}
{"level":"info","ts":1658831075.1244335,"caller":"flags/admin.go:120","msg":"Admin server started","http.host-port":"[::]:14270","health-status":"unavailable"}
{"level":"info","ts":1658831075.1393647,"caller":"config/config.go:206","msg":"Elasticsearch detected","version":7}
{"level":"info","ts":1658831075.73236,"caller":"healthcheck/handler.go:129","msg":"Health Check state change","status":"ready"}
{"level":"info","ts":1658831075.7324462,"caller":"consumer/consumer.go:79","msg":"Starting main loop"}
{"level":"info","ts":1658831102.2093694,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":0}
{"level":"info","ts":1658831102.2093897,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":0}
{"level":"info","ts":1658831102.408139,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":2}
{"level":"info","ts":1658831102.4081435,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":2}
{"level":"info","ts":1658831102.4151018,"caller":"consumer/consumer.go:167","msg":"Starting error handler","partition":1}
{"level":"info","ts":1658831102.4151034,"caller":"consumer/consumer.go:110","msg":"Starting message handler","partition":1}
{"level":"info","ts":1658831104.133721,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":0}
{"level":"info","ts":1658831104.194396,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":2}
{"level":"info","ts":1658831104.380035,"caller":"consumer/processor_factory.go:65","msg":"Creating new processors","partition":1}
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0xe417c6]
goroutine 2291 [running]:
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.convertProcess(...)
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:123
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.convertSpanEmbedProcess({0x80?, 0xc0003f6ba0?, {0x15a7e88?, 0xfd1bf2?}}, 0xc0015432c0)
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:64 +0x126
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel.FromDomain.FromDomainEmbedProcess(...)
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/dbmodel/from_domain.go:43
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore.(*SpanWriter).WriteSpan(0xc0001152c0, {0x0?, 0x0?}, 0xc0015432c0)
github.com/jaegertracing/jaeger/plugin/storage/es/spanstore/writer.go:152 +0x7a
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.KafkaSpanProcessor.Process({{0x15ac040, 0x1e84f90}, {0x15abec0, 0xc0001152c0}, {0x0, 0x0}}, {0x15aeac0?, 0xc000f6b5e0?})
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/span_processor.go:67 +0xd3
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/decorator.(*retryDecorator).Process(0xc000515180, {0x15aeac0, 0xc000f6b5e0})
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/decorator/retry.go:110 +0x37
github.com/jaegertracing/jaeger/cmd/ingester/app/consumer.(*comittingProcessor).Process(0xc000741e00, {0x15aeac0, 0xc000f6b5e0})
github.com/jaegertracing/jaeger/cmd/ingester/app/consumer/committing_processor.go:44 +0x5e
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*metricsDecorator).Process(0xc000651840, {0x15aeac0, 0xc000f6b5e0})
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/metrics_decorator.go:44 +0x5b
github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*ParallelProcessor).Start.func1()
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/parallel_processor.go:57 +0x42
created by github.com/jaegertracing/jaeger/cmd/ingester/app/processor.(*ParallelProcessor).Start
github.com/jaegertracing/jaeger/cmd/ingester/app/processor/parallel_processor.go:53 +0xf5
Expected behavior
Everything works.
Relevant log output
No response
Screenshot
No response
Additional context
No response
Jaeger backend version
No response
SDK
No response
Pipeline
No response
Stogage backend
No response
Operating system
No response
Deployment model
No response
Deployment configs
No response
About this issue
- Original URL
- State: open
- Created 2 years ago
- Comments: 19 (11 by maintainers)
So all the fixes we have so far are defensive. Still don’t know how messages existing collector->Kafka or entering Kafka->ingested might end up with nil Process. If it’s due to data corruption in Kafka, it seems very peculiar that it’s always the nil Process that people experience (although it’s very possible that because we never checked for nil there, it’s just the most obvious symptom). I just merged the fix #3578 that should prevent panics, maybe then someone can investigate what the spans look like when stored, i.e. maybe some other parts of the span are corrupted.