Skip to content

Commit

Permalink
feat: Better support for OTLP (#1886)
Browse files Browse the repository at this point in the history
Add ability to configure OTLP trace exporters with more options such as
the choice of protocols, sampler configuration, TLS settings etc.

As part of this change, the `tracing` configuration block in the Cerbos
configuration file has been completely deprecated with the aim of
removing it in the release after next. This is because of the following
reasons:
- Jaeger native protocol is no longer supported by the Otel SDK.
- The Otel specification defines standard environment variables that can
be used to configure OTLP exporters. Trying to replicate all possible
configuration options in our configuration would be brittle and just
complicate our code and documentation for not much benefit.

Fixes #1784 
Part of #341

---------

Signed-off-by: Charith Ellawala <charith@cerbos.dev>
  • Loading branch information
charithe committed Nov 22, 2023
1 parent d51d9c9 commit 1722454
Show file tree
Hide file tree
Showing 16 changed files with 461 additions and 228 deletions.
8 changes: 7 additions & 1 deletion cmd/cerbos/server/server.go
Expand Up @@ -108,9 +108,15 @@ func (c *Cmd) Run() error {
}

// initialize tracing
if err := tracing.Init(ctx); err != nil {
tracingDone, err := tracing.Init(ctx)
if err != nil {
return err
}
defer func() {
if err := tracingDone(); err != nil {
log.Warnw("Trace exporter did not shutdown cleanly", "error", err)
}
}()

if err := server.Start(ctx, c.ZPagesEnabled); err != nil {
log.Errorw("Failed to start server", "error", err)
Expand Down
@@ -1,4 +1,4 @@
# Illustrates how to deploy Cerbos with an auto-injected Jaeger agent.
# Illustrates how to deploy Cerbos with traces exported via OTLP.

cerbos:
config:
Expand All @@ -7,19 +7,15 @@ cerbos:
driver: "sqlite3"
sqlite3:
dsn: "file:/data/cerbos.sqlite?mode=rwc&_fk=true"
# Configure tracing
tracing:
serviceName: cerbos
sampleProbability: 0.5
exporter: jaeger
jaeger:
agentEndpoint: "localhost:6831"


# Annotate the deployment to inject the Jaeger agent.
deployment:
annotations:
sidecar.jaegertracing.io/inject: "true"
# Environment variables to configure OTLP exporter.
env:
- name: OTEL_SERVICE_NAME
value: cerbos.myns.svc
- name: OTEL_TRACE_SAMPLER
value: parentbased_always_on
- name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
value: https://otlp.monitoring.svc.cluster.local

# Optional common labels for resources.
commonLabels:
Expand All @@ -45,4 +41,3 @@ volumes:
volumeMounts:
- name: cerbos-policies
mountPath: /data

95 changes: 53 additions & 42 deletions docs/modules/configuration/pages/tracing.adoc
@@ -1,57 +1,68 @@
include::ROOT:partial$attributes.adoc[]

= Tracing block
= Distributed traces

Cerbos supports distributed tracing to provide insights into application performance and request lifecycle. To enable tracing, set `sampleProbability` to a value between 0.0 and 1.0. Setting the probability to 1.0 makes Cerbos capture tracing information for all requests and setting it to 0.0 disables capturing any traces.
Cerbos supports distributed tracing to provide insights into application performance and request lifecycle. Traces from Cerbos can be exported to any compatible collector that uses the OpenTelemetry OTLP protocol.

The system to export the trace data must be specified using the `exporter` setting. Currently link:https://www.jaegertracing.io[Jaeger] and link:https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md[OTLP collectors] are supported. If using Jaeger, traces can be sent to either a Jaeger Agent (compact Thrift format) or a Jaeger Collector (Thrift format).
Trace configuration should be done using link:https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/[OpenTelemetry environment variables]. The following environment variables are supported.

[%header,cols=".^1m,6a",grid=rows]
|===
| Environment variable | Description

.OpenTelemetry
****
link:https://opentelemetry.io[OpenTelemetry] is the evolving standard for observability. Cerbos supports OpenTelemetry with a few caveats due to limitations in the current Go implementation of OpenTelemetry.
| OTEL_SDK_DISABLED
| Disable traces if set to `true`

| OTEL_SERVICE_NAME
| Service name reported in the traces. Defaults to `cerbos`.

* gRPC clients should use the link:https://github.com/open-telemetry/opentelemetry-go-contrib/tree/main/propagators/opencensus[OpenCensus binary propagation format] for distributed traces.
| OTEL_TRACES_SAMPLER
| link:https://opentelemetry.io/docs/specs/otel/trace/sdk/#sampling[Trace sampler]. Defaults to `parentbased_always_off`. Supported values: +
--
`always_on`:: Record every trace.
`always_off`:: Don't record any traces.
`traceidratio`:: Record a fraction of traces based on ID. Set `OTEL_TRACES_SAMPLER_ARG` to a value between 0 and 1 to define the fraction.
`parentbased_always_on`:: Record all traces except those where the parent span is not sampled.
`parentbased_always_off`:: Don't record any traces unless the parent span is sampled.
`parentbased_traceidratio`:: Record a fraction of traces where the parent span is sampled. Set `OTEL_TRACES_SAMPLER_ARG` to a value between 0 and 1 to define the fraction.
--

* Metrics are not yet supported.
| OTEL_TRACES_SAMPLER_ARG
| Set the sampling ratio when `OTEL_TRACES_SAMPLER` is a ratio-based sampler. Defaults to `0.1`.

****
| OTEL_EXPORTER_OTLP_TRACES_ENDPOINT or OTEL_EXPORTER_OTLP_ENDPOINT
| Address of the OTLP collector (for example: `https://localhost:4317`). If not defined, traces are disabled.

== Jaeger
| OTEL_EXPORTER_OTLP_TRACES_INSECURE or OTEL_EXPORTER_OTLP_INSECURE
| Skip validating the TLS certificate of the endpoint

.Send trace data to Jaeger Agent (compact Thrift)
[source,yaml,linenums]
----
tracing:
serviceName: cerbos
sampleProbability: 0.5
exporter: jaeger
jaeger:
agentEndpoint: "localhost:6831"
----
| OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE or OTEL_EXPORTER_OTLP_CERTIFICATE
| Path to the certificate to use for validating the server's TLS credentials.

.Send trace data to Jaeger Collector (Thrift)
[source,yaml,linenums]
----
tracing:
serviceName: cerbos
sampleProbability: 0.5
exporter: jaeger
jaeger:
collectorEndpoint: "http://localhost:14268/api/traces"
----
| OTEL_EXPORTER_OTLP_TRACES_CLIENT_CERTIFICATE or OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE
| Path to the client certificate to use for mTLS

[#otlp]
== OTLP
| OTEL_EXPORTER_OTLP_TRACES_CLIENT_KEY or OTEL_EXPORTER_OTLP_CLIENT_KEY
| Path to the client key to use for mTLS

| OTEL_EXPORTER_OTLP_TRACES_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL
| OTLP protocol. Supported values are `grpc` and `http/protobuf`. Defaults to `grpc`.
|===

Refer to https://opentelemetry.io/docs/specs/otel/protocol/exporter/ for more information about exporter configuration through environment variables. Note that the OpenTelemetry Go SDK used by Cerbos might not have full support for some of the environment variables listed on the OpenTelemetry specification.

[#migration]
== Migrating tracing configuration from previous Cerbos versions

From Cerbos 0.32.0, the preferred method of trace configuration is through the OpenTelemetry environment variables described above. The `tracing` section of the Cerbos configuration file is deprecated and will be removed in Cerbos 0.33.0. Native Jaeger protocol is deprecated as well and will be removed in Cerbos 0.33.0. Follow the instructions below to migrate your existing configuration.

[%header,cols=".^1m,6a",grid=rows]
|===
| Configuration setting | New configuration

| tracing.serviceName | Set `OTEL_SERVICE_NAME` environment variable
| tracing.sampleProbability | Set `OTEL_TRACES_SAMPLER` to `parentbased_traceidratio` and `OTEL_TRACES_SAMPLER_ARG` to the probability value
| tracing.jaeger.agentEndpoint or tracing.jaeger.collectorEndpoint | Jaeger now has link:https://www.jaegertracing.io/docs/1.51/apis/#opentelemetry-protocol-stable[stable support for OTLP] and is the recommended way to send traces. Set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` to the address of your Jaeger instance (for example: `https://your.jaeger.instance:4317`) and, optionally, set `OTEL_EXPORTER_OTLP_TRACES_INSECURE=true` if Jaeger is using a self-signed certificate. If you want to use the HTTP API or customize other aspects, refer to the documentation above for other supported environment variables.
| tracing.otlp.collectorEndpoint | Set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` to the value of the collector endpoint and `OTEL_EXPORTER_OTLP_INSECURE=true` to emulate the behaviour of Cerbos OTLP exporter before version 0.32.0.
|===

.Send trace data to an OTLP collector
[source,yaml,linenums]
----
tracing:
serviceName: cerbos
sampleProbability: 0.5
exporter: otlp
otlp:
collectorEndpoint: "otel:4317"
----
10 changes: 5 additions & 5 deletions docs/modules/configuration/partials/fullconfiguration.adoc
Expand Up @@ -211,12 +211,12 @@ telemetry:
reportInterval: 1h # ReportInterval is the interval between telemetry pings.
stateDir: ${HOME}/.config/cerbos # StateDir is used to persist state to avoid repeatedly sending the data over and over again.
tracing:
exporter: jaeger # Exporter is the type of trace exporter to use.
jaeger: # Jaeger configures the Jaeger exporter.
exporter: jaeger # [Deprecated] Only OTLP is supported.
jaeger: # [Deprecated] Use OTLP to send traces to Jaeger.
agentEndpoint: "localhost:6831" # AgentEndpoint is the Jaeger agent endpoint to report to.
collectorEndpoint: "http://localhost:14268/api/traces" # CollectorEndpoint is the Jaeger collector endpoint to report to.
serviceName: cerbos # [Deprecated] Use top level ServiceName config. ServiceName is the name of the service to report to Jaeger.
otlp: # OTLP configures the OpenTelemetry exporter.
otlp: # [Deprecated] Use OpenTelemetry environment variables to configure OTLP.
collectorEndpoint: "otel:4317" # CollectorEndpoint is the Open Telemetry collector endpoint to export to.
sampleProbability: 0.1 # SampleProbability is the probability of sampling expressed as a number between 0 and 1.
serviceName: cerbos # ServiceName is the name of the service reported to the exporter.
sampleProbability: 0.1 # [Deprecated] Use OTEL_TRACES_SAMPLER and OTEL_TRACES_SAMPLER_ARG to configure trace sampler.
serviceName: cerbos # [Deprecated] Use OTEL_SERVICE_NAME environment variable.
3 changes: 2 additions & 1 deletion go.mod
Expand Up @@ -87,7 +87,9 @@ require (
go.opentelemetry.io/contrib/propagators/b3 v1.21.1
go.opentelemetry.io/otel v1.21.0
go.opentelemetry.io/otel/exporters/jaeger v1.17.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.21.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.21.0
go.opentelemetry.io/otel/sdk v1.21.0
go.opentelemetry.io/otel/trace v1.21.0
go.uber.org/automaxprocs v1.5.3
Expand Down Expand Up @@ -267,7 +269,6 @@ require (
go.opentelemetry.io/contrib/propagators/aws v1.21.1 // indirect
go.opentelemetry.io/contrib/propagators/jaeger v1.21.1 // indirect
go.opentelemetry.io/contrib/propagators/ot v1.21.1 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.21.0 // indirect
go.opentelemetry.io/otel/metric v1.21.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.20.0 // indirect
go.opentelemetry.io/proto/otlp v1.0.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -952,6 +952,8 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.21.0 h1:cl5P5/GIfFh4t6xyruO
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.21.0/go.mod h1:zgBdWWAu7oEEMC06MMKc5NLbA/1YDXV1sMpSqEeLQLg=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0 h1:tIqheXEFWAZ7O8A7m+J0aPTmpJN3YQ7qetUAdkkkKpk=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0/go.mod h1:nUeKExfxAQVbiVFn32YXpXZZHZ61Cc3s3Rn1pDBGAb0=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.21.0 h1:digkEZCJWobwBqMwC0cwCq8/wkkRy/OowZg5OArWZrM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.21.0/go.mod h1:/OpE/y70qVkndM0TrxT4KBoN3RsFZP0QaofcfYrj76I=
go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4=
go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM=
go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8=
Expand Down
7 changes: 0 additions & 7 deletions hack/dev/conf.secure.bundle.yaml
Expand Up @@ -30,13 +30,6 @@ audit:
file:
path: stdout

tracing:
sampleProbability: 1.0
exporter: jaeger
jaeger:
agentEndpoint: "localhost:6831"
serviceName: "cerbos"

storage:
driver: "bundle"
bundle:
Expand Down
7 changes: 0 additions & 7 deletions hack/dev/conf.secure.yaml
Expand Up @@ -37,13 +37,6 @@ audit:
maxFileAgeDays: 1
maxFileCount: 3

tracing:
sampleProbability: 1.0
exporter: jaeger
jaeger:
agentEndpoint: "localhost:6831"
serviceName: "cerbos"

storage:
driver: "disk"
disk:
Expand Down
25 changes: 14 additions & 11 deletions hack/dev/dev.mk
Expand Up @@ -11,15 +11,22 @@ $(DEV_DIR)/tls.crt:

.PHONY: dev-server
dev-server: $(DEV_DIR)/tls.crt
@ go run cmd/cerbos/main.go server --log-level=debug --debug-listen-addr=":6666" --z-pages-enabled --config=$(DEV_DIR)/conf.secure.yaml
@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
OTEL_EXPORTER_OTLP_INSECURE=true \
OTEL_TRACES_SAMPLER=parentbased_traceidratio \
OTEL_TRACES_SAMPLER_ARG=1.0 \
go run cmd/cerbos/main.go server --log-level=debug --debug-listen-addr=":6666" --z-pages-enabled --config=$(DEV_DIR)/conf.secure.yaml

.PHONY: perf-server
perf-server: $(DEV_DIR)/tls.crt
@ go run cmd/cerbos/main.go server --log-level=error --debug-listen-addr=":6666" --z-pages-enabled --config=$(DEV_DIR)/conf.secure.yaml --set=tracing.sampleProbability=0 --set=storage.disk.watchForChanges=false

.PHONY: dev-server-insecure
dev-server-insecure:
@ go run cmd/cerbos/main.go server --log-level=debug --debug-listen-addr=":6666" --z-pages-enabled --config=$(DEV_DIR)/conf.insecure.yaml
@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
OTEL_EXPORTER_OTLP_INSECURE=true \
OTEL_TRACES_SAMPLER=parentbased_always_on \
go run cmd/cerbos/main.go server --log-level=debug --debug-listen-addr=":6666" --z-pages-enabled --config=$(DEV_DIR)/conf.insecure.yaml

.PHONY: protoset
protoset: $(BUF)
Expand Down Expand Up @@ -231,13 +238,9 @@ perf: $(GHZ)
.PHONY: jaeger
jaeger:
@ docker run -i -t --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 5775:5775/udp \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-e COLLECTOR_OTLP_ENABLED=true \
-p 14269:14269 \
-p 16686:16686 \
-p 14268:14268 \
-p 14250:14250 \
-p 9411:9411 \
jaegertracing/all-in-one:1.28
-p 4317:4317 \
-p 6831:6831/udp \
jaegertracing/all-in-one:1.51
2 changes: 1 addition & 1 deletion hack/dev/playground.hurl
Expand Up @@ -29,7 +29,7 @@ HTTP 400
header "Content-Type" == "application/json"
jsonpath "$.failure.errors" count == 2
jsonpath "$.failure.errors[?(@.file == 'resource.yaml')].error" nth 0 == "Derived roles import 'apatr_common_roles' not found"
jsonpath "$.failure.errors[?(@.file == 'common_roles.yaml')].error" nth 0 == "Failed to read: file is not valid: [/: missing properties: 'resourcePolicy', /: missing properties: 'principalPolicy', /: missing properties: 'derivedRoles', /: missing properties: 'exportVariables']"
jsonpath "$.failure.errors[?(@.file == 'common_roles.yaml')].error" nth 0 == "Failed to read: file is not valid: { /: [missing properties: 'derivedRoles' | missing properties: 'exportVariables' | missing properties: 'principalPolicy' | missing properties: 'resourcePolicy'] }"


# Playground evaluate request 1
Expand Down
101 changes: 101 additions & 0 deletions internal/observability/otel/helpers.go
@@ -0,0 +1,101 @@
// Copyright 2021-2023 Zenauth Ltd.
// SPDX-License-Identifier: Apache-2.0

package otel

import (
"context"
"fmt"
"strconv"

"go.opentelemetry.io/otel/sdk/resource"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
"go.uber.org/zap"
)

type EnvVar struct {
Name string
Alt string
}

var (
DisabledEV = EnvVar{Name: "OTEL_SDK_DISABLED"}
MetricsEndpointEV = EnvVar{Name: "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", Alt: "OTEL_EXPORTER_OTLP_ENDPOINT"}
MetricsProtocolEV = EnvVar{Name: "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", Alt: "OTEL_EXPORTER_OTLP_PROTOCOL"}
ServiceNameEV = EnvVar{Name: "OTEL_SERVICE_NAME"}
TracesEndpointEV = EnvVar{Name: "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", Alt: "OTEL_EXPORTER_OTLP_ENDPOINT"}
TracesEndpointInsecureEV = EnvVar{Name: "OTEL_EXPORTER_OTLP_TRACES_INSECURE", Alt: "OTEL_EXPORTER_OTLP_INSECURE"}
TracesSamplerEV = EnvVar{Name: "OTEL_TRACES_SAMPLER"}
TracesSamplerArgEV = EnvVar{Name: "OTEL_TRACES_SAMPLER_ARG"}
TracesProtocolEV = EnvVar{Name: "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", Alt: "OTEL_EXPORTER_OTLP_PROTOCOL"}
)

const (
GRPCProtocol = "grpc"
HTTPProtobufProtocol = "http/protobuf"

AlwaysOffSampler = "always_off"
AlwaysOnSampler = "always_on"
JaegerRemoteSampler = "jaeger_remote"
ParentBasedAlwaysOffSampler = "parentbased_always_off"
ParentBasedAlwaysOnSampler = "parentbased_always_on"
ParentBasedJaegerRemoteSampler = "parentbased_jaeger_remote"
ParentBasedTraceIDRatioSampler = "parentbased_traceidratio"
TraceIDRatioSampler = "traceidratio"
)

type Env func(string) (string, bool)

func (env Env) Get(ev EnvVar) (string, bool) {
val, ok := env(ev.Name)
if !ok && ev.Alt != "" {
val, ok = env(ev.Alt)
}

return val, ok
}

func (env Env) GetOrDefault(ev EnvVar, defaultVal string) string {
val, ok := env.Get(ev)
if !ok {
return defaultVal
}

return val
}

func isDisabled(env Env) bool {
log := zap.L().Named("otel")
dv, ok := env.Get(DisabledEV)
if ok {
disabled, err := strconv.ParseBool(dv)
if err != nil {
log.Warn("Disabling traces because OTEL_SDK_DISABLED environment variable couldn't be parsed", zap.Error(err))
return false
}

if disabled {
log.Debug("Disabling traces because OTEL_SDK_DISABLED environment variable is set")
}
return disabled
}

_, endpointDefined := env.Get(TracesEndpointEV)
if !endpointDefined {
log.Debug("Disabling traces because neither OTEL_EXPORTER_OTLP_ENDPOINT nor OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is defined")
}
return !endpointDefined
}

func NewResource(ctx context.Context, serviceName string) (*resource.Resource, error) {
res, err := resource.New(ctx,
resource.WithAttributes(semconv.ServiceNameKey.String(serviceName)),
resource.WithProcessPID(),
resource.WithHost(),
resource.WithFromEnv())
if err != nil {
return nil, fmt.Errorf("failed to initialize otel resource: %w", err)
}

return res, nil
}

0 comments on commit 1722454

Please sign in to comment.