Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Sanitize query params in url tracing attribute #35546

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions sdk/core/azure-core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Features Added

- Tracing: `DistributedTracingPolicy` will now set an attribute, `http.request.resend_count`, on HTTP spans for resent requests to indicate the resend attempt number. #35069
- Introduced the keyword argument `allowed_query_params` to `DistributedTracingPolicy` to allow users to specify which query parameters should be allowed in URL span attributes. #35546

### Breaking Changes

Expand All @@ -13,6 +14,7 @@
### Other Changes

- HTTP tracing spans will now include an `error.type` attribute if an error status code is returned. #34619
- URL attributes in HTTP tracing spans will now have query parameters sanitized by default. #35546
- Minimum required Python version is now 3.8

## 1.30.1 (2024-02-29)
Expand Down
1 change: 1 addition & 0 deletions sdk/core/azure-core/CLIENT_LIBRARY_DEVELOPER.md
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ from azure.core.pipeline.policies import (
| DistributedTracingPolicy | SansIOHTTPPolicy | | | | |
| | | network_span_namer | x | x | A callable to customize the span name. |
| | | tracing_attributes | x | x | Attributes to set on all created spans. |
| | | allowed_query_params | x | | Query parameters that should not be sanitized in URL span attributes. |
| --- | --- | --- | --- | --- | --- |
| RedirectPolicy | HTTPPolicy | | | | |
| | | permit_redirects | x | x | Whether the client allows redirects. Defaults to `True`. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@
#
# --------------------------------------------------------------------------
"""Traces network calls using the implementation library from the settings."""
import copy
import logging
import sys
import urllib.parse
from typing import TYPE_CHECKING, Optional, Tuple, TypeVar, Union, Any, Type
from typing import TYPE_CHECKING, Optional, Tuple, TypeVar, Union, Any, Type, Set
from types import TracebackType

from azure.core.pipeline import PipelineRequest, PipelineResponse
from azure.core.pipeline.policies import SansIOHTTPPolicy
from azure.core.pipeline.policies._utils import sanitize_url_query_params
from azure.core.pipeline.transport import HttpResponse as LegacyHttpResponse, HttpRequest as LegacyHttpRequest
from azure.core.rest import HttpResponse, HttpRequest
from azure.core.settings import settings
Expand Down Expand Up @@ -71,16 +73,26 @@ class DistributedTracingPolicy(SansIOHTTPPolicy[HTTPRequestType, HTTPResponseTyp
:type network_span_namer: callable[[~azure.core.pipeline.transport.HttpRequest], str]
:keyword tracing_attributes: Attributes to set on all created spans
:type tracing_attributes: dict[str, str]
:keyword allowed_query_params: Query parameters that should not be sanitized in URL span attributes
:type allowed_query_params: list[str]
"""

DEFAULT_QUERY_PARAMS_ALLOWLIST: Set[str] = set(["api-version"])
TRACING_CONTEXT = "TRACING_CONTEXT"
_REDACTED_PLACEHOLDER: str = "REDACTED"
_REQUEST_ID = "x-ms-client-request-id"
_RESPONSE_ID = "x-ms-request-id"
_HTTP_RESEND_COUNT = "http.request.resend_count"

def __init__(self, **kwargs: Any):
self._network_span_namer = kwargs.get("network_span_namer", _default_network_span_namer)
self._tracing_attributes = kwargs.get("tracing_attributes", {})
allowed_query_params = kwargs.get("allowed_query_params")
self._allowed_query_params: Set[str] = (
self.__class__.DEFAULT_QUERY_PARAMS_ALLOWLIST
if allowed_query_params is None
else {param.lower() for param in allowed_query_params}
)

def on_request(self, request: PipelineRequest[HTTPRequestType]) -> None:
ctxt = request.context.options
Expand Down Expand Up @@ -123,8 +135,12 @@ def end_span(
return

span: "AbstractSpan" = request.context[self.TRACING_CONTEXT]
http_request: Union[HttpRequest, LegacyHttpRequest] = request.http_request
if span is not None:
http_request: Union[HttpRequest, LegacyHttpRequest] = copy.copy(request.http_request)
redacted_url = sanitize_url_query_params(
http_request.url, self._allowed_query_params, self._REDACTED_PLACEHOLDER
)
http_request.url = redacted_url
span.set_http_attributes(http_request, response=response)
if request.context.get("retry_count"):
span.add_attribute(self._HTTP_RESEND_COUNT, request.context["retry_count"])
Expand Down
23 changes: 21 additions & 2 deletions sdk/core/azure-core/azure/core/pipeline/policies/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
# --------------------------------------------------------------------------
import datetime
import email.utils
from typing import Optional, cast, Union
from urllib.parse import urlparse
from typing import Optional, cast, Union, Set
from urllib.parse import urlparse, urlunparse, parse_qsl

from azure.core.pipeline.transport import (
HttpResponse as LegacyHttpResponse,
Expand Down Expand Up @@ -102,3 +102,22 @@ def get_domain(url: str) -> str:
:return: The domain of the url.
"""
return str(urlparse(url).netloc).lower()


def sanitize_url_query_params(url: str, allowed_query_params: Set[str], placeholder: str = "REDACTED") -> str:
"""Redact query parameters from the specified URL.

:param url: The URL to sanitize.
:type url: str
:param allowed_query_params: The set of query parameters to allow. The values are expected to be lowercase.
:type allowed_query_params: set[str]
:param placeholder: The placeholder to use for redacted query parameters.
:type placeholder: str
:rtype: str
:return: The sanitized URL.
"""
scheme, netloc, path, params, query, fragment = urlparse(url)
parsed_qp = parse_qsl(query, keep_blank_values=True)
filtered_qp = ((key, value if key.lower() in allowed_query_params else placeholder) for key, value in parsed_qp)
query = "&".join(f"{k}={v}" for k, v in filtered_qp)
return urlunparse((scheme, netloc, path, params, query, fragment))
57 changes: 54 additions & 3 deletions sdk/core/azure-core/tests/test_tracing_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_distributed_tracing_policy_solo(http_request, http_response):
with FakeSpan(name="parent") as root_span:
policy = DistributedTracingPolicy()

request = http_request("GET", "http://localhost/temp?query=query")
request = http_request("GET", "http://localhost/temp")
request.headers["x-ms-client-request-id"] = "some client request id"

pipeline_request = PipelineRequest(request, PipelineContext(None))
Expand All @@ -53,7 +53,7 @@ def test_distributed_tracing_policy_solo(http_request, http_response):
assert network_span.name == "/temp"
assert network_span.attributes.get("http.method") == "GET"
assert network_span.attributes.get("component") == "http"
assert network_span.attributes.get("http.url") == "http://localhost/temp?query=query"
assert network_span.attributes.get("http.url") == "http://localhost/temp"
assert network_span.attributes.get("http.user_agent") is None
assert network_span.attributes.get("x-ms-request-id") == "some request id"
assert network_span.attributes.get("x-ms-client-request-id") == "some client request id"
Expand All @@ -65,7 +65,7 @@ def test_distributed_tracing_policy_solo(http_request, http_response):
assert network_span.name == "/temp"
assert network_span.attributes.get("http.method") == "GET"
assert network_span.attributes.get("component") == "http"
assert network_span.attributes.get("http.url") == "http://localhost/temp?query=query"
assert network_span.attributes.get("http.url") == "http://localhost/temp"
assert network_span.attributes.get("x-ms-client-request-id") == "some client request id"
assert network_span.attributes.get("http.user_agent") is None
assert network_span.attributes.get("x-ms-request-id") == None
Expand Down Expand Up @@ -292,3 +292,54 @@ def operation_namer(http_request):
# Check operation kwargs
network_span = root_span.children[1]
assert network_span.name == "operation level name"


@pytest.mark.parametrize("http_request,http_response", request_and_responses_product(HTTP_RESPONSES))
def test_distributed_tracing_policy_query_param_default_sanitization(http_request, http_response):
settings.tracing_implementation.set_value(FakeSpan)
with FakeSpan(name="parent") as root_span:

request = http_request("GET", "https://localhost/temp?foo=1&bar=2&api-version=2.2&BIZ=baz")
pipeline_request = PipelineRequest(request, PipelineContext(None))

policy = DistributedTracingPolicy()

policy.on_request(pipeline_request)

response = create_http_response(http_response, request, None)
response.headers = request.headers
response.status_code = 202

policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None)))

assert (
root_span.children[0].attributes[FakeSpan._HTTP_URL]
== "https://localhost/temp?foo=REDACTED&bar=REDACTED&api-version=2.2&BIZ=REDACTED"
)


@pytest.mark.parametrize("http_request,http_response", request_and_responses_product(HTTP_RESPONSES))
def test_distributed_tracing_policy_query_param_custom_sanitization(http_request, http_response):
settings.tracing_implementation.set_value(FakeSpan)
with FakeSpan(name="parent") as root_span:

request = http_request("GET", "https://localhost/temp?foo=1&bar=2&api-version=2.2&BIZ=baz")
pipeline_request = PipelineRequest(request, PipelineContext(None))

allowed_query_params = [
"api-version",
"foo",
]
policy = DistributedTracingPolicy(allowed_query_params=allowed_query_params)

policy.on_request(pipeline_request)

response = create_http_response(http_response, request, None)
response.headers = request.headers
response.status_code = 202

policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None)))
assert (
root_span.children[0].attributes[FakeSpan._HTTP_URL]
== "https://localhost/temp?foo=1&bar=REDACTED&api-version=2.2&BIZ=REDACTED"
)
27 changes: 26 additions & 1 deletion sdk/core/azure-core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
from azure.core.utils import case_insensitive_dict
from azure.core.utils._utils import get_running_async_lock
from azure.core.pipeline.policies._utils import parse_retry_after
from azure.core.pipeline.policies._utils import parse_retry_after, sanitize_url_query_params


@pytest.fixture()
Expand Down Expand Up @@ -146,3 +146,28 @@ def test_parse_retry_after():
assert ret == 0
ret = parse_retry_after("0.9")
assert ret == 0.9


def test_sanitize_url_query_params():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to see more test cases. Examples: multiple instances of the same key. Empty value, REDACTED as the parameter name. Query parameter in the path. (e.g. https://a.b.com/hello/world=true/something).

url = "https://www.example.com?q1=value1&q2=value2"
allowed_query_params = {"q1"}
assert sanitize_url_query_params(url, allowed_query_params) == "https://www.example.com?q1=value1&q2=REDACTED"
allowed_query_params = {"q1", "q2"}
assert sanitize_url_query_params(url, allowed_query_params) == url
allowed_query_params = {"q1", "q2", "q3"}
assert sanitize_url_query_params(url, allowed_query_params) == url
allowed_query_params = {"q1", "q3"}
assert sanitize_url_query_params(url, allowed_query_params) == "https://www.example.com?q1=value1&q2=REDACTED"
url = "https://www.example.com"
assert sanitize_url_query_params(url, allowed_query_params) == url
url = "https://www.example.com?q1=value1"
assert sanitize_url_query_params(url, allowed_query_params) == url
url = "https://www.example.com?q1=value1&q2="
assert sanitize_url_query_params(url, allowed_query_params) == "https://www.example.com?q1=value1&q2=REDACTED"
url = "https://www.example.com?q1=value1&q2"
assert sanitize_url_query_params(url, allowed_query_params) == "https://www.example.com?q1=value1&q2=REDACTED"
url = "https://www.example.com?q1=value1&q2&q3=value3"
assert (
sanitize_url_query_params(url, allowed_query_params)
== "https://www.example.com?q1=value1&q2=REDACTED&q3=value3"
)