You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to deploy a TensorflowModel (yolov5) directly from artifacts stored in S3 on a GPU instance (ml.g4dn.2xlarge)
which does not seem to be used during inference as the inference time is ~1s.
To reproduce
#script deploy.py
def get_tensorflow_inference_image_uri(
region="eu-north-1",
py_version="3.10",
version="2.12",
instance_type="ml.g4dn.2xlarge",
framework="tensorflow",
image_scope="inference",
accelerator_type=None,
):
"""Retrieve the ECR URI for the TensorFlow inference image."""
session = boto3.Session(region_name=region)
# Retrieve the ECR URI for the TensorFlow inference image
image_uri = image_uris.retrieve(
framework=framework,
region=region,
version=version,
py_version=py_version,
instance_type=instance_type,
accelerator_type=accelerator_type,
image_scope=image_scope,
sagemaker_session=Session(boto_session=session),
)
return image_uri
# Retrieve the ECR URI for the TensorFlow inference image
gpu_image_uri = get_tensorflow_inference_image_uri(
region=region, instance_type=instance_type, version=framework_version
)
# Create a Boto3 session
session = boto3.Session()
model = TensorFlowModel(
model_data=artifact_url,
role=role,
entry_point="inference.py",
source_dir="code",
image_uri=gpu_image_uri,
name=model_name
)
predictor = model.deploy(initial_instance_count=1, instance_type=instance_type)
#script inference.py
import json
import numpy as np
import io
from PIL import Image
from helpers import read_model_config, decode_yolo_preds, read_labels
import logging
import os
# Configure logging
logging.basicConfig(level=logging.INFO)
# Log GPU metrics
if 'CUDA_VISIBLE_DEVICES' in os.environ:
gpu_id = os.environ['CUDA_VISIBLE_DEVICES']
logging.info(f"Using GPU {gpu_id}")
else:
logging.info("No GPU found")
def input_handler(data, context):
"""Pre-process request input before it is sent to TensorFlow Serving REST API
Args:
data (obj): the request data, in format of dict or string
context (Context): an object containing request and configuration details
Returns:
(dict): a JSON-serializable dict that contains request body and headers
"""
f = data.read()
f = io.BytesIO(f)
image = Image.open(f).convert("RGB")
batch_size = 1
image = np.asarray(image.resize((640, 640)))
image = np.concatenate([image[np.newaxis, :, :]] * batch_size)
body = json.dumps(
{"signature_name": "serving_default", "instances": image.tolist()}
)
return body
def output_handler(data, context):
"""Post-process TensorFlow Serving output before it is returned to the client.
Args:
data (obj): the TensorFlow serving response
context (Context): an object containing request and configuration details
Returns:
(bytes, string): data to return to client, response content type
"""
if data.status_code != 200:
raise ValueError(data.content.decode("utf-8"))
response_content_type = context.accept_header
predictions = json.loads(data.content)
# Post-process detections
config = read_model_config()
labels = read_labels()
boxes, confidences, classes = decode_yolo_preds(predictions, config)
# Serialize arrays for JSON response
boxes = [box.tolist() for box in boxes]
confidences = [float(conf) for conf in confidences]
classes = [labels[int(cls)] for cls in classes]
# Prepare the response
response = {
"boxes": boxes,
"confidences": confidences,
"classes": classes,
}
outputs = json.dumps(response)
return outputs, response_content_type
Expected behavior
While the GPU appears to be loaded (as logged in CloudWatch) the inference time should ideally be
quite fast.
System information
A description of your system. Please provide:
SageMaker Python SDK version:
Framework name (eg. PyTorch) or algorithm (eg. KMeans): Tensorflow
Framework version: 2.12
Python version: 3.10
CPU or GPU:GPU
Custom Docker image (Y/N):N
Extra notes
I have tried removing all the post-processing code and inference time still remains >1s.
Image used for my testing was: 763104351884.dkr.ecr.eu-north-1.amazonaws.com/tensorflow-inference:2.12-gpu
The text was updated successfully, but these errors were encountered:
I am trying to deploy a TensorflowModel (yolov5) directly from artifacts stored in S3 on a GPU instance (ml.g4dn.2xlarge)
which does not seem to be used during inference as the inference time is ~1s.
To reproduce
Expected behavior
While the GPU appears to be loaded (as logged in CloudWatch) the inference time should ideally be
quite fast.
System information
A description of your system. Please provide:
Extra notes
I have tried removing all the post-processing code and inference time still remains >1s.
Image used for my testing was: 763104351884.dkr.ecr.eu-north-1.amazonaws.com/tensorflow-inference:2.12-gpu
The text was updated successfully, but these errors were encountered: