Skip to content

Commit fb33939

Browse files
committed
init code for supporting service models
1 parent 4284885 commit fb33939

File tree

5 files changed

+110
-13
lines changed

5 files changed

+110
-13
lines changed

ads/aqua/modeldeployment/deployment.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,9 @@ def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
12891289
AquaValueError
12901290
If model type is unsupported by tool (no recommendation report generated)
12911291
"""
1292+
deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
1293+
kwargs["deployment_config"] = deployment_config
1294+
print(deployment_config)
12921295
try:
12931296
request = RequestRecommend(**kwargs)
12941297
except ValidationError as e:

ads/aqua/shaperecommend/constants.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,20 @@
7878

7979
IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization
8080

81+
VLLM_PARAMS_KEY = "VLLM_PARAMS"
82+
VLLM_ENV_KEY = "VLLM"
83+
QUANT_FLAG = "--quantization"
84+
MAX_MODEL_LEN_FLAG = "--max-model-len"
85+
8186
TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
8287

8388
VLLM_PARAMS = {
8489
"max_model_len": "--max-model-len",
8590
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
8691
}
8792

88-
DEFAULT_WEIGHT_SIZE = "float32"
93+
DEFAULT_WEIGHT_SIZE = "bfloat16"
94+
DEFAULT_MAX_SEQ_LEN = 4096
8995

9096
BITS_AND_BYTES_8BIT = "8bit"
9197
BITS_AND_BYTES_4BIT = "4bit"

ads/aqua/shaperecommend/llm_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ads.aqua.shaperecommend.constants import (
1212
BITS_AND_BYTES_4BIT,
1313
BITS_AND_BYTES_8BIT,
14+
DEFAULT_MAX_SEQ_LEN,
1415
DEFAULT_WEIGHT_SIZE,
1516
NEXT_QUANT,
1617
QUANT_MAPPING,
@@ -42,7 +43,7 @@ class LLMConfig(BaseModel):
4243
description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
4344
)
4445
max_seq_len: Optional[int] = Field(
45-
4096, description="Maximum input sequence length (context window)."
46+
DEFAULT_MAX_SEQ_LEN, description="Maximum input sequence length (context window)."
4647
)
4748
weight_dtype: Optional[str] = Field(
4849
DEFAULT_WEIGHT_SIZE,

ads/aqua/shaperecommend/recommend.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,24 @@ def which_shapes(
9393
shapes = self.valid_compute_shapes(compartment_id=request.compartment_id)
9494

9595
ds_model = self._validate_model_ocid(request.model_id)
96-
data = self._get_model_config(ds_model)
97-
98-
llm_config = LLMConfig.from_raw_config(data)
9996

10097
model_name = ds_model.display_name if ds_model.display_name else ""
10198

102-
shape_recommendation_report = self._summarize_shapes_for_seq_lens(
103-
llm_config, shapes, model_name
104-
)
99+
if request.deployment_config:
100+
shape_recommendation_report = (
101+
ShapeRecommendationReport.from_deployment_config(
102+
request.deployment_config, model_name, shapes
103+
)
104+
)
105+
106+
else:
107+
data = self._get_model_config(ds_model)
108+
109+
llm_config = LLMConfig.from_raw_config(data)
110+
111+
shape_recommendation_report = self._summarize_shapes_for_seq_lens(
112+
llm_config, shapes, model_name
113+
)
105114

106115
if request.generate_table and shape_recommendation_report.recommendations:
107116
shape_recommendation_report = self._rich_diff_table(
@@ -248,13 +257,18 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table:
248257
else:
249258
total_memory = f"CPU: {str(shape.memory_in_gbs)}"
250259

260+
if model:
261+
model_size = str(model.total_model_gb)
262+
else:
263+
model_size = "Using Pre-Defined Config"
264+
251265
table.add_row(
252266
shape.name,
253267
str(shape.available),
254268
str(shape.shape_series),
255269
str(gpu.gpu_count),
256270
total_memory,
257-
str(model.total_model_gb),
271+
model_size,
258272
deploy.quantization,
259273
recommendation,
260274
)

ads/aqua/shaperecommend/shape_report.py

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,21 @@
22
# Copyright (c) 2025 Oracle and/or its affiliates.
33
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
44

5+
import json
56
from typing import List, Optional
67

78
from pydantic import BaseModel, Field
89

910
from ads.aqua.common.entities import ComputeShapeSummary
10-
from ads.aqua.shaperecommend.constants import QUANT_MAPPING
11+
from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig
12+
from ads.aqua.shaperecommend.constants import (
13+
DEFAULT_WEIGHT_SIZE,
14+
MAX_MODEL_LEN_FLAG,
15+
QUANT_FLAG,
16+
QUANT_MAPPING,
17+
VLLM_ENV_KEY,
18+
VLLM_PARAMS_KEY,
19+
)
1120
from ads.aqua.shaperecommend.estimator import MemoryEstimator
1221
from ads.config import COMPARTMENT_OCID
1322

@@ -30,6 +39,10 @@ class RequestRecommend(BaseModel):
3039
COMPARTMENT_OCID, description="The OCID of user's compartment"
3140
)
3241

42+
deployment_config: Optional[AquaDeploymentConfig] = Field(
43+
{}, description="The deployment configuration for model (only available for service models)."
44+
)
45+
3346
class Config:
3447
protected_namespaces = ()
3548

@@ -42,7 +55,7 @@ class DeploymentParams(BaseModel): # noqa: N801
4255
quantization: Optional[str] = Field(
4356
None, description="Type of quantization (e.g. 4bit)."
4457
)
45-
max_model_len: int = Field(..., description="Maximum length of input sequence.")
58+
max_model_len: Optional[int] = Field(None, description="Maximum length of input sequence.")
4659
params: str = Field(
4760
..., description="Runtime parameters for deployment with vLLM, etc."
4861
)
@@ -68,11 +81,12 @@ class ModelConfig(BaseModel):
6881
The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
6982
"""
7083

71-
model_details: ModelDetail = Field(..., description="Details about the model.")
7284
deployment_params: DeploymentParams = Field(
7385
..., description="Parameters for deployment."
7486
)
75-
recommendation: str = Field(..., description="GPU recommendation for the model.")
87+
model_details: Optional[ModelDetail] = Field(None, description="Details about the model.")
88+
89+
recommendation: Optional[str] = Field("", description="GPU recommendation for the model.")
7690

7791
class Config:
7892
protected_namespaces = ()
@@ -231,3 +245,62 @@ class ShapeRecommendationReport(BaseModel):
231245
None,
232246
description="Details for troubleshooting if no shapes fit the current model.",
233247
)
248+
249+
250+
@classmethod
251+
def from_deployment_config(cls, deployment_config: AquaDeploymentConfig, model_name: str, valid_shapes: List[ComputeShapeSummary]) -> "ShapeRecommendationReport":
252+
"""
253+
For service models, pre-set deployment configurations (AquaDeploymentConfig) are available.
254+
Derives ShapeRecommendationReport from AquaDeploymentConfig (if service model & available)
255+
"""
256+
257+
recs = []
258+
# may need to sort?
259+
for shape in valid_shapes:
260+
current_config = deployment_config.configuration.get(shape.name)
261+
if current_config:
262+
quantization = None
263+
max_model_len = None
264+
recommendation = ""
265+
current_params = current_config.parameters.get(VLLM_PARAMS_KEY)
266+
current_env = current_config.env.get(VLLM_ENV_KEY)
267+
268+
if current_params:
269+
param_list = current_params.split()
270+
271+
if QUANT_FLAG in param_list and (idx := param_list.index(QUANT_FLAG)) + 1 < len(param_list):
272+
quantization = param_list[idx + 1]
273+
274+
if MAX_MODEL_LEN_FLAG in param_list and (idx := param_list.index(MAX_MODEL_LEN_FLAG)) + 1 < len(param_list):
275+
max_model_len = param_list[idx + 1]
276+
max_model_len = int(max_model_len)
277+
278+
if current_env:
279+
recommendation += f"ENV: {json.dumps(current_env)}\n\n"
280+
281+
recommendation += "Model fits well within the allowed compute shape."
282+
283+
deployment_params = DeploymentParams(
284+
quantization=quantization if quantization else DEFAULT_WEIGHT_SIZE,
285+
max_model_len=max_model_len,
286+
params=current_params if current_params else "",
287+
)
288+
289+
# TODO: calculate memory footprint based on params??
290+
# TODO: add --env vars not just params, current_config.env
291+
# are there multiple configurations in the SMM configs per shape??
292+
configuration = [ModelConfig(
293+
deployment_params=deployment_params,
294+
recommendation=recommendation,
295+
)]
296+
297+
recs.append(ShapeReport(
298+
shape_details=shape,
299+
configurations=configuration
300+
)
301+
)
302+
303+
return ShapeRecommendationReport(
304+
display_name=model_name,
305+
recommendations=recs
306+
)

0 commit comments

Comments
 (0)