init code for supporting service models

elizjo · elizjo · commit fb33939e6d81 · 2025-08-21T11:01:10.000-07:00
diff --git a/ads/aqua/modeldeployment/deployment.py b/ads/aqua/modeldeployment/deployment.py
@@ -1289,6 +1289,9 @@ def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
         AquaValueError
             If model type is unsupported by tool (no recommendation report generated)
         """
+        deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
+        kwargs["deployment_config"] = deployment_config
+        print(deployment_config)
         try:
             request = RequestRecommend(**kwargs)
         except ValidationError as e:
diff --git a/ads/aqua/shaperecommend/constants.py b/ads/aqua/shaperecommend/constants.py
@@ -78,14 +78,20 @@
 
 IN_FLIGHT_QUANTIZATION = {"4bit"}  # vLLM only supports 4bit in-flight-quantization
 
+VLLM_PARAMS_KEY = "VLLM_PARAMS"
+VLLM_ENV_KEY = "VLLM"
+QUANT_FLAG = "--quantization"
+MAX_MODEL_LEN_FLAG = "--max-model-len"
+
 TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
 
 VLLM_PARAMS = {
     "max_model_len": "--max-model-len",
     "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
 }
 
-DEFAULT_WEIGHT_SIZE = "float32"
+DEFAULT_WEIGHT_SIZE = "bfloat16"
+DEFAULT_MAX_SEQ_LEN = 4096
 
 BITS_AND_BYTES_8BIT = "8bit"
 BITS_AND_BYTES_4BIT = "4bit"
diff --git a/ads/aqua/shaperecommend/llm_config.py b/ads/aqua/shaperecommend/llm_config.py
@@ -11,6 +11,7 @@
 from ads.aqua.shaperecommend.constants import (
     BITS_AND_BYTES_4BIT,
     BITS_AND_BYTES_8BIT,
+    DEFAULT_MAX_SEQ_LEN,
     DEFAULT_WEIGHT_SIZE,
     NEXT_QUANT,
     QUANT_MAPPING,
@@ -42,7 +43,7 @@ class LLMConfig(BaseModel):
         description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
     )
     max_seq_len: Optional[int] = Field(
-        4096, description="Maximum input sequence length (context window)."
+        DEFAULT_MAX_SEQ_LEN, description="Maximum input sequence length (context window)."
     )
     weight_dtype: Optional[str] = Field(
         DEFAULT_WEIGHT_SIZE,
diff --git a/ads/aqua/shaperecommend/recommend.py b/ads/aqua/shaperecommend/recommend.py
@@ -93,15 +93,24 @@ def which_shapes(
             shapes = self.valid_compute_shapes(compartment_id=request.compartment_id)
 
             ds_model = self._validate_model_ocid(request.model_id)
-            data = self._get_model_config(ds_model)
-
-            llm_config = LLMConfig.from_raw_config(data)
 
             model_name = ds_model.display_name if ds_model.display_name else ""
 
-            shape_recommendation_report = self._summarize_shapes_for_seq_lens(
-                llm_config, shapes, model_name
-            )
+            if request.deployment_config:
+                shape_recommendation_report = (
+                    ShapeRecommendationReport.from_deployment_config(
+                        request.deployment_config, model_name, shapes
+                    )
+                )
+
+            else:
+                data = self._get_model_config(ds_model)
+
+                llm_config = LLMConfig.from_raw_config(data)
+
+                shape_recommendation_report = self._summarize_shapes_for_seq_lens(
+                    llm_config, shapes, model_name
+                )
 
             if request.generate_table and shape_recommendation_report.recommendations:
                 shape_recommendation_report = self._rich_diff_table(
@@ -248,13 +257,18 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table:
             else:
                 total_memory = f"CPU: {str(shape.memory_in_gbs)}"
 
+            if model:
+                model_size = str(model.total_model_gb)
+            else:
+                model_size = "Using Pre-Defined Config"
+
             table.add_row(
                 shape.name,
                 str(shape.available),
                 str(shape.shape_series),
                 str(gpu.gpu_count),
                 total_memory,
-                str(model.total_model_gb),
+                model_size,
                 deploy.quantization,
                 recommendation,
             )
diff --git a/ads/aqua/shaperecommend/shape_report.py b/ads/aqua/shaperecommend/shape_report.py
@@ -2,12 +2,21 @@
 # Copyright (c) 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
+import json
 from typing import List, Optional
 
 from pydantic import BaseModel, Field
 
 from ads.aqua.common.entities import ComputeShapeSummary
-from ads.aqua.shaperecommend.constants import QUANT_MAPPING
+from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig
+from ads.aqua.shaperecommend.constants import (
+    DEFAULT_WEIGHT_SIZE,
+    MAX_MODEL_LEN_FLAG,
+    QUANT_FLAG,
+    QUANT_MAPPING,
+    VLLM_ENV_KEY,
+    VLLM_PARAMS_KEY,
+)
 from ads.aqua.shaperecommend.estimator import MemoryEstimator
 from ads.config import COMPARTMENT_OCID
 
@@ -30,6 +39,10 @@ class RequestRecommend(BaseModel):
         COMPARTMENT_OCID, description="The OCID of user's compartment"
     )
 
+    deployment_config: Optional[AquaDeploymentConfig] = Field(
+        {}, description="The deployment configuration for model (only available for service models)."
+    )
+
     class Config:
         protected_namespaces = ()
 
@@ -42,7 +55,7 @@ class DeploymentParams(BaseModel):  # noqa: N801
     quantization: Optional[str] = Field(
         None, description="Type of quantization (e.g. 4bit)."
     )
-    max_model_len: int = Field(..., description="Maximum length of input sequence.")
+    max_model_len: Optional[int] = Field(None, description="Maximum length of input sequence.")
     params: str = Field(
         ..., description="Runtime parameters for deployment with vLLM, etc."
     )
@@ -68,11 +81,12 @@ class ModelConfig(BaseModel):
     The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
     """
 
-    model_details: ModelDetail = Field(..., description="Details about the model.")
     deployment_params: DeploymentParams = Field(
         ..., description="Parameters for deployment."
     )
-    recommendation: str = Field(..., description="GPU recommendation for the model.")
+    model_details: Optional[ModelDetail] = Field(None, description="Details about the model.")
+
+    recommendation: Optional[str] = Field("", description="GPU recommendation for the model.")
 
     class Config:
         protected_namespaces = ()
@@ -231,3 +245,62 @@ class ShapeRecommendationReport(BaseModel):
         None,
         description="Details for troubleshooting if no shapes fit the current model.",
     )
+
+
+    @classmethod
+    def from_deployment_config(cls, deployment_config: AquaDeploymentConfig, model_name: str, valid_shapes: List[ComputeShapeSummary]) -> "ShapeRecommendationReport":
+        """
+        For service models, pre-set deployment configurations (AquaDeploymentConfig) are available.
+        Derives ShapeRecommendationReport from AquaDeploymentConfig (if service model & available)
+        """
+
+        recs = []
+        # may need to sort?
+        for shape in valid_shapes:
+            current_config = deployment_config.configuration.get(shape.name)
+            if current_config:
+                quantization = None
+                max_model_len = None
+                recommendation = ""
+                current_params = current_config.parameters.get(VLLM_PARAMS_KEY)
+                current_env = current_config.env.get(VLLM_ENV_KEY)
+
+                if current_params:
+                    param_list = current_params.split()
+
+                    if QUANT_FLAG in param_list and (idx := param_list.index(QUANT_FLAG)) + 1 < len(param_list):
+                        quantization = param_list[idx + 1]
+
+                    if MAX_MODEL_LEN_FLAG in param_list and (idx := param_list.index(MAX_MODEL_LEN_FLAG)) + 1 < len(param_list):
+                        max_model_len = param_list[idx + 1]
+                        max_model_len = int(max_model_len)
+
+                if current_env:
+                    recommendation += f"ENV: {json.dumps(current_env)}\n\n"
+
+                recommendation += "Model fits well within the allowed compute shape."
+
+                deployment_params = DeploymentParams(
+                    quantization=quantization if quantization else DEFAULT_WEIGHT_SIZE,
+                    max_model_len=max_model_len,
+                    params=current_params if current_params else "",
+                )
+
+                # TODO: calculate memory footprint based on params??
+                # TODO: add --env vars not just params, current_config.env
+                # are there multiple configurations in the SMM configs per shape??
+                configuration = [ModelConfig(
+                    deployment_params=deployment_params,
+                    recommendation=recommendation,
+                )]
+
+                recs.append(ShapeReport(
+                    shape_details=shape,
+                    configurations=configuration
+                )
+                )
+
+        return ShapeRecommendationReport(
+            display_name=model_name,
+            recommendations=recs
+        )