Skip to content

Commit 1f12b32

Browse files
Xeelee33JoshuaWilsherepaullizer
authored
Bugfix/govcloud document intelligence, ai search, content safety managed identity authentication (#388)
* Add support for govt and custom search resource manager, update document intelligence, ai search, and content safety client initialization for govt and custom environments * Fix Azure Document Intelligence operation via managed identity authentication for government and custom environments by adding base64 encoding support for document uploads, which is required by current GA document intelligence API version (2024-11-30) * Fix Azure AI services (content safety, document intelligence, search) managed identity integration for government and custom environments. Consolidated Document Intelligence calls to use same module and API and added base64 document uploads in Document Intelligence API, required by most current API GA version (2024-11-30). * Update README.md to correct Managed Identity role requirements for Azure services * Add support for govt and custom search resource manager, update document intelligence, ai search, and content safety client initialization for govt and custom environments * Fix Azure Document Intelligence operation via managed identity authentication for government and custom environments by adding base64 encoding support for document uploads, which is required by current GA document intelligence API version (2024-11-30) * Fix Azure AI services (content safety, document intelligence, search) managed identity integration for government and custom environments. Consolidated Document Intelligence calls to use same module and API and added base64 document uploads in Document Intelligence API, required by most current API GA version (2024-11-30). * Update README.md to correct Managed Identity role requirements for Azure services * added search_client_public to managed identity auth flow --------- Co-authored-by: Joshua Wilshere <joshua.wilshere@oig.dhs.gov> Co-authored-by: Paul Lizer <paullizer@microsoft.com>
1 parent 8c3c02a commit 1f12b32

File tree

3 files changed

+137
-73
lines changed

3 files changed

+137
-73
lines changed

application/single_app/config.py

Lines changed: 63 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@
112112
CUSTOM_RESOURCE_MANAGER_URL_VALUE = os.getenv("CUSTOM_RESOURCE_MANAGER_URL_VALUE", "")
113113
CUSTOM_BLOB_STORAGE_URL_VALUE = os.getenv("CUSTOM_BLOB_STORAGE_URL_VALUE", "")
114114
CUSTOM_COGNITIVE_SERVICES_URL_VALUE = os.getenv("CUSTOM_COGNITIVE_SERVICES_URL_VALUE", "")
115+
CUSTOM_SEARCH_RESOURCE_MANAGER_URL_VALUE = os.getenv("CUSTOM_SEARCH_RESOURCE_MANAGER_URL_VALUE", "")
116+
115117

116118
# Azure AD Configuration
117119
CLIENT_ID = os.getenv("CLIENT_ID")
@@ -139,11 +141,13 @@
139141
authority = AzureAuthorityHosts.AZURE_GOVERNMENT
140142
credential_scopes=[resource_manager + "/.default"]
141143
cognitive_services_scope = "https://cognitiveservices.azure.us/.default"
144+
search_resource_manager = "https://search.azure.us"
142145
elif AZURE_ENVIRONMENT == "custom":
143146
resource_manager = CUSTOM_RESOURCE_MANAGER_URL_VALUE
144147
authority = CUSTOM_IDENTITY_URL_VALUE
145148
credential_scopes=[resource_manager + "/.default"]
146149
cognitive_services_scope = CUSTOM_COGNITIVE_SERVICES_URL_VALUE
150+
search_resource_manager = CUSTOM_SEARCH_RESOURCE_MANAGER_URL_VALUE
147151
else:
148152
OIDC_METADATA_URL = f"https://login.microsoftonline.com/{TENANT_ID}/v2.0/.well-known/openid-configuration"
149153
resource_manager = "https://management.azure.com"
@@ -447,12 +451,20 @@ def initialize_clients(settings):
447451
)
448452
else:
449453
if settings.get("azure_document_intelligence_authentication_type") == "managed_identity":
450-
document_intelligence_client = DocumentIntelligenceClient(
451-
endpoint=form_recognizer_endpoint,
452-
credential=DefaultAzureCredential()
453-
)
454+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
455+
document_intelligence_client = DocumentIntelligenceClient(
456+
endpoint=form_recognizer_endpoint,
457+
credential=DefaultAzureCredential(),
458+
credential_scopes=[cognitive_services_scope],
459+
api_version="2024-11-30"
460+
)
461+
else:
462+
document_intelligence_client = DocumentIntelligenceClient(
463+
endpoint=form_recognizer_endpoint,
464+
credential=DefaultAzureCredential()
465+
)
454466
else:
455-
document_intelligence_client = DocumentAnalysisClient(
467+
document_intelligence_client = DocumentIntelligenceClient(
456468
endpoint=form_recognizer_endpoint,
457469
credential=AzureKeyCredential(form_recognizer_key)
458470
)
@@ -479,21 +491,41 @@ def initialize_clients(settings):
479491
)
480492
else:
481493
if settings.get("azure_ai_search_authentication_type") == "managed_identity":
482-
search_client_user = SearchClient(
483-
endpoint=azure_ai_search_endpoint,
484-
index_name="simplechat-user-index",
485-
credential=DefaultAzureCredential()
486-
)
487-
search_client_group = SearchClient(
488-
endpoint=azure_ai_search_endpoint,
489-
index_name="simplechat-group-index",
490-
credential=DefaultAzureCredential()
491-
)
492-
search_client_public = SearchClient(
493-
endpoint=azure_ai_search_endpoint,
494-
index_name="simplechat-public-index",
495-
credential=DefaultAzureCredential()
496-
)
494+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
495+
search_client_user = SearchClient(
496+
endpoint=azure_ai_search_endpoint,
497+
index_name="simplechat-user-index",
498+
credential=DefaultAzureCredential(),
499+
audience=search_resource_manager
500+
)
501+
search_client_group = SearchClient(
502+
endpoint=azure_ai_search_endpoint,
503+
index_name="simplechat-group-index",
504+
credential=DefaultAzureCredential(),
505+
audience=search_resource_manager
506+
)
507+
search_client_public = SearchClient(
508+
endpoint=azure_ai_search_endpoint,
509+
index_name="simplechat-public-index",
510+
credential=DefaultAzureCredential(),
511+
audience=search_resource_manager
512+
)
513+
else:
514+
search_client_user = SearchClient(
515+
endpoint=azure_ai_search_endpoint,
516+
index_name="simplechat-user-index",
517+
credential=DefaultAzureCredential()
518+
)
519+
search_client_group = SearchClient(
520+
endpoint=azure_ai_search_endpoint,
521+
index_name="simplechat-group-index",
522+
credential=DefaultAzureCredential()
523+
)
524+
search_client_public = SearchClient(
525+
endpoint=azure_ai_search_endpoint,
526+
index_name="simplechat-public-index",
527+
credential=DefaultAzureCredential()
528+
)
497529
else:
498530
search_client_user = SearchClient(
499531
endpoint=azure_ai_search_endpoint,
@@ -532,10 +564,17 @@ def initialize_clients(settings):
532564
)
533565
else:
534566
if settings.get("content_safety_authentication_type") == "managed_identity":
535-
content_safety_client = ContentSafetyClient(
536-
endpoint=safety_endpoint,
537-
credential=DefaultAzureCredential()
538-
)
567+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
568+
content_safety_client = ContentSafetyClient(
569+
endpoint=safety_endpoint,
570+
credential=DefaultAzureCredential(),
571+
credential_scopes=[cognitive_services_scope]
572+
)
573+
else:
574+
content_safety_client = ContentSafetyClient(
575+
endpoint=safety_endpoint,
576+
credential=DefaultAzureCredential()
577+
)
539578
else:
540579
content_safety_client = ContentSafetyClient(
541580
endpoint=safety_endpoint,

application/single_app/functions_content.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,22 @@ def extract_content_with_azure_di(file_path):
1919
"""
2020
try:
2121
document_intelligence_client = CLIENTS['document_intelligence_client'] # Ensure CLIENTS is populated
22-
with open(file_path, "rb") as f:
22+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
23+
# Required format for Document Intelligence API version 2024-11-30
24+
with open(file_path, 'rb') as f:
25+
file_bytes = f.read()
26+
base64_source = base64.b64encode(file_bytes).decode('utf-8')
27+
2328
poller = document_intelligence_client.begin_analyze_document(
24-
model_id="prebuilt-read",
25-
document=f
29+
"prebuilt-read",
30+
{"base64Source": base64_source}
2631
)
32+
else:
33+
with open(file_path, 'rb') as f:
34+
poller = document_intelligence_client.begin_analyze_document(
35+
model_id="prebuilt-read",
36+
document=f
37+
)
2738

2839
max_wait_time = 600
2940
start_time = time.time()

application/single_app/route_backend_settings.py

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ def get_index_client() -> SearchIndexClient:
176176
endpoint = settings["azure_ai_search_endpoint"].rstrip("/")
177177
if settings.get("azure_ai_search_authentication_type", "key") == "managed_identity":
178178
credential = DefaultAzureCredential()
179+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
180+
return SearchIndexClient(endpoint=endpoint,
181+
credential=credential,
182+
audience=search_resource_manager)
179183
else:
180184
credential = AzureKeyCredential(settings["azure_ai_search_key"])
181185

@@ -415,11 +419,17 @@ def _test_safety_connection(payload):
415419
key = direct_data.get('key')
416420

417421
if direct_data.get('auth_type') == 'managed_identity':
418-
419-
content_safety_client = ContentSafetyClient(
420-
endpoint=endpoint,
421-
credential=DefaultAzureCredential()
422-
)
422+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
423+
content_safety_client = ContentSafetyClient(
424+
endpoint=endpoint,
425+
credential=DefaultAzureCredential(),
426+
credential_scopes=[cognitive_services_scope]
427+
)
428+
else:
429+
content_safety_client = ContentSafetyClient(
430+
endpoint=endpoint,
431+
credential=DefaultAzureCredential()
432+
)
423433
else:
424434
content_safety_client = ContentSafetyClient(
425435
endpoint=endpoint,
@@ -476,32 +486,6 @@ def _test_azure_ai_search_connection(payload):
476486
"""Attempt to connect to Azure Cognitive Search (or APIM-wrapped)."""
477487
enable_apim = payload.get('enable_apim', False)
478488

479-
if enable_apim:
480-
apim_data = payload.get('apim', {})
481-
endpoint = apim_data.get('endpoint')
482-
subscription_key = apim_data.get('subscription_key')
483-
484-
content_safety_client = ContentSafetyClient(
485-
endpoint=endpoint,
486-
credential=AzureKeyCredential(subscription_key)
487-
)
488-
else:
489-
direct_data = payload.get('direct', {})
490-
endpoint = direct_data.get('endpoint')
491-
key = direct_data.get('key')
492-
493-
if direct_data.get('auth_type') == 'managed_identity':
494-
495-
content_safety_client = ContentSafetyClient(
496-
endpoint=endpoint,
497-
credential=DefaultAzureCredential()
498-
)
499-
else:
500-
content_safety_client = ContentSafetyClient(
501-
endpoint=endpoint,
502-
credential=AzureKeyCredential(key)
503-
)
504-
505489
if enable_apim:
506490
apim_data = payload.get('apim', {})
507491
endpoint = apim_data.get('endpoint') # e.g. https://my-apim.azure-api.net/search
@@ -516,10 +500,22 @@ def _test_azure_ai_search_connection(payload):
516500
endpoint = direct_data.get('endpoint') # e.g. https://<searchservice>.search.windows.net
517501
key = direct_data.get('key')
518502
url = f"{endpoint.rstrip('/')}/indexes?api-version=2023-11-01"
519-
headers = {
520-
'api-key': key,
521-
'Content-Type': 'application/json'
522-
}
503+
504+
if direct_data.get('auth_type') == 'managed_identity':
505+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"): # change credential scopes for US Gov or custom environments
506+
credential_scopes=search_resource_manager + "/.default"
507+
arm_scope = credential_scopes
508+
credential = DefaultAzureCredential()
509+
arm_token = credential.get_token(arm_scope).token
510+
headers = {
511+
'Authorization': f'Bearer {arm_token}',
512+
'Content-Type': 'application/json'
513+
}
514+
else:
515+
headers = {
516+
'api-key': key,
517+
'Content-Type': 'application/json'
518+
}
523519

524520
# A small GET to /indexes to verify we have connectivity
525521
resp = requests.get(url, headers=headers, timeout=10)
@@ -540,7 +536,7 @@ def _test_azure_doc_intelligence_connection(payload):
540536
endpoint = apim_data.get('endpoint')
541537
subscription_key = apim_data.get('subscription_key')
542538

543-
document_intelligence_client = DocumentAnalysisClient(
539+
document_intelligence_client = DocumentIntelligenceClient(
544540
endpoint=endpoint,
545541
credential=AzureKeyCredential(subscription_key)
546542
)
@@ -550,24 +546,42 @@ def _test_azure_doc_intelligence_connection(payload):
550546
key = direct_data.get('key')
551547

552548
if direct_data.get('auth_type') == 'managed_identity':
553-
554-
document_intelligence_client = DocumentAnalysisClient(
555-
endpoint=endpoint,
556-
credential=DefaultAzureCredential()
557-
)
549+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
550+
document_intelligence_client = DocumentIntelligenceClient(
551+
endpoint=endpoint,
552+
credential=DefaultAzureCredential(),
553+
credential_scopes=[cognitive_services_scope],
554+
api_version="2024-11-30" # Must be specified otherwise looks for 2023-07-31-preview by default which is not a valid version in Azure Government
555+
)
556+
else:
557+
document_intelligence_client = DocumentIntelligenceClient(
558+
endpoint=endpoint,
559+
credential=DefaultAzureCredential()
560+
)
558561
else:
559-
document_intelligence_client = DocumentAnalysisClient(
562+
document_intelligence_client = DocumentIntelligenceClient(
560563
endpoint=endpoint,
561564
credential=AzureKeyCredential(key)
562565
)
563566

564567
# Use local test file instead of URL for better offline testing
565568
test_file_path = os.path.join(current_app.root_path, 'static', 'test_files', 'test_document.pdf')
566-
with open(test_file_path, 'rb') as f:
569+
if AZURE_ENVIRONMENT in ("usgovernment", "custom"):
570+
# Required format for Document Intelligence API version 2024-11-30 and later
571+
with open(test_file_path, 'rb') as f:
572+
file_bytes = f.read()
573+
base64_source = base64.b64encode(file_bytes).decode('utf-8')
574+
567575
poller = document_intelligence_client.begin_analyze_document(
568-
model_id="prebuilt-read",
569-
document=f
576+
"prebuilt-read",
577+
{"base64Source": base64_source}
570578
)
579+
else:
580+
with open(test_file_path, 'rb') as f:
581+
poller = document_intelligence_client.begin_analyze_document(
582+
model_id="prebuilt-read",
583+
document=f
584+
)
571585

572586
max_wait_time = 600
573587
start_time = time.time()
@@ -578,7 +592,7 @@ def _test_azure_doc_intelligence_connection(payload):
578592
break
579593
if time.time() - start_time > max_wait_time:
580594
raise TimeoutError("Document analysis took too long.")
581-
time.sleep(30)
595+
time.sleep(10)
582596

583597
if status == "succeeded":
584598
return jsonify({'message': 'Azure document intelligence connection successful'}), 200

0 commit comments

Comments
 (0)