Skip to content
This repository was archived by the owner on May 28, 2025. It is now read-only.

Commit a040d03

Browse files
committed
Finalize orchestration for invoice data extraction
1 parent fb28566 commit a040d03

File tree

6 files changed

+61
-11
lines changed

6 files changed

+61
-11
lines changed
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import azure.functions as func
22
import azure.durable_functions as df
3-
from invoices import process_invoice_batch_workflow
3+
from invoices import process_invoice_batch_workflow, extract_invoice_data_workflow
44
from invoices.activities import extract_invoice_data, get_invoice_folders
55
from shared.storage import write_bytes_to_blob
66

@@ -9,3 +9,4 @@
99
app.register_functions(extract_invoice_data.bp)
1010
app.register_functions(get_invoice_folders.bp)
1111
app.register_functions(process_invoice_batch_workflow.bp)
12+
app.register_functions(extract_invoice_data_workflow.bp)

src/AIDocumentPipeline/invoices/activities/extract_invoice_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def run(input: Request) -> InvoiceData | None:
3232
system_prompt="You are an AI assistant that extracts data from documents and returns them as structured JSON objects. Do not return as a code block.",
3333
extraction_prompt=f"Extract the data from this invoice. If a value is not present, provide null. Use the following structure: {InvoiceData.empty().to_dict()}",
3434
endpoint=app_config.openai_endpoint,
35-
deployment_name=app_config.openai_vision_completion_deployment,
35+
deployment_name=app_config.openai_completion_deployment,
3636
max_tokens=4096,
3737
temperature=0.1,
3838
top_p=0.1
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from __future__ import annotations
2+
from shared.storage import write_bytes_to_blob
3+
from invoices.activities import extract_invoice_data
4+
from shared.workflow_result import WorkflowResult
5+
import azure.durable_functions as df
6+
import azure.functions as func
7+
import logging
8+
from shared import config as app_config
9+
10+
name = "ExtractInvoiceDataWorkflow"
11+
bp = df.Blueprint()
12+
13+
14+
@bp.function_name(name)
15+
@bp.orchestration_trigger(context_name="context", orchestration=name)
16+
def run(context: df.DurableOrchestrationContext) -> WorkflowResult:
17+
# Step 1: Extract the input from the context
18+
input = context.get_input()
19+
result = WorkflowResult(input.name)
20+
21+
# Step 2: Validate the input
22+
validation_result = input.validate()
23+
if not validation_result.is_valid:
24+
result.merge(validation_result)
25+
return result
26+
27+
result.add_message("InvoiceFolder.validate", "input is valid")
28+
29+
# Step 3: Get the invoice folders from the blob container
30+
for invoice in input.invoice_file_names:
31+
invoice_data = yield context.call_activity(extract_invoice_data.name, extract_invoice_data.Request(input.container_name, invoice))
32+
33+
if not invoice_data:
34+
result.add_error(extract_invoice_data.name,
35+
f"Failed to extract data for {invoice}.")
36+
continue
37+
38+
invoice_data_stored = yield context.call_activity(write_bytes_to_blob.name, write_bytes_to_blob.Request(app_config.invoices_storage_account_name, input.container_name, f"{invoice}.Data.json", invoice_data))
39+
40+
if not invoice_data_stored:
41+
result.add_error(write_bytes_to_blob.name,
42+
f"Failed to store extracted data for {invoice}.")
43+
continue
44+
45+
return result

src/AIDocumentPipeline/invoices/process_invoice_batch_workflow.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from __future__ import annotations
2+
from invoices import extract_invoice_data_workflow
23
from shared.workflow_result import WorkflowResult
34
from invoices.invoice_batch_request import InvoiceBatchRequest
4-
from shared import config as app_config
55
import azure.durable_functions as df
66
import azure.functions as func
77
import logging
8-
from invoices.activities import extract_invoice_data, get_invoice_folders
8+
from invoices.activities import get_invoice_folders
99

1010
name = "ProcessInvoiceBatchWorkflow"
1111
http_trigger_name = "ProcessInvoiceBatchHttp"
@@ -49,5 +49,15 @@ def run(context: df.DurableOrchestrationContext) -> WorkflowResult:
4949
f"Retrieved {len(invoice_folders)} invoice folders.")
5050

5151
# Step 4: Process the invoices in each folder.
52+
extract_invoice_data_tasks = []
53+
for folder in invoice_folders:
54+
extract_invoice_data_task = context.call_sub_orchestrator(
55+
extract_invoice_data_workflow.name, folder)
56+
extract_invoice_data_tasks.append(extract_invoice_data_task)
57+
58+
yield context.task_all(extract_invoice_data_tasks)
59+
60+
for task in extract_invoice_data_tasks:
61+
logging.info(f"Task {task} completed.")
5262

5363
return result

src/AIDocumentPipeline/shared/config.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,8 @@
22

33
otlp_exporter_endpoint = os.environ.get("OTLP_EXPORTER_ENDPOINT", None)
44
openai_endpoint = os.environ.get("OPENAI_ENDPOINT", None)
5-
openai_embedding_deployment = os.environ.get(
6-
"OPENAI_EMBEDDING_DEPLOYMENT", None)
75
openai_completion_deployment = os.environ.get(
86
"OPENAI_COMPLETION_DEPLOYMENT", None)
9-
openai_vision_completion_deployment = os.environ.get(
10-
"OPENAI_VISION_COMPLETION_DEPLOYMENT", None)
11-
document_intelligence_endpoint = os.environ.get(
12-
"DOCUMENT_INTELLIGENCE_ENDPOINT", None)
137
managed_identity_client_id = os.environ.get("MANAGED_IDENTITY_CLIENT_ID", None)
148
invoices_storage_account_name = os.environ.get(
159
"INVOICES_STORAGE_ACCOUNT_NAME", None)

src/AIDocumentPipeline/shared/documents/document_data_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def from_bytes(self, document_bytes: bytes, options: DocumentDataExtractorOption
3333

3434
for image_uri in image_uris:
3535
user_content.append({
36-
"type": "image_uri",
36+
"type": "image_url",
3737
"image_url": {
3838
"url": image_uri
3939
}

0 commit comments

Comments
 (0)