Skip to content

Commit 81e2f37

Browse files
committed
Add demonstration for configuring Blueprints for modular Durable Function logic
1 parent dbef7d0 commit 81e2f37

35 files changed

+1094
-46
lines changed

.devcontainer/Dockerfile

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Use a base image that supports Python.
2+
FROM mcr.microsoft.com/vscode/devcontainers/python:1-3.11-bullseye
3+
4+
# Install Python dependencies
5+
COPY ./src/AIDocumentPipeline/requirements.txt /tmp/pip-tmp/
6+
RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
7+
&& rm -rf /tmp/pip-tmp
8+
9+
# Install additional tools and dependencies
10+
COPY ./.devcontainer/install-tools.sh /tmp/tools-tmp/
11+
RUN chmod +x /tmp/tools-tmp/install-tools.sh \
12+
&& /tmp/tools-tmp/install-tools.sh \
13+
&& rm -rf /tmp/tools-tmp
14+
15+
# Default to bash shell
16+
ENV SHELL=/bin/bash \
17+
DOCKER_BUILDKIT=1
18+
19+
# Mount for docker-in-docker
20+
VOLUME [ "/var/lib/docker" ]

.devcontainer/devcontainer.json

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"name": "Azure Functions AI Document Pipeline",
3+
"build": {
4+
"dockerfile": "Dockerfile",
5+
"context": ".."
6+
},
7+
"features": {
8+
"ghcr.io/devcontainers/features/git:1": {
9+
"version": "latest",
10+
"ppa": "false"
11+
},
12+
"ghcr.io/devcontainers/features/powershell:1": {},
13+
"ghcr.io/devcontainers/features/azure-cli:1": {},
14+
"ghcr.io/azure/azure-dev/azd:0": {},
15+
"ghcr.io/devcontainers/features/git-lfs:1": {
16+
"version": "latest"
17+
},
18+
"ghcr.io/devcontainers/features/github-cli:1": {
19+
"version": "latest"
20+
},
21+
"ghcr.io/devcontainers/features/docker-in-docker:2": {
22+
"version": "latest"
23+
}
24+
},
25+
"overrideFeatureInstallOrder": [
26+
"ghcr.io/devcontainers/features/git",
27+
"ghcr.io/devcontainers/features/powershell",
28+
"ghcr.io/devcontainers/features/azure-cli",
29+
"ghcr.io/azure/azure-dev/azd",
30+
"ghcr.io/devcontainers/features/git-lfs",
31+
"ghcr.io/devcontainers/features/github-cli",
32+
"ghcr.io/devcontainers/features/docker-in-docker"
33+
],
34+
"remoteUser": "vscode",
35+
"containerUser": "vscode",
36+
"forwardPorts": [],
37+
"otherPortsAttributes": {
38+
"onAutoForward": "ignore"
39+
},
40+
"customizations": {
41+
"vscode": {
42+
"extensions": [
43+
"ms-python.vscode-pylance",
44+
"ms-python.python",
45+
"ms-python.debugpy",
46+
"tomoki1207.pdf",
47+
"ms-azuretools.vscode-bicep",
48+
"ms-vscode.vscode-node-azure-pack",
49+
"ms-vscode.PowerShell",
50+
"GitHub.vscode-pull-request-github",
51+
"ms-azuretools.vscode-azurefunctions",
52+
"DurableFunctionsMonitor.durablefunctionsmonitor",
53+
"EditorConfig.EditorConfig",
54+
"humao.rest-client",
55+
"ms-azuretools.vscode-docker",
56+
"ms-vscode-remote.remote-containers"
57+
]
58+
}
59+
}
60+
}

.devcontainer/install-tools.sh

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor >microsoft.gpg
2+
mv microsoft.gpg /etc/apt/trusted.gpg.d/microsoft.gpg
3+
4+
sh -c 'echo "deb [arch=$(dpkg --print-architecture)] https://packages.microsoft.com/debian/$(lsb_release -rs | cut -d'.' -f 1)/prod $(lsb_release -cs) main" > /etc/apt/sources.list.d/dotnetdev.list'
5+
6+
apt-get update &&
7+
apt-get upgrade -y &&
8+
export DEBIAN_FRONTEND=noninteractive &&
9+
apt-get -y install --no-install-recommends \
10+
poppler-utils \
11+
azure-functions-core-tools-4

.vscode/launch.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"host": "localhost",
1010
"port": 9091
1111
},
12-
"preLaunchTask": "func: host start"
12+
"preLaunchTask": "Start Functions Host"
1313
}
1414
]
15-
}
15+
}

.vscode/tasks.json

+14-14
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
{
2-
"version": "2.0.0",
3-
"tasks": [
4-
{
5-
"type": "func",
6-
"label": "func: host start",
7-
"command": "host start",
8-
"problemMatcher": "$func-python-watch",
9-
"isBackground": true,
10-
"options": {
11-
"cwd": "${workspaceFolder}/src\\AIDocumentPipeline"
12-
}
13-
}
14-
]
15-
}
2+
"version": "2.0.0",
3+
"tasks": [
4+
{
5+
"type": "func",
6+
"label": "Start Functions Host",
7+
"command": "host start",
8+
"problemMatcher": "$func-python-watch",
9+
"isBackground": true,
10+
"options": {
11+
"cwd": "${workspaceFolder}/src/AIDocumentPipeline"
12+
}
13+
}
14+
]
15+
}

README.md

+159-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,159 @@
1-
# azure-ai-document-pipeline-python-sample
1+
# Azure AI Document Data Extraction Pipeline using Durable Functions (Python)
2+
3+
This sample project demonstrates how to build a scalable, document data extraction pipeline by combining the capabilities of Durable Functions with various techniques for extraction using Azure AI services. The sample specifically processes structured invoices in PDF format. The sample can be adapted to process any structured or unstructured document format.
4+
5+
This approach takes advantage of the following techniques for document data extraction:
6+
7+
- [Using Azure OpenAI GPT-4 Omni vision capabilities to extract data from PDF files by converting them to images](https://github.com/Azure-Samples/azure-openai-gpt-4-vision-pdf-extraction-sample)
8+
9+
## Pre-requisites - Understanding
10+
11+
Before continuing with this sample, please ensure that you have a level of understanding of the following:
12+
13+
### Python Pipeline Specific
14+
15+
- [Durable Functions](https://learn.microsoft.com/en-us/azure/azure-functions/durable/durable-functions-overview?tabs=in-process%2Cnodejs-v3%2Cv1-model&pivots=python)
16+
- [Using Blueprints in Azure Functions for modular components](https://learn.microsoft.com/en-gb/azure/azure-functions/functions-reference-python?tabs=get-started%2Casgi%2Capplication-level&pivots=python-mode-decorators#blueprints)
17+
- [Azure Functions as Containers](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deploy-container-apps?tabs=acr%2Cbash&pivots=programming-language-python)
18+
19+
### Azure Services
20+
21+
- [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview)
22+
- [Azure Blob Storage](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction)
23+
- [Azure Storage Queues](https://learn.microsoft.com/en-us/azure/storage/queues/storage-queues-introduction)
24+
- [Azure Container Apps](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deploy-container-apps?tabs=acr%2Cbash&pivots=programming-language-csharp)
25+
26+
## Pre-requisites - Setup
27+
28+
The sample repository comes with a [**Dev Container**](https://code.visualstudio.com/docs/remote/containers) that contains all the necessary tools and dependencies to run the sample. To use the Dev Container, you need to have the following tools installed on your local machine:
29+
30+
- Install [**Visual Studio Code**](https://code.visualstudio.com/download)
31+
- Install [**Docker Desktop**](https://www.docker.com/products/docker-desktop)
32+
- Install [**Remote - Containers**](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension for Visual Studio Code
33+
34+
Additionally, you will require:
35+
36+
- An Azure subscription. If you don't have an Azure subscription, create an [account](https://azure.microsoft.com/en-us/).
37+
38+
## Understanding the pipeline
39+
40+
The purpose of this sample is to provide a demonstration of how to effectively build stateful orchestration workflows for batch processing documents, that are stored in an Azure Storage blob container, using a queue (managed by Azure Storage queues in this example).
41+
42+
Below is an illustration of how the pipeline may integrate into an intelligent application consuming it in a potential real-world scenario.
43+
44+
![Azure AI Document Processing Pipeline](./assets/Flow.png)
45+
46+
> [!IMPORTANT]
47+
> This illustration contains additional actions that are not covered in this sample project. The implementation provided focuses on the Durable Function element of the pipeline, excluding the classification of documents. For this sample project, it is assumed that all documents are invoices so classification is not required.
48+
49+
### Azure Components
50+
51+
- [**Azure Container Apps**](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deploy-container-apps?tabs=acr%2Cbash&pivots=programming-language-csharp), used to host the containerized functions used in the document processing pipeline.
52+
- **Note**: By containerizing the functions app, you can integrate this specific orchestration pipeline into an existing microservices architecture or deploy it as a standalone service.
53+
- [**Azure OpenAI Service**](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview), a managed service for OpenAI GPT models, deploying the latest GPT-4 Omni model to support Vision extraction techniques.
54+
- **Note**: The GPT-4 Omni model is not available in all Azure OpenAI regions. For more information, see the [Azure OpenAI Service documentation](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#standard-deployment-model-availability).
55+
- [**Azure Storage Account**](https://learn.microsoft.com/en-us/azure/storage/common/storage-introduction), used to store the batch of documents to be processed and the extracted data from the documents. The storage account is also used to store the queue messages for the document processing pipeline.
56+
- [**Azure Monitor**](https://learn.microsoft.com/en-us/azure/azure-monitor/overview), used to store logs and traces from the document processing pipeline for monitoring and troubleshooting purposes.
57+
- [**Azure Container Registry**](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-intro), used to store the container images for the document processing pipeline service that will be consumed by Azure Container Apps.
58+
- [**Azure User-Assigned Managed Identity**](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview-for-developers?tabs=portal%2Cdotnet), used to authenticate the service deployed in the Azure Container Apps environment to securely access other Azure services without key-based authentication, including the Azure Storage account and Azure OpenAI service.
59+
- [**Azure Bicep**](https://learn.microsoft.com/en-us/azure/azure-resource-manager/bicep/overview?tabs=bicep), used to create a repeatable infrastructure deployment for the Azure resources.
60+
61+
### Project Structure
62+
63+
The project is structured as follows:
64+
65+
- **[AIDocumentPipeline](./src/AIDocumentPipeline/)**: The main project containing the Durable Functions implementation for the document processing pipeline.
66+
- **[Invoices](./src/AIDocumentPipeline/invoices/)**: Contains the specific workflows and activities used for processing invoices.
67+
- Workflows are orchestrations in Durable Functions that manage the execution of activities. They are long-running and stateful.
68+
- Activities are the individual discrete actions that are executed by the orchestration to process the documents. State is maintained across activities by the Durable Functions runtime.
69+
- **[Shared](./src/AIDocumentPipeline/shared/)**: Contains specific shared components that are exclusive to the Durable Functions project, including service classes for abstracting the data extraction, and Azure Storage account interactions.
70+
- **[Documents](./src/AIDocumentPipeline/shared/documents)**: Contains the document data extractor services for Azure OpenAI.
71+
72+
### Flow
73+
74+
The sample pipeline is implemented using Durable Functions and consists of the following steps:
75+
76+
- Upload a batch of documents to an Azure Storage blob container.
77+
- Once the documents are uploaded, send a message to the Azure Storage queue containing the container reference to trigger the document processing pipeline.
78+
- The **[Process Invoice Batch workflow](./src/AIDocumentPipeline/invoices/process_invoice_batch_workflow.py)** picks up the message from the queue and starts to process the request.
79+
- Firstly, the batch document folders are retrieved from the blob container using the container reference in the message. **See [Get Invoice Folders](./src/AIDocumentPipeline/invoices/activities/get_invoice_folders.py).**
80+
- _Authentication to the Azure Storage account is established via a user-assigned managed identity when deployed in Azure_.
81+
- The initial workflow then triggers the specific invoice data extraction workflow for each document folder in the batch in parallel using the **[Extract Invoice Data workflow](./src/AIDocumentPipeline/invoices/extract_invoice_data_workflow.py)**. These process the folders as follows:
82+
- For each folder in the batch:
83+
- For each file in the folder:
84+
- Extract the content of the file using the document data extractor service, [Azure OpenAI with Vision](./src/AIDocumentPipeline/shared/documents/document_data_extractor.py).
85+
- Extract the structured data expected for the invoice using the defined [Invoice Data object](./src/AIDocumentPipeline/invoices/invoice_data.py). **See [Extract Invoice Data](./src/AIDocumentPipeline/invoices/activities/extract_invoice_data.py).**
86+
- _By using a defined data-transfer object, the prompt to the language model can be strictly controlled by providing a schema of the expected data to ensure accurate extraction_.
87+
88+
## Run the sample
89+
90+
The sample project is designed to be deployed as a containerized application using Azure Container Apps. The deployment is defined using Azure Bicep in the [infra folder](./infra/).
91+
92+
The deployment is split into two parts, run by separate PowerShell scripts using the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli):
93+
94+
- **[Core Infrastructure](./infra/main.bicep)**: Deploys all of the necessary core components that are required for the document processing pipeline, including the Azure AI services, Azure Storage account, Azure Container Registry, and Azure Container Apps environment. See [Deploy Core Infrastructure PowerShell script](./infra/Deploy-Infrastructure.ps1) for more detail on the infrastructure deployment process.
95+
- **[Application Deployment](./infra/apps/AIDocumentPipeline/app.bicep)**: Deploys the containerized application to the Azure Container Apps environment. See [Deploy App PowerShell script](./infra/apps/AIDocumentPipeline/Deploy-App.ps1) for more detail on the containerization and deployment process.
96+
97+
### Setup the local environment
98+
99+
To setup an environment locally, simply run the [Setup-Environment.ps1](./Setup-Environment.ps1) script from the root of the project:
100+
101+
> [!IMPORTANT]
102+
> Docker Desktop must be running to setup the necessary local development environment.
103+
104+
```powershell
105+
.\Setup-Environment.ps1 -DeploymentName <DeploymentName> -Location <Location> -IsLocal $true -SkipInfrastructure $false
106+
```
107+
108+
> [!NOTE]
109+
> The `-IsLocal` parameter is used to determine whether the complete containerized deployment is made in Azure, or whether to deploy the necessary components to Azure that will support a local development environment. The `-SkipInfrastructure` parameter is used to skip the deployment of the core infrastructure components if they are already deployed.
110+
111+
When configured for local development, you will need to grant the following role-based access to your identity scoped to the specific Azure resources:
112+
113+
- **Azure Container Registry**:
114+
- **Role**: AcrPull
115+
- **Azure Storage Account**:
116+
- **Role**: Storage Blob Data Contributor
117+
- **Role**: Storage Queue Data Contributor
118+
- **Azure OpenAI Service**:
119+
- **Role**: Cognitive Services OpenAI User
120+
121+
With the local development environment setup, you can open the solution in Visual Studio Code using the Dev Container. The Dev Container contains all the necessary tools and dependencies to run the sample project with F5 debugging support.
122+
123+
### Setup the complete Azure environment
124+
125+
To setup an environment in Azure, simply run the [Setup-Environment.ps1](./Setup-Environment.ps1) script from the root of the project:
126+
127+
```powershell
128+
.\Setup-Environment.ps1 -DeploymentName <DeploymentName> -Location <Location> -IsLocal $false -SkipInfrastructure $false
129+
```
130+
131+
> [!NOTE]
132+
> The `-IsLocal` parameter is used to determine whether the complete containerized deployment is made in Azure, or whether to deploy the necessary components to Azure that will support a local development environment. The `-SkipInfrastructure` parameter is used to skip the deployment of the core infrastructure components if they are already deployed.
133+
134+
### Running the document processing pipeline
135+
136+
Once an environment is setup, you can run the document processing pipeline by uploading a batch of documents to the Azure Storage blob container and sending a message to the Azure Storage queue containing the container reference.
137+
138+
> [!TIP]
139+
> Use the [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/) to upload the batch of documents to the Azure Storage blob container and send a message to the Azure Storage queue.
140+
141+
A batch of invoices is provided in the tests [Invoice Batch folder](./tests/InvoiceBatch/) which can be uploaded into an Azure Storage blob container.
142+
143+
> [!NOTE]
144+
> Upload all of the individual folders into the container, not the individual files. This sample processed a container that contains multiple folders, each representing a customer's data to be processed which may contain one or more invoices.
145+
146+
Once uploaded, add the following message to the **invoices** queue in the Azure Storage account:
147+
148+
> [!IMPORTANT]
149+
> When running locally, the batch must be uploaded to the deployed Azure Storage account. However, the queue message must be created in the local development storage account, Azurite, running as a Docker container. You may need to create the **invoices** queue in the local storage account first via the Azure Storage Explorer.
150+
151+
```json
152+
{
153+
"container_name": "<container-name>"
154+
}
155+
```
156+
157+
![Azure Storage Explorer invoices queue](./assets/Azure-Storage-Explorer-Queue.png)
158+
159+
The document processing pipeline will then be triggered, processing the batch of invoices and extracting the structured data from each invoice.

src/AIDocumentPipeline/Dockerfile

+4
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,8 @@ ENV AzureWebJobsScriptRoot=/home/site/wwwroot \
88
COPY requirements.txt /
99
RUN pip install -r /requirements.txt
1010

11+
RUN apt-get update \
12+
&& apt-get -y install --no-install-recommends \
13+
poppler-utils
14+
1115
COPY . /home/site/wwwroot
+9-25
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,11 @@
11
import azure.functions as func
22
import azure.durable_functions as df
3-
4-
myApp = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
5-
6-
# An HTTP-triggered function with a Durable Functions client binding
7-
@myApp.route(route="orchestrators/{functionName}")
8-
@myApp.durable_client_input(client_name="client")
9-
async def http_start(req: func.HttpRequest, client):
10-
function_name = req.route_params.get('functionName')
11-
instance_id = await client.start_new(function_name)
12-
response = client.create_check_status_response(req, instance_id)
13-
return response
14-
15-
# Orchestrator
16-
@myApp.orchestration_trigger(context_name="context")
17-
def hello_orchestrator(context):
18-
result1 = yield context.call_activity("hello", "Seattle")
19-
result2 = yield context.call_activity("hello", "Tokyo")
20-
result3 = yield context.call_activity("hello", "London")
21-
22-
return [result1, result2, result3]
23-
24-
# Activity
25-
@myApp.activity_trigger(input_name="city")
26-
def hello(city: str):
27-
return f"Hello {city}"
3+
from invoices import process_invoice_batch_workflow
4+
from invoices.activities import extract_invoice_data, get_invoice_folders
5+
from shared.storage import write_bytes_to_blob
6+
7+
app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
8+
app.register_functions(write_bytes_to_blob.bp)
9+
app.register_functions(extract_invoice_data.bp)
10+
app.register_functions(get_invoice_folders.bp)
11+
app.register_functions(process_invoice_batch_workflow.bp)

src/AIDocumentPipeline/invoices/__init__.py

Whitespace-only changes.

src/AIDocumentPipeline/invoices/activities/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)