Skip to content

Commit 1b8a3e3

Browse files
authored
Upgrade to text-embedding-3-large model as default, with vector storage optimizations (#2470)
* Initial changes for text-embedding-3 * Change text-embedding-3 * Bicep fixes * More embedding related changes * Add dimension truncation * Mypy fix * Fix mypy issues * Fix tests, add parameter * Upgrade int vect for new embedding model * Add missing env vars in other files * Remove en-us from markdown * Test vision feature, refactor vector_fields to make sense, address feedback * Fix type for vectorfields * Update StackOverflow link * Default to gpt4v when enabled * Update tests
1 parent 0c07ba6 commit 1b8a3e3

File tree

105 files changed

+489
-390
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+489
-390
lines changed

.azdo/pipelines/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ steps:
6060
AZURE_SEARCH_QUERY_SPELLER: $(AZURE_SEARCH_QUERY_SPELLER)
6161
AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER)
6262
AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING)
63+
AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING)
6364
AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT)
6465
AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP)
6566
AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU)

.github/workflows/azure-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ jobs:
5050
AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }}
5151
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
5252
AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }}
53+
AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }}
5354
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
5455
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
5556
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}

app/backend/app.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@ async def setup_clients():
464464
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER") or "lexicon"
465465
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
466466
AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower()
467+
# This defaults to the previous field name "embedding", for backwards compatibility
468+
AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding")
467469

468470
AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID")
469471
AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION")
@@ -580,7 +582,10 @@ async def setup_clients():
580582
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
581583
)
582584
ingester = UploadUserFileStrategy(
583-
search_info=search_info, embeddings=text_embeddings_service, file_processors=file_processors
585+
search_info=search_info,
586+
embeddings=text_embeddings_service,
587+
file_processors=file_processors,
588+
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
584589
)
585590
current_app.config[CONFIG_INGESTER] = ingester
586591

@@ -677,6 +682,7 @@ async def setup_clients():
677682
embedding_model=OPENAI_EMB_MODEL,
678683
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
679684
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
685+
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
680686
sourcepage_field=KB_FIELDS_SOURCEPAGE,
681687
content_field=KB_FIELDS_CONTENT,
682688
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
@@ -695,6 +701,7 @@ async def setup_clients():
695701
embedding_model=OPENAI_EMB_MODEL,
696702
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
697703
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
704+
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
698705
sourcepage_field=KB_FIELDS_SOURCEPAGE,
699706
content_field=KB_FIELDS_CONTENT,
700707
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
@@ -734,6 +741,7 @@ async def setup_clients():
734741
embedding_model=OPENAI_EMB_MODEL,
735742
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
736743
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
744+
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
737745
sourcepage_field=KB_FIELDS_SOURCEPAGE,
738746
content_field=KB_FIELDS_CONTENT,
739747
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
@@ -755,6 +763,7 @@ async def setup_clients():
755763
embedding_model=OPENAI_EMB_MODEL,
756764
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
757765
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
766+
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
758767
sourcepage_field=KB_FIELDS_SOURCEPAGE,
759768
content_field=KB_FIELDS_CONTENT,
760769
query_language=AZURE_SEARCH_QUERY_LANGUAGE,

app/backend/approaches/approach.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
class Document:
3939
id: Optional[str]
4040
content: Optional[str]
41-
embedding: Optional[list[float]]
42-
image_embedding: Optional[list[float]]
4341
category: Optional[str]
4442
sourcepage: Optional[str]
4543
sourcefile: Optional[str]
@@ -50,11 +48,9 @@ class Document:
5048
reranker_score: Optional[float] = None
5149

5250
def serialize_for_results(self) -> dict[str, Any]:
53-
return {
51+
result_dict = {
5452
"id": self.id,
5553
"content": self.content,
56-
"embedding": Document.trim_embedding(self.embedding),
57-
"imageEmbedding": Document.trim_embedding(self.image_embedding),
5854
"category": self.category,
5955
"sourcepage": self.sourcepage,
6056
"sourcefile": self.sourcefile,
@@ -75,18 +71,7 @@ def serialize_for_results(self) -> dict[str, Any]:
7571
"score": self.score,
7672
"reranker_score": self.reranker_score,
7773
}
78-
79-
@classmethod
80-
def trim_embedding(cls, embedding: Optional[list[float]]) -> Optional[str]:
81-
"""Returns a trimmed list of floats from the vector embedding."""
82-
if embedding:
83-
if len(embedding) > 2:
84-
# Format the embedding list to show the first 2 items followed by the count of the remaining items."""
85-
return f"[{embedding[0]}, {embedding[1]} ...+{len(embedding) - 2} more]"
86-
else:
87-
return str(embedding)
88-
89-
return None
74+
return result_dict
9075

9176

9277
@dataclass
@@ -159,6 +144,7 @@ def __init__(
159144
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
160145
embedding_model: str,
161146
embedding_dimensions: int,
147+
embedding_field: str,
162148
openai_host: str,
163149
vision_endpoint: str,
164150
vision_token_provider: Callable[[], Awaitable[str]],
@@ -173,6 +159,7 @@ def __init__(
173159
self.embedding_deployment = embedding_deployment
174160
self.embedding_model = embedding_model
175161
self.embedding_dimensions = embedding_dimensions
162+
self.embedding_field = embedding_field
176163
self.openai_host = openai_host
177164
self.vision_endpoint = vision_endpoint
178165
self.vision_token_provider = vision_token_provider
@@ -238,8 +225,6 @@ async def search(
238225
Document(
239226
id=document.get("id"),
240227
content=document.get("content"),
241-
embedding=document.get("embedding"),
242-
image_embedding=document.get("imageEmbedding"),
243228
category=document.get("category"),
244229
sourcepage=document.get("sourcepage"),
245230
sourcefile=document.get("sourcefile"),
@@ -314,12 +299,14 @@ class ExtraArgs(TypedDict, total=False):
314299
**dimensions_args,
315300
)
316301
query_vector = embedding.data[0].embedding
317-
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")
302+
# This performs an oversampling due to how the search index was setup,
303+
# so we do not need to explicitly pass in an oversampling parameter here
304+
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)
318305

319306
async def compute_image_embedding(self, q: str):
320307
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
321308
headers = {"Content-Type": "application/json"}
322-
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
309+
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
323310
data = {"text": q}
324311

325312
headers["Authorization"] = "Bearer " + await self.vision_token_provider()

app/backend/approaches/chatreadretrieveread.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def __init__(
3535
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
3636
embedding_model: str,
3737
embedding_dimensions: int,
38+
embedding_field: str,
3839
sourcepage_field: str,
3940
content_field: str,
4041
query_language: str,
@@ -50,6 +51,7 @@ def __init__(
5051
self.embedding_deployment = embedding_deployment
5152
self.embedding_model = embedding_model
5253
self.embedding_dimensions = embedding_dimensions
54+
self.embedding_field = embedding_field
5355
self.sourcepage_field = sourcepage_field
5456
self.content_field = content_field
5557
self.query_language = query_language

app/backend/approaches/chatreadretrievereadvision.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(
3939
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
4040
embedding_model: str,
4141
embedding_dimensions: int,
42+
embedding_field: str,
4243
sourcepage_field: str,
4344
content_field: str,
4445
query_language: str,
@@ -58,6 +59,7 @@ def __init__(
5859
self.embedding_deployment = embedding_deployment
5960
self.embedding_model = embedding_model
6061
self.embedding_dimensions = embedding_dimensions
62+
self.embedding_field = embedding_field
6163
self.sourcepage_field = sourcepage_field
6264
self.content_field = content_field
6365
self.query_language = query_language
@@ -89,7 +91,7 @@ async def run_until_final_call(
8991
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
9092
filter = self.build_filter(overrides, auth_claims)
9193

92-
vector_fields = overrides.get("vector_fields", ["embedding"])
94+
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
9395
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
9496
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
9597

@@ -122,13 +124,10 @@ async def run_until_final_call(
122124
# If retrieval mode includes vectors, compute an embedding for the query
123125
vectors = []
124126
if use_vector_search:
125-
for field in vector_fields:
126-
vector = (
127-
await self.compute_text_embedding(query_text)
128-
if field == "embedding"
129-
else await self.compute_image_embedding(query_text)
130-
)
131-
vectors.append(vector)
127+
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
128+
vectors.append(await self.compute_text_embedding(query_text))
129+
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
130+
vectors.append(await self.compute_image_embedding(query_text))
132131

133132
results = await self.search(
134133
top,

app/backend/approaches/retrievethenread.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(
2828
embedding_model: str,
2929
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
3030
embedding_dimensions: int,
31+
embedding_field: str,
3132
sourcepage_field: str,
3233
content_field: str,
3334
query_language: str,
@@ -44,6 +45,7 @@ def __init__(
4445
self.embedding_dimensions = embedding_dimensions
4546
self.chatgpt_deployment = chatgpt_deployment
4647
self.embedding_deployment = embedding_deployment
48+
self.embedding_field = embedding_field
4749
self.sourcepage_field = sourcepage_field
4850
self.content_field = content_field
4951
self.query_language = query_language

app/backend/approaches/retrievethenreadvision.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
3434
embedding_model: str,
3535
embedding_dimensions: int,
36+
embedding_field: str,
3637
sourcepage_field: str,
3738
content_field: str,
3839
query_language: str,
@@ -48,6 +49,7 @@ def __init__(
4849
self.embedding_model = embedding_model
4950
self.embedding_deployment = embedding_deployment
5051
self.embedding_dimensions = embedding_dimensions
52+
self.embedding_field = embedding_field
5153
self.sourcepage_field = sourcepage_field
5254
self.content_field = content_field
5355
self.gpt4v_deployment = gpt4v_deployment
@@ -84,20 +86,17 @@ async def run(
8486
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
8587
filter = self.build_filter(overrides, auth_claims)
8688

87-
vector_fields = overrides.get("vector_fields", ["embedding"])
89+
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
8890
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
8991
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
9092

9193
# If retrieval mode includes vectors, compute an embedding for the query
9294
vectors = []
9395
if use_vector_search:
94-
for field in vector_fields:
95-
vector = (
96-
await self.compute_text_embedding(q)
97-
if field == "embedding"
98-
else await self.compute_image_embedding(q)
99-
)
100-
vectors.append(vector)
96+
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
97+
vectors.append(await self.compute_text_embedding(q))
98+
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
99+
vectors.append(await self.compute_image_embedding(q))
101100

102101
results = await self.search(
103102
top,

app/backend/prepdocs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
398398
blob_manager=blob_manager,
399399
document_action=document_action,
400400
embeddings=openai_embeddings_service,
401+
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
401402
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
402403
search_service_user_assigned_id=args.searchserviceassignedid,
403404
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
@@ -430,6 +431,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
430431
embeddings=openai_embeddings_service,
431432
image_embeddings=image_embeddings_service,
432433
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
434+
# Default to the previous field names for backward compatibility
435+
search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"),
433436
use_acls=use_acls,
434437
category=args.category,
435438
use_content_understanding=use_content_understanding,

app/backend/prepdocslib/embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]):
239239
async def create_embeddings(self, blob_urls: list[str]) -> list[list[float]]:
240240
endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage")
241241
headers = {"Content-Type": "application/json"}
242-
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
242+
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
243243
headers["Authorization"] = "Bearer " + await self.token_provider()
244244

245245
embeddings: list[list[float]] = []

app/backend/prepdocslib/filestrategy.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(
5151
embeddings: Optional[OpenAIEmbeddings] = None,
5252
image_embeddings: Optional[ImageEmbeddings] = None,
5353
search_analyzer_name: Optional[str] = None,
54+
search_field_name_embedding: Optional[str] = None,
5455
use_acls: bool = False,
5556
category: Optional[str] = None,
5657
use_content_understanding: bool = False,
@@ -63,22 +64,27 @@ def __init__(
6364
self.embeddings = embeddings
6465
self.image_embeddings = image_embeddings
6566
self.search_analyzer_name = search_analyzer_name
67+
self.search_field_name_embedding = search_field_name_embedding
6668
self.search_info = search_info
6769
self.use_acls = use_acls
6870
self.category = category
6971
self.use_content_understanding = use_content_understanding
7072
self.content_understanding_endpoint = content_understanding_endpoint
7173

72-
async def setup(self):
73-
search_manager = SearchManager(
74+
def setup_search_manager(self):
75+
self.search_manager = SearchManager(
7476
self.search_info,
7577
self.search_analyzer_name,
7678
self.use_acls,
7779
False,
7880
self.embeddings,
81+
field_name_embedding=self.search_field_name_embedding,
7982
search_images=self.image_embeddings is not None,
8083
)
81-
await search_manager.create_index()
84+
85+
async def setup(self):
86+
self.setup_search_manager()
87+
await self.search_manager.create_index()
8288

8389
if self.use_content_understanding:
8490
if self.content_understanding_endpoint is None:
@@ -91,9 +97,7 @@ async def setup(self):
9197
await cu_manager.create_analyzer()
9298

9399
async def run(self):
94-
search_manager = SearchManager(
95-
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
96-
)
100+
self.setup_search_manager()
97101
if self.document_action == DocumentAction.Add:
98102
files = self.list_file_strategy.list()
99103
async for file in files:
@@ -104,18 +108,18 @@ async def run(self):
104108
blob_image_embeddings: Optional[list[list[float]]] = None
105109
if self.image_embeddings and blob_sas_uris:
106110
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
107-
await search_manager.update_content(sections, blob_image_embeddings, url=file.url)
111+
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
108112
finally:
109113
if file:
110114
file.close()
111115
elif self.document_action == DocumentAction.Remove:
112116
paths = self.list_file_strategy.list_paths()
113117
async for path in paths:
114118
await self.blob_manager.remove_blob(path)
115-
await search_manager.remove_content(path)
119+
await self.search_manager.remove_content(path)
116120
elif self.document_action == DocumentAction.RemoveAll:
117121
await self.blob_manager.remove_blob()
118-
await search_manager.remove_content()
122+
await self.search_manager.remove_content()
119123

120124

121125
class UploadUserFileStrategy:
@@ -129,12 +133,22 @@ def __init__(
129133
file_processors: dict[str, FileProcessor],
130134
embeddings: Optional[OpenAIEmbeddings] = None,
131135
image_embeddings: Optional[ImageEmbeddings] = None,
136+
search_field_name_embedding: Optional[str] = None,
132137
):
133138
self.file_processors = file_processors
134139
self.embeddings = embeddings
135140
self.image_embeddings = image_embeddings
136141
self.search_info = search_info
137-
self.search_manager = SearchManager(self.search_info, None, True, False, self.embeddings)
142+
self.search_manager = SearchManager(
143+
search_info=self.search_info,
144+
search_analyzer_name=None,
145+
use_acls=True,
146+
use_int_vectorization=False,
147+
embeddings=self.embeddings,
148+
field_name_embedding=search_field_name_embedding,
149+
search_images=False,
150+
)
151+
self.search_field_name_embedding = search_field_name_embedding
138152

139153
async def add_file(self, file: File):
140154
if self.image_embeddings:

0 commit comments

Comments
 (0)