Skip to content

Commit f91edc5

Browse files
refactor: split Document into TextDocument and VectorDocument (#110)
* Require contains non empty string and simplify API * fix: pinecone example * Introduce `EmbeddedDocument` * Update src/Document/EmbeddedDocument.php * - * - * tests: fix DocumentEmbedderTest * refactor: split Document into TextDocument and VectorDocument --------- Co-authored-by: Oskar Stark <[email protected]>
1 parent f1e89ab commit f91edc5

15 files changed

+109
-220
lines changed

examples/store-mongodb-similarity-search.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
use MongoDB\Client as MongoDBClient;
44
use PhpLlm\LlmChain\Chain;
5-
use PhpLlm\LlmChain\Document\Document;
65
use PhpLlm\LlmChain\Document\Metadata;
6+
use PhpLlm\LlmChain\Document\TextDocument;
77
use PhpLlm\LlmChain\DocumentEmbedder;
88
use PhpLlm\LlmChain\Message\Message;
99
use PhpLlm\LlmChain\Message\MessageBag;
@@ -46,9 +46,9 @@
4646

4747
// create embeddings and documents
4848
foreach ($movies as $movie) {
49-
$documents[] = Document::fromText(
49+
$documents[] = new TextDocument(
5050
id: Uuid::v4(),
51-
text: 'Title: '.$movie['title'].PHP_EOL.'Director: '.$movie['director'].PHP_EOL.'Description: '.$movie['description'],
51+
content: 'Title: '.$movie['title'].PHP_EOL.'Director: '.$movie['director'].PHP_EOL.'Description: '.$movie['description'],
5252
metadata: new Metadata($movie),
5353
);
5454
}

examples/store-pinecone-similarity-search.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
<?php
22

33
use PhpLlm\LlmChain\Chain;
4-
use PhpLlm\LlmChain\Document\Document;
54
use PhpLlm\LlmChain\Document\Metadata;
5+
use PhpLlm\LlmChain\Document\TextDocument;
66
use PhpLlm\LlmChain\DocumentEmbedder;
77
use PhpLlm\LlmChain\Message\Message;
88
use PhpLlm\LlmChain\Message\MessageBag;
@@ -40,9 +40,9 @@
4040

4141
// create embeddings and documents
4242
foreach ($movies as $movie) {
43-
$documents[] = Document::fromText(
43+
$documents[] = new TextDocument(
4444
id: Uuid::v4(),
45-
text: 'Title: '.$movie['title'].PHP_EOL.'Director: '.$movie['director'].PHP_EOL.'Description: '.$movie['description'],
45+
content: 'Title: '.$movie['title'].PHP_EOL.'Director: '.$movie['director'].PHP_EOL.'Description: '.$movie['description'],
4646
metadata: new Metadata($movie),
4747
);
4848
}

src/Document/Document.php

Lines changed: 0 additions & 69 deletions
This file was deleted.

src/Document/TextDocument.php

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Document;
6+
7+
use Symfony\Component\Uid\Uuid;
8+
use Webmozart\Assert\Assert;
9+
10+
final readonly class TextDocument
11+
{
12+
public function __construct(
13+
public Uuid $id,
14+
public string $content,
15+
public Metadata $metadata = new Metadata(),
16+
) {
17+
Assert::stringNotEmpty(trim($this->content));
18+
}
19+
}

src/Document/VectorDocument.php

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Document;
6+
7+
use Symfony\Component\Uid\Uuid;
8+
9+
final readonly class VectorDocument
10+
{
11+
public function __construct(
12+
public Uuid $id,
13+
public Vector $vector,
14+
public Metadata $metadata = new Metadata(),
15+
) {
16+
}
17+
}

src/DocumentEmbedder.php

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
namespace PhpLlm\LlmChain;
66

7-
use PhpLlm\LlmChain\Document\Document;
7+
use PhpLlm\LlmChain\Document\TextDocument;
8+
use PhpLlm\LlmChain\Document\VectorDocument;
89
use PhpLlm\LlmChain\Store\StoreInterface;
910
use Psr\Log\LoggerInterface;
1011
use Psr\Log\NullLogger;
@@ -25,17 +26,14 @@ public function __construct(
2526
}
2627

2728
/**
28-
* @param Document|list<Document> $documents
29+
* @param TextDocument|TextDocument[] $documents
2930
*/
30-
public function embed(Document|array $documents, int $chunkSize = 0, int $sleep = 0): void
31+
public function embed(TextDocument|array $documents, int $chunkSize = 0, int $sleep = 0): void
3132
{
32-
if ($documents instanceof Document) {
33+
if ($documents instanceof TextDocument) {
3334
$documents = [$documents];
3435
}
3536

36-
// Filter out documents without text
37-
$documents = array_filter($documents, fn (Document $document) => is_string($document->text));
38-
3937
if ([] === $documents) {
4038
$this->logger->debug('No documents to embed');
4139

@@ -45,14 +43,14 @@ public function embed(Document|array $documents, int $chunkSize = 0, int $sleep
4543
$chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents];
4644

4745
foreach ($chunks as $chunk) {
48-
$vectors = $this->embeddings->multiCreate(array_map(fn (Document $document) => $document->text, $chunk));
46+
$vectors = $this->embeddings->multiCreate(array_map(fn (TextDocument $document) => $document->content, $chunk));
4947

50-
$vectorizedDocuments = [];
48+
$vectorDocuments = [];
5149
foreach ($chunk as $i => $document) {
52-
$vectorizedDocuments[] = $document->withVector($vectors[$i]);
50+
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
5351
}
5452

55-
$this->store->addDocuments($vectorizedDocuments);
53+
$this->store->add(...$vectorDocuments);
5654

5755
if (0 !== $sleep) {
5856
$this->clock->sleep($sleep);

src/Store/Azure/SearchStore.php

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
namespace PhpLlm\LlmChain\Store\Azure;
66

7-
use PhpLlm\LlmChain\Document\Document;
87
use PhpLlm\LlmChain\Document\Metadata;
98
use PhpLlm\LlmChain\Document\Vector;
9+
use PhpLlm\LlmChain\Document\VectorDocument;
1010
use PhpLlm\LlmChain\Store\VectorStoreInterface;
1111
use Symfony\Component\Uid\Uuid;
1212
use Symfony\Contracts\HttpClient\HttpClientInterface;
@@ -26,28 +26,20 @@ public function __construct(
2626
) {
2727
}
2828

29-
public function addDocument(Document $document): void
30-
{
31-
$this->addDocuments([$document]);
32-
}
33-
34-
public function addDocuments(array $documents): void
29+
public function add(VectorDocument ...$documents): void
3530
{
3631
$this->request('index', [
37-
'value' => array_map([$this, 'convertDocumentToIndexableArray'], $documents),
32+
'value' => array_map([$this, 'convertToIndexableArray'], $documents),
3833
]);
3934
}
4035

41-
/**
42-
* @return list<Document>
43-
*/
4436
public function query(Vector $vector, array $options = []): array
4537
{
4638
$result = $this->request('search', [
4739
'vectorQueries' => [$this->buildVectorQuery($vector)],
4840
]);
4941

50-
return array_map([$this, 'convertArrayToDocument'], $result['value']);
42+
return array_map([$this, 'convertToVectorDocument'], $result['value']);
5143
}
5244

5345
/**
@@ -73,7 +65,7 @@ private function request(string $endpoint, array $payload): array
7365
/**
7466
* @return array<string, mixed>
7567
*/
76-
private function convertDocumentToIndexableArray(Document $document): array
68+
private function convertToIndexableArray(VectorDocument $document): array
7769
{
7870
return array_merge([
7971
'id' => $document->id,
@@ -84,12 +76,11 @@ private function convertDocumentToIndexableArray(Document $document): array
8476
/**
8577
* @param array<string, mixed> $data
8678
*/
87-
private function convertArrayToDocument(array $data): Document
79+
private function convertToVectorDocument(array $data): VectorDocument
8880
{
89-
return new Document(
81+
return new VectorDocument(
9082
id: Uuid::fromString($data['id']),
91-
text: null,
92-
vector: null,
83+
vector: $data[$this->vectorFieldName] ? new Vector($data[$this->vectorFieldName]) : null,
9384
metadata: new Metadata($data),
9485
);
9586
}

src/Store/ChromaDB/Store.php

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,37 +5,26 @@
55
namespace PhpLlm\LlmChain\Store\ChromaDB;
66

77
use Codewithkyrian\ChromaDB\Client;
8-
use PhpLlm\LlmChain\Document\Document;
98
use PhpLlm\LlmChain\Document\Metadata;
109
use PhpLlm\LlmChain\Document\Vector;
10+
use PhpLlm\LlmChain\Document\VectorDocument;
1111
use PhpLlm\LlmChain\Store\VectorStoreInterface;
12-
use Psr\Log\LoggerInterface;
1312
use Symfony\Component\Uid\Uuid;
1413

1514
final readonly class Store implements VectorStoreInterface
1615
{
1716
public function __construct(
1817
private Client $client,
19-
private LoggerInterface $logger,
2018
private string $collectionName,
2119
) {
2220
}
2321

24-
public function addDocument(Document $document): void
25-
{
26-
$this->addDocuments([$document]);
27-
}
28-
29-
public function addDocuments(array $documents): void
22+
public function add(VectorDocument ...$documents): void
3023
{
3124
$ids = [];
3225
$vectors = [];
3326
$metadata = [];
3427
foreach ($documents as $document) {
35-
if (!$document->hasVector()) {
36-
$this->logger->warning('Document {id} does not have a vector', ['id' => $document->id]);
37-
}
38-
3928
$ids[] = (string) $document->id;
4029
$vectors[] = $document->vector->getData();
4130
$metadata[] = $document->metadata->getArrayCopy();
@@ -55,10 +44,10 @@ public function query(Vector $vector, array $options = []): array
5544

5645
$documents = [];
5746
for ($i = 0; $i < count($queryResponse->metadatas[0]); ++$i) {
58-
$documents[] = Document::fromVector(
59-
new Vector($queryResponse->embeddings[0][$i]),
60-
Uuid::fromString($queryResponse->ids[0][$i]),
61-
new Metadata($queryResponse->metadatas[0][$i]),
47+
$documents[] = new VectorDocument(
48+
id: Uuid::fromString($queryResponse->ids[0][$i]),
49+
vector: new Vector($queryResponse->embeddings[0][$i]),
50+
metadata: new Metadata($queryResponse->metadatas[0][$i]),
6251
);
6352
}
6453

src/Store/MongoDB/Store.php

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
use MongoDB\Client;
99
use MongoDB\Collection;
1010
use MongoDB\Driver\Exception\CommandException;
11-
use PhpLlm\LlmChain\Document\Document;
1211
use PhpLlm\LlmChain\Document\Metadata;
1312
use PhpLlm\LlmChain\Document\Vector;
13+
use PhpLlm\LlmChain\Document\VectorDocument;
1414
use PhpLlm\LlmChain\Exception\InvalidArgumentException;
1515
use PhpLlm\LlmChain\Store\InitializableStoreInterface;
1616
use PhpLlm\LlmChain\Store\VectorStoreInterface;
@@ -61,26 +61,16 @@ public function __construct(
6161
) {
6262
}
6363

64-
public function addDocument(Document $document): void
65-
{
66-
$this->addDocuments([$document]);
67-
}
68-
69-
public function addDocuments(array $documents): void
64+
public function add(VectorDocument ...$documents): void
7065
{
7166
$operations = [];
7267

7368
foreach ($documents as $document) {
74-
if (!$document->hasVector()) {
75-
$this->logger->warning('Document {id} does not have a vector', ['id' => $document->id]);
76-
}
77-
7869
$operation = [
7970
['_id' => $this->toBinary($document->id)], // we use binary for the id, because of storage efficiency
8071
array_filter([
8172
'metadata' => $document->metadata->getArrayCopy(),
8273
$this->vectorFieldName => $document->vector->getData(),
83-
'text' => $document->text,
8474
]),
8575
['upsert' => true], // insert if not exists
8676
];
@@ -104,8 +94,6 @@ public function addDocuments(array $documents): void
10494
* numCandidates?: positive-int,
10595
* filter?: array<mixed>
10696
* } $options
107-
*
108-
* @return Document[]
10997
*/
11098
public function query(Vector $vector, array $options = []): array
11199
{
@@ -129,10 +117,10 @@ public function query(Vector $vector, array $options = []): array
129117
$documents = [];
130118

131119
foreach ($results as $result) {
132-
$documents[] = Document::fromVector(
133-
new Vector($result[$this->vectorFieldName]),
134-
$this->toUuid($result['_id']),
135-
new Metadata($result['metadata'] ?? []),
120+
$documents[] = new VectorDocument(
121+
id: $this->toUuid($result['_id']),
122+
vector: new Vector($result[$this->vectorFieldName]),
123+
metadata: new Metadata($result['metadata'] ?? []),
136124
);
137125
}
138126

0 commit comments

Comments
 (0)