Skip to content

refactor: extract logic from blog embed command #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions src/Blog/Embedder.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<?php

declare(strict_types=1);

namespace App\Blog;

use Codewithkyrian\ChromaDB\Client;
use PhpLlm\LlmChain\Bridge\OpenAI\Embeddings;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Model\Response\AsyncResponse;
use PhpLlm\LlmChain\Model\Response\VectorResponse;
use PhpLlm\LlmChain\PlatformInterface;

final readonly class Embedder
{
public function __construct(
private Loader $loader,
private PlatformInterface $platform,
private Client $chromaClient,
) {
}

public function embedBlog(): void
{
$posts = $this->loader->load();
$vectors = $this->createEmbeddings($posts);
$this->pushToChromaDB($posts, $vectors);
}

/**
* @param Post[] $posts
*
* @return Vector[]
*/
private function createEmbeddings(array $posts): array
{
$texts = array_map(fn (Post $post) => $post->toString(), $posts);
$response = $this->platform->request(new Embeddings(), $texts);

assert($response instanceof AsyncResponse);
$response = $response->unwrap();
assert($response instanceof VectorResponse);

return $response->getContent();
}

/**
* @param Post[] $posts
* @param Vector[] $vectors
*/
private function pushToChromaDB(array $posts, array $vectors): void
{
$collection = $this->chromaClient->getOrCreateCollection('symfony_blog');

$ids = array_map(fn (Post $post) => $post->id, $posts);
$vectors = array_map(fn (Vector $vector) => $vector->getData(), $vectors);

$collection->upsert($ids, $vectors, $posts);
}
}
42 changes: 42 additions & 0 deletions src/Blog/Loader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?php

declare(strict_types=1);

namespace App\Blog;

use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\Uid\Uuid;
use Symfony\Contracts\HttpClient\HttpClientInterface;

class Loader
{
public function __construct(
private HttpClientInterface $httpClient,
) {
}

/**
* @return Post[]
*/
public function load(): array
{
$response = $this->httpClient->request('GET', 'https://feeds.feedburner.com/symfony/blog');

$posts = [];
$crawler = new Crawler($response->getContent());
$crawler->filter('item')->each(function (Crawler $node) use (&$posts) {
$title = $node->filter('title')->text();
$posts[] = new Post(
Uuid::v5(Uuid::fromString('6ba7b810-9dad-11d1-80b4-00c04fd430c8'), $title),
$title,
$node->filter('link')->text(),
$node->filter('description')->text(),
(new Crawler($node->filter('content\:encoded')->text()))->text(),
$node->filter('dc\:creator')->text(),
new \DateTimeImmutable($node->filter('pubDate')->text()),
);
});

return $posts;
}
}
55 changes: 55 additions & 0 deletions src/Blog/Post.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<?php

declare(strict_types=1);

namespace App\Blog;

use Symfony\Component\Uid\Uuid;

final readonly class Post
{
public function __construct(
public Uuid $id,
public string $title,
public string $link,
public string $description,
public string $content,
public string $author,
public \DateTimeImmutable $date,
) {
}

public function toString(): string
{
return <<<TEXT
Title: {$this->title}
From: {$this->author} on {$this->date->format('Y-m-d')}
Description: {$this->description}
{$this->content}
TEXT;
}

/**
* @return array{
* id: string,
* title: string,
* link: string,
* description: string,
* content: string,
* author: string,
* date: string,
* }
*/
public function toArray(): array
{
return [
'id' => $this->id->toRfc4122(),
'title' => $this->title,
'link' => $this->link,
'description' => $this->description,
'content' => $this->content,
'author' => $this->author,
'date' => $this->date->format('Y-m-d'),
];
}
}
102 changes: 3 additions & 99 deletions src/Command/BlogEmbedCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,18 @@

namespace App\Command;

use Codewithkyrian\ChromaDB\Client;
use PhpLlm\LlmChain\Bridge\OpenAI\Embeddings;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Model\Response\AsyncResponse;
use PhpLlm\LlmChain\Model\Response\VectorResponse;
use PhpLlm\LlmChain\PlatformInterface;
use App\Blog\Embedder;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\Uid\Uuid;
use Symfony\Contracts\HttpClient\HttpClientInterface;

/**
* @phpstan-type Post array{
* id: Uuid,
* title: string,
* link: string,
* description: string,
* content: string,
* author: string,
* date: \DateTimeImmutable,
* }
*/
#[AsCommand('app:blog:embed', description: 'Create embeddings for Symfony blog and push to ChromaDB.')]
final class BlogEmbedCommand extends Command
{
public function __construct(
private readonly HttpClientInterface $httpClient,
private readonly PlatformInterface $platform,
private readonly Client $chromaClient,
private readonly Embedder $embedder,
) {
parent::__construct();
}
Expand All @@ -46,85 +25,10 @@ protected function execute(InputInterface $input, OutputInterface $output): int
$io = new SymfonyStyle($input, $output);
$io->title('Loading RSS of Symfony blog as embeddings into ChromaDB');

$posts = $this->loadBlogPosts();
$vectors = $this->createEmbeddings($posts);
$this->pushToChromaDB($posts, $vectors);
$this->embedder->embedBlog();

$io->success('Symfony Blog Successfully Embedded!');

return Command::SUCCESS;
}

/**
* @return list<array{
* id: Uuid,
* title: string,
* link: string,
* description: string,
* content: string,
* author: string,
* date: \DateTimeImmutable,
* }>
*/
private function loadBlogPosts(): array
{
$response = $this->httpClient->request('GET', 'https://feeds.feedburner.com/symfony/blog');

$posts = [];
$crawler = new Crawler($response->getContent());
$crawler->filter('item')->each(function (Crawler $node) use (&$posts) {
$title = $node->filter('title')->text();
$posts[] = [
'id' => Uuid::v5(Uuid::fromString('6ba7b810-9dad-11d1-80b4-00c04fd430c8'), $title),
'title' => $title,
'link' => $node->filter('link')->text(),
'description' => $node->filter('description')->text(),
'content' => (new Crawler($node->filter('content\:encoded')->text()))->text(),
'author' => $node->filter('dc\:creator')->text(),
'date' => new \DateTimeImmutable($node->filter('pubDate')->text()),
];
});

return $posts;
}

/**
* @param Post[] $posts
*
* @return Vector[]
*/
private function createEmbeddings(array $posts): array
{
$texts = [];
foreach ($posts as $post) {
$texts[] = <<<TEXT
Title: {$post['title']}
From: {$post['author']} on {$post['date']->format('Y-m-d')}
Description: {$post['description']}
{$post['content']}
TEXT;
}

$response = $this->platform->request(new Embeddings(), $texts);

assert($response instanceof AsyncResponse);
$response = $response->unwrap();
assert($response instanceof VectorResponse);

return $response->getContent();
}

/**
* @param Post[] $posts
* @param Vector[] $vectors
*/
private function pushToChromaDB(array $posts, array $vectors): void
{
$collection = $this->chromaClient->getOrCreateCollection('symfony_blog');

$ids = array_column($posts, 'id');
$vectors = array_map(fn (Vector $vector) => $vector->getData(), $vectors);

$collection->upsert($ids, $vectors, $posts);
}
}
101 changes: 101 additions & 0 deletions tests/Blog/EmbedderTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?php

declare(strict_types=1);

namespace App\Tests\Blog;

use App\Blog\Embedder;
use App\Blog\Loader;
use App\Blog\Post;
use Codewithkyrian\ChromaDB\Client;
use Codewithkyrian\ChromaDB\Resources\CollectionResource;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Model\Model;
use PhpLlm\LlmChain\Model\Response\AsyncResponse;
use PhpLlm\LlmChain\Model\Response\ResponseInterface as LlmResponse;
use PhpLlm\LlmChain\Model\Response\VectorResponse;
use PhpLlm\LlmChain\Platform\ResponseConverter;
use PhpLlm\LlmChain\PlatformInterface;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\UsesClass;
use PHPUnit\Framework\TestCase;
use Symfony\Component\HttpClient\MockHttpClient;
use Symfony\Component\HttpClient\Response\MockResponse;
use Symfony\Contracts\HttpClient\ResponseInterface as HttpResponse;

#[CoversClass(Embedder::class)]
#[UsesClass(Loader::class)]
#[UsesClass(Post::class)]
final class EmbedderTest extends TestCase
{
public function testEmbedBlog(): void
{
$response = MockResponse::fromFile(__DIR__.'/fixtures/blog.rss');
$client = new MockHttpClient([$response, $response]);
$loader = new Loader($client);
$platform = $this->createMock(PlatformInterface::class);
$chromaClient = $this->createMock(Client::class);
$posts = $loader->load();
$vectors = [
new Vector([0.1, 0.2, 0.3]),
new Vector([0.4, 0.5, 0.6]),
new Vector([0.7, 0.8, 0.9]),
new Vector([1.0, 1.1, 1.2]),
new Vector([1.3, 1.4, 1.5]),
new Vector([1.6, 1.7, 1.8]),
new Vector([1.9, 2.0, 2.1]),
new Vector([2.2, 2.3, 2.4]),
new Vector([2.5, 2.6, 2.7]),
new Vector([2.8, 2.9, 3.0]),
];
$platform
->method('request')
->willReturn($this->createAsyncResponse($vectors));

$collection = $this->createMock(CollectionResource::class);
$chromaClient
->expects($this->once())
->method('getOrCreateCollection')
->with('symfony_blog')
->willReturn($collection);

$collection
->expects($this->once())
->method('upsert')
->with(
array_map(fn (Post $post) => $post->id, $posts),
array_map(fn (Vector $vector) => $vector->getData(), $vectors),
$posts,
);

$embedder = new Embedder($loader, $platform, $chromaClient);
$embedder->embedBlog();
}

/**
* @param Vector[] $vectors
*/
private function createAsyncResponse(array $vectors): AsyncResponse
{
$converter = new class($vectors) implements ResponseConverter {
/**
* @param Vector[] $vectors
*/
public function __construct(private readonly array $vectors)
{
}

public function supports(Model $model, object|array|string $input): bool
{
return true;
}

public function convert(HttpResponse $response, array $options = []): LlmResponse
{
return new VectorResponse(...$this->vectors);
}
};

return new AsyncResponse($converter, new MockResponse());
}
}
Loading
Loading