Skip to content

Commit 86b80f9

Browse files
authored
feat: audio support (#189)
1 parent af5e00d commit 86b80f9

19 files changed

+238
-6
lines changed

README.md

+34-3
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ The core feature of LLM Chain is to interact with language models via messages.
7373
a **MessageBag** to a **Chain**, which takes care of LLM invocation and response handling.
7474

7575
Messages can be of different types, most importantly `UserMessage`, `SystemMessage`, or `AssistantMessage`, and can also
76-
have different content types, like `Text` or `Image`.
76+
have different content types, like `Text`, `Image` or `Audio`.
7777

7878
#### Example Chain call with messages
7979

@@ -453,13 +453,13 @@ use PhpLlm\LlmChain\Model\Message\Content\Image;
453453
use PhpLlm\LlmChain\Model\Message\Message;
454454
use PhpLlm\LlmChain\Model\Message\MessageBag;
455455

456-
// Initialize Platoform, LLM & Chain
456+
// Initialize Platform, LLM & Chain
457457

458458
$messages = new MessageBag(
459459
Message::forSystem('You are an image analyzer bot that helps identify the content of images.'),
460460
Message::ofUser(
461461
'Describe the image as a comedian would do it.',
462-
new Image(dirname(__DIR__).'/tests/Fixture/image.png'), // Path to an image file
462+
new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'), // Path to an image file
463463
new Image('https://foo.com/bar.png'), // URL to an image
464464
new Image('data:image/png;base64,...'), // Data URL of an image
465465
),
@@ -472,6 +472,30 @@ $response = $chain->call($messages);
472472
1. **Image Description**: [image-describer-binary.php](examples/image-describer-binary.php) (with binary file)
473473
1. **Image Description**: [image-describer-url.php](examples/image-describer-url.php) (with URL)
474474

475+
### Audio Processing
476+
477+
Similar to images, some LLMs also support audio as input, which is just another `Content` type within the `UserMessage`:
478+
479+
```php
480+
use PhpLlm\LlmChain\Model\Message\Content\Audio;
481+
use PhpLlm\LlmChain\Model\Message\Message;
482+
use PhpLlm\LlmChain\Model\Message\MessageBag;
483+
484+
// Initialize Platform, LLM & Chain
485+
486+
$messages = new MessageBag(
487+
Message::ofUser(
488+
'What is this recording about?',
489+
new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'), // Path to an audio file
490+
),
491+
);
492+
$response = $chain->call($messages);
493+
```
494+
495+
#### Code Examples
496+
497+
1. **Audio Description**: [audio-describer.php](examples/audio-describer.php)
498+
475499
### Embeddings
476500

477501
Creating embeddings of word, sentences or paragraphs is a typical use case around the interaction with LLMs and
@@ -617,3 +641,10 @@ Contributions are always welcome, so feel free to join the development of this l
617641
[![LLM Chain Contributors](https://contrib.rocks/image?repo=php-llm/llm-chain 'LLM Chain Contributors')](https://github.com/php-llm/llm-chain/graphs/contributors)
618642

619643
Made with [contrib.rocks](https://contrib.rocks).
644+
645+
### Fixture Licenses
646+
647+
For testing multi-modal features, the repository contains binary media content, with the following owners and licenses:
648+
649+
* `tests/Fixture/image.jpg`: Chris F., Creative Commons, see [pexels.com](https://www.pexels.com/photo/blauer-und-gruner-elefant-mit-licht-1680755/)
650+
* `tests/Fixture/audio.mp3`: davidbain, Creative Commons, see [freesound.org](https://freesound.org/people/davidbain/sounds/136777/)

examples/audio-describer.php

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?php
2+
3+
use PhpLlm\LlmChain\Bridge\OpenAI\GPT;
4+
use PhpLlm\LlmChain\Bridge\OpenAI\PlatformFactory;
5+
use PhpLlm\LlmChain\Chain;
6+
use PhpLlm\LlmChain\Model\Message\Content\Audio;
7+
use PhpLlm\LlmChain\Model\Message\Message;
8+
use PhpLlm\LlmChain\Model\Message\MessageBag;
9+
use Symfony\Component\Dotenv\Dotenv;
10+
11+
require_once dirname(__DIR__).'/vendor/autoload.php';
12+
(new Dotenv())->loadEnv(dirname(__DIR__).'/.env');
13+
14+
if (empty($_ENV['OPENAI_API_KEY'])) {
15+
echo 'Please set the OPENAI_API_KEY environment variable.'.PHP_EOL;
16+
exit(1);
17+
}
18+
19+
$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
20+
$llm = new GPT(GPT::GPT_4O_AUDIO);
21+
22+
$chain = new Chain($platform, $llm);
23+
$messages = new MessageBag(
24+
Message::ofUser(
25+
'What is this recording about?',
26+
new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'),
27+
),
28+
);
29+
$response = $chain->call($messages);
30+
31+
echo $response->getContent().PHP_EOL;

examples/image-describer-binary.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
Message::forSystem('You are an image analyzer bot that helps identify the content of images.'),
2525
Message::ofUser(
2626
'Describe the image as a comedian would do it.',
27-
new Image(dirname(__DIR__).'/tests/Fixture/image.png'),
27+
new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'),
2828
),
2929
);
3030
$response = $chain->call($messages);

src/Bridge/Anthropic/Claude.php

+5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ public function getOptions(): array
3232
return $this->options;
3333
}
3434

35+
public function supportsAudioInput(): bool
36+
{
37+
return false;
38+
}
39+
3540
public function supportsImageInput(): bool
3641
{
3742
return false; // it does, but implementation here is still open.

src/Bridge/Meta/Llama.php

+5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ public function getOptions(): array
4242
return $this->options;
4343
}
4444

45+
public function supportsAudioInput(): bool
46+
{
47+
return false;
48+
}
49+
4550
public function supportsImageInput(): bool
4651
{
4752
return false; // it does, but implementation here is still open.

src/Bridge/OpenAI/GPT.php

+11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ final class GPT implements LanguageModel
1414
public const GPT_4_TURBO = 'gpt-4-turbo';
1515
public const GPT_4O = 'gpt-4o';
1616
public const GPT_4O_MINI = 'gpt-4o-mini';
17+
public const GPT_4O_AUDIO = 'gpt-4o-audio-preview';
1718
public const O1_MINI = 'o1-mini';
1819
public const O1_PREVIEW = 'o1-preview';
1920

@@ -23,9 +24,14 @@ final class GPT implements LanguageModel
2324
public function __construct(
2425
private readonly string $version = self::GPT_4O,
2526
private readonly array $options = ['temperature' => 1.0],
27+
private bool $supportsAudioInput = false,
2628
private bool $supportsImageInput = false,
2729
private bool $supportsStructuredOutput = false,
2830
) {
31+
if (false === $this->supportsAudioInput) {
32+
$this->supportsAudioInput = self::GPT_4O_AUDIO === $this->version;
33+
}
34+
2935
if (false === $this->supportsImageInput) {
3036
$this->supportsImageInput = in_array($this->version, [self::GPT_4_TURBO, self::GPT_4O, self::GPT_4O_MINI, self::O1_MINI, self::O1_PREVIEW], true);
3137
}
@@ -45,6 +51,11 @@ public function getOptions(): array
4551
return $this->options;
4652
}
4753

54+
public function supportsAudioInput(): bool
55+
{
56+
return $this->supportsImageInput;
57+
}
58+
4859
public function supportsImageInput(): bool
4960
{
5061
return $this->supportsImageInput;

src/Chain.php

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ public function call(MessageBagInterface $messages, array $options = []): Respon
6060
$messages = $input->messages;
6161
$options = $input->getOptions();
6262

63+
if ($messages->containsAudio() && !$llm->supportsAudioInput()) {
64+
throw MissingModelSupport::forAudioInput($llm::class);
65+
}
66+
6367
if ($messages->containsImage() && !$llm->supportsImageInput()) {
6468
throw MissingModelSupport::forImageInput($llm::class);
6569
}

src/Exception/MissingModelSupport.php

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ public static function forToolCalling(string $model): self
1616
return new self($model, 'tool calling');
1717
}
1818

19+
public static function forAudioInput(string $model): self
20+
{
21+
return new self($model, 'audio input');
22+
}
23+
1924
public static function forImageInput(string $model): self
2025
{
2126
return new self($model, 'image input');

src/Model/LanguageModel.php

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
interface LanguageModel extends Model
88
{
9+
public function supportsAudioInput(): bool;
10+
911
public function supportsImageInput(): bool;
1012

1113
public function supportsStreaming(): bool;

src/Model/Message/Content/Audio.php

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Model\Message\Content;
6+
7+
use PhpLlm\LlmChain\Exception\InvalidArgumentException;
8+
9+
final readonly class Audio implements Content
10+
{
11+
public function __construct(
12+
public string $path,
13+
) {
14+
if (!is_readable($path) || false === file_get_contents($path)) {
15+
throw new InvalidArgumentException(sprintf('The file "%s" does not exist or is not readable.', $path));
16+
}
17+
}
18+
19+
/**
20+
* @return array{type: 'input_audio', input_audio: array{data: string, format: string}}
21+
*/
22+
public function jsonSerialize(): array
23+
{
24+
$data = file_get_contents($this->path);
25+
$format = pathinfo($this->path, PATHINFO_EXTENSION);
26+
27+
return [
28+
'type' => 'input_audio',
29+
'input_audio' => [
30+
'data' => base64_encode($data),
31+
'format' => $format,
32+
],
33+
];
34+
}
35+
}

src/Model/Message/MessageBag.php

+11
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,17 @@ public function prepend(MessageInterface $message): self
7878
return $messages;
7979
}
8080

81+
public function containsAudio(): bool
82+
{
83+
foreach ($this->messages as $message) {
84+
if ($message instanceof UserMessage && $message->hasAudioContent()) {
85+
return true;
86+
}
87+
}
88+
89+
return false;
90+
}
91+
8192
public function containsImage(): bool
8293
{
8394
foreach ($this->messages as $message) {

src/Model/Message/MessageBagInterface.php

+2
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@ public function withoutSystemMessage(): self;
2323

2424
public function prepend(MessageInterface $message): self;
2525

26+
public function containsAudio(): bool;
27+
2628
public function containsImage(): bool;
2729
}

src/Model/Message/UserMessage.php

+12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
namespace PhpLlm\LlmChain\Model\Message;
66

7+
use PhpLlm\LlmChain\Model\Message\Content\Audio;
78
use PhpLlm\LlmChain\Model\Message\Content\Content;
89
use PhpLlm\LlmChain\Model\Message\Content\Image;
910
use PhpLlm\LlmChain\Model\Message\Content\Text;
@@ -26,6 +27,17 @@ public function getRole(): Role
2627
return Role::User;
2728
}
2829

30+
public function hasAudioContent(): bool
31+
{
32+
foreach ($this->content as $content) {
33+
if ($content instanceof Audio) {
34+
return true;
35+
}
36+
}
37+
38+
return false;
39+
}
40+
2941
public function hasImageContent(): bool
3042
{
3143
foreach ($this->content as $content) {

tests/Fixture/audio.mp3

49.8 KB
Binary file not shown.

tests/Fixture/image.jpg

58.7 KB
Loading

tests/Fixture/image.png

-631 KB
Binary file not shown.
+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpLlm\LlmChain\Tests\Model\Message\Content;
6+
7+
use PhpLlm\LlmChain\Model\Message\Content\Audio;
8+
use PHPUnit\Framework\Attributes\CoversClass;
9+
use PHPUnit\Framework\Attributes\DataProvider;
10+
use PHPUnit\Framework\Attributes\Small;
11+
use PHPUnit\Framework\Attributes\Test;
12+
use PHPUnit\Framework\TestCase;
13+
14+
#[CoversClass(Audio::class)]
15+
#[Small]
16+
final class AudioTest extends TestCase
17+
{
18+
#[Test]
19+
public function constructWithValidPath(): void
20+
{
21+
$audio = new Audio(dirname(__DIR__, 3).'/Fixture/audio.mp3');
22+
23+
self::assertSame(dirname(__DIR__, 3).'/Fixture/audio.mp3', $audio->path);
24+
}
25+
26+
#[Test]
27+
#[DataProvider('provideValidPaths')]
28+
public function jsonSerializeWithValid(string $path, array $expected): void
29+
{
30+
$audio = new Audio($path);
31+
32+
$expected = [
33+
'type' => 'input_audio',
34+
'input_audio' => $expected,
35+
];
36+
37+
$actual = $audio->jsonSerialize();
38+
39+
// shortening the base64 data
40+
$actual['input_audio']['data'] = substr($actual['input_audio']['data'], 0, 30);
41+
42+
self::assertSame($expected, $actual);
43+
}
44+
45+
public static function provideValidPaths(): \Generator
46+
{
47+
yield 'mp3' => [dirname(__DIR__, 3).'/Fixture/audio.mp3', [
48+
'data' => 'SUQzBAAAAAAAfVREUkMAAAAMAAADMj', // shortened
49+
'format' => 'mp3',
50+
]];
51+
}
52+
53+
#[Test]
54+
public function constructWithInvalidPath(): void
55+
{
56+
$this->expectExceptionMessage('The file "foo.mp3" does not exist or is not readable.');
57+
58+
new Audio('foo.mp3');
59+
}
60+
}

tests/Model/Message/Content/ImageTest.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ public function constructWithValidDataUrl(): void
3333
#[Test]
3434
public function withValidFile(): void
3535
{
36-
$image = new Image(dirname(__DIR__, 3).'/Fixture/image.png');
36+
$image = new Image(dirname(__DIR__, 3).'/Fixture/image.jpg');
3737

38-
self::assertStringStartsWith('data:image/png;base64,', $image->url);
38+
self::assertStringStartsWith('data:image/jpg;base64,', $image->url);
3939
}
4040

4141
#[Test]

tests/Model/Message/UserMessageTest.php

+18
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
namespace PhpLlm\LlmChain\Tests\Model\Message;
66

7+
use PhpLlm\LlmChain\Model\Message\Content\Audio;
78
use PhpLlm\LlmChain\Model\Message\Content\Image;
89
use PhpLlm\LlmChain\Model\Message\Content\Text;
910
use PhpLlm\LlmChain\Model\Message\Role;
@@ -17,6 +18,7 @@
1718

1819
#[CoversClass(UserMessage::class)]
1920
#[UsesClass(Text::class)]
21+
#[UsesClass(Audio::class)]
2022
#[UsesClass(Image::class)]
2123
#[UsesClass(Role::class)]
2224
#[Small]
@@ -39,6 +41,22 @@ public function constructionIsPossibleWithMultipleContent(): void
3941
self::assertCount(2, $message->content);
4042
}
4143

44+
#[Test]
45+
public function hasAudioContentWithoutAudio(): void
46+
{
47+
$message = new UserMessage(new Text('foo'), new Text('bar'));
48+
49+
self::assertFalse($message->hasAudioContent());
50+
}
51+
52+
#[Test]
53+
public function hasAudioContentWithAudio(): void
54+
{
55+
$message = new UserMessage(new Text('foo'), new Audio(dirname(__DIR__, 2).'/Fixture/audio.mp3'));
56+
57+
self::assertTrue($message->hasAudioContent());
58+
}
59+
4260
#[Test]
4361
public function hasImageContentWithoutImage(): void
4462
{

0 commit comments

Comments
 (0)