|
4 | 4 |
|
5 | 5 | // third-party utilities
|
6 | 6 | // use your favorite implementations
|
7 |
| -#define DR_WAV_IMPLEMENTATION |
8 |
| -#include "dr_wav.h" |
| 7 | +#define STB_VORBIS_HEADER_ONLY |
| 8 | +#include "stb_vorbis.c" /* Enables Vorbis decoding. */ |
| 9 | + |
| 10 | +#define MA_NO_DEVICE_IO |
| 11 | +#define MA_NO_THREADING |
| 12 | +#define MA_NO_ENCODING |
| 13 | +#define MA_NO_GENERATION |
| 14 | +#define MA_NO_RESOURCE_MANAGER |
| 15 | +#define MA_NO_NODE_GRAPH |
| 16 | +#define MINIAUDIO_IMPLEMENTATION |
| 17 | +#include "miniaudio.h" |
9 | 18 |
|
10 | 19 | #include <cmath>
|
11 | 20 | #include <cstring>
|
@@ -639,111 +648,95 @@ bool is_wav_buffer(const std::string buf) {
|
639 | 648 | return true;
|
640 | 649 | }
|
641 | 650 |
|
642 |
| -bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { |
643 |
| - drwav wav; |
644 |
| - std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output |
| 651 | +bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { |
| 652 | + std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output |
| 653 | + |
| 654 | + ma_result result; |
| 655 | + ma_decoder_config decoder_config; |
| 656 | + ma_decoder decoder; |
| 657 | + |
| 658 | + decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, COMMON_SAMPLE_RATE); |
645 | 659 |
|
646 | 660 | if (fname == "-") {
|
647 |
| - { |
648 |
| - #ifdef _WIN32 |
649 |
| - _setmode(_fileno(stdin), _O_BINARY); |
650 |
| - #endif |
651 |
| - |
652 |
| - uint8_t buf[1024]; |
653 |
| - while (true) |
654 |
| - { |
655 |
| - const size_t n = fread(buf, 1, sizeof(buf), stdin); |
656 |
| - if (n == 0) { |
657 |
| - break; |
658 |
| - } |
659 |
| - wav_data.insert(wav_data.end(), buf, buf + n); |
660 |
| - } |
661 |
| - } |
| 661 | + #ifdef _WIN32 |
| 662 | + _setmode(_fileno(stdin), _O_BINARY); |
| 663 | + #endif |
662 | 664 |
|
663 |
| - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
664 |
| - fprintf(stderr, "error: failed to open WAV file from stdin\n"); |
665 |
| - return false; |
666 |
| - } |
| 665 | + uint8_t buf[1024]; |
| 666 | + while (true) |
| 667 | + { |
| 668 | + const size_t n = fread(buf, 1, sizeof(buf), stdin); |
| 669 | + if (n == 0) { |
| 670 | + break; |
| 671 | + } |
| 672 | + audio_data.insert(audio_data.end(), buf, buf + n); |
| 673 | + } |
| 674 | + |
| 675 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 676 | + |
| 677 | + fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result)); |
667 | 678 |
|
668 |
| - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); |
| 679 | + return false; |
| 680 | + } |
| 681 | + |
| 682 | + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); |
669 | 683 | }
|
670 | 684 | else if (is_wav_buffer(fname)) {
|
671 |
| - if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) { |
672 |
| - fprintf(stderr, "error: failed to open WAV file from fname buffer\n"); |
673 |
| - return false; |
674 |
| - } |
| 685 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 686 | + fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result)); |
| 687 | + |
| 688 | + return false; |
| 689 | + } |
675 | 690 | }
|
676 |
| - else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { |
| 691 | + else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) { |
677 | 692 | #if defined(WHISPER_FFMPEG)
|
678 |
| - if (ffmpeg_decode_audio(fname, wav_data) != 0) { |
679 |
| - fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str()); |
680 |
| - return false; |
681 |
| - } |
682 |
| - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
683 |
| - fprintf(stderr, "error: failed to read wav data as wav \n"); |
684 |
| - return false; |
685 |
| - } |
| 693 | + if (ffmpeg_decode_audio(fname, audio_data) != 0) { |
| 694 | + fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str()); |
| 695 | + |
| 696 | + return false; |
| 697 | + } |
| 698 | + |
| 699 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 700 | + fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result)); |
| 701 | + |
| 702 | + return false; |
| 703 | + } |
686 | 704 | #else
|
687 |
| - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); |
688 |
| - return false; |
689 |
| -#endif |
690 |
| - } |
| 705 | + fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result)); |
691 | 706 |
|
692 |
| - if (wav.channels != 1 && wav.channels != 2) { |
693 |
| - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str()); |
694 |
| - drwav_uninit(&wav); |
695 |
| - return false; |
| 707 | + return false; |
| 708 | +#endif |
696 | 709 | }
|
697 | 710 |
|
698 |
| - if (stereo && wav.channels != 2) { |
699 |
| - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str()); |
700 |
| - drwav_uninit(&wav); |
701 |
| - return false; |
702 |
| - } |
| 711 | + ma_uint64 frame_count; |
| 712 | + ma_uint64 frames_read; |
703 | 713 |
|
704 |
| - if (wav.sampleRate != COMMON_SAMPLE_RATE) { |
705 |
| - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000); |
706 |
| - drwav_uninit(&wav); |
707 |
| - return false; |
708 |
| - } |
| 714 | + if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { |
| 715 | + fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); |
709 | 716 |
|
710 |
| - if (wav.bitsPerSample != 16) { |
711 |
| - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str()); |
712 |
| - drwav_uninit(&wav); |
713 |
| - return false; |
| 717 | + return false; |
714 | 718 | }
|
715 | 719 |
|
716 |
| - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); |
| 720 | + pcmf32.resize(stereo ? frame_count*2 : frame_count); |
717 | 721 |
|
718 |
| - std::vector<int16_t> pcm16; |
719 |
| - pcm16.resize(n*wav.channels); |
720 |
| - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); |
721 |
| - drwav_uninit(&wav); |
| 722 | + if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { |
| 723 | + fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); |
722 | 724 |
|
723 |
| - // convert to mono, float |
724 |
| - pcmf32.resize(n); |
725 |
| - if (wav.channels == 1) { |
726 |
| - for (uint64_t i = 0; i < n; i++) { |
727 |
| - pcmf32[i] = float(pcm16[i])/32768.0f; |
728 |
| - } |
729 |
| - } else { |
730 |
| - for (uint64_t i = 0; i < n; i++) { |
731 |
| - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; |
732 |
| - } |
| 725 | + return false; |
733 | 726 | }
|
734 | 727 |
|
735 | 728 | if (stereo) {
|
736 |
| - // convert to stereo, float |
737 |
| - pcmf32s.resize(2); |
738 |
| - |
739 |
| - pcmf32s[0].resize(n); |
740 |
| - pcmf32s[1].resize(n); |
741 |
| - for (uint64_t i = 0; i < n; i++) { |
742 |
| - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; |
743 |
| - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; |
744 |
| - } |
| 729 | + pcmf32s.resize(2); |
| 730 | + pcmf32s[0].resize(frame_count); |
| 731 | + pcmf32s[1].resize(frame_count); |
| 732 | + for (uint64_t i = 0; i < frame_count; i++) { |
| 733 | + pcmf32s[0][i] = pcmf32[2*i]; |
| 734 | + pcmf32s[1][i] = pcmf32[2*i + 1]; |
| 735 | + } |
745 | 736 | }
|
746 | 737 |
|
| 738 | + ma_decoder_uninit(&decoder); |
| 739 | + |
747 | 740 | return true;
|
748 | 741 | }
|
749 | 742 |
|
@@ -909,3 +902,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons
|
909 | 902 | }
|
910 | 903 | return true;
|
911 | 904 | }
|
| 905 | + |
| 906 | +#undef STB_VORBIS_HEADER_ONLY |
| 907 | +#include "stb_vorbis.c" |
0 commit comments