Skip to content

Karaoke feature does not work on windows #3511

@edery-ai

Description

@edery-ai

I tried to execute the command contained in the generated .wts file by converting the .wts into a .bat , but I received an error indicating that the system could not find the specified path.
I then attempted to modify the function in cli.cpp, between lines 767 and 886, but I believe I do not fully understand how the function works.

`static bool output_wts(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector> pcmf32s, const char * fname_inp, float t_sec, const char * fname_out) {
static const char * font = params.font_path.c_str();

std::ifstream fin(font);
if (!fin.is_open()) {
    fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
    return false;
}

fout << "#!/bin/bash" << "\n";
fout << "\n";

fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";

for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

    const int n = whisper_full_n_tokens(ctx, i);

    std::vector<whisper_token_data> tokens(n);
    for (int j = 0; j < n; ++j) {
        tokens[j] = whisper_full_get_token_data(ctx, i, j);
    }

    if (i > 0) {
        fout << ",";
    }

    // background text
    fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";

    bool is_first = true;
    std::string speaker = "";

    if (params.diarize && pcmf32s.size() == 2) {
        speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
    }

    for (int j = 0; j < n; ++j) {
        const auto & token = tokens[j];

        if (tokens[j].id >= whisper_token_eot(ctx)) {
            continue;
        }

        std::string txt_bg = "";
        std::string txt_fg = ""; // highlight token
        std::string txt_ul = ""; // underline

        if (params.diarize && pcmf32s.size() == 2) {
            txt_bg = speaker;
            txt_fg = speaker;
            txt_ul = "\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ ";
        }

        txt_bg.append("> ");
        txt_fg.append("> ");
        txt_ul.append("\\ \\ ");

        {
            for (int k = 0; k < n; ++k) {
                const auto & token2 = tokens[k];

                if (tokens[k].id >= whisper_token_eot(ctx)) {
                    continue;
                }

                const std::string txt = whisper_token_to_str(ctx, token2.id);

                txt_bg += txt;

                if (k == j) {
                    for (int l = 0; l < (int) txt.size(); ++l) {
                        txt_fg += txt[l];
                        txt_ul += "_";
                    }
                    txt_fg += "|";
                } else {
                    for (int l = 0; l < (int) txt.size(); ++l) {
                        txt_fg += "\\ ";
                        txt_ul += "\\ ";
                    }
                }
            }

            ::replace_all(txt_bg, "'", "\u2019");
            ::replace_all(txt_bg, "\"", "\\\"");
            ::replace_all(txt_fg, "'", "\u2019");
            ::replace_all(txt_fg, "\"", "\\\"");
        }

        if (is_first) {
            // background text
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
            is_first = false;
        }

        // foreground text
        fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";

        // underline
        fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
    }
}

fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";

fout << "\n\n";
fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
fout << "\n";
fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
fout << "\n";

fout.close();

fprintf(stderr, "# %s: run 'source %s' to generate karaoke video\n", __func__, fname_out);

return true;

}
`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions