Allows generating captions via the windows speech recognition API (SAPI). This is currently marked as experimental due to speech recognition technology still being less than ideal. Speech recognition technology in general is probably never going to be anywhere near perfect. Microsoft's speech recognition in particular requires a bit of training via the windows speech recognition tool to ensure it can dictate better. Clear speech with a good mic is recognized fairly well, but casual speech and/or speaking with a poor microphone will have some significant issues. Captions can often be way off when speaking casually rather than with clear diction.
355 lines
8.1 KiB
C++
355 lines
8.1 KiB
C++
#include <obs-frontend-api.h>
|
|
#include "captions-stream.hpp"
|
|
#include "captions.hpp"
|
|
#include "tool-helpers.hpp"
|
|
#include <sphelper.h>
|
|
#include <util/platform.h>
|
|
#include <util/windows/HRError.hpp>
|
|
#include <util/windows/ComPtr.hpp>
|
|
#include <util/windows/CoTaskMemPtr.hpp>
|
|
#include <util/threading.h>
|
|
#include <obs-module.h>
|
|
|
|
#include <string>
|
|
#include <thread>
|
|
#include <mutex>
|
|
|
|
#define do_log(type, format, ...) blog(type, "[Captions] " format, \
|
|
##__VA_ARGS__)
|
|
|
|
#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
|
|
#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)
|
|
|
|
using namespace std;
|
|
|
|
struct obs_captions {
|
|
thread th;
|
|
recursive_mutex m;
|
|
WinHandle stop_event;
|
|
|
|
string source_name;
|
|
OBSWeakSource source;
|
|
|
|
void main_thread();
|
|
void start();
|
|
void stop();
|
|
|
|
inline obs_captions() :
|
|
stop_event(CreateEvent(nullptr, false, false, nullptr))
|
|
{
|
|
}
|
|
|
|
inline ~obs_captions() {stop();}
|
|
};
|
|
|
|
static obs_captions *captions = nullptr;
|
|
|
|
/* ------------------------------------------------------------------------- */
|
|
|
|
CaptionsDialog::CaptionsDialog(QWidget *parent) :
|
|
QDialog(parent),
|
|
ui(new Ui_CaptionsDialog)
|
|
{
|
|
ui->setupUi(this);
|
|
|
|
lock_guard<recursive_mutex> lock(captions->m);
|
|
|
|
auto cb = [this] (obs_source_t *source)
|
|
{
|
|
uint32_t caps = obs_source_get_output_flags(source);
|
|
QString name = obs_source_get_name(source);
|
|
|
|
if (caps & OBS_SOURCE_AUDIO)
|
|
ui->source->addItem(name);
|
|
|
|
OBSWeakSource weak = OBSGetWeakRef(source);
|
|
if (weak == captions->source)
|
|
ui->source->setCurrentText(name);
|
|
return true;
|
|
};
|
|
|
|
using cb_t = decltype(cb);
|
|
|
|
ui->source->blockSignals(true);
|
|
ui->source->addItem(QStringLiteral(""));
|
|
ui->source->setCurrentIndex(0);
|
|
obs_enum_sources([] (void *data, obs_source_t *source) {
|
|
return (*static_cast<cb_t*>(data))(source);}, &cb);
|
|
ui->source->blockSignals(false);
|
|
|
|
ui->enable->blockSignals(true);
|
|
ui->enable->setChecked(captions->th.joinable());
|
|
ui->enable->blockSignals(false);
|
|
}
|
|
|
|
void CaptionsDialog::on_source_currentIndexChanged(int)
|
|
{
|
|
bool started = captions->th.joinable();
|
|
if (started)
|
|
captions->stop();
|
|
|
|
captions->m.lock();
|
|
captions->source_name = ui->source->currentText().toUtf8().constData();
|
|
captions->source = GetWeakSourceByName(captions->source_name.c_str());
|
|
captions->m.unlock();
|
|
|
|
if (started)
|
|
captions->start();
|
|
}
|
|
|
|
void CaptionsDialog::on_enable_clicked(bool checked)
|
|
{
|
|
if (checked)
|
|
captions->start();
|
|
else
|
|
captions->stop();
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------- */
|
|
|
|
void obs_captions::main_thread()
|
|
try {
|
|
ComPtr<CaptionStream> audio;
|
|
ComPtr<ISpObjectToken> token;
|
|
ComPtr<ISpRecoGrammar> grammar;
|
|
ComPtr<ISpRecognizer> recognizer;
|
|
ComPtr<ISpRecoContext> context;
|
|
HRESULT hr;
|
|
|
|
auto cb = [&] (const struct audio_data *audio_data,
|
|
bool muted)
|
|
{
|
|
audio->PushAudio(audio_data, muted);
|
|
};
|
|
|
|
using cb_t = decltype(cb);
|
|
|
|
auto pre_cb = [] (void *param, obs_source_t*,
|
|
const struct audio_data *audio_data, bool muted)
|
|
{
|
|
return (*static_cast<cb_t*>(param))(audio_data, muted);
|
|
};
|
|
|
|
os_set_thread_name(__FUNCTION__);
|
|
|
|
CoInitialize(nullptr);
|
|
|
|
hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", nullptr,
|
|
&token);
|
|
if (FAILED(hr))
|
|
throw HRError("SpFindBestToken failed", hr);
|
|
|
|
hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,
|
|
__uuidof(ISpRecognizer), (void**)&recognizer);
|
|
if (FAILED(hr))
|
|
throw HRError("CoCreateInstance for recognizer failed", hr);
|
|
|
|
hr = recognizer->SetRecognizer(token);
|
|
if (FAILED(hr))
|
|
throw HRError("SetRecognizer failed", hr);
|
|
|
|
hr = recognizer->SetRecoState(SPRST_INACTIVE);
|
|
if (FAILED(hr))
|
|
throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);
|
|
|
|
hr = recognizer->CreateRecoContext(&context);
|
|
if (FAILED(hr))
|
|
throw HRError("CreateRecoContext failed", hr);
|
|
|
|
ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |
|
|
SPFEI(SPEI_END_SR_STREAM);
|
|
hr = context->SetInterest(interest, interest);
|
|
if (FAILED(hr))
|
|
throw HRError("SetInterest failed", hr);
|
|
|
|
HANDLE notify;
|
|
|
|
hr = context->SetNotifyWin32Event();
|
|
if (FAILED(hr))
|
|
throw HRError("SetNotifyWin32Event", hr);
|
|
|
|
notify = context->GetNotifyEventHandle();
|
|
if (notify == INVALID_HANDLE_VALUE)
|
|
throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);
|
|
|
|
size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
|
|
audio = new CaptionStream((DWORD)sample_rate);
|
|
audio->Release();
|
|
|
|
hr = recognizer->SetInput(audio, false);
|
|
if (FAILED(hr))
|
|
throw HRError("SetInput failed", hr);
|
|
|
|
hr = context->CreateGrammar(1, &grammar);
|
|
if (FAILED(hr))
|
|
throw HRError("CreateGrammar failed", hr);
|
|
|
|
hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
|
|
if (FAILED(hr))
|
|
throw HRError("LoadDictation failed", hr);
|
|
|
|
hr = grammar->SetDictationState(SPRS_ACTIVE);
|
|
if (FAILED(hr))
|
|
throw HRError("SetDictationState failed", hr);
|
|
|
|
hr = recognizer->SetRecoState(SPRST_ACTIVE);
|
|
if (FAILED(hr))
|
|
throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);
|
|
|
|
HANDLE events[] = {notify, stop_event};
|
|
|
|
{
|
|
captions->source = GetWeakSourceByName(
|
|
captions->source_name.c_str());
|
|
OBSSource strong = OBSGetStrongRef(source);
|
|
if (strong)
|
|
obs_source_add_audio_capture_callback(strong,
|
|
pre_cb, &cb);
|
|
}
|
|
|
|
for (;;) {
|
|
DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
|
|
if (ret != WAIT_OBJECT_0)
|
|
break;
|
|
|
|
CSpEvent event;
|
|
bool exit = false;
|
|
|
|
while (event.GetFrom(context) == S_OK) {
|
|
if (event.eEventId == SPEI_RECOGNITION) {
|
|
ISpRecoResult *result = event.RecoResult();
|
|
|
|
CoTaskMemPtr<wchar_t> text;
|
|
hr = result->GetText((ULONG)-1, (ULONG)-1,
|
|
true, &text, nullptr);
|
|
if (FAILED(hr))
|
|
continue;
|
|
|
|
char text_utf8[512];
|
|
os_wcs_to_utf8(text, 0, text_utf8, 512);
|
|
|
|
obs_output_t *output =
|
|
obs_frontend_get_streaming_output();
|
|
if (output)
|
|
obs_output_output_caption_text1(output,
|
|
text_utf8);
|
|
|
|
debug("\"%s\"", text_utf8);
|
|
|
|
obs_output_release(output);
|
|
|
|
} else if (event.eEventId == SPEI_END_SR_STREAM) {
|
|
exit = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (exit)
|
|
break;
|
|
}
|
|
|
|
{
|
|
OBSSource strong = OBSGetStrongRef(source);
|
|
if (strong)
|
|
obs_source_remove_audio_capture_callback(strong,
|
|
pre_cb, &cb);
|
|
}
|
|
|
|
audio->Stop();
|
|
|
|
CoUninitialize();
|
|
|
|
} catch (HRError err) {
|
|
error("%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
|
|
CoUninitialize();
|
|
}
|
|
|
|
void obs_captions::start()
|
|
{
|
|
if (!captions->th.joinable())
|
|
captions->th = thread([] () {captions->main_thread();});
|
|
}
|
|
|
|
void obs_captions::stop()
|
|
{
|
|
if (!captions->th.joinable())
|
|
return;
|
|
|
|
SetEvent(captions->stop_event);
|
|
captions->th.join();
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------- */
|
|
|
|
extern "C" void FreeCaptions()
|
|
{
|
|
delete captions;
|
|
captions = nullptr;
|
|
}
|
|
|
|
static void obs_event(enum obs_frontend_event event, void *)
|
|
{
|
|
if (event == OBS_FRONTEND_EVENT_EXIT)
|
|
FreeCaptions();
|
|
}
|
|
|
|
static void save_caption_data(obs_data_t *save_data, bool saving, void*)
|
|
{
|
|
if (saving) {
|
|
lock_guard<recursive_mutex> lock(captions->m);
|
|
obs_data_t *obj = obs_data_create();
|
|
|
|
obs_data_set_string(obj, "source",
|
|
captions->source_name.c_str());
|
|
obs_data_set_bool(obj, "enabled", captions->th.joinable());
|
|
|
|
obs_data_set_obj(save_data, "captions", obj);
|
|
obs_data_release(obj);
|
|
} else {
|
|
captions->stop();
|
|
|
|
captions->m.lock();
|
|
|
|
obs_data_t *obj = obs_data_get_obj(save_data, "captions");
|
|
if (!obj)
|
|
obj = obs_data_create();
|
|
|
|
bool enabled = obs_data_get_bool(obj, "enabled");
|
|
captions->source_name = obs_data_get_string(obj, "source");
|
|
captions->source = GetWeakSourceByName(
|
|
captions->source_name.c_str());
|
|
obs_data_release(obj);
|
|
|
|
captions->m.unlock();
|
|
|
|
if (enabled)
|
|
captions->start();
|
|
}
|
|
}
|
|
|
|
extern "C" void InitCaptions()
|
|
{
|
|
QAction *action = (QAction*)obs_frontend_add_tools_menu_qaction(
|
|
obs_module_text("Captions"));
|
|
|
|
captions = new obs_captions;
|
|
|
|
auto cb = [] ()
|
|
{
|
|
obs_frontend_push_ui_translation(obs_module_get_string);
|
|
|
|
QWidget *window =
|
|
(QWidget*)obs_frontend_get_main_window();
|
|
|
|
CaptionsDialog dialog(window);
|
|
dialog.exec();
|
|
|
|
obs_frontend_pop_ui_translation();
|
|
};
|
|
|
|
obs_frontend_add_save_callback(save_caption_data, nullptr);
|
|
obs_frontend_add_event_callback(obs_event, nullptr);
|
|
|
|
action->connect(action, &QAction::triggered, cb);
|
|
}
|