obs-studio/UI/frontend-plugins/frontend-tools/captions.cpp

#include <obs-frontend-api.h>
#include "captions-stream.hpp"
#include "captions.hpp"
#include "tool-helpers.hpp"
#include <sphelper.h>
#include <util/platform.h>
#include <util/windows/HRError.hpp>
#include <util/windows/ComPtr.hpp>
#include <util/windows/CoTaskMemPtr.hpp>
#include <util/threading.h>
#include <obs-module.h>

#include <string>
#include <thread>
#include <mutex>

#define do_log(type, format, ...) blog(type, "[Captions] " format, \
		##__VA_ARGS__)

#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)

using namespace std;

struct obs_captions {
	thread th;
	recursive_mutex m;
	WinHandle stop_event;

	string source_name;
	OBSWeakSource source;

	void main_thread();
	void start();
	void stop();

	inline obs_captions() :
		stop_event(CreateEvent(nullptr, false, false, nullptr))
	{
	}

	inline ~obs_captions() {stop();}
};

static obs_captions *captions = nullptr;

/* ------------------------------------------------------------------------- */

CaptionsDialog::CaptionsDialog(QWidget *parent) :
	QDialog(parent),
	ui(new Ui_CaptionsDialog)
{
	ui->setupUi(this);

	lock_guard<recursive_mutex> lock(captions->m);

	auto cb = [this] (obs_source_t *source)
	{
		uint32_t caps = obs_source_get_output_flags(source);
		QString name = obs_source_get_name(source);

		if (caps & OBS_SOURCE_AUDIO)
			ui->source->addItem(name);

		OBSWeakSource weak = OBSGetWeakRef(source);
		if (weak == captions->source)
			ui->source->setCurrentText(name);
		return true;
	};

	using cb_t = decltype(cb);

	ui->source->blockSignals(true);
	ui->source->addItem(QStringLiteral(""));
	ui->source->setCurrentIndex(0);
	obs_enum_sources([] (void *data, obs_source_t *source) {
			return (*static_cast<cb_t*>(data))(source);}, &cb);
	ui->source->blockSignals(false);

	ui->enable->blockSignals(true);
	ui->enable->setChecked(captions->th.joinable());
	ui->enable->blockSignals(false);
}

void CaptionsDialog::on_source_currentIndexChanged(int)
{
	bool started = captions->th.joinable();
	if (started)
		captions->stop();

	captions->m.lock();
	captions->source_name = ui->source->currentText().toUtf8().constData();
	captions->source = GetWeakSourceByName(captions->source_name.c_str());
	captions->m.unlock();

	if (started)
		captions->start();
}

void CaptionsDialog::on_enable_clicked(bool checked)
{
	if (checked)
		captions->start();
	else
		captions->stop();
}

/* ------------------------------------------------------------------------- */

void obs_captions::main_thread()
try {
	ComPtr<CaptionStream>  audio;
	ComPtr<ISpObjectToken> token;
	ComPtr<ISpRecoGrammar> grammar;
	ComPtr<ISpRecognizer>  recognizer;
	ComPtr<ISpRecoContext> context;
	HRESULT hr;

	auto cb = [&] (const struct audio_data *audio_data,
			bool muted)
	{
		audio->PushAudio(audio_data, muted);
	};

	using cb_t = decltype(cb);

	auto pre_cb = [] (void *param, obs_source_t*,
		const struct audio_data *audio_data, bool muted)
	{
		return (*static_cast<cb_t*>(param))(audio_data, muted);
	};

	os_set_thread_name(__FUNCTION__);

	CoInitialize(nullptr);

	hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", nullptr,
			&token);
	if (FAILED(hr))
		throw HRError("SpFindBestToken failed", hr);

	hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,
			__uuidof(ISpRecognizer), (void**)&recognizer);
	if (FAILED(hr))
		throw HRError("CoCreateInstance for recognizer failed", hr);

	hr = recognizer->SetRecognizer(token);
	if (FAILED(hr))
		throw HRError("SetRecognizer failed", hr);

	hr = recognizer->SetRecoState(SPRST_INACTIVE);
	if (FAILED(hr))
		throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);

	hr = recognizer->CreateRecoContext(&context);
	if (FAILED(hr))
		throw HRError("CreateRecoContext failed", hr);

	ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |
		SPFEI(SPEI_END_SR_STREAM);
	hr = context->SetInterest(interest, interest);
	if (FAILED(hr))
		throw HRError("SetInterest failed", hr);

	HANDLE notify;

	hr = context->SetNotifyWin32Event();
	if (FAILED(hr))
		throw HRError("SetNotifyWin32Event", hr);

	notify = context->GetNotifyEventHandle();
	if (notify == INVALID_HANDLE_VALUE)
		throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);

	size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
	audio = new CaptionStream((DWORD)sample_rate);
	audio->Release();

	hr = recognizer->SetInput(audio, false);
	if (FAILED(hr))
		throw HRError("SetInput failed", hr);

	hr = context->CreateGrammar(1, &grammar);
	if (FAILED(hr))
		throw HRError("CreateGrammar failed", hr);

	hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
	if (FAILED(hr))
		throw HRError("LoadDictation failed", hr);

	hr = grammar->SetDictationState(SPRS_ACTIVE);
	if (FAILED(hr))
		throw HRError("SetDictationState failed", hr);

	hr = recognizer->SetRecoState(SPRST_ACTIVE);
	if (FAILED(hr))
		throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);

	HANDLE events[] = {notify, stop_event};

	{
		captions->source = GetWeakSourceByName(
				captions->source_name.c_str());
		OBSSource strong = OBSGetStrongRef(source);
		if (strong)
			obs_source_add_audio_capture_callback(strong,
					pre_cb, &cb);
	}

	for (;;) {
		DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
		if (ret != WAIT_OBJECT_0)
			break;

		CSpEvent event;
		bool exit = false;

		while (event.GetFrom(context) == S_OK) {
			if (event.eEventId == SPEI_RECOGNITION) {
				ISpRecoResult *result = event.RecoResult();

				CoTaskMemPtr<wchar_t> text;
				hr = result->GetText((ULONG)-1, (ULONG)-1,
						true, &text, nullptr);
				if (FAILED(hr))
					continue;

				char text_utf8[512];
				os_wcs_to_utf8(text, 0, text_utf8, 512);

				obs_output_t *output =
					obs_frontend_get_streaming_output();
				if (output)
					obs_output_output_caption_text1(output,
							text_utf8);

				debug("\"%s\"", text_utf8);

				obs_output_release(output);

			} else if (event.eEventId == SPEI_END_SR_STREAM) {
				exit = true;
				break;
			}
		}

		if (exit)
			break;
	}

	{
		OBSSource strong = OBSGetStrongRef(source);
		if (strong)
			obs_source_remove_audio_capture_callback(strong,
					pre_cb, &cb);
	}

	audio->Stop();

	CoUninitialize();

} catch (HRError err) {
	error("%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
	CoUninitialize();
}

void obs_captions::start()
{
	if (!captions->th.joinable())
		captions->th = thread([] () {captions->main_thread();});
}

void obs_captions::stop()
{
	if (!captions->th.joinable())
		return;

	SetEvent(captions->stop_event);
	captions->th.join();
}

/* ------------------------------------------------------------------------- */

extern "C" void FreeCaptions()
{
	delete captions;
	captions = nullptr;
}

static void obs_event(enum obs_frontend_event event, void *)
{
	if (event == OBS_FRONTEND_EVENT_EXIT)
		FreeCaptions();
}

static void save_caption_data(obs_data_t *save_data, bool saving, void*)
{
	if (saving) {
		lock_guard<recursive_mutex> lock(captions->m);
		obs_data_t *obj = obs_data_create();

		obs_data_set_string(obj, "source",
				captions->source_name.c_str());
		obs_data_set_bool(obj, "enabled", captions->th.joinable());

		obs_data_set_obj(save_data, "captions", obj);
		obs_data_release(obj);
	} else {
		captions->stop();

		captions->m.lock();

		obs_data_t *obj = obs_data_get_obj(save_data, "captions");
		if (!obj)
			obj = obs_data_create();

		bool enabled = obs_data_get_bool(obj, "enabled");
		captions->source_name = obs_data_get_string(obj, "source");
		captions->source = GetWeakSourceByName(
				captions->source_name.c_str());
		obs_data_release(obj);

		captions->m.unlock();

		if (enabled)
			captions->start();
	}
}

extern "C" void InitCaptions()
{
	QAction *action = (QAction*)obs_frontend_add_tools_menu_qaction(
			obs_module_text("Captions"));

	captions = new obs_captions;

	auto cb = [] ()
	{
		obs_frontend_push_ui_translation(obs_module_get_string);

		QWidget *window =
			(QWidget*)obs_frontend_get_main_window();

		CaptionsDialog dialog(window);
		dialog.exec();

		obs_frontend_pop_ui_translation();
	};

	obs_frontend_add_save_callback(save_caption_data, nullptr);
	obs_frontend_add_event_callback(obs_event, nullptr);

	action->connect(action, &QAction::triggered, cb);
}