frontend-tools: Add caption generation tool (windows)

Allows generating captions via the windows speech recognition API (SAPI). This is currently marked as experimental due to speech recognition technology still being less than ideal. Speech recognition technology in general is probably never going to be anywhere near perfect. Microsoft's speech recognition in particular requires a bit of training via the windows speech recognition tool to ensure it can dictate better. Clear speech with a good mic is recognized fairly well, but casual speech and/or speaking with a poor microphone will have some significant issues. Captions can often be way off when speaking casually rather than with clear diction.
2016-11-17 05:30:45 -08:00 · 2016-11-17 05:30:45 -08:00 · b8e078f2bf
commit b8e078f2bf
parent 2cbb3c2505
9 changed files with 1064 additions and 2 deletions
--- a/UI/frontend-plugins/frontend-tools/CMakeLists.txt
+++ b/UI/frontend-plugins/frontend-tools/CMakeLists.txt
@ -17,8 +17,13 @@ if(WIN32 OR APPLE)
 		)
 endif()

+configure_file(
+	"${CMAKE_CURRENT_SOURCE_DIR}/frontend-tools-config.h.in"
+	"${CMAKE_BINARY_DIR}/config/frontend-tools-config.h")
+
 set(frontend-tools_HEADERS
 	${frontend-tools_HEADERS}
+	"${CMAKE_BINARY_DIR}/config/frontend-tools-config.h"
 	output-timer.hpp
 	tool-helpers.hpp
 	)
@ -35,6 +40,18 @@ set(frontend-tools_UI
 if(WIN32)
 	set(frontend-tools_PLATFORM_SOURCES
 		auto-scene-switcher-win.cpp)
+
+	if(BUILD_CAPTIONS)
+		set(frontend-tools_PLATFORM_SOURCES
+			${frontend-tools_PLATFORM_SOURCES}
+			captions.cpp
+			captions-stream.cpp)
+		set(frontend-tools_PLATFORM_HEADERS
+			captions.hpp
+			captions-stream.hpp)
+		set(frontend-tools_PLATFORM_UI
+			forms/captions.ui)
+	endif()
 elseif(APPLE)
 	set(frontend-tools_PLATFORM_SOURCES
 		auto-scene-switcher-osx.mm)
@ -45,13 +62,16 @@ elseif(APPLE)
 		${COCOA})
 endif()

-qt5_wrap_ui(frontend-tools_UI_HEADERS ${frontend-tools_UI})
+qt5_wrap_ui(frontend-tools_UI_HEADERS
+	${frontend-tools_UI}
+	${frontend-tools_PLATFORM_UI})

 add_library(frontend-tools MODULE
 	${frontend-tools_HEADERS}
 	${frontend-tools_SOURCES}
-	${frontend-tools_PLATFORM_SOURCES}
 	${frontend-tools_UI_HEADERS}
+	${frontend-tools_PLATFORM_SOURCES}
+	${frontend-tools_PLATFORM_HEADERS}
 	)
 target_link_libraries(frontend-tools
 	${frontend-tools_PLATFORM_LIBS}
--- a/UI/frontend-plugins/frontend-tools/captions-stream.cpp
+++ b/UI/frontend-plugins/frontend-tools/captions-stream.cpp
@ -0,0 +1,418 @@
+#include "captions-stream.hpp"
+#include <mmreg.h>
+#include <util/windows/CoTaskMemPtr.hpp>
+#include <util/threading.h>
+#include <util/base.h>
+
+using namespace std;
+
+#if 0
+#define debugfunc(format, ...) blog(LOG_DEBUG, "[Captions] %s(" format ")", \
+		__FUNCTION__, ##__VA_ARGS__)
+#else
+#define debugfunc(format, ...)
+#endif
+
+CaptionStream::CaptionStream(DWORD samplerate_) :
+	samplerate(samplerate_),
+	event(CreateEvent(nullptr, false, false, nullptr))
+{
+	buf_info.ulMsMinNotification = 50;
+	buf_info.ulMsBufferSize = 500;
+	buf_info.ulMsEventBias = 0;
+
+	format.wFormatTag = WAVE_FORMAT_PCM;
+	format.nChannels = 1;
+	format.nSamplesPerSec = 16000;
+	format.nAvgBytesPerSec = format.nSamplesPerSec * sizeof(uint16_t);
+	format.nBlockAlign = 2;
+	format.wBitsPerSample = 16;
+	format.cbSize = sizeof(format);
+
+	resampler.Reset(&format);
+}
+
+void CaptionStream::Stop()
+{
+	{
+		lock_guard<mutex> lock(m);
+		circlebuf_free(buf);
+	}
+
+	cv.notify_one();
+}
+
+void CaptionStream::PushAudio(const struct audio_data *data, bool muted)
+{
+	uint8_t *output[MAX_AV_PLANES] = {};
+	uint32_t frames = data->frames;
+	uint64_t ts_offset;
+	bool ready = false;
+
+	audio_resampler_resample(resampler, output, &frames, &ts_offset,
+			data->data, data->frames);
+
+	if (output[0]) {
+		if (muted)
+			memset(output[0], 0, frames * sizeof(int16_t));
+
+		lock_guard<mutex> lock(m);
+		circlebuf_push_back(buf, output[0], frames * sizeof(int16_t));
+		write_pos += frames * sizeof(int16_t);
+
+		if (wait_size && buf->size >= wait_size)
+			ready = true;
+	}
+
+	if (ready)
+		cv.notify_one();
+}
+
+// IUnknown methods
+
+STDMETHODIMP CaptionStream::QueryInterface(REFIID riid, void **ppv)
+{
+	if (riid == IID_IUnknown) {
+		AddRef();
+		*ppv = this;
+
+	} else if (riid == IID_IStream) {
+		AddRef();
+		*ppv = (IStream*)this;
+
+	} else if (riid == IID_ISpStreamFormat) {
+		AddRef();
+		*ppv = (ISpStreamFormat*)this;
+
+	} else if (riid == IID_ISpAudio) {
+		AddRef();
+		*ppv = (ISpAudio*)this;
+
+	} else {
+		*ppv = nullptr;
+		return E_NOINTERFACE;
+	}
+
+	return NOERROR;
+}
+
+STDMETHODIMP_(ULONG) CaptionStream::AddRef()
+{
+	return (ULONG)os_atomic_inc_long(&refs);
+}
+
+STDMETHODIMP_(ULONG) CaptionStream::Release()
+{
+	ULONG new_refs = (ULONG)os_atomic_dec_long(&refs);
+	if (!new_refs)
+		delete this;
+
+	return new_refs;
+}
+
+// ISequentialStream methods
+
+STDMETHODIMP CaptionStream::Read(void *data, ULONG bytes, ULONG *read_bytes)
+{
+	HRESULT hr = S_OK;
+	size_t cur_size;
+
+	debugfunc("data, %lu, read_bytes", bytes);
+	if (!data)
+		return STG_E_INVALIDPOINTER;
+
+	{
+		lock_guard<mutex> lock1(m);
+		wait_size = bytes;
+		cur_size = buf->size;
+	}
+
+	unique_lock<mutex> lock(m);
+
+	if (bytes > cur_size)
+		cv.wait(lock);
+
+	if (bytes > (ULONG)buf->size) {
+		bytes = (ULONG)buf->size;
+		hr = S_FALSE;
+	}
+	if (bytes)
+		circlebuf_pop_front(buf, data, bytes);
+	if (read_bytes)
+		*read_bytes = bytes;
+
+	wait_size = 0;
+	pos.QuadPart += bytes;
+	return hr;
+}
+
+STDMETHODIMP CaptionStream::Write(const void *, ULONG bytes,
+		ULONG*)
+{
+	debugfunc("data, %lu, written_bytes", bytes);
+	UNUSED_PARAMETER(bytes);
+
+	return STG_E_INVALIDFUNCTION;
+}
+
+// IStream methods
+
+STDMETHODIMP CaptionStream::Seek(LARGE_INTEGER move, DWORD origin,
+		ULARGE_INTEGER *new_pos)
+{
+	debugfunc("%lld, %lx, new_pos", move, origin);
+	UNUSED_PARAMETER(move);
+	UNUSED_PARAMETER(origin);
+
+	if (!new_pos)
+		return E_POINTER;
+
+	if (origin != SEEK_CUR || move.QuadPart != 0)
+		return E_NOTIMPL;
+
+	*new_pos = pos;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::SetSize(ULARGE_INTEGER new_size)
+{
+	debugfunc("%llu", new_size);
+	UNUSED_PARAMETER(new_size);
+	return STG_E_INVALIDFUNCTION;
+}
+
+STDMETHODIMP CaptionStream::CopyTo(IStream *stream, ULARGE_INTEGER bytes,
+		ULARGE_INTEGER *read_bytes,
+		ULARGE_INTEGER *written_bytes)
+{
+	HRESULT hr;
+
+	debugfunc("stream, %llu, read_bytes, written_bytes", bytes);
+
+	if (!stream)
+		return STG_E_INVALIDPOINTER;
+
+	ULONG written = 0;
+	if (bytes.QuadPart > (ULONGLONG)buf->size)
+		bytes.QuadPart = (ULONGLONG)buf->size;
+
+	lock_guard<mutex> lock(m);
+	temp_buf.resize((size_t)bytes.QuadPart);
+	circlebuf_peek_front(buf, &temp_buf[0], (size_t)bytes.QuadPart);
+
+	hr = stream->Write(temp_buf.data(), (ULONG)bytes.QuadPart, &written);
+
+	if (read_bytes)
+		*read_bytes = bytes;
+	if (written_bytes)
+		written_bytes->QuadPart = written;
+
+	return hr;
+}
+
+STDMETHODIMP CaptionStream::Commit(DWORD commit_flags)
+{
+	debugfunc("%lx", commit_flags);
+	UNUSED_PARAMETER(commit_flags);
+	/* TODO? */
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::Revert(void)
+{
+	debugfunc("");
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::LockRegion(ULARGE_INTEGER offset,
+		ULARGE_INTEGER size, DWORD type)
+{
+	debugfunc("%llu, %llu, %ld", offset, size, type);
+	UNUSED_PARAMETER(offset);
+	UNUSED_PARAMETER(size);
+	UNUSED_PARAMETER(type);
+	/* TODO? */
+	return STG_E_INVALIDFUNCTION;
+}
+
+STDMETHODIMP CaptionStream::UnlockRegion(ULARGE_INTEGER offset,
+		ULARGE_INTEGER size, DWORD type)
+{
+	debugfunc("%llu, %llu, %ld", offset, size, type);
+	UNUSED_PARAMETER(offset);
+	UNUSED_PARAMETER(size);
+	UNUSED_PARAMETER(type);
+	/* TODO? */
+	return STG_E_INVALIDFUNCTION;
+}
+
+static const wchar_t *stat_name = L"Caption stream";
+
+STDMETHODIMP CaptionStream::Stat(STATSTG *stg, DWORD flag)
+{
+	debugfunc("stg, %lu", flag);
+
+	if (!stg)
+		return E_POINTER;
+
+	lock_guard<mutex> lock(m);
+	*stg = {};
+	stg->type = STGTY_STREAM;
+	stg->cbSize.QuadPart = (ULONGLONG)buf->size;
+
+	if (flag == STATFLAG_DEFAULT) {
+		stg->pwcsName = (wchar_t*)CoTaskMemAlloc(sizeof(stat_name));
+		memcpy(stg->pwcsName, stat_name, sizeof(stat_name));
+	}
+
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::Clone(IStream **stream)
+{
+	debugfunc("stream");
+	*stream = nullptr;
+	return E_NOTIMPL;
+}
+
+// ISpStreamFormat methods
+
+STDMETHODIMP CaptionStream::GetFormat(GUID *guid,
+		WAVEFORMATEX **co_mem_wfex_out)
+{
+	debugfunc("guid, co_mem_wfex_out");
+
+	if (!guid || !co_mem_wfex_out)
+		return E_POINTER;
+
+	if (format.wFormatTag == 0) {
+		*co_mem_wfex_out = nullptr;
+		return S_OK;
+	}
+
+	void *wfex = CoTaskMemAlloc(sizeof(format));
+	memcpy(wfex, &format, sizeof(format));
+
+	*co_mem_wfex_out = (WAVEFORMATEX*)wfex;
+	return S_OK;
+}
+
+// ISpAudio methods
+
+STDMETHODIMP CaptionStream::SetState(SPAUDIOSTATE state_, ULONGLONG)
+{
+	debugfunc("%lu, reserved", state_);
+	state = state_;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::SetFormat(REFGUID guid_ref,
+		const WAVEFORMATEX *wfex)
+{
+	debugfunc("guid, wfex");
+	if (!wfex)
+		return E_INVALIDARG;
+
+	if (guid_ref == SPDFID_WaveFormatEx) {
+		lock_guard<mutex> lock(m);
+		memcpy(&format, wfex, sizeof(format));
+		resampler.Reset(wfex);
+
+		/* 50 msec */
+		DWORD size = format.nSamplesPerSec / 20;
+		DWORD byte_size = size * format.nBlockAlign;
+		circlebuf_reserve(buf, (size_t)byte_size);
+	}
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::GetStatus(SPAUDIOSTATUS *status)
+{
+	debugfunc("status");
+
+	if (!status)
+		return E_POINTER;
+
+	/* TODO? */
+	lock_guard<mutex> lock(m);
+	*status = {};
+	status->cbNonBlockingIO = (ULONG)buf->size;
+	status->State = state;
+	status->CurSeekPos = pos.QuadPart;
+	status->CurDevicePos = write_pos;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::SetBufferInfo(const SPAUDIOBUFFERINFO *buf_info_)
+{
+	debugfunc("buf_info");
+
+	/* TODO */
+	buf_info = *buf_info_;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::GetBufferInfo(SPAUDIOBUFFERINFO *buf_info_)
+{
+	debugfunc("buf_info");
+	if (!buf_info_)
+		return E_POINTER;
+
+	*buf_info_ = buf_info;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::GetDefaultFormat(GUID *format,
+		WAVEFORMATEX **co_mem_wfex_out)
+{
+	debugfunc("format, co_mem_wfex_out");
+
+	if (!format || !co_mem_wfex_out)
+		return E_POINTER;
+
+	void *wfex = CoTaskMemAlloc(sizeof(format));
+	memcpy(wfex, &format, sizeof(format));
+
+	*format = SPDFID_WaveFormatEx;
+	*co_mem_wfex_out = (WAVEFORMATEX*)wfex;
+	return S_OK;
+}
+
+STDMETHODIMP_(HANDLE) CaptionStream::EventHandle(void)
+{
+	debugfunc("");
+	return event;
+}
+
+STDMETHODIMP CaptionStream::GetVolumeLevel(ULONG *level)
+{
+	debugfunc("level");
+	if (!level)
+		return E_POINTER;
+
+	*level = vol;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::SetVolumeLevel(ULONG level)
+{
+	debugfunc("%lu", level);
+	vol = level;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::GetBufferNotifySize(ULONG *size)
+{
+	debugfunc("size");
+	if (!size)
+		return E_POINTER;
+	*size = notify_size;
+	return S_OK;
+}
+
+STDMETHODIMP CaptionStream::SetBufferNotifySize(ULONG size)
+{
+	debugfunc("%lu", size);
+	notify_size = size;
+	return S_OK;
+}
--- a/UI/frontend-plugins/frontend-tools/captions-stream.hpp
+++ b/UI/frontend-plugins/frontend-tools/captions-stream.hpp
@ -0,0 +1,119 @@
+#include <windows.h>
+#include <sapi.h>
+#include <condition_variable>
+#include <mutex>
+#include <vector>
+#include <obs.h>
+#include <media-io/audio-resampler.h>
+#include <util/circlebuf.h>
+#include <util/windows/WinHandle.hpp>
+
+#include <fstream>
+
+class CircleBuf {
+	circlebuf buf = {};
+public:
+	inline ~CircleBuf() {circlebuf_free(&buf);}
+	inline operator circlebuf*() {return &buf;}
+	inline circlebuf *operator->() {return &buf;}
+};
+
+class Resampler {
+	audio_resampler_t *resampler = nullptr;
+
+public:
+	inline void Reset(const WAVEFORMATEX *wfex)
+	{
+		const struct audio_output_info *aoi =
+			audio_output_get_info(obs_get_audio());
+
+		struct resample_info src;
+		src.samples_per_sec = aoi->samples_per_sec;
+		src.format = aoi->format;
+		src.speakers = aoi->speakers;
+
+		struct resample_info dst;
+		dst.samples_per_sec = uint32_t(wfex->nSamplesPerSec);
+		dst.format = AUDIO_FORMAT_16BIT;
+		dst.speakers = (enum speaker_layout)wfex->nChannels;
+
+		if (resampler)
+			audio_resampler_destroy(resampler);
+		resampler = audio_resampler_create(&dst, &src);
+	}
+
+	inline ~Resampler() {audio_resampler_destroy(resampler);}
+	inline operator audio_resampler_t*() {return resampler;}
+};
+
+class CaptionStream : public ISpAudio {
+	volatile long refs = 1;
+	SPAUDIOBUFFERINFO buf_info = {};
+	ULONG notify_size = 0;
+	SPAUDIOSTATE state;
+	WinHandle event;
+	ULONG vol = 0;
+
+	std::condition_variable cv;
+	std::mutex m;
+	std::vector<int16_t> temp_buf;
+	WAVEFORMATEX format = {};
+	Resampler resampler;
+
+	CircleBuf buf;
+	ULONG wait_size = 0;
+	DWORD samplerate = 0;
+	ULARGE_INTEGER pos = {};
+	ULONGLONG write_pos = 0;
+
+public:
+	CaptionStream(DWORD samplerate);
+
+	void Stop();
+	void PushAudio(const struct audio_data *audio_data, bool muted);
+
+	// IUnknown methods
+	STDMETHODIMP QueryInterface(REFIID riid, void **ppv) override;
+	STDMETHODIMP_(ULONG) AddRef() override;
+	STDMETHODIMP_(ULONG) Release() override;
+
+	// ISequentialStream methods
+	STDMETHODIMP Read(void *data, ULONG bytes, ULONG *read_bytes) override;
+	STDMETHODIMP Write(const void *data, ULONG bytes, ULONG *written_bytes)
+		override;
+
+	// IStream methods
+	STDMETHODIMP Seek(LARGE_INTEGER move, DWORD origin,
+			ULARGE_INTEGER *new_pos) override;
+	STDMETHODIMP SetSize(ULARGE_INTEGER new_size) override;
+	STDMETHODIMP CopyTo(IStream *stream, ULARGE_INTEGER bytes,
+			ULARGE_INTEGER *read_bytes,
+			ULARGE_INTEGER *written_bytes) override;
+	STDMETHODIMP Commit(DWORD commit_flags) override;
+	STDMETHODIMP Revert(void) override;
+	STDMETHODIMP LockRegion(ULARGE_INTEGER offset, ULARGE_INTEGER size,
+			DWORD type) override;
+	STDMETHODIMP UnlockRegion(ULARGE_INTEGER offset, ULARGE_INTEGER size,
+			DWORD type) override;
+	STDMETHODIMP Stat(STATSTG *stg, DWORD flags) override;
+	STDMETHODIMP Clone(IStream **stream) override;
+
+	// ISpStreamFormat methods
+	STDMETHODIMP GetFormat(GUID *guid, WAVEFORMATEX **co_mem_wfex_out)
+		override;
+
+	// ISpAudio methods
+	STDMETHODIMP SetState(SPAUDIOSTATE state, ULONGLONG reserved) override;
+	STDMETHODIMP SetFormat(REFGUID guid_ref, const WAVEFORMATEX *wfex)
+		override;
+	STDMETHODIMP GetStatus(SPAUDIOSTATUS *status) override;
+	STDMETHODIMP SetBufferInfo(const SPAUDIOBUFFERINFO *buf_info) override;
+	STDMETHODIMP GetBufferInfo(SPAUDIOBUFFERINFO *buf_info) override;
+	STDMETHODIMP GetDefaultFormat(GUID *format,
+			WAVEFORMATEX **co_mem_wfex_out) override;
+	STDMETHODIMP_(HANDLE) EventHandle(void) override;
+	STDMETHODIMP GetVolumeLevel(ULONG *level) override;
+	STDMETHODIMP SetVolumeLevel(ULONG level) override;
+	STDMETHODIMP GetBufferNotifySize(ULONG *size) override;
+	STDMETHODIMP SetBufferNotifySize(ULONG size) override;
+};
--- a/UI/frontend-plugins/frontend-tools/captions.cpp
+++ b/UI/frontend-plugins/frontend-tools/captions.cpp
@ -0,0 +1,354 @@
+#include <obs-frontend-api.h>
+#include "captions-stream.hpp"
+#include "captions.hpp"
+#include "tool-helpers.hpp"
+#include <sphelper.h>
+#include <util/platform.h>
+#include <util/windows/HRError.hpp>
+#include <util/windows/ComPtr.hpp>
+#include <util/windows/CoTaskMemPtr.hpp>
+#include <util/threading.h>
+#include <obs-module.h>
+
+#include <string>
+#include <thread>
+#include <mutex>
+
+#define do_log(type, format, ...) blog(type, "[Captions] " format, \
+		##__VA_ARGS__)
+
+#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
+#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)
+
+using namespace std;
+
+struct obs_captions {
+	thread th;
+	recursive_mutex m;
+	WinHandle stop_event;
+
+	string source_name;
+	OBSWeakSource source;
+
+	void main_thread();
+	void start();
+	void stop();
+
+	inline obs_captions() :
+		stop_event(CreateEvent(nullptr, false, false, nullptr))
+	{
+	}
+
+	inline ~obs_captions() {stop();}
+};
+
+static obs_captions *captions = nullptr;
+
+/* ------------------------------------------------------------------------- */
+
+CaptionsDialog::CaptionsDialog(QWidget *parent) :
+	QDialog(parent),
+	ui(new Ui_CaptionsDialog)
+{
+	ui->setupUi(this);
+
+	lock_guard<recursive_mutex> lock(captions->m);
+
+	auto cb = [this] (obs_source_t *source)
+	{
+		uint32_t caps = obs_source_get_output_flags(source);
+		QString name = obs_source_get_name(source);
+
+		if (caps & OBS_SOURCE_AUDIO)
+			ui->source->addItem(name);
+
+		OBSWeakSource weak = OBSGetWeakRef(source);
+		if (weak == captions->source)
+			ui->source->setCurrentText(name);
+		return true;
+	};
+
+	using cb_t = decltype(cb);
+
+	ui->source->blockSignals(true);
+	ui->source->addItem(QStringLiteral(""));
+	ui->source->setCurrentIndex(0);
+	obs_enum_sources([] (void *data, obs_source_t *source) {
+			return (*static_cast<cb_t*>(data))(source);}, &cb);
+	ui->source->blockSignals(false);
+
+	ui->enable->blockSignals(true);
+	ui->enable->setChecked(captions->th.joinable());
+	ui->enable->blockSignals(false);
+}
+
+void CaptionsDialog::on_source_currentIndexChanged(int)
+{
+	bool started = captions->th.joinable();
+	if (started)
+		captions->stop();
+
+	captions->m.lock();
+	captions->source_name = ui->source->currentText().toUtf8().constData();
+	captions->source = GetWeakSourceByName(captions->source_name.c_str());
+	captions->m.unlock();
+
+	if (started)
+		captions->start();
+}
+
+void CaptionsDialog::on_enable_clicked(bool checked)
+{
+	if (checked)
+		captions->start();
+	else
+		captions->stop();
+}
+
+/* ------------------------------------------------------------------------- */
+
+void obs_captions::main_thread()
+try {
+	ComPtr<CaptionStream>  audio;
+	ComPtr<ISpObjectToken> token;
+	ComPtr<ISpRecoGrammar> grammar;
+	ComPtr<ISpRecognizer>  recognizer;
+	ComPtr<ISpRecoContext> context;
+	HRESULT hr;
+
+	auto cb = [&] (const struct audio_data *audio_data,
+			bool muted)
+	{
+		audio->PushAudio(audio_data, muted);
+	};
+
+	using cb_t = decltype(cb);
+
+	auto pre_cb = [] (void *param, obs_source_t*,
+		const struct audio_data *audio_data, bool muted)
+	{
+		return (*static_cast<cb_t*>(param))(audio_data, muted);
+	};
+
+	os_set_thread_name(__FUNCTION__);
+
+	CoInitialize(nullptr);
+
+	hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", nullptr,
+			&token);
+	if (FAILED(hr))
+		throw HRError("SpFindBestToken failed", hr);
+
+	hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,
+			__uuidof(ISpRecognizer), (void**)&recognizer);
+	if (FAILED(hr))
+		throw HRError("CoCreateInstance for recognizer failed", hr);
+
+	hr = recognizer->SetRecognizer(token);
+	if (FAILED(hr))
+		throw HRError("SetRecognizer failed", hr);
+
+	hr = recognizer->SetRecoState(SPRST_INACTIVE);
+	if (FAILED(hr))
+		throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);
+
+	hr = recognizer->CreateRecoContext(&context);
+	if (FAILED(hr))
+		throw HRError("CreateRecoContext failed", hr);
+
+	ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |
+		SPFEI(SPEI_END_SR_STREAM);
+	hr = context->SetInterest(interest, interest);
+	if (FAILED(hr))
+		throw HRError("SetInterest failed", hr);
+
+	HANDLE notify;
+
+	hr = context->SetNotifyWin32Event();
+	if (FAILED(hr))
+		throw HRError("SetNotifyWin32Event", hr);
+
+	notify = context->GetNotifyEventHandle();
+	if (notify == INVALID_HANDLE_VALUE)
+		throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);
+
+	size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
+	audio = new CaptionStream((DWORD)sample_rate);
+	audio->Release();
+
+	hr = recognizer->SetInput(audio, false);
+	if (FAILED(hr))
+		throw HRError("SetInput failed", hr);
+
+	hr = context->CreateGrammar(1, &grammar);
+	if (FAILED(hr))
+		throw HRError("CreateGrammar failed", hr);
+
+	hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
+	if (FAILED(hr))
+		throw HRError("LoadDictation failed", hr);
+
+	hr = grammar->SetDictationState(SPRS_ACTIVE);
+	if (FAILED(hr))
+		throw HRError("SetDictationState failed", hr);
+
+	hr = recognizer->SetRecoState(SPRST_ACTIVE);
+	if (FAILED(hr))
+		throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);
+
+	HANDLE events[] = {notify, stop_event};
+
+	{
+		captions->source = GetWeakSourceByName(
+				captions->source_name.c_str());
+		OBSSource strong = OBSGetStrongRef(source);
+		if (strong)
+			obs_source_add_audio_capture_callback(strong,
+					pre_cb, &cb);
+	}
+
+	for (;;) {
+		DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
+		if (ret != WAIT_OBJECT_0)
+			break;
+
+		CSpEvent event;
+		bool exit = false;
+
+		while (event.GetFrom(context) == S_OK) {
+			if (event.eEventId == SPEI_RECOGNITION) {
+				ISpRecoResult *result = event.RecoResult();
+
+				CoTaskMemPtr<wchar_t> text;
+				hr = result->GetText((ULONG)-1, (ULONG)-1,
+						true, &text, nullptr);
+				if (FAILED(hr))
+					continue;
+
+				char text_utf8[512];
+				os_wcs_to_utf8(text, 0, text_utf8, 512);
+
+				obs_output_t *output =
+					obs_frontend_get_streaming_output();
+				if (output)
+					obs_output_output_caption_text1(output,
+							text_utf8);
+
+				debug("\"%s\"", text_utf8);
+
+				obs_output_release(output);
+
+			} else if (event.eEventId == SPEI_END_SR_STREAM) {
+				exit = true;
+				break;
+			}
+		}
+
+		if (exit)
+			break;
+	}
+
+	{
+		OBSSource strong = OBSGetStrongRef(source);
+		if (strong)
+			obs_source_remove_audio_capture_callback(strong,
+					pre_cb, &cb);
+	}
+
+	audio->Stop();
+
+	CoUninitialize();
+
+} catch (HRError err) {
+	error("%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
+	CoUninitialize();
+}
+
+void obs_captions::start()
+{
+	if (!captions->th.joinable())
+		captions->th = thread([] () {captions->main_thread();});
+}
+
+void obs_captions::stop()
+{
+	if (!captions->th.joinable())
+		return;
+
+	SetEvent(captions->stop_event);
+	captions->th.join();
+}
+
+/* ------------------------------------------------------------------------- */
+
+extern "C" void FreeCaptions()
+{
+	delete captions;
+	captions = nullptr;
+}
+
+static void obs_event(enum obs_frontend_event event, void *)
+{
+	if (event == OBS_FRONTEND_EVENT_EXIT)
+		FreeCaptions();
+}
+
+static void save_caption_data(obs_data_t *save_data, bool saving, void*)
+{
+	if (saving) {
+		lock_guard<recursive_mutex> lock(captions->m);
+		obs_data_t *obj = obs_data_create();
+
+		obs_data_set_string(obj, "source",
+				captions->source_name.c_str());
+		obs_data_set_bool(obj, "enabled", captions->th.joinable());
+
+		obs_data_set_obj(save_data, "captions", obj);
+		obs_data_release(obj);
+	} else {
+		captions->stop();
+
+		captions->m.lock();
+
+		obs_data_t *obj = obs_data_get_obj(save_data, "captions");
+		if (!obj)
+			obj = obs_data_create();
+
+		bool enabled = obs_data_get_bool(obj, "enabled");
+		captions->source_name = obs_data_get_string(obj, "source");
+		captions->source = GetWeakSourceByName(
+				captions->source_name.c_str());
+		obs_data_release(obj);
+
+		captions->m.unlock();
+
+		if (enabled)
+			captions->start();
+	}
+}
+
+extern "C" void InitCaptions()
+{
+	QAction *action = (QAction*)obs_frontend_add_tools_menu_qaction(
+			obs_module_text("Captions"));
+
+	captions = new obs_captions;
+
+	auto cb = [] ()
+	{
+		obs_frontend_push_ui_translation(obs_module_get_string);
+
+		QWidget *window =
+			(QWidget*)obs_frontend_get_main_window();
+
+		CaptionsDialog dialog(window);
+		dialog.exec();
+
+		obs_frontend_pop_ui_translation();
+	};
+
+	obs_frontend_add_save_callback(save_caption_data, nullptr);
+	obs_frontend_add_event_callback(obs_event, nullptr);
+
+	action->connect(action, &QAction::triggered, cb);
+}
--- a/UI/frontend-plugins/frontend-tools/captions.hpp
+++ b/UI/frontend-plugins/frontend-tools/captions.hpp
@ -0,0 +1,19 @@
+#pragma once
+
+#include <QDialog>
+#include <memory>
+
+#include "ui_captions.h"
+
+class CaptionsDialog : public QDialog {
+	Q_OBJECT
+
+	std::unique_ptr<Ui_CaptionsDialog> ui;
+
+public:
+	CaptionsDialog(QWidget *parent);
+
+public slots:
+	void on_source_currentIndexChanged(int idx);
+	void on_enable_clicked(bool checked);
+};
--- a/UI/frontend-plugins/frontend-tools/data/locale/en-US.ini
+++ b/UI/frontend-plugins/frontend-tools/data/locale/en-US.ini
@ -11,6 +11,9 @@ Inactive="Inactive"
 Start="Start"
 Stop="Stop"

+Captions="Captions (Experimental)"
+Captions.AudioSource="Audio source:"
+
 OutputTimer="Output Timer"
 OutputTimer.Stream="Stop streaming after:"
 OutputTimer.Record="Stop recording after:"
--- a/UI/frontend-plugins/frontend-tools/forms/captions.ui
+++ b/UI/frontend-plugins/frontend-tools/forms/captions.ui
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>CaptionsDialog</class>
+ <widget class="QDialog" name="CaptionsDialog">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>519</width>
+    <height>140</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Captions</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <layout class="QFormLayout" name="formLayout">
+     <item row="1" column="0">
+      <widget class="QLabel" name="label">
+       <property name="text">
+        <string>Captions.AudioSource</string>
+       </property>
+      </widget>
+     </item>
+     <item row="1" column="1">
+      <widget class="QComboBox" name="source">
+       <property name="insertPolicy">
+        <enum>QComboBox::InsertAlphabetically</enum>
+       </property>
+      </widget>
+     </item>
+     <item row="0" column="1">
+      <widget class="QCheckBox" name="enable">
+       <property name="text">
+        <string>Enable</string>
+       </property>
+      </widget>
+     </item>
+    </layout>
+   </item>
+   <item>
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>0</width>
+       <height>0</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+   <item>
+    <layout class="QHBoxLayout" name="horizontalLayout">
+     <item>
+      <spacer name="horizontalSpacer">
+       <property name="orientation">
+        <enum>Qt::Horizontal</enum>
+       </property>
+       <property name="sizeHint" stdset="0">
+        <size>
+         <width>40</width>
+         <height>20</height>
+        </size>
+       </property>
+      </spacer>
+     </item>
+     <item>
+      <widget class="QPushButton" name="accept">
+       <property name="text">
+        <string>OK</string>
+       </property>
+      </widget>
+     </item>
+     <item>
+      <spacer name="horizontalSpacer_2">
+       <property name="orientation">
+        <enum>Qt::Horizontal</enum>
+       </property>
+       <property name="sizeHint" stdset="0">
+        <size>
+         <width>40</width>
+         <height>20</height>
+        </size>
+       </property>
+      </spacer>
+     </item>
+    </layout>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>accept</sender>
+   <signal>clicked()</signal>
+   <receiver>CaptionsDialog</receiver>
+   <slot>accept()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>268</x>
+     <y>331</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>229</x>
+     <y>-11</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>
--- a/UI/frontend-plugins/frontend-tools/frontend-tools-config.h.in
+++ b/UI/frontend-plugins/frontend-tools/frontend-tools-config.h.in
@ -0,0 +1,3 @@
+#pragma once
+
+#define BUILD_CAPTIONS @BUILD_CAPTIONS@
--- a/UI/frontend-plugins/frontend-tools/frontend-tools.c
+++ b/UI/frontend-plugins/frontend-tools/frontend-tools.c
@ -1,4 +1,5 @@
 #include <obs-module.h>
+#include "frontend-tools-config.h"

 OBS_DECLARE_MODULE()
 OBS_MODULE_USE_DEFAULT_LOCALE("frontend-tools", "en-US")
@ -7,6 +8,12 @@ OBS_MODULE_USE_DEFAULT_LOCALE("frontend-tools", "en-US")
 void InitSceneSwitcher();
 void FreeSceneSwitcher();
 #endif
+
+#if defined(_WIN32) && BUILD_CAPTIONS
+void InitCaptions();
+void FreeCaptions();
+#endif
+
 void InitOutputTimer();
 void FreeOutputTimer();

@ -14,6 +21,9 @@ bool obs_module_load(void)
 {
 #if defined(_WIN32) || defined(__APPLE__)
 	InitSceneSwitcher();
+#endif
+#if defined(_WIN32) && BUILD_CAPTIONS
+	InitCaptions();
 #endif
 	InitOutputTimer();
 	return true;
@ -23,6 +33,9 @@ void obs_module_unload(void)
 {
 #if defined(_WIN32) || defined(__APPLE__)
 	FreeSceneSwitcher();
+#endif
+#if defined(_WIN32) && BUILD_CAPTIONS
+	FreeCaptions();
 #endif
 	FreeOutputTimer();
 }