obs-studio/plugins/win-dshow/tiny-nv12-scale.c
jp9000 6377fe3177 win-dshow: Add Virtual Camera (Windows)
The virtual camera adds the ability to use the output of OBS itself as a
camera that can be selected within other Windows applications.  This is
very loosely based upon the catxfish virtual camera plugin design.

There is a shared memory queue, but instead of having 10-20 frames in
the queue, there are now only 3 frames in the queue to minimize latency
and reduce memory usage.  The third frame is mostly to ensure that
writing does not occur on the same frame being read; the delay is merely
one frame at all times.

The frames of the shared memory queue are NV12 instead of YUYV, which
reduces the memory and data copied, as well as eliminate unnecessary
conversion from NV12.  Some programs (such as chrome, which uses webrtc
to capture) do not support NV12 however, so an I420 conversion is
provided, which is far less expensive than YUYV.  The CPU cost of NV12
-> I420 is negligible in comparison.

The virtual camera filter itself is based upon the output filter within
the libdshowcapture library, which was originally implemented for other
purposes.  This is more ideal than the Microsoft example code because
for one, it's far less convoluted, two, allows us to be able to
customize the filter to our needs a bit more easily, and three, has much
better RAII.  The Microsoft CBaseFilter/etc code comprised of about 30
source files, where as the output filter comprises of two or three
required source files which we already had, so it's a huge win to
compile time.

Scaling is avoided whenever possible to minimize CPU usage.  When the
virtual camera is activated in OBS, the width, height, and frame
interval are saved, that way if the filter is activated, it will always
remember the last OBS resolution/interval that the virtual camera was
activated with, even if OBS is not active.  If for some reason the
filter activates before OBS starts up, and OBS starts up with a
different resolution, it will use simple point scaling intermittently,
and then will remember the new scaling in the future.  The scaler could
use some optimization.  FFmpeg was not opted for because the FFmpeg DLLs
would have to be provided for both architectures, which would be about
30 megabytes in total, and would make writing the plugin much more
painful.  Thus a simple point scaling algorithm is used, and scaling is
avoided whenever possible.

(If another willing participant wants to have a go at improving the
scaling then go for it.  But otherwise, it avoids scaling whenever
possible anyway, so it's not a huge deal)
2020-07-07 06:09:59 -07:00

135 lines
3.2 KiB
C

#include <string.h>
#include "tiny-nv12-scale.h"
/* TODO: optimize this stuff later, or replace with something better. it's
* kind of garbage. although normally it shouldn't be called that often. plus
* it's nearest neighbor so not really a huge deal. at the very least it
* should be sse2 at some point. */
void nv12_scale_init(nv12_scale_t *s, bool convert_to_i420, int dst_cx,
int dst_cy, int src_cx, int src_cy)
{
s->convert_to_i420 = convert_to_i420;
s->src_cx = src_cx;
s->src_cy = src_cy;
s->dst_cx = dst_cx;
s->dst_cy = dst_cy;
}
static void nv12_scale_nearest(nv12_scale_t *s, uint8_t *dst_start,
const uint8_t *src)
{
register uint8_t *dst = dst_start;
const int src_cx = s->src_cx;
const int src_cy = s->src_cy;
const int dst_cx = s->dst_cx;
const int dst_cy = s->dst_cy;
/* lum */
for (int y = 0; y < dst_cy; y++) {
const int src_line = y * src_cy / dst_cy * s->src_cx;
for (int x = 0; x < dst_cx; x++) {
const int src_x = x * src_cx / dst_cx;
*(dst++) = src[src_line + src_x];
}
}
src += src_cx * src_cy;
/* uv */
const int dst_cx_d2 = dst_cx / 2;
const int dst_cy_d2 = dst_cy / 2;
for (int y = 0; y < dst_cy_d2; y++) {
const int src_line = y * src_cy / dst_cy * src_cx;
for (int x = 0; x < dst_cx_d2; x++) {
const int src_x = x * src_cx / dst_cx * 2;
const int pos = src_line + src_x;
*(dst++) = src[pos];
*(dst++) = src[pos + 1];
}
}
}
static void nv12_scale_nearest_to_i420(nv12_scale_t *s, uint8_t *dst_start,
const uint8_t *src)
{
register uint8_t *dst = dst_start;
const int src_cx = s->src_cx;
const int src_cy = s->src_cy;
const int dst_cx = s->dst_cx;
const int dst_cy = s->dst_cy;
const int size = src_cx * src_cy;
/* lum */
for (int y = 0; y < dst_cy; y++) {
const int src_line = y * src_cy / dst_cy * s->src_cx;
for (int x = 0; x < dst_cx; x++) {
const int src_x = x * src_cx / dst_cx;
*(dst++) = src[src_line + src_x];
}
}
src += size;
/* uv */
const int dst_cx_d2 = dst_cx / 2;
const int dst_cy_d2 = dst_cy / 2;
register uint8_t *dst2 = dst + dst_cx * dst_cy / 4;
for (int y = 0; y < dst_cy_d2; y++) {
const int src_line = y * src_cy / dst_cy * src_cx;
for (int x = 0; x < dst_cx_d2; x++) {
const int src_x = x * src_cx / dst_cx * 2;
const int pos = src_line + src_x;
*(dst++) = src[pos];
*(dst2++) = src[pos + 1];
}
}
}
static void nv12_convert_to_i420(nv12_scale_t *s, uint8_t *dst_start,
const uint8_t *src_start)
{
const int size = s->src_cx * s->src_cy;
const int size_d4 = size / 4;
memcpy(dst_start, src_start, size);
register uint8_t *dst1 = dst_start + size;
register uint8_t *dst2 = dst1 + size_d4;
register uint8_t *dst_end = dst2 + size_d4;
register const uint8_t *src = src_start + size;
while (dst2 < dst_end) {
*(dst1++) = *(src++);
*(dst2++) = *(src++);
}
}
void nv12_do_scale(nv12_scale_t *s, uint8_t *dst, const uint8_t *src)
{
if (s->src_cx == s->dst_cx && s->src_cy == s->dst_cy) {
if (s->convert_to_i420)
nv12_convert_to_i420(s, dst, src);
else
memcpy(dst, src, s->src_cx * s->src_cy * 3 / 2);
} else {
if (s->convert_to_i420)
nv12_scale_nearest_to_i420(s, dst, src);
else
nv12_scale_nearest(s, dst, src);
}
}