From 9aacc99b3eeb73fedc291369be4a771a43070715 Mon Sep 17 00:00:00 2001
From: jpark37 <jpark37@users.noreply.github.com>
Date: Fri, 26 Jul 2019 23:21:41 -0700
Subject: [PATCH] libobs: Separate textures for YUV output, fix chroma

The shaders to pack YUV information into the same texture were rather
complicated and suffering precision issues. Breaking them up into
separate textures makes the shaders much simpler and avoids having to
compute large integer offsets. Unfortunately, the code to handle
multiple textures is not as pleasant, but at least the NV12 rendering
path is no longer separate.

In addition, write chroma samples to "standard" offsets. For I444,
there's no difference, but I420/NV12 formats now have chroma shifted to
the left as 4:2:0 is shown in the H.264 specification.

Intel GPA, SetStablePowerState, Intel HD Graphics 530

Expect speed incrase:
I420: 844 us -> 493 us (254 us + 190 us + 274 us)
I444: 837 us -> 747 us (258 us + 276 us + 272 us)
NV12: 450 us -> 368 us (319 us + 168 us)

Expect no change:
NV12 (HW): 580 (481 us + 166 us) us -> 588 us (468 us + 247 us)
RGB: 359 us -> 387 us

Fixes https://obsproject.com/mantis/view.php?id=624
Fixes https://obsproject.com/mantis/view.php?id=1512
---
 libobs/data/format_conversion.effect | 339 +++++++----------------
 libobs/obs-internal.h                |  16 +-
 libobs/obs-video.c                   | 400 +++++++++++++--------------
 libobs/obs.c                         | 281 ++++++++++---------
 4 files changed, 443 insertions(+), 593 deletions(-)

diff --git a/libobs/data/format_conversion.effect b/libobs/data/format_conversion.effect
index f1d284144..0f096cd21 100644
--- a/libobs/data/format_conversion.effect
+++ b/libobs/data/format_conversion.effect
@@ -15,25 +15,12 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
 
-//#define DEBUGGING
-
-uniform float     u_plane_offset;
-uniform float     v_plane_offset;
-
 uniform float     width;
 uniform float     height;
 uniform float     width_i;
-uniform float     height_i;
 uniform float     width_d2;
-uniform float     height_d2;
 uniform float     width_d2_i;
-uniform float     height_d2_i;
-uniform float     input_width;
-uniform float     input_height;
-uniform float     input_width_i;
-uniform float     input_height_i;
 uniform float     input_width_i_d2;
-uniform float     input_height_i_d2;
 
 uniform int       int_width;
 uniform int       int_input_width;
@@ -65,8 +52,17 @@ struct VertTexPos {
 	float4 pos : POSITION;
 };
 
+struct VertTexPosWide {
+	float3 uuv : TEXCOORD0;
+	float4 pos : POSITION;
+};
+
 struct FragTex {
-	float2 uv  : TEXCOORD0;
+	float2 uv : TEXCOORD0;
+};
+
+struct FragTexWide {
+	float3 uuv : TEXCOORD0;
 };
 
 FragPos VSPos(uint id : VERTEXID)
@@ -82,7 +78,7 @@ FragPos VSPos(uint id : VERTEXID)
 	return vert_out;
 }
 
-VertTexPos VSPosTex(uint id : VERTEXID)
+VertTexPos VSTexPos(uint id : VERTEXID)
 {
 	float idHigh = float(id >> 1);
 	float idLow = float(id & uint(1));
@@ -99,225 +95,76 @@ VertTexPos VSPosTex(uint id : VERTEXID)
 	return vert_out;
 }
 
+VertTexPosWide VSTexPosLeft(uint id : VERTEXID)
+{
+	float idHigh = float(id >> 1);
+	float idLow = float(id & uint(1));
+
+	float x = idHigh * 4.0 - 1.0;
+	float y = idLow * 4.0 - 1.0;
+
+	float u_right = idHigh * 2.0;
+	float u_left = u_right - width_i;
+	float v = obs_glsl_compile ? (idLow * 2.0) : (1.0 - idLow * 2.0);
+
+	VertTexPosWide vert_out;
+	vert_out.uuv.x = u_left;
+	vert_out.uuv.y = u_right;
+	vert_out.uuv.z = v;
+	vert_out.pos = float4(x, y, 0.0, 1.0);
+	return vert_out;
+}
+
 /* used to prevent internal GPU precision issues width fmod in particular */
 #define PRECISION_OFFSET 0.2
 
-float4 PSNV12(FragTex frag_in) : TARGET
-{
-	float v_mul = floor(frag_in.uv.y * input_height);
-
-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
-
-	if (byte_offset < u_plane_offset) {
-#ifdef DEBUGGING
-		return float4(1.0, 1.0, 1.0, 1.0);
-#endif
-
-		float lum_u = floor(fmod(byte_offset, width)) * width_i;
-		float lum_v = floor(byte_offset * width_i)    * height_i;
-
-		/* move to texel centers to sample the 4 pixels properly */
-		lum_u += width_i  * 0.5;
-		lum_v += height_i * 0.5;
-
-		float2 sample_pos0 = float2(lum_u,            lum_v);
-		float2 sample_pos1 = float2(lum_u += width_i, lum_v);
-		float2 sample_pos2 = float2(lum_u += width_i, lum_v);
-		float2 sample_pos3 = float2(lum_u +  width_i, lum_v);
-
-		float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-		float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-		float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-		float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-		float4 out_val = float4(
-			dot(color_vec_y.xyz, rgb0) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb1) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb2) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb3) + color_vec_y.w
-		);
-
-		return out_val;
-	} else {
-#ifdef DEBUGGING
-		return float4(0.5, 0.2, 0.5, 0.2);
-#endif
-
-		float new_offset = byte_offset - u_plane_offset;
-
-		float ch_u = floor(fmod(new_offset, width)) * width_i;
-		float ch_v = floor(new_offset * width_i)    * height_d2_i;
-		float width_i2 = width_i*2.0;
-
-		/* move to the borders of each set of 4 pixels to force it
-		 * to do bilinear averaging */
-		ch_u += width_i;
-		ch_v += height_i;
-
-		float2 sample_pos0 = float2(ch_u,             ch_v);
-		float2 sample_pos1 = float2(ch_u + width_i2,  ch_v);
-
-		float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-		float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-
-		return float4(
-			dot(color_vec_u.xyz, rgb0) + color_vec_u.w,
-			dot(color_vec_v.xyz, rgb0) + color_vec_v.w,
-			dot(color_vec_u.xyz, rgb1) + color_vec_u.w,
-			dot(color_vec_v.xyz, rgb1) + color_vec_v.w
-		);
-	}
-}
-
-float PSNV12_Y(FragPos frag_in) : TARGET
+float PS_Y(FragPos frag_in) : TARGET
 {
 	float3 rgb = image.Load(int3(frag_in.pos.xy, 0)).rgb;
 	float y = dot(color_vec_y.xyz, rgb) + color_vec_y.w;
 	return y;
 }
 
-float2 PSNV12_UV(FragTex frag_in) : TARGET
+float2 PS_UV_Wide(FragTexWide frag_in) : TARGET
 {
-	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
 	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
 	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
 	return float2(u, v);
 }
 
-float4 PSPlanar420(FragTex frag_in) : TARGET
+float PS_U(FragTex frag_in) : TARGET
 {
-	float v_mul = floor(frag_in.uv.y * input_height);
-
-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
-
-	float2 sample_pos0, sample_pos1, sample_pos2, sample_pos3;
-
-	if (byte_offset < u_plane_offset) {
-#ifdef DEBUGGING
-		return float4(1.0, 1.0, 1.0, 1.0);
-#endif
-
-		float lum_u = floor(fmod(byte_offset, width)) * width_i;
-		float lum_v = floor(byte_offset * width_i)    * height_i;
-
-		/* move to texel centers to sample the 4 pixels properly */
-		lum_u += width_i  * 0.5;
-		lum_v += height_i * 0.5;
-
-		sample_pos0 = float2(lum_u,            lum_v);
-		sample_pos1 = float2(lum_u += width_i, lum_v);
-		sample_pos2 = float2(lum_u += width_i, lum_v);
-		sample_pos3 = float2(lum_u +  width_i, lum_v);
-
-	} else {
-#ifdef DEBUGGING
-		return ((byte_offset < v_plane_offset) ?
-				float4(0.5, 0.5, 0.5, 0.5) :
-				float4(0.2, 0.2, 0.2, 0.2));
-#endif
-
-		float new_offset = byte_offset -
-				((byte_offset < v_plane_offset) ?
-				u_plane_offset : v_plane_offset);
-
-		float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;
-		float ch_v = floor(new_offset * width_d2_i)    * height_d2_i;
-		float width_i2 = width_i*2.0;
-
-		/* move to the borders of each set of 4 pixels to force it
-		 * to do bilinear averaging */
-		ch_u += width_i;
-		ch_v += height_i;
-
-		/* set up coordinates for next chroma line, in case
-		 * (width / 2) % 4 == 2, i.e. the current set of 4 pixels is split
-		 * between the current and the next chroma line; do note that the next
-		 * chroma line is two source lines below the current source line */
-		float ch_u_n = 0.   + width_i;
-		float ch_v_n = ch_v + height_i * 3;
-
-		sample_pos0 = float2(ch_u,             ch_v);
-		sample_pos1 = float2(ch_u += width_i2, ch_v);
-
-		ch_u += width_i2;
-		// check if ch_u overflowed the current source and chroma line
-		if (ch_u > 1.0) {
-			sample_pos2 = float2(ch_u_n,            ch_v_n);
-			sample_pos2 = float2(ch_u_n + width_i2, ch_v_n);
-		} else {
-			sample_pos2 = float2(ch_u,            ch_v);
-			sample_pos3 = float2(ch_u + width_i2, ch_v);
-		}
-	}
-
-	float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-	float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-	float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-	float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-	float4 color_vec;
-	if (byte_offset < u_plane_offset)
-		color_vec = color_vec_y;
-	else if (byte_offset < v_plane_offset)
-		color_vec = color_vec_u;
-	else
-		color_vec = color_vec_v;
-
-	return float4(
-		dot(color_vec.xyz, rgb0) + color_vec.w,
-		dot(color_vec.xyz, rgb1) + color_vec.w,
-		dot(color_vec.xyz, rgb2) + color_vec.w,
-		dot(color_vec.xyz, rgb3) + color_vec.w
-	);
+	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
+	return u;
 }
 
-float4 PSPlanar444(FragTex frag_in) : TARGET
+float PS_V(FragTex frag_in) : TARGET
 {
-	float v_mul = floor(frag_in.uv.y * input_height);
+	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
+	return v;
+}
 
-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
+float PS_U_Wide(FragTexWide frag_in) : TARGET
+{
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
+	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
+	return u;
+}
 
-	float new_byte_offset = byte_offset;
-
-	if (byte_offset >= v_plane_offset)
-		new_byte_offset -= v_plane_offset;
-	else if (byte_offset >= u_plane_offset)
-		new_byte_offset -= u_plane_offset;
-
-	float u_val = floor(fmod(new_byte_offset, width)) * width_i;
-	float v_val = floor(new_byte_offset * width_i)    * height_i;
-
-	/* move to texel centers to sample the 4 pixels properly */
-	u_val += width_i  * 0.5;
-	v_val += height_i * 0.5;
-
-	float2 sample_pos0 = float2(u_val,            v_val);
-	float2 sample_pos1 = float2(u_val += width_i, v_val);
-	float2 sample_pos2 = float2(u_val += width_i, v_val);
-	float2 sample_pos3 = float2(u_val +  width_i, v_val);
-
-	float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-	float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-	float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-	float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-	float4 color_vec;
-	if (byte_offset < u_plane_offset)
-		color_vec = color_vec_y;
-	else if (byte_offset < v_plane_offset)
-		color_vec = color_vec_u;
-	else
-		color_vec = color_vec_v;
-
-	return float4(
-		dot(color_vec.xyz, rgb0) + color_vec.w,
-		dot(color_vec.xyz, rgb1) + color_vec.w,
-		dot(color_vec.xyz, rgb2) + color_vec.w,
-		dot(color_vec.xyz, rgb3) + color_vec.w
-	);
+float PS_V_Wide(FragTexWide frag_in) : TARGET
+{
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
+	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
+	return v;
 }
 
 float GetIntOffsetColor(int offset)
@@ -473,30 +320,48 @@ float4 PSBGR3_Full(FragTex frag_in) : TARGET
 	return float4(rgb, 1.0);
 }
 
-technique Planar420
+technique Planar_Y
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSPlanar420(frag_in);
+		vertex_shader = VSPos(id);
+		pixel_shader  = PS_Y(frag_in);
 	}
 }
 
-technique Planar444
+technique Planar_U
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSPlanar444(frag_in);
+		vertex_shader = VSTexPos(id);
+		pixel_shader  = PS_U(frag_in);
 	}
 }
 
-technique NV12
+technique Planar_V
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSNV12(frag_in);
+		vertex_shader = VSTexPos(id);
+		pixel_shader  = PS_V(frag_in);
+	}
+}
+
+technique Planar_U_Left
+{
+	pass
+	{
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_U_Wide(frag_in);
+	}
+}
+
+technique Planar_V_Left
+{
+	pass
+	{
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_V_Wide(frag_in);
 	}
 }
 
@@ -505,7 +370,7 @@ technique NV12_Y
 	pass
 	{
 		vertex_shader = VSPos(id);
-		pixel_shader  = PSNV12_Y(frag_in);
+		pixel_shader  = PS_Y(frag_in);
 	}
 }
 
@@ -513,8 +378,8 @@ technique NV12_UV
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSNV12_UV(frag_in);
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_UV_Wide(frag_in);
 	}
 }
 
@@ -522,7 +387,7 @@ technique UYVY_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 2, 0, 1, 3);
 	}
 }
@@ -531,7 +396,7 @@ technique YUY2_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 1, 3, 2, 0);
 	}
 }
@@ -540,7 +405,7 @@ technique YVYU_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 3, 1, 2, 0);
 	}
 }
@@ -549,7 +414,7 @@ technique I420_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar420_Reverse(frag_in);
 	}
 }
@@ -558,7 +423,7 @@ technique I422_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar422_Reverse(frag_in);
 	}
 }
@@ -567,7 +432,7 @@ technique I444_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar444_Reverse(frag_in);
 	}
 }
@@ -576,7 +441,7 @@ technique NV12_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSNV12_Reverse(frag_in);
 	}
 }
@@ -585,7 +450,7 @@ technique Y800_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSY800_Limited(frag_in);
 	}
 }
@@ -594,7 +459,7 @@ technique Y800_Full
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSY800_Full(frag_in);
 	}
 }
@@ -603,7 +468,7 @@ technique RGB_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSRGB_Limited(frag_in);
 	}
 }
@@ -612,7 +477,7 @@ technique BGR3_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSBGR3_Limited(frag_in);
 	}
 }
@@ -621,7 +486,7 @@ technique BGR3_Full
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSBGR3_Full(frag_in);
 	}
 }
diff --git a/libobs/obs-internal.h b/libobs/obs-internal.h
index 06377c25f..4efb93469 100644
--- a/libobs/obs-internal.h
+++ b/libobs/obs-internal.h
@@ -37,6 +37,7 @@
 #include "obs.h"
 
 #define NUM_TEXTURES 2
+#define NUM_CHANNELS 3
 #define MICROSECOND_DEN 1000000
 #define NUM_ENCODE_TEXTURES 3
 #define NUM_ENCODE_TEXTURE_FRAMES_TO_WAIT 1
@@ -235,11 +236,10 @@ struct obs_tex_frame {
 
 struct obs_core_video {
 	graphics_t *graphics;
-	gs_stagesurf_t *copy_surfaces[NUM_TEXTURES];
+	gs_stagesurf_t *copy_surfaces[NUM_TEXTURES][NUM_CHANNELS];
 	gs_texture_t *render_texture;
 	gs_texture_t *output_texture;
-	gs_texture_t *convert_texture;
-	gs_texture_t *convert_uv_texture;
+	gs_texture_t *convert_textures[NUM_CHANNELS];
 	bool texture_rendered;
 	bool textures_copied[NUM_TEXTURES];
 	bool texture_converted;
@@ -258,7 +258,7 @@ struct obs_core_video {
 	gs_effect_t *bilinear_lowres_effect;
 	gs_effect_t *premultiplied_alpha_effect;
 	gs_samplerstate_t *point_sampler;
-	gs_stagesurf_t *mapped_surface;
+	gs_stagesurf_t *mapped_surfaces[NUM_CHANNELS];
 	int cur_texture;
 	long raw_active;
 	long gpu_encoder_active;
@@ -283,11 +283,9 @@ struct obs_core_video {
 	bool thread_initialized;
 
 	bool gpu_conversion;
-	const char *conversion_tech;
-	uint32_t conversion_height;
-	uint32_t plane_offsets[3];
-	uint32_t plane_sizes[3];
-	uint32_t plane_linewidth[3];
+	const char *conversion_techs[NUM_CHANNELS];
+	bool conversion_needed;
+	float conversion_width_i;
 
 	uint32_t output_width;
 	uint32_t output_height;
diff --git a/libobs/obs-video.c b/libobs/obs-video.c
index b402c3be8..99bf4f9fe 100644
--- a/libobs/obs-video.c
+++ b/libobs/obs-video.c
@@ -109,9 +109,11 @@ static inline void set_render_size(uint32_t width, uint32_t height)
 
 static inline void unmap_last_surface(struct obs_core_video *video)
 {
-	if (video->mapped_surface) {
-		gs_stagesurface_unmap(video->mapped_surface);
-		video->mapped_surface = NULL;
+	for (int c = 0; c < NUM_CHANNELS; ++c) {
+		if (video->mapped_surfaces[c]) {
+			gs_stagesurface_unmap(video->mapped_surfaces[c]);
+			video->mapped_surfaces[c] = NULL;
+		}
 	}
 }
 
@@ -264,10 +266,24 @@ static inline gs_texture_t *render_output_texture(struct obs_core_video *video)
 	return target;
 }
 
-static inline void set_eparam(gs_effect_t *effect, const char *name, float val)
+static void render_convert_plane(gs_effect_t *effect, gs_texture_t *texture,
+				 gs_texture_t *target, const char *tech_name)
 {
-	gs_eparam_t *param = gs_effect_get_param_by_name(effect, name);
-	gs_effect_set_float(param, val);
+	gs_technique_t *tech = gs_effect_get_technique(effect, tech_name);
+
+	const uint32_t width = gs_texture_get_width(target);
+	const uint32_t height = gs_texture_get_height(target);
+
+	gs_set_render_target(target, NULL);
+	set_render_size(width, height);
+
+	size_t passes = gs_technique_begin(tech);
+	for (size_t i = 0; i < passes; i++) {
+		gs_technique_begin_pass(tech, i);
+		gs_draw(GS_TRIS, 0, 3);
+		gs_technique_end_pass(tech);
+	}
+	gs_technique_end(tech);
 }
 
 static const char *render_convert_texture_name = "render_convert_texture";
@@ -276,11 +292,6 @@ static void render_convert_texture(struct obs_core_video *video,
 {
 	profile_start(render_convert_texture_name);
 
-	gs_texture_t *target = video->convert_texture;
-	float fwidth = (float)video->output_width;
-	float fheight = (float)video->output_height;
-	size_t passes, i;
-
 	gs_effect_t *effect = video->conversion_effect;
 	gs_eparam_t *color_vec_y =
 		gs_effect_get_param_by_name(effect, "color_vec_y");
@@ -289,20 +300,7 @@ static void render_convert_texture(struct obs_core_video *video,
 	gs_eparam_t *color_vec_v =
 		gs_effect_get_param_by_name(effect, "color_vec_v");
 	gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
-	gs_technique_t *tech =
-		gs_effect_get_technique(effect, video->conversion_tech);
-
-	set_eparam(effect, "u_plane_offset", (float)video->plane_offsets[1]);
-	set_eparam(effect, "v_plane_offset", (float)video->plane_offsets[2]);
-	set_eparam(effect, "width", fwidth);
-	set_eparam(effect, "height", fheight);
-	set_eparam(effect, "width_i", 1.0f / fwidth);
-	set_eparam(effect, "height_i", 1.0f / fheight);
-	set_eparam(effect, "width_d2", fwidth * 0.5f);
-	set_eparam(effect, "height_d2", fheight * 0.5f);
-	set_eparam(effect, "width_d2_i", 1.0f / (fwidth * 0.5f));
-	set_eparam(effect, "height_d2_i", 1.0f / (fheight * 0.5f));
-	set_eparam(effect, "input_height", (float)video->conversion_height);
+	gs_eparam_t *width_i = gs_effect_get_param_by_name(effect, "width_i");
 
 	struct vec4 vec_y, vec_u, vec_v;
 	vec4_set(&vec_y, video->color_matrix[4], video->color_matrix[5],
@@ -311,23 +309,39 @@ static void render_convert_texture(struct obs_core_video *video,
 		 video->color_matrix[2], video->color_matrix[3]);
 	vec4_set(&vec_v, video->color_matrix[8], video->color_matrix[9],
 		 video->color_matrix[10], video->color_matrix[11]);
-	gs_effect_set_vec4(color_vec_y, &vec_y);
-	gs_effect_set_vec4(color_vec_u, &vec_u);
-	gs_effect_set_vec4(color_vec_v, &vec_v);
-
-	gs_effect_set_texture(image, texture);
-
-	gs_set_render_target(target, NULL);
-	set_render_size(video->output_width, video->conversion_height);
 
 	gs_enable_blending(false);
-	passes = gs_technique_begin(tech);
-	for (i = 0; i < passes; i++) {
-		gs_technique_begin_pass(tech, i);
-		gs_draw(GS_TRIS, 0, 3);
-		gs_technique_end_pass(tech);
+
+	if (video->convert_textures[0]) {
+		gs_effect_set_texture(image, texture);
+		gs_effect_set_vec4(color_vec_y, &vec_y);
+		render_convert_plane(effect, texture,
+				     video->convert_textures[0],
+				     video->conversion_techs[0]);
+
+		if (video->convert_textures[1]) {
+			gs_effect_set_texture(image, texture);
+			gs_effect_set_vec4(color_vec_u, &vec_u);
+			if (!video->convert_textures[2])
+				gs_effect_set_vec4(color_vec_v, &vec_v);
+			gs_effect_set_float(width_i, video->conversion_width_i);
+			render_convert_plane(effect, texture,
+					     video->convert_textures[1],
+					     video->conversion_techs[1]);
+
+			if (video->convert_textures[2]) {
+				gs_effect_set_texture(image, texture);
+				gs_effect_set_vec4(color_vec_v, &vec_v);
+				gs_effect_set_float(width_i,
+						    video->conversion_width_i);
+				render_convert_plane(
+					effect, texture,
+					video->convert_textures[2],
+					video->conversion_techs[2]);
+			}
+		}
 	}
-	gs_technique_end(tech);
+
 	gs_enable_blending(true);
 
 	video->texture_converted = true;
@@ -335,90 +349,32 @@ static void render_convert_texture(struct obs_core_video *video,
 	profile_end(render_convert_texture_name);
 }
 
-static void render_nv12(struct obs_core_video *video, gs_texture_t *texture,
-			gs_texture_t *target, const char *tech_name,
-			uint32_t width, uint32_t height)
-{
-	gs_effect_t *effect = video->conversion_effect;
-	gs_eparam_t *color_vec_y =
-		gs_effect_get_param_by_name(effect, "color_vec_y");
-	gs_eparam_t *color_vec_u =
-		gs_effect_get_param_by_name(effect, "color_vec_u");
-	gs_eparam_t *color_vec_v =
-		gs_effect_get_param_by_name(effect, "color_vec_v");
-	gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
-	gs_technique_t *tech = gs_effect_get_technique(effect, tech_name);
-	size_t passes, i;
-
-	struct vec4 vec_y, vec_u, vec_v;
-	vec4_set(&vec_y, video->color_matrix[4], video->color_matrix[5],
-		 video->color_matrix[6], video->color_matrix[7]);
-	vec4_set(&vec_u, video->color_matrix[0], video->color_matrix[1],
-		 video->color_matrix[2], video->color_matrix[3]);
-	vec4_set(&vec_v, video->color_matrix[8], video->color_matrix[9],
-		 video->color_matrix[10], video->color_matrix[11]);
-	gs_effect_set_vec4(color_vec_y, &vec_y);
-	gs_effect_set_vec4(color_vec_u, &vec_u);
-	gs_effect_set_vec4(color_vec_v, &vec_v);
-
-	gs_effect_set_texture(image, texture);
-
-	gs_set_render_target(target, NULL);
-	set_render_size(width, height);
-
-	gs_enable_blending(false);
-	passes = gs_technique_begin(tech);
-	for (i = 0; i < passes; i++) {
-		gs_technique_begin_pass(tech, i);
-		gs_draw(GS_TRIS, 0, 3);
-		gs_technique_end_pass(tech);
-	}
-	gs_technique_end(tech);
-	gs_enable_blending(true);
-}
-
-static const char *render_convert_nv12_name = "render_convert_texture_nv12";
-static void render_convert_texture_nv12(struct obs_core_video *video,
-					gs_texture_t *texture)
-{
-	profile_start(render_convert_nv12_name);
-
-	render_nv12(video, texture, video->convert_texture, "NV12_Y",
-		    video->output_width, video->output_height);
-	render_nv12(video, texture, video->convert_uv_texture, "NV12_UV",
-		    video->output_width / 2, video->output_height / 2);
-
-	video->texture_converted = true;
-
-	profile_end(render_convert_nv12_name);
-}
-
 static const char *stage_output_texture_name = "stage_output_texture";
 static inline void stage_output_texture(struct obs_core_video *video,
-					gs_texture_t *texture, int cur_texture)
+					int cur_texture)
 {
 	profile_start(stage_output_texture_name);
 
-	bool texture_ready;
-	gs_stagesurf_t *copy = video->copy_surfaces[cur_texture];
-
-	if (video->gpu_conversion) {
-		texture = video->convert_texture;
-		texture_ready = video->texture_converted;
-	} else {
-		texture_ready = true;
-	}
-
 	unmap_last_surface(video);
 
-	if (!texture_ready)
-		goto end;
+	if (!video->gpu_conversion) {
+		gs_stagesurf_t *copy = video->copy_surfaces[cur_texture][0];
+		if (copy)
+			gs_stage_texture(copy, video->output_texture);
 
-	gs_stage_texture(copy, texture);
+		video->textures_copied[cur_texture] = true;
+	} else if (video->texture_converted) {
+		for (int i = 0; i < NUM_CHANNELS; i++) {
+			gs_stagesurf_t *copy =
+				video->copy_surfaces[cur_texture][i];
+			if (copy)
+				gs_stage_texture(copy,
+						 video->convert_textures[i]);
+		}
 
-	video->textures_copied[cur_texture] = true;
+		video->textures_copied[cur_texture] = true;
+	}
 
-end:
 	profile_end(stage_output_texture_name);
 }
 
@@ -458,13 +414,13 @@ static inline bool queue_frame(struct obs_core_video *video, bool raw_active,
 	 * reason.  otherwise, it goes to the 'duplicate' case above, which
 	 * will ensure better performance. */
 	if (raw_active || vframe_info->count > 1) {
-		gs_copy_texture(tf.tex, video->convert_texture);
+		gs_copy_texture(tf.tex, video->convert_textures[0]);
 	} else {
-		gs_texture_t *tex = video->convert_texture;
-		gs_texture_t *tex_uv = video->convert_uv_texture;
+		gs_texture_t *tex = video->convert_textures[0];
+		gs_texture_t *tex_uv = video->convert_textures[1];
 
-		video->convert_texture = tf.tex;
-		video->convert_uv_texture = tf.tex_uv;
+		video->convert_textures[0] = tf.tex;
+		video->convert_textures[1] = tf.tex_uv;
 
 		tf.tex = tex;
 		tf.tex_uv = tex_uv;
@@ -529,17 +485,12 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 		gs_texture_t *texture = render_output_texture(video);
 
 #ifdef _WIN32
-		if (gpu_active) {
+		if (gpu_active)
 			gs_flush();
-		}
 #endif
 
-		if (video->gpu_conversion) {
-			if (video->using_nv12_tex)
-				render_convert_texture_nv12(video, texture);
-			else
-				render_convert_texture(video, texture);
-		}
+		if (video->gpu_conversion)
+			render_convert_texture(video, texture);
 
 #ifdef _WIN32
 		if (gpu_active) {
@@ -547,8 +498,9 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 			output_gpu_encoders(video, raw_active);
 		}
 #endif
+
 		if (raw_active)
-			stage_output_texture(video, texture, cur_texture);
+			stage_output_texture(video, cur_texture);
 	}
 
 	gs_set_render_target(NULL, NULL);
@@ -560,73 +512,41 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 static inline bool download_frame(struct obs_core_video *video,
 				  int prev_texture, struct video_data *frame)
 {
-	gs_stagesurf_t *surface = video->copy_surfaces[prev_texture];
-
 	if (!video->textures_copied[prev_texture])
 		return false;
 
-	if (!gs_stagesurface_map(surface, &frame->data[0], &frame->linesize[0]))
-		return false;
+	for (int channel = 0; channel < NUM_CHANNELS; ++channel) {
+		gs_stagesurf_t *surface =
+			video->copy_surfaces[prev_texture][channel];
+		if (surface) {
+			if (!gs_stagesurface_map(surface, &frame->data[channel],
+						 &frame->linesize[channel]))
+				return false;
 
-	video->mapped_surface = surface;
+			video->mapped_surfaces[channel] = surface;
+		}
+	}
 	return true;
 }
 
-static inline uint32_t calc_linesize(uint32_t pos, uint32_t linesize)
+static const uint8_t *set_gpu_converted_plane(uint32_t width, uint32_t height,
+					      uint32_t linesize_input,
+					      uint32_t linesize_output,
+					      const uint8_t *in, uint8_t *out)
 {
-	uint32_t size = pos % linesize;
-	return size ? size : linesize;
-}
-
-static void copy_dealign(uint8_t *dst, uint32_t dst_pos, uint32_t dst_linesize,
-			 const uint8_t *src, uint32_t src_pos,
-			 uint32_t src_linesize, uint32_t remaining)
-{
-	while (remaining) {
-		uint32_t src_remainder = src_pos % src_linesize;
-		uint32_t dst_offset = dst_linesize - src_remainder;
-		uint32_t src_offset = src_linesize - src_remainder;
-
-		if (remaining < dst_offset) {
-			memcpy(dst + dst_pos, src + src_pos, remaining);
-			src_pos += remaining;
-			dst_pos += remaining;
-			remaining = 0;
-		} else {
-			memcpy(dst + dst_pos, src + src_pos, dst_offset);
-			src_pos += src_offset;
-			dst_pos += dst_offset;
-			remaining -= dst_offset;
+	if ((width == linesize_input) && (width == linesize_output)) {
+		size_t total = width * height;
+		memcpy(out, in, total);
+		in += total;
+	} else {
+		for (size_t y = 0; y < height; y++) {
+			memcpy(out, in, width);
+			out += linesize_output;
+			in += linesize_input;
 		}
 	}
-}
 
-static inline uint32_t make_aligned_linesize_offset(uint32_t offset,
-						    uint32_t dst_linesize,
-						    uint32_t src_linesize)
-{
-	uint32_t remainder = offset % dst_linesize;
-	return (offset / dst_linesize) * src_linesize + remainder;
-}
-
-static void fix_gpu_converted_alignment(struct obs_core_video *video,
-					struct video_frame *output,
-					const struct video_data *input)
-{
-	uint32_t src_linesize = input->linesize[0];
-	uint32_t dst_linesize = output->linesize[0] * 4;
-	uint32_t src_pos = 0;
-
-	for (size_t i = 0; i < 3; i++) {
-		if (video->plane_linewidth[i] == 0)
-			break;
-
-		src_pos = make_aligned_linesize_offset(
-			video->plane_offsets[i], dst_linesize, src_linesize);
-
-		copy_dealign(output->data[i], 0, dst_linesize, input->data[0],
-			     src_pos, src_linesize, video->plane_sizes[i]);
-	}
+	return in;
 }
 
 static void set_gpu_converted_data(struct obs_core_video *video,
@@ -634,41 +554,91 @@ static void set_gpu_converted_data(struct obs_core_video *video,
 				   const struct video_data *input,
 				   const struct video_output_info *info)
 {
-	if (input->linesize[0] == video->output_width * 4) {
-		struct video_frame frame;
+	if (video->using_nv12_tex) {
+		const uint32_t width = info->width;
+		const uint32_t height = info->height;
 
-		for (size_t i = 0; i < 3; i++) {
-			if (video->plane_linewidth[i] == 0)
-				break;
-
-			frame.linesize[i] = video->plane_linewidth[i];
-			frame.data[i] =
-				input->data[0] + video->plane_offsets[i];
-		}
-
-		video_frame_copy(output, &frame, info->format, info->height);
-
-	} else if (video->using_nv12_tex) {
-		size_t width = info->width;
-		size_t height = info->height;
-		size_t height_d2 = height / 2;
-		uint8_t *out_y = output->data[0];
-		uint8_t *out_uv = output->data[1];
-		uint8_t *in = input->data[0];
-
-		for (size_t y = 0; y < height; y++) {
-			memcpy(out_y, in, width);
-			out_y += output->linesize[0];
-			in += input->linesize[0];
-		}
-		for (size_t y = 0; y < height_d2; y++) {
-			memcpy(out_uv, in, width);
-			out_uv += output->linesize[0];
-			in += input->linesize[0];
-		}
+		const uint8_t *const in_uv = set_gpu_converted_plane(
+			width, height, input->linesize[0], output->linesize[0],
+			input->data[0], output->data[0]);
 
+		const uint32_t height_d2 = height / 2;
+		set_gpu_converted_plane(width, height_d2, input->linesize[0],
+					output->linesize[1], in_uv,
+					output->data[1]);
 	} else {
-		fix_gpu_converted_alignment(video, output, input);
+		switch (info->format) {
+		case VIDEO_FORMAT_I420: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			const uint32_t width_d2 = width / 2;
+			const uint32_t height_d2 = height / 2;
+
+			set_gpu_converted_plane(width_d2, height_d2,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			set_gpu_converted_plane(width_d2, height_d2,
+						input->linesize[2],
+						output->linesize[2],
+						input->data[2],
+						output->data[2]);
+
+			break;
+		}
+		case VIDEO_FORMAT_NV12: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			const uint32_t height_d2 = height / 2;
+			set_gpu_converted_plane(width, height_d2,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			break;
+		}
+		case VIDEO_FORMAT_I444: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[2],
+						output->linesize[2],
+						input->data[2],
+						output->data[2]);
+
+			break;
+		}
+		}
 	}
 }
 
diff --git a/libobs/obs.c b/libobs/obs.c
index 1d33fcfb1..8f694f2a9 100644
--- a/libobs/obs.c
+++ b/libobs/obs.c
@@ -42,117 +42,35 @@ static inline void make_video_info(struct video_output_info *vi,
 	vi->cache_size = 6;
 }
 
-#define PIXEL_SIZE 4
-
-#define GET_ALIGN(val, align) (((val) + (align - 1)) & ~(align - 1))
-
-static inline void set_420p_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height / 4);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = ovi->output_width * ovi->output_height;
-	video->plane_offsets[2] = video->plane_offsets[1] + chroma_pixels;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width / 2;
-	video->plane_linewidth[2] = ovi->output_width / 2;
-
-	video->plane_sizes[0] = video->plane_offsets[1];
-	video->plane_sizes[1] = video->plane_sizes[0] / 4;
-	video->plane_sizes[2] = video->plane_sizes[1];
-
-	total_bytes = video->plane_offsets[2] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "Planar420";
-}
-
-static inline void set_nv12_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height / 2);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = ovi->output_width * ovi->output_height;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width;
-
-	video->plane_sizes[0] = video->plane_offsets[1];
-	video->plane_sizes[1] = video->plane_sizes[0] / 2;
-
-	total_bytes = video->plane_offsets[1] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "NV12";
-}
-
-static inline void set_444p_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = chroma_pixels;
-	video->plane_offsets[2] = chroma_pixels + chroma_pixels;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width;
-	video->plane_linewidth[2] = ovi->output_width;
-
-	video->plane_sizes[0] = chroma_pixels;
-	video->plane_sizes[1] = chroma_pixels;
-	video->plane_sizes[2] = chroma_pixels;
-
-	total_bytes = video->plane_offsets[2] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "Planar444";
-}
-
 static inline void calc_gpu_conversion_sizes(const struct obs_video_info *ovi)
 {
-	obs->video.conversion_height = 0;
-	memset(obs->video.plane_offsets, 0, sizeof(obs->video.plane_offsets));
-	memset(obs->video.plane_sizes, 0, sizeof(obs->video.plane_sizes));
-	memset(obs->video.plane_linewidth, 0,
-	       sizeof(obs->video.plane_linewidth));
+	struct obs_core_video *video = &obs->video;
+
+	video->conversion_needed = false;
+	video->conversion_techs[0] = NULL;
+	video->conversion_techs[1] = NULL;
+	video->conversion_techs[2] = NULL;
+	video->conversion_width_i = 0.f;
 
 	switch ((uint32_t)ovi->output_format) {
 	case VIDEO_FORMAT_I420:
-		set_420p_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "Planar_Y";
+		video->conversion_techs[1] = "Planar_U_Left";
+		video->conversion_techs[2] = "Planar_V_Left";
+		video->conversion_width_i = 1.f / (float)ovi->output_width;
 		break;
 	case VIDEO_FORMAT_NV12:
-		set_nv12_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "NV12_Y";
+		video->conversion_techs[1] = "NV12_UV";
+		video->conversion_width_i = 1.f / (float)ovi->output_width;
 		break;
 	case VIDEO_FORMAT_I444:
-		set_444p_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "Planar_Y";
+		video->conversion_techs[1] = "Planar_U";
+		video->conversion_techs[2] = "Planar_V";
 		break;
 	}
 }
@@ -167,7 +85,7 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)
 					? gs_nv12_available()
 					: false;
 
-	if (!video->conversion_height) {
+	if (!video->conversion_needed) {
 		blog(LOG_INFO, "GPU conversion not available for format: %u",
 		     (unsigned int)ovi->output_format);
 		video->gpu_conversion = false;
@@ -183,23 +101,96 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)
 
 #ifdef _WIN32
 	if (video->using_nv12_tex) {
-		gs_texture_create_nv12(&video->convert_texture,
-				       &video->convert_uv_texture,
+		gs_texture_create_nv12(&video->convert_textures[0],
+				       &video->convert_textures[1],
 				       ovi->output_width, ovi->output_height,
 				       GS_RENDER_TARGET | GS_SHARED_KM_TEX);
-		if (!video->convert_uv_texture)
-			return false;
 	} else {
 #endif
-		video->convert_texture = gs_texture_create(
-			ovi->output_width, video->conversion_height, GS_RGBA, 1,
-			NULL, GS_RENDER_TARGET);
+		video->convert_textures[0] =
+			gs_texture_create(ovi->output_width, ovi->output_height,
+					  GS_R8, 1, NULL, GS_RENDER_TARGET);
+
+		const struct video_output_info *info =
+			video_output_get_info(video->video);
+		switch (info->format) {
+		case VIDEO_FORMAT_I420:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8, 1, NULL, GS_RENDER_TARGET);
+			video->convert_textures[2] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8, 1, NULL, GS_RENDER_TARGET);
+			if (!video->convert_textures[2])
+				return false;
+			break;
+		case VIDEO_FORMAT_NV12:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8G8, 1, NULL, GS_RENDER_TARGET);
+			break;
+		case VIDEO_FORMAT_I444:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width, ovi->output_height, GS_R8, 1,
+				NULL, GS_RENDER_TARGET);
+			video->convert_textures[2] = gs_texture_create(
+				ovi->output_width, ovi->output_height, GS_R8, 1,
+				NULL, GS_RENDER_TARGET);
+			if (!video->convert_textures[2])
+				return false;
+			break;
+		}
 #ifdef _WIN32
 	}
 #endif
 
-	if (!video->convert_texture)
+	if (!video->convert_textures[0])
 		return false;
+	if (!video->convert_textures[1])
+		return false;
+
+	return true;
+}
+
+static bool obs_init_gpu_copy_surfaces(struct obs_video_info *ovi, size_t i)
+{
+	struct obs_core_video *video = &obs->video;
+
+	video->copy_surfaces[i][0] = gs_stagesurface_create(
+		ovi->output_width, ovi->output_height, GS_R8);
+	if (!video->copy_surfaces[i][0])
+		return false;
+
+	const struct video_output_info *info =
+		video_output_get_info(video->video);
+	switch (info->format) {
+	case VIDEO_FORMAT_I420:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		video->copy_surfaces[i][2] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8);
+		if (!video->copy_surfaces[i][2])
+			return false;
+		break;
+	case VIDEO_FORMAT_NV12:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8G8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		break;
+	case VIDEO_FORMAT_I444:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width, ovi->output_height, GS_R8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		video->copy_surfaces[i][2] = gs_stagesurface_create(
+			ovi->output_width, ovi->output_height, GS_R8);
+		if (!video->copy_surfaces[i][2])
+			return false;
+		break;
+	}
 
 	return true;
 }
@@ -207,25 +198,29 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)
 static bool obs_init_textures(struct obs_video_info *ovi)
 {
 	struct obs_core_video *video = &obs->video;
-	uint32_t output_height = video->gpu_conversion
-					 ? video->conversion_height
-					 : ovi->output_height;
-	size_t i;
 
-	for (i = 0; i < NUM_TEXTURES; i++) {
+	for (size_t i = 0; i < NUM_TEXTURES; i++) {
 #ifdef _WIN32
 		if (video->using_nv12_tex) {
-			video->copy_surfaces[i] = gs_stagesurface_create_nv12(
-				ovi->output_width, ovi->output_height);
-			if (!video->copy_surfaces[i])
+			video->copy_surfaces[i][0] =
+				gs_stagesurface_create_nv12(ovi->output_width,
+							    ovi->output_height);
+			if (!video->copy_surfaces[i][0])
 				return false;
 
 		} else {
 #endif
-			video->copy_surfaces[i] = gs_stagesurface_create(
-				ovi->output_width, output_height, GS_RGBA);
-			if (!video->copy_surfaces[i])
-				return false;
+			if (video->gpu_conversion) {
+				if (!obs_init_gpu_copy_surfaces(ovi, i))
+					return false;
+			} else {
+				video->copy_surfaces[i][0] =
+					gs_stagesurface_create(
+						ovi->output_width,
+						ovi->output_height, GS_RGBA);
+				if (!video->copy_surfaces[i][0])
+					return false;
+			}
 #ifdef _WIN32
 		}
 #endif
@@ -465,23 +460,45 @@ static void obs_free_video(void)
 
 		gs_enter_context(video->graphics);
 
-		if (video->mapped_surface) {
-			gs_stagesurface_unmap(video->mapped_surface);
-			video->mapped_surface = NULL;
+		for (size_t c = 0; c < NUM_CHANNELS; c++) {
+			if (video->mapped_surfaces[c]) {
+				gs_stagesurface_unmap(
+					video->mapped_surfaces[c]);
+				video->mapped_surfaces[c] = NULL;
+			}
 		}
 
 		for (size_t i = 0; i < NUM_TEXTURES; i++) {
-			gs_stagesurface_destroy(video->copy_surfaces[i]);
-			video->copy_surfaces[i] = NULL;
+			for (size_t c = 0; c < NUM_CHANNELS; c++) {
+				if (video->copy_surfaces[i][c]) {
+					gs_stagesurface_destroy(
+						video->copy_surfaces[i][c]);
+					video->copy_surfaces[i][c] = NULL;
+				}
+			}
 		}
 
 		gs_texture_destroy(video->render_texture);
-		gs_texture_destroy(video->convert_texture);
-		gs_texture_destroy(video->convert_uv_texture);
+
+		for (size_t c = 0; c < NUM_CHANNELS; c++) {
+			if (video->convert_textures[c]) {
+				gs_texture_destroy(video->convert_textures[c]);
+				video->convert_textures[c] = NULL;
+			}
+		}
+
+		for (size_t i = 0; i < NUM_TEXTURES; i++) {
+			for (size_t c = 0; c < NUM_CHANNELS; c++) {
+				if (video->copy_surfaces[i][c]) {
+					gs_stagesurface_destroy(
+						video->copy_surfaces[i][c]);
+					video->copy_surfaces[i][c] = NULL;
+				}
+			}
+		}
+
 		gs_texture_destroy(video->output_texture);
 		video->render_texture = NULL;
-		video->convert_texture = NULL;
-		video->convert_uv_texture = NULL;
 		video->output_texture = NULL;
 
 		gs_leave_context();