Merge pull request #1995 from jpark37/yuv-simplify

libobs: Separate textures for YUV output, fix chroma
2019-08-09 21:11:45 -07:00 · 2019-08-09 21:11:45 -07:00 · 164f731320
commit 164f731320
parent 0fa22233a4 9aacc99b3e
4 changed files with 443 additions and 593 deletions
--- a/libobs/data/format_conversion.effect
+++ b/libobs/data/format_conversion.effect
@ -15,25 +15,12 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/

-//#define DEBUGGING
-
-uniform float     u_plane_offset;
-uniform float     v_plane_offset;
-
 uniform float     width;
 uniform float     height;
 uniform float     width_i;
-uniform float     height_i;
 uniform float     width_d2;
-uniform float     height_d2;
 uniform float     width_d2_i;
-uniform float     height_d2_i;
-uniform float     input_width;
-uniform float     input_height;
-uniform float     input_width_i;
-uniform float     input_height_i;
 uniform float     input_width_i_d2;
-uniform float     input_height_i_d2;

 uniform int       int_width;
 uniform int       int_input_width;
@ -65,8 +52,17 @@ struct VertTexPos {
 	float4 pos : POSITION;
 };

+struct VertTexPosWide {
+	float3 uuv : TEXCOORD0;
+	float4 pos : POSITION;
+};
+
 struct FragTex {
-	float2 uv  : TEXCOORD0;
+	float2 uv : TEXCOORD0;
+};
+
+struct FragTexWide {
+	float3 uuv : TEXCOORD0;
 };

 FragPos VSPos(uint id : VERTEXID)
@ -82,7 +78,7 @@ FragPos VSPos(uint id : VERTEXID)
 	return vert_out;
 }

-VertTexPos VSPosTex(uint id : VERTEXID)
+VertTexPos VSTexPos(uint id : VERTEXID)
 {
 	float idHigh = float(id >> 1);
 	float idLow = float(id & uint(1));
@ -99,225 +95,76 @@ VertTexPos VSPosTex(uint id : VERTEXID)
 	return vert_out;
 }

+VertTexPosWide VSTexPosLeft(uint id : VERTEXID)
+{
+	float idHigh = float(id >> 1);
+	float idLow = float(id & uint(1));
+
+	float x = idHigh * 4.0 - 1.0;
+	float y = idLow * 4.0 - 1.0;
+
+	float u_right = idHigh * 2.0;
+	float u_left = u_right - width_i;
+	float v = obs_glsl_compile ? (idLow * 2.0) : (1.0 - idLow * 2.0);
+
+	VertTexPosWide vert_out;
+	vert_out.uuv.x = u_left;
+	vert_out.uuv.y = u_right;
+	vert_out.uuv.z = v;
+	vert_out.pos = float4(x, y, 0.0, 1.0);
+	return vert_out;
+}
+
 /* used to prevent internal GPU precision issues width fmod in particular */
 #define PRECISION_OFFSET 0.2

-float4 PSNV12(FragTex frag_in) : TARGET
-{
-	float v_mul = floor(frag_in.uv.y * input_height);
-
-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
-
-	if (byte_offset < u_plane_offset) {
-#ifdef DEBUGGING
-		return float4(1.0, 1.0, 1.0, 1.0);
-#endif
-
-		float lum_u = floor(fmod(byte_offset, width)) * width_i;
-		float lum_v = floor(byte_offset * width_i)    * height_i;
-
-		/* move to texel centers to sample the 4 pixels properly */
-		lum_u += width_i  * 0.5;
-		lum_v += height_i * 0.5;
-
-		float2 sample_pos0 = float2(lum_u,            lum_v);
-		float2 sample_pos1 = float2(lum_u += width_i, lum_v);
-		float2 sample_pos2 = float2(lum_u += width_i, lum_v);
-		float2 sample_pos3 = float2(lum_u +  width_i, lum_v);
-
-		float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-		float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-		float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-		float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-		float4 out_val = float4(
-			dot(color_vec_y.xyz, rgb0) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb1) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb2) + color_vec_y.w,
-			dot(color_vec_y.xyz, rgb3) + color_vec_y.w
-		);
-
-		return out_val;
-	} else {
-#ifdef DEBUGGING
-		return float4(0.5, 0.2, 0.5, 0.2);
-#endif
-
-		float new_offset = byte_offset - u_plane_offset;
-
-		float ch_u = floor(fmod(new_offset, width)) * width_i;
-		float ch_v = floor(new_offset * width_i)    * height_d2_i;
-		float width_i2 = width_i*2.0;
-
-		/* move to the borders of each set of 4 pixels to force it
-		 * to do bilinear averaging */
-		ch_u += width_i;
-		ch_v += height_i;
-
-		float2 sample_pos0 = float2(ch_u,             ch_v);
-		float2 sample_pos1 = float2(ch_u + width_i2,  ch_v);
-
-		float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-		float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-
-		return float4(
-			dot(color_vec_u.xyz, rgb0) + color_vec_u.w,
-			dot(color_vec_v.xyz, rgb0) + color_vec_v.w,
-			dot(color_vec_u.xyz, rgb1) + color_vec_u.w,
-			dot(color_vec_v.xyz, rgb1) + color_vec_v.w
-		);
-	}
-}
-
-float PSNV12_Y(FragPos frag_in) : TARGET
+float PS_Y(FragPos frag_in) : TARGET
 {
 	float3 rgb = image.Load(int3(frag_in.pos.xy, 0)).rgb;
 	float y = dot(color_vec_y.xyz, rgb) + color_vec_y.w;
 	return y;
 }

-float2 PSNV12_UV(FragTex frag_in) : TARGET
+float2 PS_UV_Wide(FragTexWide frag_in) : TARGET
 {
-	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
 	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
 	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
 	return float2(u, v);
 }

-float4 PSPlanar420(FragTex frag_in) : TARGET
+float PS_U(FragTex frag_in) : TARGET
 {
-	float v_mul = floor(frag_in.uv.y * input_height);
-
-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
-
-	float2 sample_pos0, sample_pos1, sample_pos2, sample_pos3;
-
-	if (byte_offset < u_plane_offset) {
-#ifdef DEBUGGING
-		return float4(1.0, 1.0, 1.0, 1.0);
-#endif
-
-		float lum_u = floor(fmod(byte_offset, width)) * width_i;
-		float lum_v = floor(byte_offset * width_i)    * height_i;
-
-		/* move to texel centers to sample the 4 pixels properly */
-		lum_u += width_i  * 0.5;
-		lum_v += height_i * 0.5;
-
-		sample_pos0 = float2(lum_u,            lum_v);
-		sample_pos1 = float2(lum_u += width_i, lum_v);
-		sample_pos2 = float2(lum_u += width_i, lum_v);
-		sample_pos3 = float2(lum_u +  width_i, lum_v);
-
-	} else {
-#ifdef DEBUGGING
-		return ((byte_offset < v_plane_offset) ?
-				float4(0.5, 0.5, 0.5, 0.5) :
-				float4(0.2, 0.2, 0.2, 0.2));
-#endif
-
-		float new_offset = byte_offset -
-				((byte_offset < v_plane_offset) ?
-				u_plane_offset : v_plane_offset);
-
-		float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;
-		float ch_v = floor(new_offset * width_d2_i)    * height_d2_i;
-		float width_i2 = width_i*2.0;
-
-		/* move to the borders of each set of 4 pixels to force it
-		 * to do bilinear averaging */
-		ch_u += width_i;
-		ch_v += height_i;
-
-		/* set up coordinates for next chroma line, in case
-		 * (width / 2) % 4 == 2, i.e. the current set of 4 pixels is split
-		 * between the current and the next chroma line; do note that the next
-		 * chroma line is two source lines below the current source line */
-		float ch_u_n = 0.   + width_i;
-		float ch_v_n = ch_v + height_i * 3;
-
-		sample_pos0 = float2(ch_u,             ch_v);
-		sample_pos1 = float2(ch_u += width_i2, ch_v);
-
-		ch_u += width_i2;
-		// check if ch_u overflowed the current source and chroma line
-		if (ch_u > 1.0) {
-			sample_pos2 = float2(ch_u_n,            ch_v_n);
-			sample_pos2 = float2(ch_u_n + width_i2, ch_v_n);
-		} else {
-			sample_pos2 = float2(ch_u,            ch_v);
-			sample_pos3 = float2(ch_u + width_i2, ch_v);
-		}
-	}
-
-	float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-	float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-	float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-	float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-	float4 color_vec;
-	if (byte_offset < u_plane_offset)
-		color_vec = color_vec_y;
-	else if (byte_offset < v_plane_offset)
-		color_vec = color_vec_u;
-	else
-		color_vec = color_vec_v;
-
-	return float4(
-		dot(color_vec.xyz, rgb0) + color_vec.w,
-		dot(color_vec.xyz, rgb1) + color_vec.w,
-		dot(color_vec.xyz, rgb2) + color_vec.w,
-		dot(color_vec.xyz, rgb3) + color_vec.w
-	);
+	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
+	return u;
 }

-float4 PSPlanar444(FragTex frag_in) : TARGET
+float PS_V(FragTex frag_in) : TARGET
 {
-	float v_mul = floor(frag_in.uv.y * input_height);
+	float3 rgb = image.Sample(def_sampler, frag_in.uv).rgb;
+	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
+	return v;
+}

-	float byte_offset = floor((v_mul + frag_in.uv.x) * width) * 4.0;
-	byte_offset += PRECISION_OFFSET;
+float PS_U_Wide(FragTexWide frag_in) : TARGET
+{
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
+	float u = dot(color_vec_u.xyz, rgb) + color_vec_u.w;
+	return u;
+}

-	float new_byte_offset = byte_offset;
-
-	if (byte_offset >= v_plane_offset)
-		new_byte_offset -= v_plane_offset;
-	else if (byte_offset >= u_plane_offset)
-		new_byte_offset -= u_plane_offset;
-
-	float u_val = floor(fmod(new_byte_offset, width)) * width_i;
-	float v_val = floor(new_byte_offset * width_i)    * height_i;
-
-	/* move to texel centers to sample the 4 pixels properly */
-	u_val += width_i  * 0.5;
-	v_val += height_i * 0.5;
-
-	float2 sample_pos0 = float2(u_val,            v_val);
-	float2 sample_pos1 = float2(u_val += width_i, v_val);
-	float2 sample_pos2 = float2(u_val += width_i, v_val);
-	float2 sample_pos3 = float2(u_val +  width_i, v_val);
-
-	float3 rgb0 = image.Sample(def_sampler, sample_pos0).rgb;
-	float3 rgb1 = image.Sample(def_sampler, sample_pos1).rgb;
-	float3 rgb2 = image.Sample(def_sampler, sample_pos2).rgb;
-	float3 rgb3 = image.Sample(def_sampler, sample_pos3).rgb;
-
-	float4 color_vec;
-	if (byte_offset < u_plane_offset)
-		color_vec = color_vec_y;
-	else if (byte_offset < v_plane_offset)
-		color_vec = color_vec_u;
-	else
-		color_vec = color_vec_v;
-
-	return float4(
-		dot(color_vec.xyz, rgb0) + color_vec.w,
-		dot(color_vec.xyz, rgb1) + color_vec.w,
-		dot(color_vec.xyz, rgb2) + color_vec.w,
-		dot(color_vec.xyz, rgb3) + color_vec.w
-	);
+float PS_V_Wide(FragTexWide frag_in) : TARGET
+{
+	float3 rgb_left = image.Sample(def_sampler, frag_in.uuv.xz).rgb;
+	float3 rgb_right = image.Sample(def_sampler, frag_in.uuv.yz).rgb;
+	float3 rgb = (rgb_left + rgb_right) * 0.5;
+	float v = dot(color_vec_v.xyz, rgb) + color_vec_v.w;
+	return v;
 }

 float GetIntOffsetColor(int offset)
@ -473,30 +320,48 @@ float4 PSBGR3_Full(FragTex frag_in) : TARGET
 	return float4(rgb, 1.0);
 }

-technique Planar420
+technique Planar_Y
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSPlanar420(frag_in);
+		vertex_shader = VSPos(id);
+		pixel_shader  = PS_Y(frag_in);
 	}
 }

-technique Planar444
+technique Planar_U
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSPlanar444(frag_in);
+		vertex_shader = VSTexPos(id);
+		pixel_shader  = PS_U(frag_in);
 	}
 }

-technique NV12
+technique Planar_V
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSNV12(frag_in);
+		vertex_shader = VSTexPos(id);
+		pixel_shader  = PS_V(frag_in);
+	}
+}
+
+technique Planar_U_Left
+{
+	pass
+	{
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_U_Wide(frag_in);
+	}
+}
+
+technique Planar_V_Left
+{
+	pass
+	{
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_V_Wide(frag_in);
 	}
 }

@ -505,7 +370,7 @@ technique NV12_Y
 	pass
 	{
 		vertex_shader = VSPos(id);
-		pixel_shader  = PSNV12_Y(frag_in);
+		pixel_shader  = PS_Y(frag_in);
 	}
 }

@ -513,8 +378,8 @@ technique NV12_UV
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
-		pixel_shader  = PSNV12_UV(frag_in);
+		vertex_shader = VSTexPosLeft(id);
+		pixel_shader  = PS_UV_Wide(frag_in);
 	}
 }

@ -522,7 +387,7 @@ technique UYVY_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 2, 0, 1, 3);
 	}
 }
@ -531,7 +396,7 @@ technique YUY2_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 1, 3, 2, 0);
 	}
 }
@ -540,7 +405,7 @@ technique YVYU_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPacked422_Reverse(frag_in, 3, 1, 2, 0);
 	}
 }
@ -549,7 +414,7 @@ technique I420_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar420_Reverse(frag_in);
 	}
 }
@ -558,7 +423,7 @@ technique I422_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar422_Reverse(frag_in);
 	}
 }
@ -567,7 +432,7 @@ technique I444_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSPlanar444_Reverse(frag_in);
 	}
 }
@ -576,7 +441,7 @@ technique NV12_Reverse
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSNV12_Reverse(frag_in);
 	}
 }
@ -585,7 +450,7 @@ technique Y800_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSY800_Limited(frag_in);
 	}
 }
@ -594,7 +459,7 @@ technique Y800_Full
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSY800_Full(frag_in);
 	}
 }
@ -603,7 +468,7 @@ technique RGB_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSRGB_Limited(frag_in);
 	}
 }
@ -612,7 +477,7 @@ technique BGR3_Limited
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSBGR3_Limited(frag_in);
 	}
 }
@ -621,7 +486,7 @@ technique BGR3_Full
 {
 	pass
 	{
-		vertex_shader = VSPosTex(id);
+		vertex_shader = VSTexPos(id);
 		pixel_shader  = PSBGR3_Full(frag_in);
 	}
 }
--- a/libobs/obs-internal.h
+++ b/libobs/obs-internal.h
@ -37,6 +37,7 @@
 #include "obs.h"

 #define NUM_TEXTURES 2
+#define NUM_CHANNELS 3
 #define MICROSECOND_DEN 1000000
 #define NUM_ENCODE_TEXTURES 3
 #define NUM_ENCODE_TEXTURE_FRAMES_TO_WAIT 1
@ -235,11 +236,10 @@ struct obs_tex_frame {

 struct obs_core_video {
 	graphics_t *graphics;
-	gs_stagesurf_t *copy_surfaces[NUM_TEXTURES];
+	gs_stagesurf_t *copy_surfaces[NUM_TEXTURES][NUM_CHANNELS];
 	gs_texture_t *render_texture;
 	gs_texture_t *output_texture;
-	gs_texture_t *convert_texture;
-	gs_texture_t *convert_uv_texture;
+	gs_texture_t *convert_textures[NUM_CHANNELS];
 	bool texture_rendered;
 	bool textures_copied[NUM_TEXTURES];
 	bool texture_converted;
@ -258,7 +258,7 @@ struct obs_core_video {
 	gs_effect_t *bilinear_lowres_effect;
 	gs_effect_t *premultiplied_alpha_effect;
 	gs_samplerstate_t *point_sampler;
-	gs_stagesurf_t *mapped_surface;
+	gs_stagesurf_t *mapped_surfaces[NUM_CHANNELS];
 	int cur_texture;
 	long raw_active;
 	long gpu_encoder_active;
@ -283,11 +283,9 @@ struct obs_core_video {
 	bool thread_initialized;

 	bool gpu_conversion;
-	const char *conversion_tech;
-	uint32_t conversion_height;
-	uint32_t plane_offsets[3];
-	uint32_t plane_sizes[3];
-	uint32_t plane_linewidth[3];
+	const char *conversion_techs[NUM_CHANNELS];
+	bool conversion_needed;
+	float conversion_width_i;

 	uint32_t output_width;
 	uint32_t output_height;
--- a/libobs/obs-video.c
+++ b/libobs/obs-video.c
@ -109,9 +109,11 @@ static inline void set_render_size(uint32_t width, uint32_t height)

 static inline void unmap_last_surface(struct obs_core_video *video)
 {
-	if (video->mapped_surface) {
-		gs_stagesurface_unmap(video->mapped_surface);
-		video->mapped_surface = NULL;
+	for (int c = 0; c < NUM_CHANNELS; ++c) {
+		if (video->mapped_surfaces[c]) {
+			gs_stagesurface_unmap(video->mapped_surfaces[c]);
+			video->mapped_surfaces[c] = NULL;
+		}
 	}
 }

@ -264,10 +266,24 @@ static inline gs_texture_t *render_output_texture(struct obs_core_video *video)
 	return target;
 }

-static inline void set_eparam(gs_effect_t *effect, const char *name, float val)
+static void render_convert_plane(gs_effect_t *effect, gs_texture_t *texture,
+				 gs_texture_t *target, const char *tech_name)
 {
-	gs_eparam_t *param = gs_effect_get_param_by_name(effect, name);
-	gs_effect_set_float(param, val);
+	gs_technique_t *tech = gs_effect_get_technique(effect, tech_name);
+
+	const uint32_t width = gs_texture_get_width(target);
+	const uint32_t height = gs_texture_get_height(target);
+
+	gs_set_render_target(target, NULL);
+	set_render_size(width, height);
+
+	size_t passes = gs_technique_begin(tech);
+	for (size_t i = 0; i < passes; i++) {
+		gs_technique_begin_pass(tech, i);
+		gs_draw(GS_TRIS, 0, 3);
+		gs_technique_end_pass(tech);
+	}
+	gs_technique_end(tech);
 }

 static const char *render_convert_texture_name = "render_convert_texture";
@ -276,11 +292,6 @@ static void render_convert_texture(struct obs_core_video *video,
 {
 	profile_start(render_convert_texture_name);

-	gs_texture_t *target = video->convert_texture;
-	float fwidth = (float)video->output_width;
-	float fheight = (float)video->output_height;
-	size_t passes, i;
-
 	gs_effect_t *effect = video->conversion_effect;
 	gs_eparam_t *color_vec_y =
 		gs_effect_get_param_by_name(effect, "color_vec_y");
@ -289,20 +300,7 @@ static void render_convert_texture(struct obs_core_video *video,
 	gs_eparam_t *color_vec_v =
 		gs_effect_get_param_by_name(effect, "color_vec_v");
 	gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
-	gs_technique_t *tech =
-		gs_effect_get_technique(effect, video->conversion_tech);
-
-	set_eparam(effect, "u_plane_offset", (float)video->plane_offsets[1]);
-	set_eparam(effect, "v_plane_offset", (float)video->plane_offsets[2]);
-	set_eparam(effect, "width", fwidth);
-	set_eparam(effect, "height", fheight);
-	set_eparam(effect, "width_i", 1.0f / fwidth);
-	set_eparam(effect, "height_i", 1.0f / fheight);
-	set_eparam(effect, "width_d2", fwidth * 0.5f);
-	set_eparam(effect, "height_d2", fheight * 0.5f);
-	set_eparam(effect, "width_d2_i", 1.0f / (fwidth * 0.5f));
-	set_eparam(effect, "height_d2_i", 1.0f / (fheight * 0.5f));
-	set_eparam(effect, "input_height", (float)video->conversion_height);
+	gs_eparam_t *width_i = gs_effect_get_param_by_name(effect, "width_i");

 	struct vec4 vec_y, vec_u, vec_v;
 	vec4_set(&vec_y, video->color_matrix[4], video->color_matrix[5],
@ -311,23 +309,39 @@ static void render_convert_texture(struct obs_core_video *video,
 		 video->color_matrix[2], video->color_matrix[3]);
 	vec4_set(&vec_v, video->color_matrix[8], video->color_matrix[9],
 		 video->color_matrix[10], video->color_matrix[11]);
-	gs_effect_set_vec4(color_vec_y, &vec_y);
-	gs_effect_set_vec4(color_vec_u, &vec_u);
-	gs_effect_set_vec4(color_vec_v, &vec_v);
-
-	gs_effect_set_texture(image, texture);
-
-	gs_set_render_target(target, NULL);
-	set_render_size(video->output_width, video->conversion_height);

 	gs_enable_blending(false);
-	passes = gs_technique_begin(tech);
-	for (i = 0; i < passes; i++) {
-		gs_technique_begin_pass(tech, i);
-		gs_draw(GS_TRIS, 0, 3);
-		gs_technique_end_pass(tech);
+
+	if (video->convert_textures[0]) {
+		gs_effect_set_texture(image, texture);
+		gs_effect_set_vec4(color_vec_y, &vec_y);
+		render_convert_plane(effect, texture,
+				     video->convert_textures[0],
+				     video->conversion_techs[0]);
+
+		if (video->convert_textures[1]) {
+			gs_effect_set_texture(image, texture);
+			gs_effect_set_vec4(color_vec_u, &vec_u);
+			if (!video->convert_textures[2])
+				gs_effect_set_vec4(color_vec_v, &vec_v);
+			gs_effect_set_float(width_i, video->conversion_width_i);
+			render_convert_plane(effect, texture,
+					     video->convert_textures[1],
+					     video->conversion_techs[1]);
+
+			if (video->convert_textures[2]) {
+				gs_effect_set_texture(image, texture);
+				gs_effect_set_vec4(color_vec_v, &vec_v);
+				gs_effect_set_float(width_i,
+						    video->conversion_width_i);
+				render_convert_plane(
+					effect, texture,
+					video->convert_textures[2],
+					video->conversion_techs[2]);
+			}
+		}
 	}
-	gs_technique_end(tech);
+
 	gs_enable_blending(true);

 	video->texture_converted = true;
@ -335,90 +349,32 @@ static void render_convert_texture(struct obs_core_video *video,
 	profile_end(render_convert_texture_name);
 }

-static void render_nv12(struct obs_core_video *video, gs_texture_t *texture,
-			gs_texture_t *target, const char *tech_name,
-			uint32_t width, uint32_t height)
-{
-	gs_effect_t *effect = video->conversion_effect;
-	gs_eparam_t *color_vec_y =
-		gs_effect_get_param_by_name(effect, "color_vec_y");
-	gs_eparam_t *color_vec_u =
-		gs_effect_get_param_by_name(effect, "color_vec_u");
-	gs_eparam_t *color_vec_v =
-		gs_effect_get_param_by_name(effect, "color_vec_v");
-	gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
-	gs_technique_t *tech = gs_effect_get_technique(effect, tech_name);
-	size_t passes, i;
-
-	struct vec4 vec_y, vec_u, vec_v;
-	vec4_set(&vec_y, video->color_matrix[4], video->color_matrix[5],
-		 video->color_matrix[6], video->color_matrix[7]);
-	vec4_set(&vec_u, video->color_matrix[0], video->color_matrix[1],
-		 video->color_matrix[2], video->color_matrix[3]);
-	vec4_set(&vec_v, video->color_matrix[8], video->color_matrix[9],
-		 video->color_matrix[10], video->color_matrix[11]);
-	gs_effect_set_vec4(color_vec_y, &vec_y);
-	gs_effect_set_vec4(color_vec_u, &vec_u);
-	gs_effect_set_vec4(color_vec_v, &vec_v);
-
-	gs_effect_set_texture(image, texture);
-
-	gs_set_render_target(target, NULL);
-	set_render_size(width, height);
-
-	gs_enable_blending(false);
-	passes = gs_technique_begin(tech);
-	for (i = 0; i < passes; i++) {
-		gs_technique_begin_pass(tech, i);
-		gs_draw(GS_TRIS, 0, 3);
-		gs_technique_end_pass(tech);
-	}
-	gs_technique_end(tech);
-	gs_enable_blending(true);
-}
-
-static const char *render_convert_nv12_name = "render_convert_texture_nv12";
-static void render_convert_texture_nv12(struct obs_core_video *video,
-					gs_texture_t *texture)
-{
-	profile_start(render_convert_nv12_name);
-
-	render_nv12(video, texture, video->convert_texture, "NV12_Y",
-		    video->output_width, video->output_height);
-	render_nv12(video, texture, video->convert_uv_texture, "NV12_UV",
-		    video->output_width / 2, video->output_height / 2);
-
-	video->texture_converted = true;
-
-	profile_end(render_convert_nv12_name);
-}
-
 static const char *stage_output_texture_name = "stage_output_texture";
 static inline void stage_output_texture(struct obs_core_video *video,
-					gs_texture_t *texture, int cur_texture)
+					int cur_texture)
 {
 	profile_start(stage_output_texture_name);

-	bool texture_ready;
-	gs_stagesurf_t *copy = video->copy_surfaces[cur_texture];
-
-	if (video->gpu_conversion) {
-		texture = video->convert_texture;
-		texture_ready = video->texture_converted;
-	} else {
-		texture_ready = true;
-	}
-
 	unmap_last_surface(video);

-	if (!texture_ready)
-		goto end;
+	if (!video->gpu_conversion) {
+		gs_stagesurf_t *copy = video->copy_surfaces[cur_texture][0];
+		if (copy)
+			gs_stage_texture(copy, video->output_texture);

-	gs_stage_texture(copy, texture);
+		video->textures_copied[cur_texture] = true;
+	} else if (video->texture_converted) {
+		for (int i = 0; i < NUM_CHANNELS; i++) {
+			gs_stagesurf_t *copy =
+				video->copy_surfaces[cur_texture][i];
+			if (copy)
+				gs_stage_texture(copy,
+						 video->convert_textures[i]);
+		}

-	video->textures_copied[cur_texture] = true;
+		video->textures_copied[cur_texture] = true;
+	}

-end:
 	profile_end(stage_output_texture_name);
 }

@ -458,13 +414,13 @@ static inline bool queue_frame(struct obs_core_video *video, bool raw_active,
 	 * reason.  otherwise, it goes to the 'duplicate' case above, which
 	 * will ensure better performance. */
 	if (raw_active || vframe_info->count > 1) {
-		gs_copy_texture(tf.tex, video->convert_texture);
+		gs_copy_texture(tf.tex, video->convert_textures[0]);
 	} else {
-		gs_texture_t *tex = video->convert_texture;
-		gs_texture_t *tex_uv = video->convert_uv_texture;
+		gs_texture_t *tex = video->convert_textures[0];
+		gs_texture_t *tex_uv = video->convert_textures[1];

-		video->convert_texture = tf.tex;
-		video->convert_uv_texture = tf.tex_uv;
+		video->convert_textures[0] = tf.tex;
+		video->convert_textures[1] = tf.tex_uv;

 		tf.tex = tex;
 		tf.tex_uv = tex_uv;
@ -529,17 +485,12 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 		gs_texture_t *texture = render_output_texture(video);

 #ifdef _WIN32
-		if (gpu_active) {
+		if (gpu_active)
 			gs_flush();
-		}
 #endif

-		if (video->gpu_conversion) {
-			if (video->using_nv12_tex)
-				render_convert_texture_nv12(video, texture);
-			else
-				render_convert_texture(video, texture);
-		}
+		if (video->gpu_conversion)
+			render_convert_texture(video, texture);

 #ifdef _WIN32
 		if (gpu_active) {
@ -547,8 +498,9 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 			output_gpu_encoders(video, raw_active);
 		}
 #endif
+
 		if (raw_active)
-			stage_output_texture(video, texture, cur_texture);
+			stage_output_texture(video, cur_texture);
 	}

 	gs_set_render_target(NULL, NULL);
@ -560,73 +512,41 @@ static inline void render_video(struct obs_core_video *video, bool raw_active,
 static inline bool download_frame(struct obs_core_video *video,
 				  int prev_texture, struct video_data *frame)
 {
-	gs_stagesurf_t *surface = video->copy_surfaces[prev_texture];
-
 	if (!video->textures_copied[prev_texture])
 		return false;

-	if (!gs_stagesurface_map(surface, &frame->data[0], &frame->linesize[0]))
-		return false;
+	for (int channel = 0; channel < NUM_CHANNELS; ++channel) {
+		gs_stagesurf_t *surface =
+			video->copy_surfaces[prev_texture][channel];
+		if (surface) {
+			if (!gs_stagesurface_map(surface, &frame->data[channel],
+						 &frame->linesize[channel]))
+				return false;

-	video->mapped_surface = surface;
+			video->mapped_surfaces[channel] = surface;
+		}
+	}
 	return true;
 }

-static inline uint32_t calc_linesize(uint32_t pos, uint32_t linesize)
+static const uint8_t *set_gpu_converted_plane(uint32_t width, uint32_t height,
+					      uint32_t linesize_input,
+					      uint32_t linesize_output,
+					      const uint8_t *in, uint8_t *out)
 {
-	uint32_t size = pos % linesize;
-	return size ? size : linesize;
-}
-
-static void copy_dealign(uint8_t *dst, uint32_t dst_pos, uint32_t dst_linesize,
-			 const uint8_t *src, uint32_t src_pos,
-			 uint32_t src_linesize, uint32_t remaining)
-{
-	while (remaining) {
-		uint32_t src_remainder = src_pos % src_linesize;
-		uint32_t dst_offset = dst_linesize - src_remainder;
-		uint32_t src_offset = src_linesize - src_remainder;
-
-		if (remaining < dst_offset) {
-			memcpy(dst + dst_pos, src + src_pos, remaining);
-			src_pos += remaining;
-			dst_pos += remaining;
-			remaining = 0;
-		} else {
-			memcpy(dst + dst_pos, src + src_pos, dst_offset);
-			src_pos += src_offset;
-			dst_pos += dst_offset;
-			remaining -= dst_offset;
+	if ((width == linesize_input) && (width == linesize_output)) {
+		size_t total = width * height;
+		memcpy(out, in, total);
+		in += total;
+	} else {
+		for (size_t y = 0; y < height; y++) {
+			memcpy(out, in, width);
+			out += linesize_output;
+			in += linesize_input;
 		}
 	}
-}

-static inline uint32_t make_aligned_linesize_offset(uint32_t offset,
-						    uint32_t dst_linesize,
-						    uint32_t src_linesize)
-{
-	uint32_t remainder = offset % dst_linesize;
-	return (offset / dst_linesize) * src_linesize + remainder;
-}
-
-static void fix_gpu_converted_alignment(struct obs_core_video *video,
-					struct video_frame *output,
-					const struct video_data *input)
-{
-	uint32_t src_linesize = input->linesize[0];
-	uint32_t dst_linesize = output->linesize[0] * 4;
-	uint32_t src_pos = 0;
-
-	for (size_t i = 0; i < 3; i++) {
-		if (video->plane_linewidth[i] == 0)
-			break;
-
-		src_pos = make_aligned_linesize_offset(
-			video->plane_offsets[i], dst_linesize, src_linesize);
-
-		copy_dealign(output->data[i], 0, dst_linesize, input->data[0],
-			     src_pos, src_linesize, video->plane_sizes[i]);
-	}
+	return in;
 }

 static void set_gpu_converted_data(struct obs_core_video *video,
@ -634,41 +554,91 @@ static void set_gpu_converted_data(struct obs_core_video *video,
 				   const struct video_data *input,
 				   const struct video_output_info *info)
 {
-	if (input->linesize[0] == video->output_width * 4) {
-		struct video_frame frame;
+	if (video->using_nv12_tex) {
+		const uint32_t width = info->width;
+		const uint32_t height = info->height;

-		for (size_t i = 0; i < 3; i++) {
-			if (video->plane_linewidth[i] == 0)
-				break;
-
-			frame.linesize[i] = video->plane_linewidth[i];
-			frame.data[i] =
-				input->data[0] + video->plane_offsets[i];
-		}
-
-		video_frame_copy(output, &frame, info->format, info->height);
-
-	} else if (video->using_nv12_tex) {
-		size_t width = info->width;
-		size_t height = info->height;
-		size_t height_d2 = height / 2;
-		uint8_t *out_y = output->data[0];
-		uint8_t *out_uv = output->data[1];
-		uint8_t *in = input->data[0];
-
-		for (size_t y = 0; y < height; y++) {
-			memcpy(out_y, in, width);
-			out_y += output->linesize[0];
-			in += input->linesize[0];
-		}
-		for (size_t y = 0; y < height_d2; y++) {
-			memcpy(out_uv, in, width);
-			out_uv += output->linesize[0];
-			in += input->linesize[0];
-		}
+		const uint8_t *const in_uv = set_gpu_converted_plane(
+			width, height, input->linesize[0], output->linesize[0],
+			input->data[0], output->data[0]);

+		const uint32_t height_d2 = height / 2;
+		set_gpu_converted_plane(width, height_d2, input->linesize[0],
+					output->linesize[1], in_uv,
+					output->data[1]);
 	} else {
-		fix_gpu_converted_alignment(video, output, input);
+		switch (info->format) {
+		case VIDEO_FORMAT_I420: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			const uint32_t width_d2 = width / 2;
+			const uint32_t height_d2 = height / 2;
+
+			set_gpu_converted_plane(width_d2, height_d2,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			set_gpu_converted_plane(width_d2, height_d2,
+						input->linesize[2],
+						output->linesize[2],
+						input->data[2],
+						output->data[2]);
+
+			break;
+		}
+		case VIDEO_FORMAT_NV12: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			const uint32_t height_d2 = height / 2;
+			set_gpu_converted_plane(width, height_d2,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			break;
+		}
+		case VIDEO_FORMAT_I444: {
+			const uint32_t width = info->width;
+			const uint32_t height = info->height;
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[0],
+						output->linesize[0],
+						input->data[0],
+						output->data[0]);
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[1],
+						output->linesize[1],
+						input->data[1],
+						output->data[1]);
+
+			set_gpu_converted_plane(width, height,
+						input->linesize[2],
+						output->linesize[2],
+						input->data[2],
+						output->data[2]);
+
+			break;
+		}
+		}
 	}
 }

--- a/libobs/obs.c
+++ b/libobs/obs.c
@ -42,117 +42,35 @@ static inline void make_video_info(struct video_output_info *vi,
 	vi->cache_size = 6;
 }

-#define PIXEL_SIZE 4
-
-#define GET_ALIGN(val, align) (((val) + (align - 1)) & ~(align - 1))
-
-static inline void set_420p_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height / 4);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = ovi->output_width * ovi->output_height;
-	video->plane_offsets[2] = video->plane_offsets[1] + chroma_pixels;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width / 2;
-	video->plane_linewidth[2] = ovi->output_width / 2;
-
-	video->plane_sizes[0] = video->plane_offsets[1];
-	video->plane_sizes[1] = video->plane_sizes[0] / 4;
-	video->plane_sizes[2] = video->plane_sizes[1];
-
-	total_bytes = video->plane_offsets[2] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "Planar420";
-}
-
-static inline void set_nv12_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height / 2);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = ovi->output_width * ovi->output_height;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width;
-
-	video->plane_sizes[0] = video->plane_offsets[1];
-	video->plane_sizes[1] = video->plane_sizes[0] / 2;
-
-	total_bytes = video->plane_offsets[1] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "NV12";
-}
-
-static inline void set_444p_sizes(const struct obs_video_info *ovi)
-{
-	struct obs_core_video *video = &obs->video;
-	uint32_t chroma_pixels;
-	uint32_t total_bytes;
-
-	chroma_pixels = (ovi->output_width * ovi->output_height);
-	chroma_pixels = GET_ALIGN(chroma_pixels, PIXEL_SIZE);
-
-	video->plane_offsets[0] = 0;
-	video->plane_offsets[1] = chroma_pixels;
-	video->plane_offsets[2] = chroma_pixels + chroma_pixels;
-
-	video->plane_linewidth[0] = ovi->output_width;
-	video->plane_linewidth[1] = ovi->output_width;
-	video->plane_linewidth[2] = ovi->output_width;
-
-	video->plane_sizes[0] = chroma_pixels;
-	video->plane_sizes[1] = chroma_pixels;
-	video->plane_sizes[2] = chroma_pixels;
-
-	total_bytes = video->plane_offsets[2] + chroma_pixels;
-
-	video->conversion_height =
-		(total_bytes / PIXEL_SIZE + ovi->output_width - 1) /
-		ovi->output_width;
-
-	video->conversion_height = GET_ALIGN(video->conversion_height, 2);
-	video->conversion_tech = "Planar444";
-}
-
 static inline void calc_gpu_conversion_sizes(const struct obs_video_info *ovi)
 {
-	obs->video.conversion_height = 0;
-	memset(obs->video.plane_offsets, 0, sizeof(obs->video.plane_offsets));
-	memset(obs->video.plane_sizes, 0, sizeof(obs->video.plane_sizes));
-	memset(obs->video.plane_linewidth, 0,
-	       sizeof(obs->video.plane_linewidth));
+	struct obs_core_video *video = &obs->video;
+
+	video->conversion_needed = false;
+	video->conversion_techs[0] = NULL;
+	video->conversion_techs[1] = NULL;
+	video->conversion_techs[2] = NULL;
+	video->conversion_width_i = 0.f;

 	switch ((uint32_t)ovi->output_format) {
 	case VIDEO_FORMAT_I420:
-		set_420p_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "Planar_Y";
+		video->conversion_techs[1] = "Planar_U_Left";
+		video->conversion_techs[2] = "Planar_V_Left";
+		video->conversion_width_i = 1.f / (float)ovi->output_width;
 		break;
 	case VIDEO_FORMAT_NV12:
-		set_nv12_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "NV12_Y";
+		video->conversion_techs[1] = "NV12_UV";
+		video->conversion_width_i = 1.f / (float)ovi->output_width;
 		break;
 	case VIDEO_FORMAT_I444:
-		set_444p_sizes(ovi);
+		video->conversion_needed = true;
+		video->conversion_techs[0] = "Planar_Y";
+		video->conversion_techs[1] = "Planar_U";
+		video->conversion_techs[2] = "Planar_V";
 		break;
 	}
 }
@ -167,7 +85,7 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)
 					? gs_nv12_available()
 					: false;

-	if (!video->conversion_height) {
+	if (!video->conversion_needed) {
 		blog(LOG_INFO, "GPU conversion not available for format: %u",
 		     (unsigned int)ovi->output_format);
 		video->gpu_conversion = false;
@ -183,23 +101,96 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)

 #ifdef _WIN32
 	if (video->using_nv12_tex) {
-		gs_texture_create_nv12(&video->convert_texture,
-				       &video->convert_uv_texture,
+		gs_texture_create_nv12(&video->convert_textures[0],
+				       &video->convert_textures[1],
 				       ovi->output_width, ovi->output_height,
 				       GS_RENDER_TARGET | GS_SHARED_KM_TEX);
-		if (!video->convert_uv_texture)
-			return false;
 	} else {
 #endif
-		video->convert_texture = gs_texture_create(
-			ovi->output_width, video->conversion_height, GS_RGBA, 1,
-			NULL, GS_RENDER_TARGET);
+		video->convert_textures[0] =
+			gs_texture_create(ovi->output_width, ovi->output_height,
+					  GS_R8, 1, NULL, GS_RENDER_TARGET);
+
+		const struct video_output_info *info =
+			video_output_get_info(video->video);
+		switch (info->format) {
+		case VIDEO_FORMAT_I420:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8, 1, NULL, GS_RENDER_TARGET);
+			video->convert_textures[2] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8, 1, NULL, GS_RENDER_TARGET);
+			if (!video->convert_textures[2])
+				return false;
+			break;
+		case VIDEO_FORMAT_NV12:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width / 2, ovi->output_height / 2,
+				GS_R8G8, 1, NULL, GS_RENDER_TARGET);
+			break;
+		case VIDEO_FORMAT_I444:
+			video->convert_textures[1] = gs_texture_create(
+				ovi->output_width, ovi->output_height, GS_R8, 1,
+				NULL, GS_RENDER_TARGET);
+			video->convert_textures[2] = gs_texture_create(
+				ovi->output_width, ovi->output_height, GS_R8, 1,
+				NULL, GS_RENDER_TARGET);
+			if (!video->convert_textures[2])
+				return false;
+			break;
+		}
 #ifdef _WIN32
 	}
 #endif

-	if (!video->convert_texture)
+	if (!video->convert_textures[0])
 		return false;
+	if (!video->convert_textures[1])
+		return false;
+
+	return true;
+}
+
+static bool obs_init_gpu_copy_surfaces(struct obs_video_info *ovi, size_t i)
+{
+	struct obs_core_video *video = &obs->video;
+
+	video->copy_surfaces[i][0] = gs_stagesurface_create(
+		ovi->output_width, ovi->output_height, GS_R8);
+	if (!video->copy_surfaces[i][0])
+		return false;
+
+	const struct video_output_info *info =
+		video_output_get_info(video->video);
+	switch (info->format) {
+	case VIDEO_FORMAT_I420:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		video->copy_surfaces[i][2] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8);
+		if (!video->copy_surfaces[i][2])
+			return false;
+		break;
+	case VIDEO_FORMAT_NV12:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width / 2, ovi->output_height / 2, GS_R8G8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		break;
+	case VIDEO_FORMAT_I444:
+		video->copy_surfaces[i][1] = gs_stagesurface_create(
+			ovi->output_width, ovi->output_height, GS_R8);
+		if (!video->copy_surfaces[i][1])
+			return false;
+		video->copy_surfaces[i][2] = gs_stagesurface_create(
+			ovi->output_width, ovi->output_height, GS_R8);
+		if (!video->copy_surfaces[i][2])
+			return false;
+		break;
+	}

 	return true;
 }
@ -207,25 +198,29 @@ static bool obs_init_gpu_conversion(struct obs_video_info *ovi)
 static bool obs_init_textures(struct obs_video_info *ovi)
 {
 	struct obs_core_video *video = &obs->video;
-	uint32_t output_height = video->gpu_conversion
-					 ? video->conversion_height
-					 : ovi->output_height;
-	size_t i;

-	for (i = 0; i < NUM_TEXTURES; i++) {
+	for (size_t i = 0; i < NUM_TEXTURES; i++) {
 #ifdef _WIN32
 		if (video->using_nv12_tex) {
-			video->copy_surfaces[i] = gs_stagesurface_create_nv12(
-				ovi->output_width, ovi->output_height);
-			if (!video->copy_surfaces[i])
+			video->copy_surfaces[i][0] =
+				gs_stagesurface_create_nv12(ovi->output_width,
+							    ovi->output_height);
+			if (!video->copy_surfaces[i][0])
 				return false;

 		} else {
 #endif
-			video->copy_surfaces[i] = gs_stagesurface_create(
-				ovi->output_width, output_height, GS_RGBA);
-			if (!video->copy_surfaces[i])
-				return false;
+			if (video->gpu_conversion) {
+				if (!obs_init_gpu_copy_surfaces(ovi, i))
+					return false;
+			} else {
+				video->copy_surfaces[i][0] =
+					gs_stagesurface_create(
+						ovi->output_width,
+						ovi->output_height, GS_RGBA);
+				if (!video->copy_surfaces[i][0])
+					return false;
+			}
 #ifdef _WIN32
 		}
 #endif
@ -465,23 +460,45 @@ static void obs_free_video(void)

 		gs_enter_context(video->graphics);

-		if (video->mapped_surface) {
-			gs_stagesurface_unmap(video->mapped_surface);
-			video->mapped_surface = NULL;
+		for (size_t c = 0; c < NUM_CHANNELS; c++) {
+			if (video->mapped_surfaces[c]) {
+				gs_stagesurface_unmap(
+					video->mapped_surfaces[c]);
+				video->mapped_surfaces[c] = NULL;
+			}
 		}

 		for (size_t i = 0; i < NUM_TEXTURES; i++) {
-			gs_stagesurface_destroy(video->copy_surfaces[i]);
-			video->copy_surfaces[i] = NULL;
+			for (size_t c = 0; c < NUM_CHANNELS; c++) {
+				if (video->copy_surfaces[i][c]) {
+					gs_stagesurface_destroy(
+						video->copy_surfaces[i][c]);
+					video->copy_surfaces[i][c] = NULL;
+				}
+			}
 		}

 		gs_texture_destroy(video->render_texture);
-		gs_texture_destroy(video->convert_texture);
-		gs_texture_destroy(video->convert_uv_texture);
+
+		for (size_t c = 0; c < NUM_CHANNELS; c++) {
+			if (video->convert_textures[c]) {
+				gs_texture_destroy(video->convert_textures[c]);
+				video->convert_textures[c] = NULL;
+			}
+		}
+
+		for (size_t i = 0; i < NUM_TEXTURES; i++) {
+			for (size_t c = 0; c < NUM_CHANNELS; c++) {
+				if (video->copy_surfaces[i][c]) {
+					gs_stagesurface_destroy(
+						video->copy_surfaces[i][c]);
+					video->copy_surfaces[i][c] = NULL;
+				}
+			}
+		}
+
 		gs_texture_destroy(video->output_texture);
 		video->render_texture = NULL;
-		video->convert_texture = NULL;
-		video->convert_uv_texture = NULL;
 		video->output_texture = NULL;

 		gs_leave_context();