obs-studio/build/data/libobs/format_conversion.effect

/******************************************************************************
    Copyright (C) 2014 by Hugh Bailey <obs.jim@gmail.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/

//#define DEBUGGING

uniform float4x4  ViewProj;

uniform float     u_plane_offset;
uniform float     v_plane_offset;

uniform float     width;
uniform float     height;
uniform float     width_i;
uniform float     height_i;
uniform float     width_d2;
uniform float     height_d2;
uniform float     width_d2_i;
uniform float     height_d2_i;
uniform float     input_height;

uniform texture2d image;

sampler_state def_sampler {
	Filter   = Linear;
	AddressU = Clamp;
	AddressV = Clamp;
};

struct VertInOut {
	float4 pos : POSITION;
	float2 uv  : TEXCOORD0;
};

VertInOut VSDefault(VertInOut vert_in)
{
	VertInOut vert_out;
	vert_out.pos = mul(float4(vert_in.pos.xyz, 1.0), ViewProj);
	vert_out.uv  = vert_in.uv;
	return vert_out;
}

/* used to prevent internal GPU precision issues width fmod in particular */
#define PRECISION_OFFSET 0.1

float4 PSNV12(VertInOut vert_in) : TARGET
{
#ifdef _OPENGL
	float v_mul = floor((1.0 - vert_in.uv.y) * input_height);
#else
	float v_mul = floor(vert_in.uv.y * input_height);
#endif

	float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
	byte_offset += PRECISION_OFFSET;

	float2 sample_pos[4];

	if (byte_offset < u_plane_offset) {
#ifdef DEBUGGING
		return float4(1.0, 1.0, 1.0, 1.0);
#endif

		float lum_u = floor(fmod(byte_offset, width)) * width_i;
		float lum_v = floor(byte_offset * width_i)    * height_i;

		/* move to texel centers to sample the 4 pixels properly */
		lum_u += width_i  * 0.5;
		lum_v += height_i * 0.5;

		sample_pos[0] = float2(lum_u,            lum_v);
		sample_pos[1] = float2(lum_u += width_i, lum_v);
		sample_pos[2] = float2(lum_u += width_i, lum_v);
		sample_pos[3] = float2(lum_u +  width_i, lum_v);

		float4x4 out_val = float4x4(
			image.Sample(def_sampler, sample_pos[0]),
			image.Sample(def_sampler, sample_pos[1]),
			image.Sample(def_sampler, sample_pos[2]),
			image.Sample(def_sampler, sample_pos[3])
		);

		return transpose(out_val)[1];
	} else {
#ifdef DEBUGGING
		return float4(0.5, 0.2, 0.5, 0.2);
#endif

		float new_offset = byte_offset - u_plane_offset;

		float ch_u = floor(fmod(new_offset, width)) * width_i;
		float ch_v = floor(new_offset * width_i)    * height_d2_i;
		float width_i2 = width_i*2.0;

		/* move to the borders of each set of 4 pixels to force it
		 * to do bilinear averaging */
		ch_u += width_i;
		ch_v += height_i;

		sample_pos[0] = float2(ch_u,             ch_v);
		sample_pos[1] = float2(ch_u + width_i2,  ch_v);
		
		return float4(
				image.Sample(def_sampler, sample_pos[0]).rb,
				image.Sample(def_sampler, sample_pos[1]).rb
				);
	}
}

float4 PSPlanar420(VertInOut vert_in) : TARGET
{
#ifdef _OPENGL
	float v_mul = floor((1.0 - vert_in.uv.y) * input_height);
#else
	float v_mul = floor(vert_in.uv.y * input_height);
#endif

	float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;
	byte_offset += PRECISION_OFFSET;

	float2 sample_pos[4];

	if (byte_offset < u_plane_offset) {
#ifdef DEBUGGING
		return float4(1.0, 1.0, 1.0, 1.0);
#endif

		float lum_u = floor(fmod(byte_offset, width)) * width_i;
		float lum_v = floor(byte_offset * width_i)    * height_i;

		/* move to texel centers to sample the 4 pixels properly */
		lum_u += width_i  * 0.5;
		lum_v += height_i * 0.5;

		sample_pos[0] = float2(lum_u,            lum_v);
		sample_pos[1] = float2(lum_u += width_i, lum_v);
		sample_pos[2] = float2(lum_u += width_i, lum_v);
		sample_pos[3] = float2(lum_u +  width_i, lum_v);

	} else {
#ifdef DEBUGGING
		return ((byte_offset < v_plane_offset) ?
				float4(0.5, 0.5, 0.5, 0.5) :
				float4(0.2, 0.2, 0.2, 0.2));
#endif

		float new_offset = byte_offset -
				((byte_offset < v_plane_offset) ?
				u_plane_offset : v_plane_offset);

		float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;
		float ch_v = floor(new_offset * width_d2_i)    * height_d2_i;
		float width_i2 = width_i*2.0;

		/* move to the borders of each set of 4 pixels to force it
		 * to do bilinear averaging */
		ch_u += width_i;
		ch_v += height_i;

		sample_pos[0] = float2(ch_u,             ch_v);
		sample_pos[1] = float2(ch_u += width_i2, ch_v);
		sample_pos[2] = float2(ch_u += width_i2, ch_v);
		sample_pos[3] = float2(ch_u +  width_i2, ch_v);
	}

	float4x4 out_val = float4x4(
		image.Sample(def_sampler, sample_pos[0]),
		image.Sample(def_sampler, sample_pos[1]),
		image.Sample(def_sampler, sample_pos[2]),
		image.Sample(def_sampler, sample_pos[3])
	);

	out_val = transpose(out_val);

	if (byte_offset < u_plane_offset)
		return out_val[1];
	else if (byte_offset < v_plane_offset)
		return out_val[0];
	else
		return out_val[2];
}

technique Planar420
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSPlanar420(vert_in);
	}
}

technique NV12
{
	pass
	{
		vertex_shader = VSDefault(vert_in);
		pixel_shader  = PSNV12(vert_in);
	}
}
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`/******************************************************************************`
			`Copyright (C) 2014 by Hugh Bailey <obs.jim@gmail.com>`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`******************************************************************************/`

			`//#define DEBUGGING`

			`uniform float4x4 ViewProj;`

			`uniform float u_plane_offset;`
			`uniform float v_plane_offset;`

			`uniform float width;`
			`uniform float height;`
			`uniform float width_i;`
			`uniform float height_i;`
			`uniform float width_d2;`
			`uniform float height_d2;`
			`uniform float width_d2_i;`
			`uniform float height_d2_i;`
			`uniform float input_height;`

			`uniform texture2d image;`

			`sampler_state def_sampler {`
			`Filter = Linear;`
			`AddressU = Clamp;`
			`AddressV = Clamp;`
			`};`

			`struct VertInOut {`
			`float4 pos : POSITION;`
			`float2 uv : TEXCOORD0;`
			`};`

			`VertInOut VSDefault(VertInOut vert_in)`
			`{`
			`VertInOut vert_out;`
			`vert_out.pos = mul(float4(vert_in.pos.xyz, 1.0), ViewProj);`
			`vert_out.uv = vert_in.uv;`
			`return vert_out;`
			`}`

Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`/* used to prevent internal GPU precision issues width fmod in particular */`
			`#define PRECISION_OFFSET 0.1`

Add NV12 conversion shader 2014-04-04 20:49:23 +02:00			`float4 PSNV12(VertInOut vert_in) : TARGET`
			`{`
			`#ifdef _OPENGL`
			`float v_mul = floor((1.0 - vert_in.uv.y) * input_height);`
			`#else`
			`float v_mul = floor(vert_in.uv.y * input_height);`
			`#endif`

			`float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;`
			`byte_offset += PRECISION_OFFSET;`

			`float2 sample_pos[4];`

			`if (byte_offset < u_plane_offset) {`
			`#ifdef DEBUGGING`
			`return float4(1.0, 1.0, 1.0, 1.0);`
			`#endif`

			`float lum_u = floor(fmod(byte_offset, width)) * width_i;`
			`float lum_v = floor(byte_offset * width_i) * height_i;`

			`/* move to texel centers to sample the 4 pixels properly */`
			`lum_u += width_i * 0.5;`
			`lum_v += height_i * 0.5;`

			`sample_pos[0] = float2(lum_u, lum_v);`
			`sample_pos[1] = float2(lum_u += width_i, lum_v);`
			`sample_pos[2] = float2(lum_u += width_i, lum_v);`
			`sample_pos[3] = float2(lum_u + width_i, lum_v);`

			`float4x4 out_val = float4x4(`
			`image.Sample(def_sampler, sample_pos[0]),`
			`image.Sample(def_sampler, sample_pos[1]),`
			`image.Sample(def_sampler, sample_pos[2]),`
			`image.Sample(def_sampler, sample_pos[3])`
			`);`

			`return transpose(out_val)[1];`
			`} else {`
			`#ifdef DEBUGGING`
			`return float4(0.5, 0.2, 0.5, 0.2);`
			`#endif`

			`float new_offset = byte_offset - u_plane_offset;`

			`float ch_u = floor(fmod(new_offset, width)) * width_i;`
			`float ch_v = floor(new_offset * width_i) * height_d2_i;`
			`float width_i2 = width_i*2.0;`

			`/* move to the borders of each set of 4 pixels to force it`
			`* to do bilinear averaging */`
			`ch_u += width_i;`
			`ch_v += height_i;`

			`sample_pos[0] = float2(ch_u, ch_v);`
			`sample_pos[1] = float2(ch_u + width_i2, ch_v);`

			`return float4(`
			`image.Sample(def_sampler, sample_pos[0]).rb,`
			`image.Sample(def_sampler, sample_pos[1]).rb`
			`);`
			`}`
			`}`

Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`float4 PSPlanar420(VertInOut vert_in) : TARGET`
			`{`
			`#ifdef _OPENGL`
			`float v_mul = floor((1.0 - vert_in.uv.y) * input_height);`
			`#else`
			`float v_mul = floor(vert_in.uv.y * input_height);`
			`#endif`

Remove redundant constant from conversion shader 2014-02-16 19:55:59 -07:00			`float byte_offset = floor((v_mul + vert_in.uv.x) * width) * 4.0;`
Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`byte_offset += PRECISION_OFFSET;`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00
			`float2 sample_pos[4];`

			`if (byte_offset < u_plane_offset) {`
			`#ifdef DEBUGGING`
Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`return float4(1.0, 1.0, 1.0, 1.0);`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`#endif`

Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`float lum_u = floor(fmod(byte_offset, width)) * width_i;`
			`float lum_v = floor(byte_offset * width_i) * height_i;`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00
			`/* move to texel centers to sample the 4 pixels properly */`
			`lum_u += width_i * 0.5;`
			`lum_v += height_i * 0.5;`

			`sample_pos[0] = float2(lum_u, lum_v);`
			`sample_pos[1] = float2(lum_u += width_i, lum_v);`
			`sample_pos[2] = float2(lum_u += width_i, lum_v);`
			`sample_pos[3] = float2(lum_u + width_i, lum_v);`

			`} else {`
			`#ifdef DEBUGGING`
			`return ((byte_offset < v_plane_offset) ?`
Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`float4(0.5, 0.5, 0.5, 0.5) :`
			`float4(0.2, 0.2, 0.2, 0.2));`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`#endif`

			`float new_offset = byte_offset -`
			`((byte_offset < v_plane_offset) ?`
			`u_plane_offset : v_plane_offset);`

Fix precision issues with new conversion shader Turns out that on some adapters, due to some sort of internal GPU precision error, fmod(x, y) can return x when x == y, wich is incorrect (and no, they were actually equal, not off due to precision errors). This would cause the shader to sample wrong coordinates on the edges sometimes. Just adding 0.1 to the x value before being put in to fmod and then flooring the result after fixes the issue. 2014-02-17 09:28:27 -07:00			`float ch_u = floor(fmod(new_offset, width_d2)) * width_d2_i;`
			`float ch_v = floor(new_offset * width_d2_i) * height_d2_i;`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`float width_i2 = width_i*2.0;`

			`/* move to the borders of each set of 4 pixels to force it`
			`* to do bilinear averaging */`
			`ch_u += width_i;`
			`ch_v += height_i;`

			`sample_pos[0] = float2(ch_u, ch_v);`
			`sample_pos[1] = float2(ch_u += width_i2, ch_v);`
			`sample_pos[2] = float2(ch_u += width_i2, ch_v);`
			`sample_pos[3] = float2(ch_u + width_i2, ch_v);`
			`}`

Fix a NULL pointer deference Also, fixed an issue with the new conversion shader not compiling properly on some video devices 2014-02-16 22:42:35 -07:00			`float4x4 out_val = float4x4(`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00			`image.Sample(def_sampler, sample_pos[0]),`
			`image.Sample(def_sampler, sample_pos[1]),`
			`image.Sample(def_sampler, sample_pos[2]),`
			`image.Sample(def_sampler, sample_pos[3])`
Fix a NULL pointer deference Also, fixed an issue with the new conversion shader not compiling properly on some video devices 2014-02-16 22:42:35 -07:00			`);`
Make a number of key optimizations - Changed glMapBuffer to glMapBufferRange to allow invalidation. Using just glMapBuffer alone was causing some unacceptable stalls. - Changed dynamic buffers from GL_DYNAMIC_WRITE to GL_STREAM_WRITE because I had misunderstood the OpenGL specification - Added _OPENGL and _D3D11 builtin preprocessor macros to effects to allow special processing if needed - Added fmod support to shaders (NOTE: D3D and GL do not function identically with negative numbers when using this. Positive numbers however function identically) - Created a planar conversion shader that converts from packed YUV to planar 420 right on the GPU without any CPU processing. Reduces required GPU download size to approximately 37.5% of its normal rate as well. GPU usage down by 10 entire percentage points despite the extra required pass. 2014-02-16 19:28:21 -07:00
			`out_val = transpose(out_val);`

			`if (byte_offset < u_plane_offset)`
			`return out_val[1];`
			`else if (byte_offset < v_plane_offset)`
			`return out_val[0];`
			`else`
			`return out_val[2];`
			`}`

			`technique Planar420`
			`{`
			`pass`
			`{`
			`vertex_shader = VSDefault(vert_in);`
			`pixel_shader = PSPlanar420(vert_in);`
			`}`
			`}`
Add NV12 conversion shader 2014-04-04 20:49:23 +02:00
			`technique NV12`
			`{`
			`pass`
			`{`
			`vertex_shader = VSDefault(vert_in);`
			`pixel_shader = PSNV12(vert_in);`
			`}`
			`}`