obs-studio/libobs/media-io/format-conversion.c

/******************************************************************************
    Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/

#include "format-conversion.h"
#include <xmmintrin.h>
#include <emmintrin.h>

static inline uint32_t get_m128_32_0(const __m128i val)
{
	return *(uint32_t* const)&val;
}

static inline uint32_t get_m128_32_1(const __m128i val)
{
	return *(((uint32_t* const)&val)+1);
}

static inline void pack_lum(uint8_t *lum_plane,
		uint32_t lum_pos0, uint32_t lum_pos1,
		const __m128i line1, const __m128i line2,
		const __m128i lum_mask)
{
	__m128i pack_val = _mm_packs_epi32(
			_mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
			_mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
	pack_val = _mm_packus_epi16(pack_val, pack_val);

	*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
	*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
}

static inline void pack_chroma_1plane(uint8_t *uv_plane,
		uint32_t chroma_pos,
		const __m128i line1, const __m128i line2,
		const __m128i uv_mask)
{
	__m128i add_val = _mm_add_epi64(
			_mm_and_si128(line1, uv_mask),
			_mm_and_si128(line2, uv_mask));
	__m128i avg_val = _mm_add_epi64(
			add_val,
			_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
	avg_val = _mm_srai_epi16(avg_val, 2);
	avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
	avg_val = _mm_packus_epi16(avg_val, avg_val);

	*(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
}

static inline void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
		uint32_t chroma_pos,
		const __m128i line1, const __m128i line2,
		const __m128i uv_mask)
{
	uint32_t packed_vals;

	__m128i add_val = _mm_add_epi64(
			_mm_and_si128(line1, uv_mask),
			_mm_and_si128(line2, uv_mask));
	__m128i avg_val = _mm_add_epi64(
			add_val,
			_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
	avg_val = _mm_srai_epi16(avg_val, 2);
	avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
	avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
	avg_val = _mm_packus_epi16(avg_val, avg_val);

	packed_vals = get_m128_32_0(avg_val);

	*(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
	*(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
}

void compress_uyvx_to_i420(const void *input_v, uint32_t width, uint32_t height,
		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
		void **output)
{
	const uint8_t *input = input_v;
	uint8_t  *lum_plane   = output[0];
	uint8_t  *u_plane     = output[1];
	uint8_t  *v_plane     = output[2];
	uint32_t chroma_pitch = width >> 1;
	uint32_t y;

	__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
	__m128i uv_mask  = _mm_set1_epi16(0x00FF);

	for (y = start_y; y < end_y; y += 2) {
		uint32_t y_pos        = y * row_bytes;
		uint32_t chroma_y_pos = (y>>1) * chroma_pitch;
		uint32_t lum_y_pos    = y * width;
		uint32_t x;

		for (x = 0; x < width; x += 4) {
			const uint8_t *img = input + y_pos + x*4;
			uint32_t lum_pos0  = lum_y_pos + x;
			uint32_t lum_pos1  = lum_pos0 + width;

			__m128i line1 = _mm_load_si128((const __m128i*)img);
			__m128i line2 = _mm_load_si128(
					(const __m128i*)(img + row_bytes));

			pack_lum(lum_plane, lum_pos0, lum_pos1,
					line1, line2, lum_mask);
			pack_chroma_2plane(u_plane, v_plane,
					chroma_y_pos + (x>>1),
					line1, line2, uv_mask);
		}
	}
}

static inline void _compress_uyvx_to_nv12(const uint8_t *input,
		uint32_t width, uint32_t height, uint32_t pitch,
		uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
		void **output)
{
	uint8_t *lum_plane    = output[0];
	uint8_t *chroma_plane = output[1];
	uint32_t y;

	__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
	__m128i uv_mask  = _mm_set1_epi16(0x00FF);

	for (y = start_y; y < end_y; y += 2) {
		uint32_t y_pos        = y * pitch;
		uint32_t chroma_y_pos = (y>>1) * row_bytes_out;
		uint32_t lum_y_pos    = y * row_bytes_out;
		uint32_t x;

		for (x = 0; x < width; x += 4) {
			const uint8_t *img = input + y_pos + x*4;
			uint32_t lum_pos0  = lum_y_pos + x;
			uint32_t lum_pos1  = lum_pos0 + row_bytes_out;

			__m128i line1 = _mm_load_si128((const __m128i*)img);
			__m128i line2 = _mm_load_si128(
					(const __m128i*)(img + pitch));

			pack_lum(lum_plane, lum_pos0, lum_pos1,
					line1, line2, lum_mask);
			pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
					line1, line2, uv_mask);
		}
	}
}

void compress_uyvx_to_nv12(const void *input, uint32_t width, uint32_t height,
		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
		void **output)
{
	_compress_uyvx_to_nv12(input, width, height, row_bytes,
			start_y, end_y, width, output);
}

void compress_uyvx_to_nv12_aligned(const void *input,
		uint32_t width, uint32_t height, uint32_t row_bytes,
		uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
		void **output)
{
	_compress_uyvx_to_nv12(input, width, height, row_bytes,
			start_y, end_y, row_bytes_out, output);
}

void decompress_420(const void *input_v, uint32_t width, uint32_t height,
		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
		void *output_v)
{
	uint8_t       *output = output_v;
	const uint8_t *input  = input_v;
	const uint8_t *input2 = input + width * height;
	const uint8_t *input3 = input2 + width * height / 4;

	uint32_t start_y_d2 = start_y/2;
	uint32_t width_d2   = width/2;
	uint32_t height_d2  = end_y/2;
	uint32_t y;

	for (y = start_y_d2; y < height_d2; y++) {
		const uint8_t *chroma0 = input2 + y * width_d2;
		const uint8_t *chroma1 = input3 + y * width_d2;
		register const uint8_t *lum0, *lum1;
		register uint32_t *output0, *output1;
		uint32_t x;

		lum0 = input + y * 2*width;
		lum1 = lum0 + width;
		output0 = (uint32_t*)(output + y * 2*row_bytes);
		output1 = (uint32_t*)((uint8_t*)output0 + row_bytes);

		for (x = 0; x < width_d2; x++) {
			uint32_t out;
			out = (*(chroma0++) << 8) | (*(chroma1++) << 16);

			*(output0++) = *(lum0++) | out;
			*(output0++) = *(lum0++) | out;

			*(output1++) = *(lum1++) | out;
			*(output1++) = *(lum1++) | out;
		}
	}
}

void decompress_nv12(const void *input_v, uint32_t width, uint32_t height,
		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
		void *output_v)
{
	uint8_t       *output = output_v;
	const uint8_t *input  = input_v;
	const uint8_t *input2 = input + width * height;

	uint32_t start_y_d2 = start_y/2;
	uint32_t width_d2   = width/2;
	uint32_t height_d2  = end_y/2;
	uint32_t y;

	for (y = start_y_d2; y < height_d2; y++) {
		const uint16_t *chroma = (uint16_t*)(input2 + y * width);
		register const uint8_t *lum0, *lum1;
		register uint32_t *output0, *output1;
		uint32_t x;

		lum0 = input + y * 2*width;
		lum1 = lum0 + width;
		output0 = (uint32_t*)(output + y * 2*row_bytes);
		output1 = (uint32_t*)((uint8_t*)output0 + row_bytes);

		for (x = 0; x < width_d2; x++) {
			uint32_t out = *(chroma++) << 8;

			*(output0++) = *(lum0++) | out;
			*(output0++) = *(lum0++) | out;

			*(output1++) = *(lum1++) | out;
			*(output1++) = *(lum1++) | out;
		}
	}
}

void decompress_422(const void *input_v, uint32_t width, uint32_t height,
		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
		void *output_v, bool leading_lum)
{
	const uint8_t *input  = input_v;
	uint8_t       *output = output_v;

	uint32_t width_d2  = width >> 1;
	uint32_t line_size = width * 2;
	uint32_t y;

	register const uint32_t *input32;
	register const uint32_t *input32_end;
	register uint32_t       *output32;

	if (leading_lum) {
		for (y = start_y; y < end_y; y++) {
			input32     = (uint32_t*)(input + y*line_size);
			input32_end = input32 + width_d2;
			output32    = (uint32_t*)(output + y*row_bytes);

			while(input32 < input32_end) {
				register uint32_t dw = *input32;

				output32[0] = dw;
				dw &= 0xFFFFFF00;
				dw |= (uint8_t)(dw>>16);
				output32[1] = dw;

				output32 += 2;
				input32++;
			}
		}
	} else {
		for (y = start_y; y < end_y; y++) {
			input32     = (uint32_t*)(input + y*line_size);
			input32_end = input32 + width_d2;
			output32    = (uint32_t*)(output + y*row_bytes);

			while (input32 < input32_end) {
				register uint32_t dw = *input32;

				output32[0] = dw;
				dw &= 0xFFFF00FF;
				dw |= (dw>>16) & 0xFF00;
				output32[1] = dw;

				output32 += 2;
				input32++;
			}
		}
	}
}
add format conversion functions 2013-10-25 10:23:11 -07:00			`/******************************************************************************`
			`Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
With the permission of my fellow contributors, I'm switching obs-studio back to GPL v2+ to prevent issues between this project and the original OBS project, and for personal reasons to avoid legal ambiguity (not political reasons, I admittedly would prefer GPL v3+) 2013-12-02 21:24:38 -08:00			`the Free Software Foundation, either version 2 of the License, or`
add format conversion functions 2013-10-25 10:23:11 -07:00			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`******************************************************************************/`

			`#include "format-conversion.h"`
			`#include <xmmintrin.h>`
			`#include <emmintrin.h>`

			`static inline uint32_t get_m128_32_0(const __m128i val)`
			`{`
			`return (uint32_t const)&val;`
			`}`

			`static inline uint32_t get_m128_32_1(const __m128i val)`
			`{`
			`return (((uint32_t const)&val)+1);`
			`}`

			`static inline void pack_lum(uint8_t *lum_plane,`
			`uint32_t lum_pos0, uint32_t lum_pos1,`
			`const __m128i line1, const __m128i line2,`
			`const __m128i lum_mask)`
			`{`
			`__m128i pack_val = _mm_packs_epi32(`
			`_mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),`
			`_mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));`
			`pack_val = _mm_packus_epi16(pack_val, pack_val);`

			`(uint32_t)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);`
			`(uint32_t)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);`
			`}`

			`static inline void pack_chroma_1plane(uint8_t *uv_plane,`
			`uint32_t chroma_pos,`
			`const __m128i line1, const __m128i line2,`
			`const __m128i uv_mask)`
			`{`
			`__m128i add_val = _mm_add_epi64(`
			`_mm_and_si128(line1, uv_mask),`
			`_mm_and_si128(line2, uv_mask));`
			`__m128i avg_val = _mm_add_epi64(`
			`add_val,`
			`_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));`
			`avg_val = _mm_srai_epi16(avg_val, 2);`
			`avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));`
			`avg_val = _mm_packus_epi16(avg_val, avg_val);`

			`(uint32_t)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);`
			`}`

			`static inline void pack_chroma_2plane(uint8_t u_plane, uint8_t v_plane,`
			`uint32_t chroma_pos,`
			`const __m128i line1, const __m128i line2,`
			`const __m128i uv_mask)`
			`{`
			`uint32_t packed_vals;`

			`__m128i add_val = _mm_add_epi64(`
			`_mm_and_si128(line1, uv_mask),`
			`_mm_and_si128(line2, uv_mask));`
			`__m128i avg_val = _mm_add_epi64(`
			`add_val,`
			`_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));`
			`avg_val = _mm_srai_epi16(avg_val, 2);`
			`avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));`
			`avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));`
			`avg_val = _mm_packus_epi16(avg_val, avg_val);`

			`packed_vals = get_m128_32_0(avg_val);`

			`(uint16_t)(u_plane+chroma_pos) = (uint16_t)(packed_vals);`
			`(uint16_t)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);`
			`}`

			`void compress_uyvx_to_i420(const void *input_v, uint32_t width, uint32_t height,`
			`uint32_t row_bytes, uint32_t start_y, uint32_t end_y,`
			`void **output)`
			`{`
			`const uint8_t *input = input_v;`
			`uint8_t *lum_plane = output[0];`
			`uint8_t *u_plane = output[1];`
			`uint8_t *v_plane = output[2];`
			`uint32_t chroma_pitch = width >> 1;`
			`uint32_t y;`

			`__m128i lum_mask = _mm_set1_epi32(0x0000FF00);`
			`__m128i uv_mask = _mm_set1_epi16(0x00FF);`

			`for (y = start_y; y < end_y; y += 2) {`
			`uint32_t y_pos = y * row_bytes;`
			`uint32_t chroma_y_pos = (y>>1) * chroma_pitch;`
			`uint32_t lum_y_pos = y * width;`
			`uint32_t x;`

			`for (x = 0; x < width; x += 4) {`
			`const uint8_t img = input + y_pos + x4;`
			`uint32_t lum_pos0 = lum_y_pos + x;`
			`uint32_t lum_pos1 = lum_pos0 + width;`

			`__m128i line1 = _mm_load_si128((const __m128i*)img);`
			`__m128i line2 = _mm_load_si128(`
			`(const __m128i*)(img + row_bytes));`

			`pack_lum(lum_plane, lum_pos0, lum_pos1,`
			`line1, line2, lum_mask);`
			`pack_chroma_2plane(u_plane, v_plane,`
			`chroma_y_pos + (x>>1),`
			`line1, line2, uv_mask);`
			`}`
			`}`
			`}`

			`static inline void _compress_uyvx_to_nv12(const uint8_t *input,`
			`uint32_t width, uint32_t height, uint32_t pitch,`
			`uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,`
			`void **output)`
			`{`
			`uint8_t *lum_plane = output[0];`
			`uint8_t *chroma_plane = output[1];`
			`uint32_t y;`

			`__m128i lum_mask = _mm_set1_epi32(0x0000FF00);`
			`__m128i uv_mask = _mm_set1_epi16(0x00FF);`

			`for (y = start_y; y < end_y; y += 2) {`
			`uint32_t y_pos = y * pitch;`
			`uint32_t chroma_y_pos = (y>>1) * row_bytes_out;`
			`uint32_t lum_y_pos = y * row_bytes_out;`
			`uint32_t x;`

			`for (x = 0; x < width; x += 4) {`
			`const uint8_t img = input + y_pos + x4;`
			`uint32_t lum_pos0 = lum_y_pos + x;`
			`uint32_t lum_pos1 = lum_pos0 + row_bytes_out;`

			`__m128i line1 = _mm_load_si128((const __m128i*)img);`
			`__m128i line2 = _mm_load_si128(`
			`(const __m128i*)(img + pitch));`

			`pack_lum(lum_plane, lum_pos0, lum_pos1,`
			`line1, line2, lum_mask);`
			`pack_chroma_1plane(chroma_plane, chroma_y_pos + x,`
			`line1, line2, uv_mask);`
			`}`
			`}`
			`}`

			`void compress_uyvx_to_nv12(const void *input, uint32_t width, uint32_t height,`
			`uint32_t row_bytes, uint32_t start_y, uint32_t end_y,`
			`void **output)`
			`{`
			`_compress_uyvx_to_nv12(input, width, height, row_bytes,`
			`start_y, end_y, width, output);`
			`}`

			`void compress_uyvx_to_nv12_aligned(const void *input,`
			`uint32_t width, uint32_t height, uint32_t row_bytes,`
			`uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,`
			`void **output)`
			`{`
			`_compress_uyvx_to_nv12(input, width, height, row_bytes,`
			`start_y, end_y, row_bytes_out, output);`
			`}`

fix function definition names for two of the conversion functions 2013-10-25 10:30:50 -07:00			`void decompress_420(const void *input_v, uint32_t width, uint32_t height,`
add format conversion functions 2013-10-25 10:23:11 -07:00			`uint32_t row_bytes, uint32_t start_y, uint32_t end_y,`
			`void *output_v)`
			`{`
			`uint8_t *output = output_v;`
			`const uint8_t *input = input_v;`
			`const uint8_t input2 = input + width height;`
			`const uint8_t input3 = input2 + width height / 4;`

			`uint32_t start_y_d2 = start_y/2;`
			`uint32_t width_d2 = width/2;`
			`uint32_t height_d2 = end_y/2;`
			`uint32_t y;`

			`for (y = start_y_d2; y < height_d2; y++) {`
			`const uint8_t chroma0 = input2 + y width_d2;`
			`const uint8_t chroma1 = input3 + y width_d2;`
			`register const uint8_t lum0, lum1;`
			`register uint32_t output0, output1;`
			`uint32_t x;`

			`lum0 = input + y * 2*width;`
			`lum1 = lum0 + width;`
			`output0 = (uint32_t)(output + y 2*row_bytes);`
			`output1 = (uint32_t)((uint8_t)output0 + row_bytes);`

			`for (x = 0; x < width_d2; x++) {`
			`uint32_t out;`
			`out = ((chroma0++) << 8) \| ((chroma1++) << 16);`

			`(output0++) = (lum0++) \| out;`
			`(output0++) = (lum0++) \| out;`

			`(output1++) = (lum1++) \| out;`
			`(output1++) = (lum1++) \| out;`
			`}`
			`}`
			`}`

fill out the rest of the source video frame functions, added nv12 decompression function, and cleaned up the design of the source video frame stuff 2013-10-26 14:32:06 -07:00			`void decompress_nv12(const void *input_v, uint32_t width, uint32_t height,`
			`uint32_t row_bytes, uint32_t start_y, uint32_t end_y,`
			`void *output_v)`
			`{`
			`uint8_t *output = output_v;`
			`const uint8_t *input = input_v;`
			`const uint8_t input2 = input + width height;`

			`uint32_t start_y_d2 = start_y/2;`
			`uint32_t width_d2 = width/2;`
			`uint32_t height_d2 = end_y/2;`
			`uint32_t y;`

			`for (y = start_y_d2; y < height_d2; y++) {`
			`const uint16_t chroma = (uint16_t)(input2 + y * width);`
			`register const uint8_t lum0, lum1;`
			`register uint32_t output0, output1;`
			`uint32_t x;`

			`lum0 = input + y * 2*width;`
			`lum1 = lum0 + width;`
			`output0 = (uint32_t)(output + y 2*row_bytes);`
			`output1 = (uint32_t)((uint8_t)output0 + row_bytes);`

			`for (x = 0; x < width_d2; x++) {`
			`uint32_t out = *(chroma++) << 8;`

			`(output0++) = (lum0++) \| out;`
			`(output0++) = (lum0++) \| out;`

			`(output1++) = (lum1++) \| out;`
			`(output1++) = (lum1++) \| out;`
			`}`
			`}`
			`}`

fix function definition names for two of the conversion functions 2013-10-25 10:30:50 -07:00			`void decompress_422(const void *input_v, uint32_t width, uint32_t height,`
add format conversion functions 2013-10-25 10:23:11 -07:00			`uint32_t row_bytes, uint32_t start_y, uint32_t end_y,`
			`void *output_v, bool leading_lum)`
			`{`
			`const uint8_t *input = input_v;`
			`uint8_t *output = output_v;`

fill out the rest of the source video frame functions, added nv12 decompression function, and cleaned up the design of the source video frame stuff 2013-10-26 14:32:06 -07:00			`uint32_t width_d2 = width >> 1;`
			`uint32_t line_size = width * 2;`
add format conversion functions 2013-10-25 10:23:11 -07:00			`uint32_t y;`

			`register const uint32_t *input32;`
			`register const uint32_t *input32_end;`
			`register uint32_t *output32;`

			`if (leading_lum) {`
fill out the rest of the source video frame functions, added nv12 decompression function, and cleaned up the design of the source video frame stuff 2013-10-26 14:32:06 -07:00			`for (y = start_y; y < end_y; y++) {`
add format conversion functions 2013-10-25 10:23:11 -07:00			`input32 = (uint32_t)(input + yline_size);`
			`input32_end = input32 + width_d2;`
			`output32 = (uint32_t)(output + yrow_bytes);`

			`while(input32 < input32_end) {`
			`register uint32_t dw = *input32;`

			`output32[0] = dw;`
			`dw &= 0xFFFFFF00;`
			`dw \|= (uint8_t)(dw>>16);`
			`output32[1] = dw;`

			`output32 += 2;`
			`input32++;`
			`}`
			`}`
			`} else {`
fill out the rest of the source video frame functions, added nv12 decompression function, and cleaned up the design of the source video frame stuff 2013-10-26 14:32:06 -07:00			`for (y = start_y; y < end_y; y++) {`
add format conversion functions 2013-10-25 10:23:11 -07:00			`input32 = (uint32_t)(input + yline_size);`
			`input32_end = input32 + width_d2;`
			`output32 = (uint32_t)(output + yrow_bytes);`

			`while (input32 < input32_end) {`
			`register uint32_t dw = *input32;`

			`output32[0] = dw;`
			`dw &= 0xFFFF00FF;`
			`dw \|= (dw>>16) & 0xFF00;`
			`output32[1] = dw;`

			`output32 += 2;`
			`input32++;`
			`}`
			`}`
			`}`
			`}`