2013-10-25 10:23:11 -07:00
|
|
|
/******************************************************************************
|
|
|
|
Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>
|
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
2013-12-02 21:24:38 -08:00
|
|
|
the Free Software Foundation, either version 2 of the License, or
|
2013-10-25 10:23:11 -07:00
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
#include "format-conversion.h"
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE uint32_t get_m128_32_0(const __m128i val)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
return *(uint32_t* const)&val;
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE uint32_t get_m128_32_1(const __m128i val)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
return *(((uint32_t* const)&val)+1);
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_lum(uint8_t *lum_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t lum_pos0, uint32_t lum_pos1,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i lum_mask)
|
|
|
|
{
|
|
|
|
__m128i pack_val = _mm_packs_epi32(
|
|
|
|
_mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
|
|
|
|
_mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
|
|
|
|
pack_val = _mm_packus_epi16(pack_val, pack_val);
|
|
|
|
|
|
|
|
*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
|
|
|
|
*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_chroma_1plane(uint8_t *uv_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t chroma_pos,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i uv_mask)
|
|
|
|
{
|
|
|
|
__m128i add_val = _mm_add_epi64(
|
|
|
|
_mm_and_si128(line1, uv_mask),
|
|
|
|
_mm_and_si128(line2, uv_mask));
|
|
|
|
__m128i avg_val = _mm_add_epi64(
|
|
|
|
add_val,
|
|
|
|
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
|
|
avg_val = _mm_srai_epi16(avg_val, 2);
|
|
|
|
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_packus_epi16(avg_val, avg_val);
|
|
|
|
|
|
|
|
*(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t chroma_pos,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i uv_mask)
|
|
|
|
{
|
|
|
|
uint32_t packed_vals;
|
|
|
|
|
|
|
|
__m128i add_val = _mm_add_epi64(
|
|
|
|
_mm_and_si128(line1, uv_mask),
|
|
|
|
_mm_and_si128(line2, uv_mask));
|
|
|
|
__m128i avg_val = _mm_add_epi64(
|
|
|
|
add_val,
|
|
|
|
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
|
|
avg_val = _mm_srai_epi16(avg_val, 2);
|
|
|
|
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_packus_epi16(avg_val, avg_val);
|
|
|
|
|
|
|
|
packed_vals = get_m128_32_0(avg_val);
|
|
|
|
|
|
|
|
*(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
|
|
|
|
*(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void compress_uyvx_to_i420(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output[], const uint32_t out_linesize[])
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint8_t *lum_plane = output[0];
|
|
|
|
uint8_t *u_plane = output[1];
|
|
|
|
uint8_t *v_plane = output[2];
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
|
|
|
|
__m128i uv_mask = _mm_set1_epi16(0x00FF);
|
|
|
|
|
|
|
|
for (y = start_y; y < end_y; y += 2) {
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t y_pos = y * in_linesize;
|
|
|
|
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
|
|
|
|
uint32_t lum_y_pos = y * out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t x;
|
|
|
|
|
|
|
|
for (x = 0; x < width; x += 4) {
|
|
|
|
const uint8_t *img = input + y_pos + x*4;
|
|
|
|
uint32_t lum_pos0 = lum_y_pos + x;
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
__m128i line1 = _mm_load_si128((const __m128i*)img);
|
|
|
|
__m128i line2 = _mm_load_si128(
|
2014-02-09 04:51:06 -08:00
|
|
|
(const __m128i*)(img + in_linesize));
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
pack_lum(lum_plane, lum_pos0, lum_pos1,
|
|
|
|
line1, line2, lum_mask);
|
|
|
|
pack_chroma_2plane(u_plane, v_plane,
|
|
|
|
chroma_y_pos + (x>>1),
|
|
|
|
line1, line2, uv_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void compress_uyvx_to_nv12(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output[], const uint32_t out_linesize[])
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint8_t *lum_plane = output[0];
|
|
|
|
uint8_t *chroma_plane = output[1];
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
|
|
|
|
__m128i uv_mask = _mm_set1_epi16(0x00FF);
|
|
|
|
|
|
|
|
for (y = start_y; y < end_y; y += 2) {
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t y_pos = y * in_linesize;
|
|
|
|
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
|
|
|
|
uint32_t lum_y_pos = y * out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t x;
|
|
|
|
|
|
|
|
for (x = 0; x < width; x += 4) {
|
|
|
|
const uint8_t *img = input + y_pos + x*4;
|
|
|
|
uint32_t lum_pos0 = lum_y_pos + x;
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
__m128i line1 = _mm_load_si128((const __m128i*)img);
|
|
|
|
__m128i line2 = _mm_load_si128(
|
2014-02-09 04:51:06 -08:00
|
|
|
(const __m128i*)(img + in_linesize));
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
pack_lum(lum_plane, lum_pos0, lum_pos1,
|
|
|
|
line1, line2, lum_mask);
|
|
|
|
pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
|
|
|
|
line1, line2, uv_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_420(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *const input[], const uint32_t in_linesize[],
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint32_t start_y_d2 = start_y/2;
|
|
|
|
uint32_t width_d2 = width/2;
|
|
|
|
uint32_t height_d2 = end_y/2;
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
for (y = start_y_d2; y < height_d2; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *chroma0 = input[1] + y * in_linesize[1];
|
|
|
|
const uint8_t *chroma1 = input[2] + y * in_linesize[2];
|
2013-10-25 10:23:11 -07:00
|
|
|
register const uint8_t *lum0, *lum1;
|
|
|
|
register uint32_t *output0, *output1;
|
|
|
|
uint32_t x;
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
lum0 = input[0] + y * 2*width;
|
2013-10-25 10:23:11 -07:00
|
|
|
lum1 = lum0 + width;
|
2014-02-09 04:51:06 -08:00
|
|
|
output0 = (uint32_t*)(output + y * 2 * in_linesize[0]);
|
|
|
|
output1 = (uint32_t*)((uint8_t*)output0 + in_linesize[0]);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
for (x = 0; x < width_d2; x++) {
|
|
|
|
uint32_t out;
|
|
|
|
out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
|
|
|
|
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_nv12(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *const input[], const uint32_t in_linesize[],
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize)
|
2013-10-26 14:32:06 -07:00
|
|
|
{
|
|
|
|
uint32_t start_y_d2 = start_y/2;
|
|
|
|
uint32_t width_d2 = width/2;
|
|
|
|
uint32_t height_d2 = end_y/2;
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
for (y = start_y_d2; y < height_d2; y++) {
|
2014-02-07 02:03:54 -08:00
|
|
|
const uint16_t *chroma;
|
2013-10-26 14:32:06 -07:00
|
|
|
register const uint8_t *lum0, *lum1;
|
|
|
|
register uint32_t *output0, *output1;
|
|
|
|
uint32_t x;
|
|
|
|
|
2014-02-09 04:51:06 -08:00
|
|
|
chroma = (const uint16_t*)(input[1] + y * in_linesize[1]);
|
|
|
|
lum0 = input[0] + y*2 * in_linesize[0];
|
|
|
|
lum1 = lum0 + in_linesize[0];
|
|
|
|
output0 = (uint32_t*)(output + y*2 * out_linesize);
|
|
|
|
output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
|
2013-10-26 14:32:06 -07:00
|
|
|
|
|
|
|
for (x = 0; x < width_d2; x++) {
|
|
|
|
uint32_t out = *(chroma++) << 8;
|
|
|
|
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_422(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
bool leading_lum)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width_d2 = width >> 1;
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
register const uint32_t *input32;
|
|
|
|
register const uint32_t *input32_end;
|
|
|
|
register uint32_t *output32;
|
|
|
|
|
|
|
|
if (leading_lum) {
|
2013-10-26 14:32:06 -07:00
|
|
|
for (y = start_y; y < end_y; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
input32 = (const uint32_t*)(input + y*in_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
input32_end = input32 + width_d2;
|
2014-02-09 04:51:06 -08:00
|
|
|
output32 = (uint32_t*)(output + y*out_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
while(input32 < input32_end) {
|
|
|
|
register uint32_t dw = *input32;
|
|
|
|
|
|
|
|
output32[0] = dw;
|
|
|
|
dw &= 0xFFFFFF00;
|
|
|
|
dw |= (uint8_t)(dw>>16);
|
|
|
|
output32[1] = dw;
|
|
|
|
|
|
|
|
output32 += 2;
|
|
|
|
input32++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2013-10-26 14:32:06 -07:00
|
|
|
for (y = start_y; y < end_y; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
input32 = (const uint32_t*)(input + y*in_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
input32_end = input32 + width_d2;
|
2014-02-09 04:51:06 -08:00
|
|
|
output32 = (uint32_t*)(output + y*out_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
while (input32 < input32_end) {
|
|
|
|
register uint32_t dw = *input32;
|
|
|
|
|
|
|
|
output32[0] = dw;
|
|
|
|
dw &= 0xFFFF00FF;
|
|
|
|
dw |= (dw>>16) & 0xFF00;
|
|
|
|
output32[1] = dw;
|
|
|
|
|
|
|
|
output32 += 2;
|
|
|
|
input32++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|